Extract links with Scrapy
Using Scrapy’s LinkExtractor method you can get the links from every page that you desire.
What are Link Extractors?
“A link extractor is an object that extracts links from responses.”
# -*- coding: utf-8 -*-
# +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
# |r|e|d|a|n|d|g|r|e|e|n|.|c|o|.|u|k|
# +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
import scrapy
from scrapy import Spider
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
import os
class Ebayspider(Spider):
name = "ebayspider"
allowed_domains = ["ebay.co.uk"]
start_urls = ["https://www.ebay.co.uk/deals"]
try:
os.remove("ebay2.txt")
except OSError:
pass
custom_settings = {
"CONCURRENT_REQUESTS": 2,
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_DEBUG": True,
"DOWNLOAD_DELAY": 1,
}
def __init__(self):
self.link_extractor = LinkExtractor(
allow="https://www.ebay.co.uk/e/fashion/up-to-50-off-superdry", unique=True
)
def parse(self, response):
for link in self.link_extractor.extract_links(response):
with open("ebay2.txt", "a+") as f:
f.write(f"\n{str(link)}")
Summary
The above code gets all of the hrefs very quickly and give you the flexibility to omit or include very specific attirbutes
Watch the video Extract Links | how to scrape website urls | Python + Scrapy Link Extractors