'Scrapy stops scraping yet continues to run
My project uses SerpAPI to generate a list of sites, scrapes them for any about/contact pages, and then scrapes the emails from those pages.
It had been working completely fine until I decided to pickle the list of urls generated, and then load the list into my spider.
My main.py:
# Search google using SerpAPI
search = GoogleSearch({"q": input("What are you searching? "), "location": input("Where is the location? "),
"api_key": input("What is your API key? "), "output": "html",
"num": "200", "gl": "us"})
# Filter html response for links
results = search.get_dict()
organic_results = results['organic_results']
links = []
for result in organic_results:
links.append(str(result['link']))
# Filter links to remove unwanted sites
to_remove = [
'wikipedia', 'yelp', 'google', 'britannica', 'tripadvisor', 'amazon', 'ebay', 'craigslist', 'apple',
'microsoft', 'homeadvisor', 'bing', 'businessinsider'
]
links = [i for i in links if not re.search("|".join(to_remove), i)]
set(links)
# Pickle lists and dump into separate txt files
base_path = Path(__file__).parent
file_path = (base_path / "../sites1.txt").resolve()
with open(file_path, 'wb') as fp:
pickle.dump(links, fp)
# process = CrawlerProcess(get_project_settings())
#
# process.crawl(EmailSpider)
#
# process.start()
Spider:
import pickle
import re
import tldextract
from pathlib import Path
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import EmailscrapeItem
class EmailSpider(CrawlSpider):
name = 'email'
start_urls = []
allowed_domains = []
base_path = Path(__file__).parents[2]
file_path = (base_path / "../sites1.txt").resolve()
with open(file_path, 'rb') as fp:
for i in pickle.load(fp):
start_urls.append(i)
for url in start_urls:
extracted_domain = tldextract.extract(url)
domain = "{}.{}".format(extracted_domain.domain, extracted_domain.suffix)
allowed_domains.append(domain)
rules = [
Rule(LinkExtractor(allow=r'contact/'), callback='parse'),
Rule(LinkExtractor(allow=r'contact-us/'), callback='parse'),
Rule(LinkExtractor(allow=r'about'), callback='parse'),
Rule(LinkExtractor(allow=r'about-us'), callback='parse')
]
def parse(self, response, **kwargs):
items = EmailscrapeItem()
regex = re.compile(
r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
)
# extract emails with mailto:: attachment
for res in response.xpath("//a[starts-with(@href, 'mailto')]/text()"):
items['email'] = res.get()
yield items
# extract emails using regex
html = str(response.text)
mail_list = re.findall(regex, html)
for mail in mail_list:
items['email'] = mail
yield items
And pipelines:
import re
from scrapy import signals
from scrapy.exporters import CsvItemExporter
from scrapy.exceptions import DropItem
class EmailscrapePipeline(object):
def __init__(self):
self.exporter = None
self.email_list = set()
self.file = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('emails.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
regex = re.compile(
r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
)
if not item['email']:
raise DropItem("Item is None or empty")
if not re.search(regex, str(item['email'])):
raise DropItem("Item is not an email.")
if item['email'] in self.email_list:
raise DropItem("Duplicate item email found: %s" % item)
else:
self.email_list.add(item['email'])
return item
I have no errors that appear when I run the spider via command line.
"Most" sites return a DEBUG (200).
If anyone could point me in a good direction that'd be great. I've reduced Timeout to 15 seconds, so I'm not sure as to why it freezes.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
