'Scrapy stops scraping yet continues to run

My project uses SerpAPI to generate a list of sites, scrapes them for any about/contact pages, and then scrapes the emails from those pages.

It had been working completely fine until I decided to pickle the list of urls generated, and then load the list into my spider.

My main.py:

# Search google using SerpAPI
search = GoogleSearch({"q": input("What are you searching? "), "location": input("Where is the location? "),
                       "api_key": input("What is your API key? "), "output": "html",
                       "num": "200", "gl": "us"})

# Filter html response for links
results = search.get_dict()
organic_results = results['organic_results']
links = []

for result in organic_results:
    links.append(str(result['link']))

# Filter links to remove unwanted sites
to_remove = [
    'wikipedia', 'yelp', 'google', 'britannica', 'tripadvisor', 'amazon', 'ebay', 'craigslist', 'apple',
    'microsoft', 'homeadvisor', 'bing', 'businessinsider'
]
links = [i for i in links if not re.search("|".join(to_remove), i)]
set(links)

# Pickle lists and dump into separate txt files
base_path = Path(__file__).parent
file_path = (base_path / "../sites1.txt").resolve()

with open(file_path, 'wb') as fp:
    pickle.dump(links, fp)

# process = CrawlerProcess(get_project_settings())
#
# process.crawl(EmailSpider)
#
# process.start()

Spider:

import pickle
import re
import tldextract
from pathlib import Path
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import EmailscrapeItem


class EmailSpider(CrawlSpider):
    name = 'email'
    start_urls = []
    allowed_domains = []

    base_path = Path(__file__).parents[2]
    file_path = (base_path / "../sites1.txt").resolve()

    with open(file_path, 'rb') as fp:
        for i in pickle.load(fp):
            start_urls.append(i)

    for url in start_urls:
        extracted_domain = tldextract.extract(url)
        domain = "{}.{}".format(extracted_domain.domain, extracted_domain.suffix)
        allowed_domains.append(domain)

    rules = [
        Rule(LinkExtractor(allow=r'contact/'), callback='parse'),
        Rule(LinkExtractor(allow=r'contact-us/'), callback='parse'),
        Rule(LinkExtractor(allow=r'about'), callback='parse'),
        Rule(LinkExtractor(allow=r'about-us'), callback='parse')
    ]

    def parse(self, response, **kwargs):
        items = EmailscrapeItem()
        regex = re.compile(
            r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
        )

        # extract emails with mailto:: attachment
        for res in response.xpath("//a[starts-with(@href, 'mailto')]/text()"):
            items['email'] = res.get()
            yield items

        # extract emails using regex
        html = str(response.text)
        mail_list = re.findall(regex, html)
        for mail in mail_list:
            items['email'] = mail
            yield items

And pipelines:

import re
from scrapy import signals
from scrapy.exporters import CsvItemExporter
from scrapy.exceptions import DropItem


class EmailscrapePipeline(object):
    def __init__(self):
        self.exporter = None
        self.email_list = set()
        self.file = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('emails.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)

        regex = re.compile(
            r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
        )
        if not item['email']:
            raise DropItem("Item is None or empty")
        if not re.search(regex, str(item['email'])):
            raise DropItem("Item is not an email.")
        if item['email'] in self.email_list:
            raise DropItem("Duplicate item email found: %s" % item)
        else:
            self.email_list.add(item['email'])
            return item

I have no errors that appear when I run the spider via command line.

"Most" sites return a DEBUG (200).

If anyone could point me in a good direction that'd be great. I've reduced Timeout to 15 seconds, so I'm not sure as to why it freezes.

python scrapy

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'Scrapy stops scraping yet continues to run

Sources

Related Questions