'Remove duplicates based on a unique ID

I want to remove duplicate data based on unique id's. Each listing from the site has a unique id and so I want to filter the data to remove any duplicates.

After looking at the documentation for scrapy: Filter duplicates

I have tried to implement a similar pipeline to remove the duplicates, however I am unfamiliar on how to get this to work.

Here's what I have truied:

import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import json
from itertools import zip_longest
from collections import defaultdict
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem

headers = {
    'authority': 'www.theparking.eu',
    'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
    'accept': '*/*',
    'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'x-requested-with': 'XMLHttpRequest',
    'sec-ch-ua-mobile': '?0',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
    'sec-ch-ua-platform': '"macOS"',
    'origin': 'https://www.theparking.eu',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-mode': 'cors',
    'sec-fetch-dest': 'empty',
    #'referer': 'https://www.theparking.eu/used-cars/used-cars/',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
class DuplicatesPipeline:

    def __init__(self):
        # self.ids_seen = set()
        self.titles_seen = set()

    def process_item(self, unique_id, spider):
        if unique_id in self.titles_seen:
            raise DropItem("Duplicate item title found: %s" % unique_id)
        else:
            self.titles_seen.add(unique_id)
            return unique_id

class Countryitem(scrapy.Item):
    make = Field(output_processor = TakeFirst())
    unique_id = Field(output_processor = TakeFirst())
    page_number = Field(output_processor = TakeFirst())

class CountrySpider(scrapy.Spider):
    name = "country"
    test_dict={'country_id': [4,5,109,7,6,8,87],
               'country': ['australia','austria','bahamas','belarus','belgium','bosnia and herzegovina','brasil'],
               'make': [20, 13, 131, 113, 32, 62, 104],
               'model': [1108, 4655, 687, 492, 499, 702, 6143],
               'engine': [5, 11, 10, 7, 14, 21, 170]}
    #for links, pages, id, country in zip(url_data.links, url_data.pages, url_data.id, url_data.country):

    def start_requests(self):
        for id_ in zip(self.test_dict['country_id']):
            for id_marque in self.test_dict['make']:
                for models in self.test_dict['model']:
                    for engine in self.test_dict['engine']:            
                        for page in range(1, 10000):
                            yield scrapy.FormRequest(
                                url = f'https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_pays%3D{id_}%26id_marque%3D{id_marque}%26id_modele%3D{models}%26id_motorisation%3D{engine}',
                                method="POST",
                                callback = self.parse,
                                formdata =  {
                                    'ajax': '{"tab_id":"t0","cur_page":%s,"cur_trie":"distance","query":"","critere":{"id_pays":[%s],"id_marque":[%s], "id_modele":[%s], "id_motorisation":[%s]},"sliders":{"prix":{"id":"#range_prix","face":"prix","max_counter":983615,"min":"1","max":"400000"},"km":{"id":"#range_km","face":"km","max_counter":1071165,"min":"1","max":"500000"},"millesime":{"id":"#range_millesime","face":"millesime","max_counter":1163610,"min":"1900","max":"2022"}},"req_num":1,"nb_results":"11795660","current_location_distance":-1,"logged_in":false}' % (page,id_, id_marque, models, engine),
                                    'tabs': '["t0"]'
                                    },
                                headers=headers,

                                 cb_kwargs = {
                                     'page_number':page
                                }
                        )
    def parse(self, response,page_number):
        container = json.loads(response.text)
        test=container['#lists']
        soup = BeautifulSoup(test, 'lxml')
        for i in soup:
            
            carMake = i.select("a.external.tag_f_titre > span.title-block.brand:nth-child(1)")
            carUnique = i.select('li[tref]')

            for make, unique in zip_longest(
                carMake, carUnique
                ):
                loader = ItemLoader(Countryitem())
                # loader.add_value('page_number', page_number)
                loader.add_value("unique_id", unique['tref'])
                loader.add_value("page_number",page_number)

                if make != None:
                    loader.add_value('make', make.text)
                else:
                    loader.add_value('make', "None")

                yield loader.load_item()
        
process = CrawlerProcess(
    settings = {
        'FEED_URI':'park.jl',
        'FEED_FORMAT':'jsonlines'
    }
)
process.crawl(CountrySpider)
process.start()


Solution 1:[1]

class DuplicatesPipeline:

    def __init__(self):
        self.titles_seen = set()

    def process_item(self, item, spider):
        if item['unique_id'] in self.titles_seen:
            raise DropItem("Duplicate item title found: %s" % unique_id)
        else:
            self.titles_seen.add(item['unique_id'])
            return item

Also add to custom_settings:

custom_settings = {
    'ITEM_PIPELINES': {
        'myproject.path_to_your_file.DuplicatesPipeline': 300
    }
}

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1