'Removing duplicates from response
I am looking to try and remove duplicate timestamps for when I scrape the following site for data on BTC. I want to remove the duplicates after every time requests are sent, so that scrapy can remove the duplicates.
However, I cannot understand how the duplicates are removed when it involves the json response. I had thought it would remove the duplicates when I put the json into a dataframe, however it will not do this.
Here's the items pipeline:
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class DuplicatesPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
adapter = ItemAdapter(item)
for time in adapter['data']['timestamp']:
if time in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(time)
return item
The pipelines seems to not produce any errors however it is not removing the duplicate timestamps, so it is not working.
Here's the script that I am using to grab the data.
import scrapy
import numpy as np
from collections import defaultdict
import pandas as pd
import time
def storeBitcoin(response):
bitcoin = defaultdict(list)
resp = response.json()['data']['KdataInfo']
for row in range(0, len(resp)):
bitcoin['timestamp'].append(resp[row]['T'])
bitcoin['open'].append(resp[row]['O'])
bitcoin['closed'].append(resp[row]['C'])
bitcoin['high'].append(resp[row]['H'])
bitcoin['low'].append(resp[row]['L'])
return bitcoin
sec_begin = [55, 75]
sec_end = [15, 35]
class BtcSpider(scrapy.Spider):
name = 'btcc2'
start_urls = ['https://www.btcc.com/quot/history?']
custom_settings = {
'DOWNLOAD_DELAY':0.2
}
def start_requests(self):
for urls in self.start_urls:
for begin, end in zip(sec_begin, sec_end):
yield scrapy.FormRequest(
url=urls,
method="GET",
formdata = {
'codeid': '3223607',
'token': 'm19JU98eIFQjRgwsf9b3eXXI1jmDSW9N',
'interval': '35',
'from': f'16517697{begin}',
'to': f'16518562{end}',
},
callback = self.parse,
)
def parse(self, response):
data = pd.DataFrame(storeBitcoin(response))
data = data.drop_duplicates(subset=['timestamp'])
yield data
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
