'how can I sort a json file by one element and remove the duplicate from it?
we suppose to extract the billboard 100 artists into a json file with scrapy (python) and also grab the 5-first ones in each pages and sort them alphabetically and remove the duplicate. and then, load them into a new google sheet. This is what I've done so far:
import json
import scrapy
import datetime
from datetime import datetime
from datetime import timedelta, date
class BillboardWeeklySpider(scrapy.Spider):
name = 'billboard-weekly'
allowed_domains = ['www.billboard.com']
start_urls = ['https://www.billboard.com/charts/artist-100/']
def __init__(self):
self.last_week_str = ""
def parse(self, response):
for i in range(4):
string_date = response.css('#chart-date-picker::attr(data-date)').get()
real_date = datetime.strptime(string_date, '%Y-%m-%d')
day_delta = timedelta(weeks=1)
last_week = real_date - i * day_delta
self.last_week_str = last_week.strftime('%Y-%m-%d')
next_page = f"https://www.billboard.com/charts/artist-100/{self.last_week_str}"
if next_page:
yield response.follow(next_page, callback=self.week_parse)
def week_parse(self, response):
for element in response.css('.o-chart-results-list-row-container'):
name = element.css('#title-of-a-story::text').get()
number = element.css(
'span.c-label.a-font-primary-bold-l.u-font-size-32\@tablet.u-letter-spacing-0080\@tablet::text').get()
clean_name = name.strip()
clean_number = number.strip()
if int(clean_number) > 5:
break
yield {
'name': clean_name,
'rank': clean_number,
'date': response.url
}
Solution 1:[1]
import scrapy
import json
import datetime
from datetime import datetime
from datetime import timedelta, date
class QuestionSpider(scrapy.Spider):
global f
name = 'billboard-weekly'
allowed_domains = ['www.billboard.com']
start_urls = ['https://www.billboard.com/charts/artist-100/']
f = open("billboard.json","w")
def __init__(self):
self.last_week_str = ""
def parse(self, response):
for i in range(4):
string_date = response.css('#chart-date-picker::attr(data-date)').get()
real_date = datetime.strptime(string_date, '%Y-%m-%d')
day_delta = timedelta(weeks=1)
last_week = real_date - i * day_delta
self.last_week_str = last_week.strftime('%Y-%m-%d')
next_page = f"https://www.billboard.com/charts/artist-100/{self.last_week_str}"
if next_page:
yield response.follow(next_page, callback=self.week_parse)
def week_parse(self, response):
global f
item = {}
for element in response.css('.o-chart-results-list-row-container'):
name = element.css('#title-of-a-story::text').get()
number = element.css(
'span.c-label.a-font-primary-bold-l.u-font-size-32\@tablet.u-letter-spacing-0080\@tablet::text').get()
clean_name = name.strip()
clean_number = number.strip()
if int(clean_number) > 5:
break
item["name"] = clean_name
item["rank"] = clean_number
item["date"] = response.url
jsonl = json.dumps(item)
f.write(jsonl)
print(item)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | studymakesmebetter |
