'How do I make a pair with a list of different number of elements?
I would like to pair a + d , c + e, and b+x. but I don't know how to do it.
I thought I could do it using zip_longest, but It did not work.
combine_list = zip_longest(list1, list2)
I am scraping a BBS and getting information across multiple pages.
Thread title : Shop A
Thread URL : https://thread1, thread2
Page1 comment : blah blah blah
Page2 comment : blah blah blah
Thread title : Shop B
Thread URL : https://thread1
Page1 comment: blah blah blah
Thread title : Shop C
Thread URL : https://thread1, thread2
Page1 comment : blah blah blah
Page2 comment : blah blah blah
list1 = ['ShopA', 'thread_url', 'Page1 comment'] , ['ShopB', 'thread_url', 'Page1 comment'], ['ShopC', 'thread_url', 'Page1 comment']
list2 = ['ShopA', 'thread_url', 'Page2 comment'] , ['ShopC', 'thread_url', 'Page1 comment']
I have a situation like this, and I want to combine the comments of Page1 and Page2 of Shop A into one comment. Shop B wants to get the comments as Shop B. Shop C wants to combine the comments of Page1 and Page2.
What is the best way to do this?
Waiting for help.
Code
import requests
from bs4 import BeautifulSoup
import pymongo
import re
import time
import itertools
def browse_header():
headers = {"User-Agent": "Mozilla/5.0 (X11; Mac OS X x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"}
return headers
def domain():
domain = 'https://bakusai.com'
return domain
def bbs_url_out():
thread_list = []
with open('./thread_url.csv', mode='r', encoding='utf-8') as f:
for urls in f:
thread_list.append(urls.strip())
return thread_list
def thread_requests_parse():
headers = browse_header()
url_list = bbs_url_out()
thread_url_lists = []
for thread_url in url_list:
time.sleep(1)
try:
r = requests.get(thread_url, headers=headers)
thread_url_lists.append(thread_url)
except Exception as ex:
print('Except:', ex)
pass
return thread_url_lists
def thread_article_parse(domain):
thread_url_lists = thread_requests_parse()
headers = browse_header()
domain = domain
shop_info_list_1 = []
for thread_url in thread_url_lists:
try:
r = requests.get(thread_url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
htmls = soup.find_all('div', {'class': 'article'})
title1 = soup.find('div', {'id': 'title_thr'}).text
shop_texts_1 = []
for html in htmls:
post = html.get_text()
time_pat = r'\d\d:\d\d'
posts = re.split(time_pat, post)
time_post = re.search(time_pat, post)
try:
seikei1_1 = str(posts[0]) + time_post.group() + '\n' + '<br><br><span style="font-size: 200%;"><b>' + str(posts[1]) + '\n' + '</span></b><br>[匿名さん]<br><br>'
seikei1_2 = re.sub('最新レス', '', seikei1_1)
seikei1_3 = seikei1_2.replace('[匿名さん]', '')
# print(seikei1_3)
shop_texts_1.append(seikei1_3)
except Exception as ex:
print('Except:', ex)
pass
thread_info_list = list([title1, thread_url, shop_texts_1])
shop_info_list_1.append(thread_info_list)
print(shop_info_list_1)
except Exception as ex:
print('Except:', ex)
pass
def next_page_url_parse():
thread_url_lists = thread_requests_parse()
next_url_lists = []
for thread_url in thread_url_lists:
try:
r = requests.get(thread_url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
nexts = soup.find('div', {'class': 'paging'}).find('span', {'class': 'paging_nextlink'}).find('a')
b = nexts.get('href')
next_url = domain + b
print('[thread_parse] : next_url', f'{next_url}')
next_url_lists.append(next_url)
except Exception as ex:
print('Except:', ex)
pass
return next_url_lists
def next_page_thread_parse():
next_url_lists = next_page_url_parse()
shop_info_list_next = []
for next_url in next_url_lists:
r2 = requests.get(next_url, headers=headers)
soup2 = BeautifulSoup(r2.text, 'html.parser')
html2s = soup2.find_all('div', {'class': 'article'})
title2 = soup2.find('div', {'id': 'title_thr'}).text
shop_texts2 = []
for html2 in html2s:
post2 = html2.get_text()
time_pat = r'\d\d:\d\d'
posts2 = re.split(time_pat, post2)
time_post = re.search(time_pat, post2)
try:
seikei2_1 = str(posts2[0]) + time_post.group() + '\n' + '<br><br><span style="font-size: 200%;"><b>' + str(posts2[1]) + '\n' + '</span></b><br>[匿名さん]<br><br>'
seikei2_2 = re.sub('最新レス', '', seikei2_1)
seikei2_3 = seikei2_2.replace('[匿名さん]', '')
# print('[thread_parse]', seikei2_3)
shop_texts2.append(seikei2_3)
except Exception as ex:
print('Except:', ex)
pass
thread_info_list_next = list([title2, next_url, shop_texts2])
shop_info_list_next.append(thread_info_list_next)
print(shop_info_list_next)
# write_text2 = '\n'.join(texts2)
# print(write_text2)
# with open('./thread.txt', mode='a', encoding='utf-8') as f:
# f.write('\n\n' + write_text2 + '\n\n')
return shop_info_list_next
shop_info_list_next = next_page_thread_parse()
shop_info_list_matome = list(itertools.zip_longest(shop_info_list_1, shop_info_list_next))
# thread_info_list = list([title1, thread_url, shop_texts_1, shop_texts2])
# print(thread_info_list)
return shop_info_list_matome
def text_mix(shop_info_list_matome):
shop_info_list_matome = shop_info_list_matome
for shop_info_list in shop_info_list_matome:
for texts in shop_info_list:
print(texts, type(texts))
if __name__ == '__main__':
browse_header()
domain = domain()
bbs_url_out()
shop_info_list_matome = thread_article_parse(domain)
text_mix(shop_info_list_matome)
Solution 1:[1]
What I think you want to do is build a dict keyed on the shop name (the first element of each sublist) so that you can combine all the info from different pages for a given shop. Here's some example code:
page1 = [['ShopA', 'thread_url', 'Page1 comment'], ['ShopB', 'thread_url', 'Page1 comment'], ['ShopC', 'thread_url', 'Page1 comment']]
page2 = [['ShopA', 'thread_url', 'Page2 comment'], ['ShopC', 'thread_url', 'Page1 comment']]
shops = {}
for page in (page1, page2):
for shop, thread_url, comment in page:
shops.setdefault(shop, []).extend([thread_url, comment])
combined = [[k] + v for k, v in shops.items()]
print("\n".join(map(str, combined)))
# ['ShopA', 'thread_url', 'Page1 comment', 'thread_url', 'Page2 comment']
# ['ShopB', 'thread_url', 'Page1 comment'],
# ['ShopC', 'thread_url', 'Page1 comment', 'thread_url', 'Page1 comment']
You'll probably need to modify this code for your purposes, since you didn't give an example of exactly how you wanted to combine the lists, but hopefully this is enough to point you in the right direction.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Samwise |
