'How do I make a pair with a list of different number of elements?

I would like to pair a + d , c + e, and b+x. but I don't know how to do it.

I thought I could do it using zip_longest, but It did not work.

combine_list = zip_longest(list1, list2)

I am scraping a BBS and getting information across multiple pages.


Thread title : Shop A

Thread URL : https://thread1, thread2

Page1 comment : blah blah blah

Page2 comment : blah blah blah


Thread title : Shop B

Thread URL : https://thread1

Page1 comment: blah blah blah


Thread title : Shop C

Thread URL : https://thread1, thread2

Page1 comment : blah blah blah

Page2 comment : blah blah blah


list1 = ['ShopA', 'thread_url', 'Page1 comment'] , ['ShopB', 'thread_url', 'Page1 comment'], ['ShopC', 'thread_url', 'Page1 comment']

list2 = ['ShopA', 'thread_url', 'Page2 comment'] , ['ShopC', 'thread_url', 'Page1 comment']

I have a situation like this, and I want to combine the comments of Page1 and Page2 of Shop A into one comment. Shop B wants to get the comments as Shop B. Shop C wants to combine the comments of Page1 and Page2.

What is the best way to do this?

Waiting for help.

Code

import requests
from bs4 import BeautifulSoup
import pymongo
import re
import time
import itertools



def browse_header():
    headers = {"User-Agent": "Mozilla/5.0 (X11; Mac OS X x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"}

    return headers


def domain():
    domain = 'https://bakusai.com'

    return domain


def bbs_url_out():

    thread_list = []

    with open('./thread_url.csv', mode='r', encoding='utf-8') as f:
        for urls in f:
            thread_list.append(urls.strip())

    return thread_list


def thread_requests_parse():
    headers = browse_header()
    url_list = bbs_url_out()

    thread_url_lists = []

    for thread_url in url_list:
        time.sleep(1)
        try:
            r = requests.get(thread_url, headers=headers)
            thread_url_lists.append(thread_url)

        except Exception as ex:
            print('Except:', ex)
            pass

    return thread_url_lists


def thread_article_parse(domain):

    thread_url_lists = thread_requests_parse()
    headers = browse_header()
    domain = domain
    shop_info_list_1 = []


    for thread_url in thread_url_lists:
        try:
            r = requests.get(thread_url, headers=headers)
            soup = BeautifulSoup(r.text, 'html.parser')
            htmls = soup.find_all('div', {'class': 'article'})
            title1 = soup.find('div', {'id': 'title_thr'}).text


            shop_texts_1 = []
            for html in htmls:
                post = html.get_text()
                time_pat = r'\d\d:\d\d'
                posts = re.split(time_pat, post)
                time_post = re.search(time_pat, post)
                try:
                    seikei1_1 = str(posts[0]) + time_post.group() + '\n' + '<br><br><span style="font-size: 200%;"><b>' + str(posts[1]) + '\n' + '</span></b><br>[匿名さん]<br><br>'
                    seikei1_2 = re.sub('最新レス', '', seikei1_1)
                    seikei1_3 = seikei1_2.replace('[匿名さん]', '')
                    # print(seikei1_3)
                    shop_texts_1.append(seikei1_3)

                except Exception as ex:
                    print('Except:', ex)
                    pass

            thread_info_list = list([title1, thread_url, shop_texts_1])
            shop_info_list_1.append(thread_info_list)
            print(shop_info_list_1)

        except Exception as ex:
            print('Except:', ex)
            pass

    def next_page_url_parse():
        thread_url_lists = thread_requests_parse()
        next_url_lists = []

        for thread_url in thread_url_lists:
            try:
                r = requests.get(thread_url, headers=headers)
                soup = BeautifulSoup(r.text, 'html.parser')

                nexts = soup.find('div', {'class': 'paging'}).find('span', {'class': 'paging_nextlink'}).find('a')
                b = nexts.get('href')
                next_url = domain + b
                print('[thread_parse] : next_url', f'{next_url}')
                next_url_lists.append(next_url)

            except Exception as ex:
                print('Except:', ex)
                pass

        return next_url_lists


    def next_page_thread_parse():
        next_url_lists = next_page_url_parse()
        shop_info_list_next = []

        for next_url in next_url_lists:
            r2 = requests.get(next_url, headers=headers)
            soup2 = BeautifulSoup(r2.text, 'html.parser')
            html2s = soup2.find_all('div', {'class': 'article'})
            title2 = soup2.find('div', {'id': 'title_thr'}).text


            shop_texts2 = []
            for html2 in html2s:
                post2 = html2.get_text()
                time_pat = r'\d\d:\d\d'
                posts2 = re.split(time_pat, post2)
                time_post = re.search(time_pat, post2)

                try:
                    seikei2_1 = str(posts2[0]) + time_post.group() + '\n' + '<br><br><span style="font-size: 200%;"><b>' + str(posts2[1]) + '\n' + '</span></b><br>[匿名さん]<br><br>'
                    seikei2_2 = re.sub('最新レス', '', seikei2_1)
                    seikei2_3 = seikei2_2.replace('[匿名さん]', '')
                    # print('[thread_parse]', seikei2_3)
                    shop_texts2.append(seikei2_3)
                except Exception as ex:
                    print('Except:', ex)
                    pass

            thread_info_list_next = list([title2, next_url, shop_texts2])
            shop_info_list_next.append(thread_info_list_next)
            print(shop_info_list_next)

            # write_text2 = '\n'.join(texts2)
            # print(write_text2)
        # with open('./thread.txt', mode='a', encoding='utf-8') as f:
        #     f.write('\n\n' + write_text2 + '\n\n')

        return shop_info_list_next

    shop_info_list_next = next_page_thread_parse()


    shop_info_list_matome = list(itertools.zip_longest(shop_info_list_1, shop_info_list_next))


    # thread_info_list = list([title1, thread_url, shop_texts_1, shop_texts2])
    # print(thread_info_list)
    return shop_info_list_matome

def text_mix(shop_info_list_matome):

    shop_info_list_matome = shop_info_list_matome
    for shop_info_list in shop_info_list_matome:
        for texts in shop_info_list:
            print(texts, type(texts))


if __name__ == '__main__':
    browse_header()
    domain = domain()
    bbs_url_out()
    shop_info_list_matome = thread_article_parse(domain)
    text_mix(shop_info_list_matome)


Solution 1:[1]

What I think you want to do is build a dict keyed on the shop name (the first element of each sublist) so that you can combine all the info from different pages for a given shop. Here's some example code:

page1 = [['ShopA', 'thread_url', 'Page1 comment'], ['ShopB', 'thread_url', 'Page1 comment'], ['ShopC', 'thread_url', 'Page1 comment']]
page2 = [['ShopA', 'thread_url', 'Page2 comment'], ['ShopC', 'thread_url', 'Page1 comment']]

shops = {}
for page in (page1, page2):
    for shop, thread_url, comment in page:
        shops.setdefault(shop, []).extend([thread_url, comment])

combined = [[k] + v for k, v in shops.items()]
print("\n".join(map(str, combined)))
# ['ShopA', 'thread_url', 'Page1 comment', 'thread_url', 'Page2 comment']
# ['ShopB', 'thread_url', 'Page1 comment'], 
# ['ShopC', 'thread_url', 'Page1 comment', 'thread_url', 'Page1 comment']

You'll probably need to modify this code for your purposes, since you didn't give an example of exactly how you wanted to combine the lists, but hopefully this is enough to point you in the right direction.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Samwise