'how to improve my web scraping code by using multithreading?

This is my code. It is web scraping page by page and extracting the data to Excel. It is taking the next page link by extracting the anchor tag present in pagination of the current page.

Currently it is slow; can someone please help to make it fast by using multithreading or anything else?

import requests
from urllib3.exceptions import InsecureRequestWarning
import csv

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs

f = csv.writer(open('GEM.csv', 'w', newline=''))
f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate','pageNumber'])


def scrap_bid_data():
    page_no = 1
    url = ""
    while page_no <= 532:

        print('Hold on creating URL to fetch data for...'+str(page_no))
        if page_no == 2:
            url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + "AMCR24yMNFkfoXF3wKPmGMy_wV8TJPAlxm6oWiTHGOI"
        if page_no == 1:
            url = 'https://bidplus.gem.gov.in/bidlists?bidlists'

        print('URL created: ' + url)
        scraped_data = requests.get(url, verify=False)
        soup_data = bs(scraped_data.text, 'lxml')
        nextlink = soup_data.find('a', {'rel': 'next'})

        nxt = nextlink['href'].split('=')[1]
        extracted_data = soup_data.find('div', {'id': 'pagi_content'})
        if len(extracted_data) == 0:
            break
        else:
            for idx in range(len(extracted_data)):
                if (idx % 2 == 1):
                    bid_data = extracted_data.contents[idx].text.strip().split('\n')
                    if (len(bid_data) > 1):
                        print(page_no)
                        if (len(bid_data[8]) > 1 and len(bid_data[10].split(':')) > 1):
                            bidno = bid_data[0].split(":")[-1]
                            items = bid_data[9].strip().split('Items:')[-1]
                            qnty = int(bid_data[10].split(':')[1].strip())
                            dept = (bid_data[11] + bid_data[16].strip()).split(":")[-1]
                            edate = bid_data[21].split("End Date:")[-1]
                            f.writerow([bidno, items, qnty, dept, edate,page_no])

        page_no=page_no+1
        url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' +nxt
        print('printing the next url')
        print(url)
scrap_bid_data()

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'how to improve my web scraping code by using multithreading?

Sources

Related Questions