'can i make my scraping (pandas read html) script faster?
i have a very simple script, it just scrapes some tables off the internet and inserts it into a db. however: tickerlist contains about 8000 rows. and the script takes about 2 hours to complete (80ms per ticker). is there anyway to speed it up?
this is my code:
import pandas as pd import weslib as wl from datetime import datetime
wl.joblog('AQstats', '', '','Starting AQ scrape stats')
con = wl.condb()
tickerlist = wl.tickerlist(['NASDAQ','NYSE'])
#tickerlist = ['AAPL', 'MSFT', 'T','INTC']
scrapedate = datetime.now()
for n, stock in enumerate(tickerlist):
wl.joblog('AQstats', stock, '','Starting: ' + str(n+1) + '/' + str(len(tickerlist)) + ' ' + str(stock))
try:
url = 'https://www.alphaquery.com/stock/' + str(stock) + '/all-data-variables/'
df = pd.read_html(url)[0]
df['numvalue'] = wl.convertnumbers(df[1])
df['scrapedate'] = scrapedate
df['symbol'] = stock
df.columns=['Label','orgvalue', 'numvalue','scrapedate','symbol']
df.reset_index(inplace=True)
except (Exception) as error:
wl.joblog('AQstats', stock, '','error: scraping \n' + str(error) )
continue
try:
wl.insert_values(con,df,'AQstats',stock,'AQstats')
except (Exception) as error:
wl.joblog('AQstats', stock, '','error: scraping \n' + str(error) )
continue
con.close()
wl.joblog('AQstats', '', '','Ending AQ scrape stats')
and these are the two relevant functions from my own lib.
import psycopg2 as db
import psycopg2.extras as extras
def insert_values(conn, df, table, ticker, call):
""" Using psycopg2.extras.execute_values() to insert the dataframe """
df.replace('None',0,inplace=True)
# Create a list of tupples from the dataframe values
tuples = [tuple(x) for x in df.to_numpy()]
# Comma-separated dataframe columns
cols = ','.join(list(df.columns))
# SQL quert to execute
query = "INSERT INTO %s(%s) VALUES %%s on conflict do nothing" % (table, cols)
cursor = conn.cursor()
try:
extras.execute_values(cursor, query, tuples)
conn.commit()
except (Exception, db.DatabaseError) as error:
joblog(call, ticker, table, "db error: %s" % error)
print("DB Error: %s" % error)
conn.rollback()
cursor.close()
return 1
joblog(call, ticker, table,"insert values done for: %s %s", call, ticker)
cursor.close()
def joblog(job, ticker, table, note):
print('JOBLOG', job, ticker, table, note)
conn = condb()
query = "insert into log (logdate, logtable, logjob, logticker, note) values ( current_timestamp, %s, %s, %s, %s ) on conflict do nothing"
cursor = conn.cursor()
try:
cursor.execute(query, (table, job, ticker, note))
except (Exception, db.DatabaseError) as error:
print(query)
dblog(job, ticker, 'log' ,"DB Error: %s" % error)
conn.rollback()
cursor.close()
return 1
conn.commit()
cursor.close()
conn.close()
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|