'Python scraping google finance
I am trying to scrape stock prices from google finance's new interface.
I am using xpath to navigate to the location of the price, but upon printing it, it always returns a pair of square braces [].
I have tried fixing it but nothing seems to work, and it also does this for all of the other objects on the screen.
from lxml import html
import requests
import time
def parse(ticker):
url = "http://google.com/finance?q=%s"%(ticker)
response = requests.get(url, verify=False)
parser = html.fromstring(response.content)
priceO = parser.xpath('//*[@id="fac-ut"]/div[1]/div[4]/div[1]/span[1]/text()')
print priceO
parse('AAPL')
Output:
[]
[Finished in 1.2s]
Solution 1:[1]
Your XPath seem to be incorrect
Try to replace
priceO = parser.xpath('//*[@id="fac-ut"]/div[1]/div[4]/div[1]/span[1]/text()')
with below line
price0 = parser.xpath('//div[@id="price-panel"]//span')[0].text_content().strip()
output:
172.50
Solution 2:[2]
Why not do it like this?
import datetime
import pandas as pd
import numpy as np
import pylab as pl
import datetime
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from matplotlib.collections import LineCollection
from pandas_datareader import data as wb
from sklearn import cluster, covariance, manifold
start = '2019-02-01'
end = '2020-02-01'
tickers = ['MMM',
'ABT',
'ABBV',
'ABMD',
'ACN',
'ATVI']
thelen = len(tickers)
price_data = []
for ticker in tickers:
prices = wb.DataReader(ticker, start = start, end = end, data_source='yahoo')[['Open','Adj Close']]
price_data.append(prices.assign(ticker=ticker)[['ticker', 'Open', 'Adj Close']])
#names = np.reshape(price_data, (len(price_data), 1))
names = pd.concat(price_data)
names.reset_index()
#pd.set_option('display.max_columns', 500)
open = np.array([q['Open'] for q in price_data]).astype(np.float)
close = np.array([q['Adj Close'] for q in price_data]).astype(np.float)
print(names)
Solution 3:[3]
| Problem (besides wrong XPath) | Solution |
|---|---|
No user-agent in request headers. Default requests user-agent is python-requests so Google understand that it's a script. |
Check what's your user-angent. Pass user-agent to request headers. |
You can also try to use parsel with CSS selectors since it might be a bit easier and it translates every CSS selector query to XPath under the hood.
Code and full example in the online IDE:
from bs4 import BeautifulSoup
import requests, lxml, json
from itertools import zip_longest
def scrape_google_finance(ticker: str):
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"hl": "en" # language
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
# https://www.whatismybrowser.com/detect/what-is-my-user-agent
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
html = requests.get(f"https://www.google.com/finance/quote/{ticker}", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
ticker_data = {"right_panel_data": {},
"ticker_info": {}}
ticker_data["ticker_info"]["title"] = soup.select_one(".zzDege").text
ticker_data["ticker_info"]["current_price"] = soup.select_one(".AHmHk .fxKbKc").text
right_panel_keys = soup.select(".gyFHrc .mfs7Fc")
right_panel_values = soup.select(".gyFHrc .P6K39c")
for key, value in zip_longest(right_panel_keys, right_panel_values):
key_value = key.text.lower().replace(" ", "_")
ticker_data["right_panel_data"][key_value] = value.text
return ticker_data
data = scrape_google_finance(ticker="GOOGL:NASDAQ")
print(json.dumps(data, indent=2))
Output:
{
"right_panel_data": {
"previous_close": "$2,803.01",
"day_range": "$2,806.21 - $2,874.24",
"year_range": "$2,091.43 - $3,030.93",
"market_cap": "1.89T USD",
"volume": "1.68M",
"p/e_ratio": "25.49",
"dividend_yield": "-",
"primary_exchange": "NASDAQ",
"ceo": "Sundar Pichai",
"founded": "Oct 2, 2015",
"headquarters": "Mountain View, CaliforniaUnited States",
"website": "abc.xyz",
"employees": "156,500"
},
"ticker_info": {
"title": "Alphabet Inc Class A",
"current_price": "$2,859.43"
}
}
If you want to scrape more data with a line-by-line explanation, there's a Scrape Google Finance Ticker Quote Data in Python blog post of mine that also covers scraping time-series data using Nasdaq API.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Andersson |
| Solution 2 | |
| Solution 3 |

