'python request get method creating a type error positional argument
I'm using snscrape to scrape Facebook but I'm getting a positional argument error that is caused by the get method that says it needs a positional argument named 'URL'. but on the module the 'URL' is being passed as baseUrl but not working for some reason. please help me find out where the problem actually is.
script
import snscrape.modules.facebook as snfacebook
for post in snfacebook._FacebookUserAndCommunityScraper('Houston Mhlongo').get_items():
print(post)
TypeError Traceback (most recent call last)
<ipython-input-18-127eb1633a58> in <module>
----> 1 for post in snfacebook._FacebookUserAndCommunityScraper('Houston Mhlongo').get_items():
2 print(post)
~\anaconda3\lib\site-packages\snscrape\modules\facebook.py in get_items(self)
181 nextPageLinkPattern = re.compile(r'^/pages_reaction_units/more/\?page_id=')
182 spuriousForLoopPattern = re.compile(r'^for \(;;\);')
--> 183
184 r, soup = self._initial_page()
185 if r.status_code == 404:
~\anaconda3\lib\site-packages\snscrape\modules\facebook.py in _initial_page(self)
170 def _initial_page(self):
171 if self._initialPage is None:
--> 172 _logger.info('Retrieving initial data')
173 r = self._get(self._baseUrl,self._username, headers = self._headers)
174 if r.status_code not in (200, 404):
~\anaconda3\lib\site-packages\snscrape\base.py in _get(self, *args, **kwargs)
214
215 def _get(self, *args, **kwargs):
--> 216 return self._request('GET', *args, **kwargs)
217
218 def _post(self, *args, **kwargs):
TypeError: _request() missing 1 required positional argument: 'url'
class facebook in snscrape
class _FacebookUserAndCommunityScraper(_FacebookCommonScraper):
def __init__(self, username, **kwargs):
super().__init__(**kwargs)
self._username = username
self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:78.0) Gecko/20100101 Firefox/78.0', 'Accept-Language': 'en-US,en;q=0.5'}
self._baseUrl = f'https://www.facebook/com/{self._username}'
self._initialPage = None
self._initialPageSoup = None
def _initial_page(self):
if self._initialPage is None:
_logger.info('Retrieving initial data')
r = self._get(self._baseUrl, headers = self._headers)
if r.status_code not in (200, 404):
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
self._initialPage = r
self._initialPageSoup = bs4.BeautifulSoup(r.text, 'lxml')
return self._initialPage, self._initialPageSoup
def get_items(self):
nextPageLinkPattern = re.compile(r'^/pages_reaction_units/more/\?page_id=')
spuriousForLoopPattern = re.compile(r'^for \(;;\);')
r, soup = self._initial_page()
if r.status_code == 404:
_logger.warning('User does not exist')
return
yield from self._soup_to_items(soup, self._baseUrl, 'user')
while (nextPageLink := soup.find('a', ajaxify = nextPageLinkPattern)):
_logger.info('Retrieving next page')
# The web app sends a bunch of additional parameters. Most of them would be easy to add, but there's also __dyn, which is a compressed list of the "modules" loaded in the browser.
# Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here.
r = self._get(urllib.parse.urljoin(self._baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = self._headers)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
response = json.loads(spuriousForLoopPattern.sub('', r.text))
assert 'domops' in response
assert len(response['domops']) == 1
assert len(response['domops'][0]) == 4
assert response['domops'][0][0] == 'replace', f'{response["domops"][0]} is not "replace"'
assert response['domops'][0][1] in ('#www_pages_reaction_see_more_unitwww_pages_home', '#www_pages_reaction_see_more_unitwww_pages_community_tab')
assert response['domops'][0][2] == False
assert '__html' in response['domops'][0][3]
soup = bs4.BeautifulSoup(response['domops'][0][3]['__html'], 'lxml')
yield from self._soup_to_items(soup, self._baseUrl, 'user')
@classmethod
def cli_setup_parser(cls, subparser):
subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A Facebook username or user ID')
@classmethod
def cli_from_args(cls, args):
return cls.cli_construct(args, args.username)
Solution 1:[1]
Looks to me like you are trying to use one of their private classes that you modified to include the _baseUrl attribute. Looking at the github, the private class does not have that attribute.
Are you installing your modified copy of the library before you import and use? My best guess is that your modifications are not being applied so self._baseUrl is defaulting to None.
Why not use their provided class instead. https://github.com/JustAnotherArchivist/snscrape/blob/3a92b5bf0d93142e75b64cfb3828d69143bd106c/snscrape/modules/facebook.py#L216
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | myz540 |
