'python request get method creating a type error positional argument

I'm using snscrape to scrape Facebook but I'm getting a positional argument error that is caused by the get method that says it needs a positional argument named 'URL'. but on the module the 'URL' is being passed as baseUrl but not working for some reason. please help me find out where the problem actually is.

script

import snscrape.modules.facebook as snfacebook


for post in snfacebook._FacebookUserAndCommunityScraper('Houston Mhlongo').get_items():
    print(post)
TypeError                                 Traceback (most recent call last)
<ipython-input-18-127eb1633a58> in <module>
----> 1 for post in snfacebook._FacebookUserAndCommunityScraper('Houston Mhlongo').get_items():
      2     print(post)

~\anaconda3\lib\site-packages\snscrape\modules\facebook.py in get_items(self)
    181                 nextPageLinkPattern = re.compile(r'^/pages_reaction_units/more/\?page_id=')
    182                 spuriousForLoopPattern = re.compile(r'^for \(;;\);')
--> 183 
    184                 r, soup = self._initial_page()
    185                 if r.status_code == 404:

~\anaconda3\lib\site-packages\snscrape\modules\facebook.py in _initial_page(self)
    170         def _initial_page(self):
    171                 if self._initialPage is None:
--> 172                         _logger.info('Retrieving initial data')
    173                         r = self._get(self._baseUrl,self._username, headers = self._headers)
    174                         if r.status_code not in (200, 404):

~\anaconda3\lib\site-packages\snscrape\base.py in _get(self, *args, **kwargs)
  214 
    215         def _get(self, *args, **kwargs):
--> 216                 return self._request('GET', *args, **kwargs)
    217 
    218         def _post(self, *args, **kwargs):

TypeError: _request() missing 1 required positional argument: 'url'

class facebook in snscrape

class _FacebookUserAndCommunityScraper(_FacebookCommonScraper):
    def __init__(self, username, **kwargs):
        super().__init__(**kwargs)
        self._username = username
        self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:78.0) Gecko/20100101 Firefox/78.0', 'Accept-Language': 'en-US,en;q=0.5'}
        self._baseUrl = f'https://www.facebook/com/{self._username}'
        self._initialPage = None
        self._initialPageSoup = None

    def _initial_page(self):
        if self._initialPage is None:
            _logger.info('Retrieving initial data')
            r = self._get(self._baseUrl, headers = self._headers)
            if r.status_code not in (200, 404):
                raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
            self._initialPage = r
            self._initialPageSoup = bs4.BeautifulSoup(r.text, 'lxml')
        return self._initialPage, self._initialPageSoup

    def get_items(self):
        nextPageLinkPattern = re.compile(r'^/pages_reaction_units/more/\?page_id=')
        spuriousForLoopPattern = re.compile(r'^for \(;;\);')

        r, soup = self._initial_page()
        if r.status_code == 404:
            _logger.warning('User does not exist')
            return
        yield from self._soup_to_items(soup, self._baseUrl, 'user')

        while (nextPageLink := soup.find('a', ajaxify = nextPageLinkPattern)):
            _logger.info('Retrieving next page')

            # The web app sends a bunch of additional parameters. Most of them would be easy to add, but there's also __dyn, which is a compressed list of the "modules" loaded in the browser.
            # Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here.
            r = self._get(urllib.parse.urljoin(self._baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = self._headers)
            if r.status_code != 200:
                raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
            response = json.loads(spuriousForLoopPattern.sub('', r.text))
            assert 'domops' in response
            assert len(response['domops']) == 1
            assert len(response['domops'][0]) == 4
            assert response['domops'][0][0] == 'replace', f'{response["domops"][0]} is not "replace"'
            assert response['domops'][0][1] in ('#www_pages_reaction_see_more_unitwww_pages_home', '#www_pages_reaction_see_more_unitwww_pages_community_tab')
            assert response['domops'][0][2] == False
            assert '__html' in response['domops'][0][3]
            soup = bs4.BeautifulSoup(response['domops'][0][3]['__html'], 'lxml')
            yield from self._soup_to_items(soup, self._baseUrl, 'user')

    @classmethod
    def cli_setup_parser(cls, subparser):
        subparser.add_argument('username', type = snscrape.base.nonempty_string('username'), help = 'A Facebook username or user ID')

    @classmethod
    def cli_from_args(cls, args):
        return cls.cli_construct(args, args.username)


Solution 1:[1]

Looks to me like you are trying to use one of their private classes that you modified to include the _baseUrl attribute. Looking at the github, the private class does not have that attribute.

Are you installing your modified copy of the library before you import and use? My best guess is that your modifications are not being applied so self._baseUrl is defaulting to None.

Why not use their provided class instead. https://github.com/JustAnotherArchivist/snscrape/blob/3a92b5bf0d93142e75b64cfb3828d69143bd106c/snscrape/modules/facebook.py#L216

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 myz540