'Webscraping using nested loops

I'm struggling to webscrape this search and was wondering if anyone has an idea on how I should be organizing the nested loop? At one point I was running into the problem that read_html() can't read multiple rows in a data frame. I tried to get around this with how I set up the loop, but have been unsuccessful. (I also could use some pointers on the outputs of loops:/). Thanks in advance.

library(purrr)
library(rvest)
library(data.table)
library(tidyverse)
library(quanteda)
library(quanteda.textstats)
#seach first page
url_1 <- "https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B%5D=0&congresses%5B%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B%5D=hsif00&q={%22chamber%22:%22House%22,%22type%22:%22bills%22,%22subject%22:%22Health%22,%22house-committee%22:%22Energy+and+Commerce%22}&pageSize=250"
#seach second page
url_2 <- "https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=2"
read_html(url_1)
#css_selector <- ".result-heading a"

#scrape all 250 bill hyperlinks on first page
urlLinks <- url_1 %>%
  read_html() %>%
  html_nodes(".result-heading a") %>%
  html_attr("href")
urlLinks<- unique(urlLinks)
as.data.frame(urlLinks)
#pull text from the first bill hyperlink
    first_link <- urlLinks[1]
    first_link <- gsub("\\?.*", "", first_link) #Remove everything from ?q
    first_link <- paste0("https://www.congress.gov", first_link, "/text") #Add /text to the link and prepare the hyperlink url
#Get text from the first bill hyperlink
    get_text <- read_html(first_link) %>%
            html_nodes(".generated-html-container") %>%
            html_text(trim = T)
    get_text
Sys.sleep(5)
#loop above for all 250 bill hyperlinks on one page
billTexts <- c()
for (i in 1:length(urlLinks)){
  rest_of_links <- urlLinks[i]
  rest_of_links <- gsub("\\?.*", "", rest_of_links)
  rest_of_links <- paste0("https://www.congress.gov", rest_of_links, "/text")
  billText <- read_html(rest_of_links) %>%
            html_nodes(".generated-html-container") %>%
            html_text(trim = T)
  billTexts <- c(billTexts, billText)
}

#Loop for each page (34)
final <- c() #final table for the loop output
output <- c() #inner  loop output
pageNumber <- c(2:34)
#urls for search pages
urls <- url_1
for(i in 1:length(pageNumber)){
  urls <- c(urls, paste0("https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=",pageNumber[i], sep=""))
#read the 250 hyperlinks on each of the 34 pages
 urlLinks <- urls[i] %>%
  read_html() %>%
  html_nodes(".result-heading a") %>%
  html_attr("href")
urlLinks<- unique(urlLinks)
billTexts <- c()
#loop for pulling bill text for each of the 250 hyperlinks
for (j in 1:length(urlLinks)){
  rest_of_links <- urlLinks[j]
  rest_of_links <- gsub("\\?.*", "", rest_of_links)
  rest_of_links <- paste0("https://www.congress.gov", rest_of_links, "/text")
  billText <- read_html(rest_of_links) %>%
            html_nodes(".generated-html-container") %>%
            html_text(trim = T)
  billTexts <- c(billTexts, billText)
#taking bill output and putting in table
output <- c(output,billTexts)
}
#taking inner loop output and combining  with outer loop output and putting it in table
final <- c(output, urlLinks)
#return the final dataset here
}

I was expecting to get a data frame with the bill texts (1 per link) in each hyperlink (250 links) on each page of the search (34 pages).

r loops web-scraping

Solution 1:^[1]

We can split the nested loop into simple loops using lapply,

First generate links for all the 34 pages,

urls <- c(urls, paste0("https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=",2:34))

Second get links from each of the 34 page,

df = lapply(urls, function(x){
  urlLinks = x %>%
    read_html() %>%
    html_nodes(".result-heading a") %>%
    html_attr("href")
  urlLinks<- unique(urlLinks)
  first_link <- gsub("\\?.*", "", urlLinks)
  first_link <- paste0("https://www.congress.gov", first_link, "/text") 
})

Third get text from each of the links,

text = lapply(df, function(x) lapply(x, function(x){
  text1  = read_html(x) %>%
    html_nodes(".generated-html-container") %>%
    html_text(trim = T)
  })
)

We now have text from all the 34 pages stored in a list.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source
Solution 1

'Webscraping using nested loops

Solution 1:[1]

Sources

Related Questions

Solution 1:^[1]