'Webscraping using nested loops
I'm struggling to webscrape this search and was wondering if anyone has an idea on how I should be organizing the nested loop? At one point I was running into the problem that read_html() can't read multiple rows in a data frame. I tried to get around this with how I set up the loop, but have been unsuccessful. (I also could use some pointers on the outputs of loops:/). Thanks in advance.
library(purrr)
library(rvest)
library(data.table)
library(tidyverse)
library(quanteda)
library(quanteda.textstats)
#seach first page
url_1 <- "https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B%5D=0&congresses%5B%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B%5D=hsif00&q={%22chamber%22:%22House%22,%22type%22:%22bills%22,%22subject%22:%22Health%22,%22house-committee%22:%22Energy+and+Commerce%22}&pageSize=250"
#seach second page
url_2 <- "https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=2"
read_html(url_1)
#css_selector <- ".result-heading a"
#scrape all 250 bill hyperlinks on first page
urlLinks <- url_1 %>%
read_html() %>%
html_nodes(".result-heading a") %>%
html_attr("href")
urlLinks<- unique(urlLinks)
as.data.frame(urlLinks)
#pull text from the first bill hyperlink
first_link <- urlLinks[1]
first_link <- gsub("\\?.*", "", first_link) #Remove everything from ?q
first_link <- paste0("https://www.congress.gov", first_link, "/text") #Add /text to the link and prepare the hyperlink url
#Get text from the first bill hyperlink
get_text <- read_html(first_link) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
get_text
Sys.sleep(5)
#loop above for all 250 bill hyperlinks on one page
billTexts <- c()
for (i in 1:length(urlLinks)){
rest_of_links <- urlLinks[i]
rest_of_links <- gsub("\\?.*", "", rest_of_links)
rest_of_links <- paste0("https://www.congress.gov", rest_of_links, "/text")
billText <- read_html(rest_of_links) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
billTexts <- c(billTexts, billText)
}
#Loop for each page (34)
final <- c() #final table for the loop output
output <- c() #inner loop output
pageNumber <- c(2:34)
#urls for search pages
urls <- url_1
for(i in 1:length(pageNumber)){
urls <- c(urls, paste0("https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=",pageNumber[i], sep=""))
#read the 250 hyperlinks on each of the 34 pages
urlLinks <- urls[i] %>%
read_html() %>%
html_nodes(".result-heading a") %>%
html_attr("href")
urlLinks<- unique(urlLinks)
billTexts <- c()
#loop for pulling bill text for each of the 250 hyperlinks
for (j in 1:length(urlLinks)){
rest_of_links <- urlLinks[j]
rest_of_links <- gsub("\\?.*", "", rest_of_links)
rest_of_links <- paste0("https://www.congress.gov", rest_of_links, "/text")
billText <- read_html(rest_of_links) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
billTexts <- c(billTexts, billText)
#taking bill output and putting in table
output <- c(output,billTexts)
}
#taking inner loop output and combining with outer loop output and putting it in table
final <- c(output, urlLinks)
#return the final dataset here
}
I was expecting to get a data frame with the bill texts (1 per link) in each hyperlink (250 links) on each page of the search (34 pages).
Solution 1:[1]
We can split the nested loop into simple loops using lapply,
First generate links for all the 34 pages,
urls <- c(urls, paste0("https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=",2:34))
Second get links from each of the 34 page,
df = lapply(urls, function(x){
urlLinks = x %>%
read_html() %>%
html_nodes(".result-heading a") %>%
html_attr("href")
urlLinks<- unique(urlLinks)
first_link <- gsub("\\?.*", "", urlLinks)
first_link <- paste0("https://www.congress.gov", first_link, "/text")
})
Third get text from each of the links,
text = lapply(df, function(x) lapply(x, function(x){
text1 = read_html(x) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
})
)
We now have text from all the 34 pages stored in a list.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 |
