'Webscraping with right html_element in R
This code worked two weeks ago but is not working now. Is it possible that html_elements were changed? How can I get the updated ones - without not changing the code all over again.
# To get $rooms, $m2, $price, $link
library(rvest)
library(dplyr)
flat_I <- data.frame()
for (i in 1:3) { #300
links <- paste0("https://www.willhaben.at/iad/immobilien/eigentumswohnung/wien?page=", i)
page <- read_html(links)
#parse out the parent nodes
results <- page %>% html_elements(".faMxZw")
#retrieve the rooms, m2 and price from each parent
rooms <- results %>% html_element(".iLQwFF+ .iLQwFF .jXuiQ") %>%
html_text()
m2 <- results %>% html_element(".iLQwFF:nth-child(1) .jXuiQ") %>%
html_text()
price <- results %>% html_element(".kswHcP , .eRKVmh") %>%
html_text()
location <- results %>% html_element(".kSOEKM .khvLsE") %>%
html_text()
library(stringr)
link <- page %>% html_nodes('a') %>%
html_attr('href') %>% unique() %>%
str_subset('expose') %>%
paste0("https://www.willhaben.at", ., sep="")
flat_I <- rbind(flat_I, data.frame(rooms, m2, price, location, link, stringsAsFactors = FALSE))
print(paste("page: ",i))
}
Solution 1:[1]
I think I have been able to extract the information you need with the following code :
library(RSelenium)
shell('docker run -d -p 4445:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")
remDr$open()
remDr$navigate("https://www.willhaben.at/iad/immobilien/eigentumswohnung/wien?page=1")
# accept cookies ...
web <- remDr$findElement("id", "didomi-notice-agree-button")
web$clickElement()
web_Obj <- remDr$findElements("class", "dfxdhY")
list_Text <- lapply(X = web_Obj, FUN = function(x) x$getElementText())
list_Text
Here is the result :
[[1]]
[[1]][[1]]
[1] "280 m²\n7 Zimmer\nTerrasse, Balkon, Dachterrasse\nVon Poll Immobilien Wien\n€ 3.500.000"
[[2]]
[[2]][[1]]
[1] "55 m²\n2 Zimmer\nERA Team Immobilienmanagement, RES Real Estate Services GmbH\n€ 299.900"
[[3]]
[[3]][[1]]
[1] "111 m²\nBalkon\nERA Team Immobilienmanagement, RES Real Estate Services GmbH\n€ 949.000"
[[4]]
[[4]][[1]]
[1] "60 m²\n3 Zimmer\nTerrasse\nERA Team Immobilienmanagement, RES Real Estate Services GmbH\n€ 289.000"
[[5]]
[[5]][[1]]
[1] "62 m²\n2 Zimmer\nTerrasse, Balkon\nDECUS Immobilien GmbH\n€ 513.152"
[[6]]
[[6]][[1]]
[1] "60 m²\n3 Zimmer\nBalkon\nDECUS Immobilien GmbH\n€ 432.337,50"
[[7]]
[[7]][[1]]
[1] "125 m²\n4 Zimmer\nTerrasse\nDECUS Immobilien GmbH\n€ 772.855,20"
Here is another approach that does not require docker :
library(RDCOMClient)
url <- "https://www.willhaben.at/iad/immobilien/eigentumswohnung/wien?page=1"
IEApp <- COMCreate("InternetExplorer.Application")
IEApp[['Visible']] <- TRUE
IEApp$Navigate(url)
Sys.sleep(10)
doc <- IEApp$Document()
mouseEvent <- doc$createEvent("MouseEvent")
mouseEvent$initEvent("click", TRUE, FALSE)
web_Obj <- doc$getElementByID("didomi-notice-agree-button")
tryCatch(web_Obj$dispatchEvent(mouseEvent), error = function(e) NA)
# Force to load the page by scrolling down
for(i in 1 : 100)
{
print(i)
doc$parentWindow()$execScript("window.scrollBy(0, 75);", "javascript")
Sys.sleep(0.02)
}
for(i in 1 : 100)
{
print(i)
doc$parentWindow()$execScript("window.scrollBy(0, -75);", "javascript")
Sys.sleep(0.02)
}
list_Elem <- doc$getElementsByClassName("Box-sc-wfmb7k-0 ResultListAdRowLayout___StyledBox-sc-1rmys2w-0 iXdpLu bHgpCD")
list_Text <- list()
for(i in 1 : 100)
{
print(i)
list_Text[[i]] <- tryCatch(list_Elem[[i]]$InnerText(), error = function(e) NA)
}
list_Text <- list_Text[!is.na(list_Text)]
list_Text[1 : 10]
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 |