'Webscraping with right html_element in R

This code worked two weeks ago but is not working now. Is it possible that html_elements were changed? How can I get the updated ones - without not changing the code all over again.


# To get $rooms, $m2, $price, $link
library(rvest)
library(dplyr)

flat_I <- data.frame()

for (i in 1:3) { #300
  links <- paste0("https://www.willhaben.at/iad/immobilien/eigentumswohnung/wien?page=", i)  
  page <- read_html(links)
  
  #parse out the parent nodes
  results <- page %>% html_elements(".faMxZw")
  
  #retrieve the rooms, m2 and price from each parent
  rooms <- results %>% html_element(".iLQwFF+ .iLQwFF .jXuiQ") %>%
    html_text()
  
  m2 <- results %>% html_element(".iLQwFF:nth-child(1) .jXuiQ") %>%
    html_text()
  
  price <- results %>% html_element(".kswHcP , .eRKVmh") %>%
    html_text()
  
  location <- results %>% html_element(".kSOEKM .khvLsE") %>%
    html_text()
  
  library(stringr)
  link <- page %>% html_nodes('a') %>% 
    html_attr('href') %>% unique() %>% 
    str_subset('expose') %>% 
    paste0("https://www.willhaben.at", ., sep="")
  
  flat_I <- rbind(flat_I, data.frame(rooms, m2, price, location, link, stringsAsFactors = FALSE))
  print(paste("page: ",i))
}


r


Solution 1:[1]

I think I have been able to extract the information you need with the following code :

library(RSelenium)
shell('docker run -d -p 4445:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")
remDr$open()
remDr$navigate("https://www.willhaben.at/iad/immobilien/eigentumswohnung/wien?page=1")

# accept cookies ...
web <- remDr$findElement("id", "didomi-notice-agree-button")
web$clickElement()

web_Obj <- remDr$findElements("class", "dfxdhY")
list_Text <- lapply(X = web_Obj, FUN = function(x) x$getElementText())
list_Text

Here is the result :

[[1]]
[[1]][[1]]
[1] "280 m²\n7 Zimmer\nTerrasse, Balkon, Dachterrasse\nVon Poll Immobilien Wien\n€ 3.500.000"


[[2]]
[[2]][[1]]
[1] "55 m²\n2 Zimmer\nERA Team Immobilienmanagement, RES Real Estate Services GmbH\n€ 299.900"


[[3]]
[[3]][[1]]
[1] "111 m²\nBalkon\nERA Team Immobilienmanagement, RES Real Estate Services GmbH\n€ 949.000"


[[4]]
[[4]][[1]]
[1] "60 m²\n3 Zimmer\nTerrasse\nERA Team Immobilienmanagement, RES Real Estate Services GmbH\n€ 289.000"


[[5]]
[[5]][[1]]
[1] "62 m²\n2 Zimmer\nTerrasse, Balkon\nDECUS Immobilien GmbH\n€ 513.152"


[[6]]
[[6]][[1]]
[1] "60 m²\n3 Zimmer\nBalkon\nDECUS Immobilien GmbH\n€ 432.337,50"


[[7]]
[[7]][[1]]
[1] "125 m²\n4 Zimmer\nTerrasse\nDECUS Immobilien GmbH\n€ 772.855,20"

Here is another approach that does not require docker :

library(RDCOMClient)
url <- "https://www.willhaben.at/iad/immobilien/eigentumswohnung/wien?page=1"
IEApp <- COMCreate("InternetExplorer.Application")
IEApp[['Visible']] <- TRUE
IEApp$Navigate(url)

Sys.sleep(10)
doc <- IEApp$Document()
mouseEvent <- doc$createEvent("MouseEvent")
mouseEvent$initEvent("click", TRUE, FALSE)
web_Obj <- doc$getElementByID("didomi-notice-agree-button")
tryCatch(web_Obj$dispatchEvent(mouseEvent), error = function(e) NA)

# Force to load the page by scrolling down
for(i in 1 : 100)
{
  print(i)
  doc$parentWindow()$execScript("window.scrollBy(0, 75);", "javascript")
  Sys.sleep(0.02)
}

for(i in 1 : 100)
{
  print(i)
  doc$parentWindow()$execScript("window.scrollBy(0, -75);", "javascript")
  Sys.sleep(0.02)
}

list_Elem <- doc$getElementsByClassName("Box-sc-wfmb7k-0 ResultListAdRowLayout___StyledBox-sc-1rmys2w-0 iXdpLu bHgpCD")
list_Text <- list()

for(i in 1 : 100)
{
  print(i)
  list_Text[[i]] <- tryCatch(list_Elem[[i]]$InnerText(), error = function(e) NA)
}
  
list_Text <- list_Text[!is.na(list_Text)]
list_Text[1 : 10]

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1