check_robotxt = TRUE # check if it is allowed to scrap web pagedownload education plans to vary text
1 Notes
2 global variables
Define your global variables if needed:
3 load packages
# sets the directory of location of this script as the current directory
# setwd(dirname(rstudioapi::getSourceEditorContext()$path))
### load packages
require(pacman)
p_load('qdapRegex', 'rvest', 'rSHAPE', 'httr',
'tidyverse',
'robotstxt',
'xml2',
'xlsx', 'DT')4 get list of links (load .xlsx)
setwd(dir = "data")
links <- xlsx::read.xlsx2(file = "links.xlsx", sheetIndex = 1)5 get education plans for every link
5.1 function to scrape webpage
getEducationPlans <- function(url_educationplan, crawl_delay = 2) {
t0 <- Sys.time()
session <-
rvest::session(url_educationplan) # , ... > session$config$options$useragent
t1 <- Sys.time()
response_delay <- as.numeric(t1 - t0) # backing off time
wbpage <- rvest::read_html(session)
# Avoid HTTP error 429 due to too many requests - use crawl delay & back off
Sys.sleep(crawl_delay + 3 * response_delay + runif(n = 1, min = 0.5, max = 1))
wbpage %>%
html_elements(".tktable") %>%
html_text() %>%
str_extract(string = ., pattern = "[:digit:]{4}")
# get title of page
tmp_title <- wbpage %>%
html_elements(".basecontent-pbsbw---headline-h1") %>%
html_text()
# get level G
tmp_level_G <- wbpage %>%
html_elements(".ITK_G p") %>%
html_text()
tmp_level_G_number <- wbpage %>%
html_elements(".ITK_G .teilkompetenzNumber") %>%
html_text()
# get level M
tmp_level_M <- wbpage %>%
html_elements(".ITK_M p") %>%
html_text()
tmp_level_M_number <- wbpage %>%
html_elements(".ITK_M .teilkompetenzNumber") %>%
html_text()
# get level E
tmp_level_E <- wbpage %>%
html_elements(".ITK_E p") %>%
html_text()
tmp_level_E_number <- wbpage %>%
html_elements(".ITK_E .teilkompetenzNumber") %>%
html_text()
tmp <-
data.frame(
Titel = tmp_title,
Niveau = c(
rep(x = "G", times = length(tmp_level_G)),
rep(x = "M", times = length(tmp_level_M)),
rep(x = "E", times = length(tmp_level_E))
),
Kompetenznummer = as.numeric(x = str_remove_all(
string = c(tmp_level_G_number, tmp_level_M_number, tmp_level_E_number),
pattern = "\\)|\\("
)),
Kompetenz = c(tmp_level_G, tmp_level_M, tmp_level_E)
)
return(tmp)
}5.2 run function to scrape webpage
for(i in 1:nrow(links)){
if(check_robotxt){
# check if allowed to scrape page
# browseURL(url = link)
paths_allowed(paths = links$Link[i])
}
tmp_EP <- getEducationPlans(url_educationplan = links$Link[i], crawl_delay = 2)
if(i == 1){
tmp_out <- tmp_EP
}else{
tmp_out <- rbind(tmp_out, tmp_EP)
}
links$Kompetenz[i] <- unique(tmp_EP$Titel)
}
DT::datatable(data = tmp_out)5.3 save data
setwd(dir = "outputs")
xlsx::write.xlsx2(x = tmp_out, file = "Bildungsplaene.xlsx")