= TRUE # check if it is allowed to scrap web page check_robotxt
download education plans to vary text
1 Notes
2 global variables
Define your global variables if needed:
3 load packages
# sets the directory of location of this script as the current directory
# setwd(dirname(rstudioapi::getSourceEditorContext()$path))
### load packages
require(pacman)
p_load('qdapRegex', 'rvest', 'rSHAPE', 'httr',
'tidyverse',
'robotstxt',
'xml2',
'xlsx', 'DT')
4 get list of links (load .xlsx)
setwd(dir = "data")
<- xlsx::read.xlsx2(file = "links.xlsx", sheetIndex = 1) links
5 get education plans for every link
5.1 function to scrape webpage
<- function(url_educationplan, crawl_delay = 2) {
getEducationPlans <- Sys.time()
t0 <-
session ::session(url_educationplan) # , ... > session$config$options$useragent
rvest<- Sys.time()
t1 <- as.numeric(t1 - t0) # backing off time
response_delay <- rvest::read_html(session)
wbpage
# Avoid HTTP error 429 due to too many requests - use crawl delay & back off
Sys.sleep(crawl_delay + 3 * response_delay + runif(n = 1, min = 0.5, max = 1))
%>%
wbpage html_elements(".tktable") %>%
html_text() %>%
str_extract(string = ., pattern = "[:digit:]{4}")
# get title of page
<- wbpage %>%
tmp_title html_elements(".basecontent-pbsbw---headline-h1") %>%
html_text()
# get level G
<- wbpage %>%
tmp_level_G html_elements(".ITK_G p") %>%
html_text()
<- wbpage %>%
tmp_level_G_number html_elements(".ITK_G .teilkompetenzNumber") %>%
html_text()
# get level M
<- wbpage %>%
tmp_level_M html_elements(".ITK_M p") %>%
html_text()
<- wbpage %>%
tmp_level_M_number html_elements(".ITK_M .teilkompetenzNumber") %>%
html_text()
# get level E
<- wbpage %>%
tmp_level_E html_elements(".ITK_E p") %>%
html_text()
<- wbpage %>%
tmp_level_E_number html_elements(".ITK_E .teilkompetenzNumber") %>%
html_text()
<-
tmp data.frame(
Titel = tmp_title,
Niveau = c(
rep(x = "G", times = length(tmp_level_G)),
rep(x = "M", times = length(tmp_level_M)),
rep(x = "E", times = length(tmp_level_E))
),Kompetenznummer = as.numeric(x = str_remove_all(
string = c(tmp_level_G_number, tmp_level_M_number, tmp_level_E_number),
pattern = "\\)|\\("
)),Kompetenz = c(tmp_level_G, tmp_level_M, tmp_level_E)
)
return(tmp)
}
5.2 run function to scrape webpage
for(i in 1:nrow(links)){
if(check_robotxt){
# check if allowed to scrape page
# browseURL(url = link)
paths_allowed(paths = links$Link[i])
}
<- getEducationPlans(url_educationplan = links$Link[i], crawl_delay = 2)
tmp_EP
if(i == 1){
<- tmp_EP
tmp_out else{
}<- rbind(tmp_out, tmp_EP)
tmp_out
}
$Kompetenz[i] <- unique(tmp_EP$Titel)
links
}
::datatable(data = tmp_out) DT
5.3 save data
setwd(dir = "outputs")
::write.xlsx2(x = tmp_out, file = "Bildungsplaene.xlsx") xlsx