Web Scraping Programmer Jobs With R and rvest
Background
Pendahuluan
Berdasarkan laporan The Future of Jobs dari World Economic Forum, terdapat sejumlah profesi yang paling dibutuhkan di masa depan, khususnya profesi yang melibatkan ilmu sains dan teknologi. Beberapa profesi yang paling menjanjikan yaitu Software Developer, Web/Mobile App Developer, dan profesi yang berkaitan dengan dunia IT.
Tidak bisa dipungkiri di era digital saat ini, hampir semua perusahaan membutuhkan sistem komputerisasi untuk memajukan setiap unit bisnis mereka. Maka tidak heran profesi di dunia IT semakin dicari dan dibutuhkan. Namun apakah jumlah kebutuhan sumber daya manusia (SDM) untuk profesi ini di Indonesia sudah merata pada masing-masing daerah? serta skill pemrograman apa saja yang paling banyak dibutuhkan perusahaan? Untuk menjawab pertanyaan tersebut dilakukan riset dengan cara web scraping pada salah satu situs pencari kerja yaitu Indeed. Riset dilakukan pada bulan Mei 2019 dengan pencarian kata kunci “programmer” dan menghasilkan sebanyak 387 hasil pencarian.
Script Scraping
library(tidyverse)
library(rvest)
library(xml2)
url <- "https://id.indeed.com/lowongan-kerja?q=programmer&l="
page <- xml2::read_html(url)
#get the job title
job_title <- page %>%
rvest::html_nodes("div") %>%
rvest::html_nodes(xpath = '//*[@data-tn-element = "jobTitle"]') %>%
rvest::html_attr("title")
#get job title using CSS
page %>%
rvest::html_nodes('[data-tn-element="jobTitle"]') %>%
rvest::html_attr("title")
# or
page %>%
rvest::html_nodes('a[data-tn-element="jobTitle"]') %>%
rvest::html_attr("title")
# get company location
page %>%
rvest::html_nodes("span") %>%
rvest::html_nodes(xpath = '//*[@class="location"]')%>%
rvest::html_text() %>%
stringi::stri_trim_both()
# get company name
page %>%
rvest::html_nodes("span") %>%
rvest::html_nodes(xpath = '//*[@class="company"]') %>%
rvest::html_text() %>%
stringi::stri_trim_both()
#using CSS
# get company location
page %>%
rvest::html_nodes("span") %>%
rvest::html_nodes(xpath = '//*[@class="location"]')%>%
rvest::html_text() %>%
stringi::stri_trim_both()
# get company name
page %>%
rvest::html_nodes("span") %>%
rvest::html_nodes(xpath = '//*[@class="company"]') %>%
rvest::html_text() %>%
stringi::stri_trim_both()
# get links xpath
page %>%
rvest::html_nodes("div") %>%
rvest::html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
rvest::html_attr("href")
# get links CSS selectors
page %>%
rvest::html_nodes('[data-tn-element="jobTitle"]') %>%
rvest::html_attr("href")
# get job description xpath
page %>%
rvest::html_nodes("div") %>%
rvest::html_nodes(xpath = '//*[@class="jobsearch-jobDescriptionText"]') %>%
rvest::html_text() %>%
stringi::stri_trim_both()
page_result_start <- 10 # starting page
page_result_end <- 1510 # last page results
page_results <- seq(from = page_result_start, to = page_result_end, by = 10)
full_df <- data.frame()
for(i in seq_along(page_results)) {
first_page_url <- "https://id.indeed.com/lowongan-kerja?q=programmer&l="
url <- paste0(first_page_url, "&start=", page_results[i])
page <- xml2::read_html(url)
# Sys.sleep pauses R for two seconds before it resumes
# Putting it there avoids error messages such as "Error in open.connection(con, "rb") : Timeout was reached"
Sys.sleep(2)
#get the job title
job_title <- page %>%
rvest::html_nodes("div") %>%
rvest::html_nodes(xpath = '//a[@data-tn-element = "jobTitle"]') %>%
rvest::html_attr("title")
#get the company name
company_name <- page %>%
rvest::html_nodes("span") %>%
rvest::html_nodes(xpath = '//*[@class="company"]') %>%
rvest::html_text() %>%
stringi::stri_trim_both() -> company.name
#get job location
job_location <- page %>%
rvest::html_nodes("span") %>%
rvest::html_nodes(xpath = '//*[@class="location"]')%>%
rvest::html_text() %>%
stringi::stri_trim_both()
# get links
links <- page %>%
rvest::html_nodes("div") %>%
rvest::html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
rvest::html_attr("href")
job_description <- c()
for(i in seq_along(links)) {
url <- paste0("https://id.indeed.com/", links[i])
page <- xml2::read_html(url)
job_description[[i]] <- page %>%
rvest::html_nodes("div") %>%
rvest::html_nodes(xpath = '//*[@class="jobsearch-JobComponent-description icl-u-xs-mt--md"]') %>%
rvest::html_text() %>%
stringi::stri_trim_both()
}
df <- data.frame(job_title, company_name, job_location, job_description)
full_df <- rbind(full_df, df)
}Deskripsi Data
Wordcloud
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 6
##Cleaning data
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
twitclean <- tm_map(komenc, removeURL)
removeNL <- function(y) gsub("\n", " ", y)
twitclean <- tm_map(twitclean, removeNL)
replacecomma <- function(y) gsub(",", "", y)
twitclean <- tm_map(twitclean, replacecomma)
removeRT <- function(y) gsub("RT ", "", y)
twitclean <- tm_map(twitclean, removeRT)
removetitik2 <- function(y) gsub(":", "", y)
twitclean <- tm_map(twitclean, removetitik2)
removetitikkoma <- function(y) gsub(";", " ", y)
twitclean <- tm_map(twitclean, removetitikkoma)
removetitik3 <- function(y) gsub("p…", "", y)
twitclean <- tm_map(twitclean, removetitik3)
removeamp <- function(y) gsub("&", "", y)
twitclean <- tm_map(twitclean, removeamp)
removeUN <- function(z) gsub("@\\w+", "", z)
twitclean <- tm_map(twitclean, removeUN)
remove.all <- function(xy) gsub("[^[:alpha:][:space:]]*", "", xy)
twitclean <- tm_map(twitclean,remove.all)
twitclean <- tm_map(twitclean, removePunctuation)
twitclean <- tm_map(twitclean, tolower)## Warning in tm_map.SimpleCorpus(twitclean, removeWords, c("and")):
## transformation drops documents
Grafik
Most in-Demand Programming Skill
library(dplyr)
# remove rows in r by row number
app <- d[c(1:20),]
colnames(app) <- c("tools", "freq") # change column names
app <- app[order(app$freq), ] # sort
app$tools <- factor(app$tools, levels = app$tools) # to retain the order in plot.
library(ggplot2)
theme_set(theme_minimal())
# Draw plot
ggplot(app, aes(x=tools, y=freq)) +
geom_bar(stat="identity",fill="#FF9999", colour="black")+
labs(title="Most in-Demand Programming Skill",
subtitle="source : indeed.com - May, 2019") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))location <- Corpus(VectorSource(loc))
##Cleaning data
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
loc2 <- tm_map(location, removeURL)
removeNL <- function(y) gsub("\n", " ", y)
loc2 <- tm_map(loc2, removeNL)
replacecomma <- function(y) gsub(",", "", y)
loc2 <- tm_map(loc2, replacecomma)
removeRT <- function(y) gsub("RT ", "", y)
loc2 <- tm_map(loc2, removeRT)
removetitik2 <- function(y) gsub(":", "", y)
loc2 <- tm_map(loc2, removetitik2)
removetitikkoma <- function(y) gsub(";", " ", y)
loc2 <- tm_map(loc2, removetitikkoma)
removetitik3 <- function(y) gsub("p…", "", y)
loc2 <- tm_map(loc2, removetitik3)
removeamp <- function(y) gsub("&", "", y)
loc2 <- tm_map(loc2, removeamp)
removeUN <- function(z) gsub("@\\w+", "", z)
loc2 <- tm_map(loc2, removeUN)
remove.all <- function(xy) gsub("[^[:alpha:][:space:]]*", "", xy)
loc2 <- tm_map(loc2,remove.all)
loc2 <- tm_map(loc2, removePunctuation)
loc2 <- tm_map(loc2, tolower)#Build a term-document matrix
{
dtm <- TermDocumentMatrix(loc2)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
k <- data.frame(word = names(v),freq=v)
}Programmer Job Vacancy based on Location
# remove rows in r by row number
colnames(k) <- c("Area", "Freq") # change column names
k <- k[order(k$Freq), ] # sort
k$Area <- factor(k$Area, levels = k$Area) # to retain the order in plot.
library(ggplot2)
theme_set(theme_minimal())
# Draw plot
ggplot(k, aes(x=Area, y=Freq)) +
geom_bar(stat="identity",fill="#FF9999", colour="black")+
labs(title="Where the programmer jobs are",
subtitle="area with the most job posting for programmer roles",
caption="source : indeed.com - May, 2019") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 6
##Cleaning data
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
jkt <- tm_map(jkt, removeURL)
removeNL <- function(y) gsub("\n", " ", y)
jkt <- tm_map(jkt, removeNL)
replacecomma <- function(y) gsub(",", "", y)
jkt <- tm_map(jkt, replacecomma)
removeRT <- function(y) gsub("RT ", "", y)
jkt <- tm_map(jkt, removeRT)
removetitik2 <- function(y) gsub(":", "", y)
jkt <- tm_map(jkt, removetitik2)
removetitikkoma <- function(y) gsub(";", " ", y)
jkt <- tm_map(jkt, removetitikkoma)
removetitik3 <- function(y) gsub("p…", "", y)
jkt <- tm_map(jkt, removetitik3)
removeamp <- function(y) gsub("&", "", y)
jkt <- tm_map(jkt, removeamp)
removeUN <- function(z) gsub("@\\w+", "", z)
jkt <- tm_map(jkt, removeUN)
remove.all <- function(xy) gsub("[^[:alpha:][:space:]]*", "", xy)
jkt <- tm_map(jkt,remove.all)
jkt <- tm_map(jkt, removePunctuation)
jkt <- tm_map(jkt, tolower)#Build a term-document matrix
{
dtm <- TermDocumentMatrix(jkt)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
jkt <- data.frame(word = names(v),freq=v)
}Jakarta
jkt <- d[c(1:20),]
# remove rows in r by row number
colnames(jkt) <- c("Tools", "Freq") # change column names
jkt <- jkt[order(jkt$Freq), ] # sort
jkt$Area <- factor(jkt$Tools, levels = jkt$Tools) # to retain the order in plot.
library(ggplot2)
theme_set(theme_minimal())
# Draw plot
ggplot(jkt, aes(x=Tools, y=Freq)) +
geom_bar(stat="identity",fill="#FF9999", colour="black")+
labs(title="Programming skill that are in demand in Jakarta",
caption="source : indeed.com - May, 2019") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))library(tm)
tn <- readLines("E:/BELAJAR/Scrape Indeed/tng.txt")
tn <- unlist(strsplit(tn, "\t"))
tn <- Corpus(VectorSource(tn))
##Cleaning data
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
tn <- tm_map(tn, removeURL)
removeNL <- function(y) gsub("\n", " ", y)
tn <- tm_map(tn, removeNL)
replacecomma <- function(y) gsub(",", "", y)
tn <- tm_map(tn, replacecomma)
removeRT <- function(y) gsub("RT ", "", y)
tn <- tm_map(tn, removeRT)
removetitik2 <- function(y) gsub(":", "", y)
tn <- tm_map(tn, removetitik2)
removetitikkoma <- function(y) gsub(";", " ", y)
tn <- tm_map(tn, removetitikkoma)
removetitik3 <- function(y) gsub("p…", "", y)
tn <- tm_map(tn, removetitik3)
removeamp <- function(y) gsub("&", "", y)
tn <- tm_map(tn, removeamp)
removeUN <- function(z) gsub("@\\w+", "", z)
tn <- tm_map(tn, removeUN)
remove.all <- function(xy) gsub("[^[:alpha:][:space:]]*", "", xy)
tn <- tm_map(tn,remove.all)
tn <- tm_map(tn, removePunctuation)
tn <- tm_map(tn, tolower)
#Build a term-document matrix
{
dtm <- TermDocumentMatrix(tn)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
tn <- data.frame(word = names(v),freq=v)
}
tn <- head(tn,10)
# remove rows in r by row number
colnames(tn) <- c("Tools", "Freq") # change column names
tn <- tn[order(tn$Freq), ] # sort
tn$Area <- factor(tn$Tools, levels = tn$Tools) # to retain the order in plot.Tangerang
library(ggplot2)
theme_set(theme_minimal())
# Draw plot
ggplot(tn, aes(x=Tools, y=Freq)) +
geom_bar(stat="identity",fill="#FF9999", colour="black")+
labs(title="Programming skill that are in demand in Tangerang",
caption="source : indeed.com - May, 2019") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 6
##Cleaning data
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
sby <- tm_map(sby, removeURL)
removeNL <- function(y) gsub("\n", " ", y)
sby <- tm_map(sby, removeNL)
replacecomma <- function(y) gsub(",", "", y)
sby <- tm_map(sby, replacecomma)
removeRT <- function(y) gsub("RT ", "", y)
sby <- tm_map(sby, removeRT)
removetitik2 <- function(y) gsub(":", "", y)
sby <- tm_map(sby, removetitik2)
removetitikkoma <- function(y) gsub(";", " ", y)
sby <- tm_map(sby, removetitikkoma)
removetitik3 <- function(y) gsub("p…", "", y)
sby <- tm_map(sby, removetitik3)
removeamp <- function(y) gsub("&", "", y)
sby <- tm_map(sby, removeamp)
removeUN <- function(z) gsub("@\\w+", "", z)
sby <- tm_map(sby, removeUN)
remove.all <- function(xy) gsub("[^[:alpha:][:space:]]*", "", xy)
sby <- tm_map(sby,remove.all)
sby <- tm_map(sby, removePunctuation)
sby <- tm_map(sby, tolower)
sby <- tm_map(sby , removeWords,
c('and'))#Build a term-document matrix
{
dtm <- TermDocumentMatrix(sby)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
sby <- data.frame(word = names(v),freq=v)
}sby <- head(sby, 10)
# remove rows in r by row number
colnames(sby) <- c("Tools", "Freq") # change column names
sby <- sby[order(sby$Freq), ] # sort
sby$Area <- factor(sby$Tools, levels = sby$Tools) # to retain the order in plot.Surabaya
library(ggplot2)
theme_set(theme_minimal())
# Draw plot
ggplot(sby, aes(x=Tools, y=Freq)) +
geom_bar(stat="identity",fill="#FF9999", colour="black")+
labs(title="Programming skill that are in demand in Surabaya",
caption="source : indeed.com - May, 2019") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 6
##Cleaning data
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
bdg <- tm_map(bdg, removeURL)
removeNL <- function(y) gsub("\n", " ", y)
bdg <- tm_map(bdg, removeNL)
replacecomma <- function(y) gsub(",", "", y)
bdg <- tm_map(bdg, replacecomma)
removeRT <- function(y) gsub("RT ", "", y)
bdg <- tm_map(bdg, removeRT)
removetitik2 <- function(y) gsub(":", "", y)
bdg <- tm_map(bdg, removetitik2)
removetitikkoma <- function(y) gsub(";", " ", y)
bdg <- tm_map(bdg, removetitikkoma)
removetitik3 <- function(y) gsub("p…", "", y)
bdg <- tm_map(bdg, removetitik3)
removeamp <- function(y) gsub("&", "", y)
bdg <- tm_map(bdg, removeamp)
removeUN <- function(z) gsub("@\\w+", "", z)
bdg <- tm_map(bdg, removeUN)
remove.all <- function(xy) gsub("[^[:alpha:][:space:]]*", "", xy)
bdg <- tm_map(bdg,remove.all)
bdg <- tm_map(bdg, removePunctuation)
bdg <- tm_map(bdg, tolower)
bdg <- tm_map(bdg , removeWords,
c('and'))#Build a term-document matrix
{
dtm <- TermDocumentMatrix(bdg)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
bdg <- data.frame(word = names(v),freq=v)
}bdg <- head(bdg, 10)
# remove rows in r by row number
colnames(bdg) <- c("Tools", "Freq") # change column names
bdg <- bdg[order(bdg$Freq), ] # sort
bdg$Area <- factor(bdg$Tools, levels = bdg$Tools) # to retain the order in plot.Bandung
library(ggplot2)
theme_set(theme_minimal())
# Draw plot
ggplot(bdg, aes(x=Tools, y=Freq)) +
geom_bar(stat="identity",fill="#FF9999", colour="black")+
labs(title="Programming skill that are in demand in Bandung",
caption="source : indeed.com - May, 2019") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 6
##Cleaning data
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
jog <- tm_map(jog, removeURL)
removeNL <- function(y) gsub("\n", " ", y)
jog <- tm_map(jog, removeNL)
replacecomma <- function(y) gsub(",", "", y)
jog <- tm_map(jog, replacecomma)
removeRT <- function(y) gsub("RT ", "", y)
jog <- tm_map(jog, removeRT)
removetitik2 <- function(y) gsub(":", "", y)
jog <- tm_map(jog, removetitik2)
removetitikkoma <- function(y) gsub(";", " ", y)
jog <- tm_map(jog, removetitikkoma)
removetitik3 <- function(y) gsub("p…", "", y)
jog <- tm_map(jog, removetitik3)
removeamp <- function(y) gsub("&", "", y)
jog <- tm_map(jog, removeamp)
removeUN <- function(z) gsub("@\\w+", "", z)
jog <- tm_map(jog, removeUN)
remove.all <- function(xy) gsub("[^[:alpha:][:space:]]*", "", xy)
jog <- tm_map(jog,remove.all)
jog <- tm_map(jog, removePunctuation)
jog <- tm_map(jog, tolower)
jog <- tm_map(jog , removeWords,
c('and'))#Build a term-document matrix
{
dtm <- TermDocumentMatrix(jog)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
jog <- data.frame(word = names(v),freq=v)
}jog <- head(jog, 10)
# remove rows in r by row number
colnames(jog) <- c("Tools", "Freq") # change column names
jog <- jog[order(jog$Freq), ] # sort
jog$Area <- factor(jog$Tools, levels = jog$Tools) # to retain the order in plot.Yogyakarta
library(ggplot2)
theme_set(theme_minimal())
# Draw plot
ggplot(jog, aes(x=Tools, y=Freq)) +
geom_bar(stat="identity",fill="#FF9999", colour="black")+
labs(title="Programming skill that are in demand in Yogyakarta",
caption="source : indeed.com - May, 2019") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))Peta
# Library
library(leaflet)
# load data
new <- read.csv("E:/BELAJAR/Scrape Indeed/k.csv", sep = ";")
# Create a color palette with handmade bins.
mybins=seq(0, 250, by=50)
mypalette = colorBin( palette="YlOrBr", domain=new$Freq, na.color="transparent", bins=mybins)
# Prepar the text for the tooltip:
mytext=paste("Job vacancy: ", new$Freq) %>%
lapply(htmltools::HTML)
# Final Map
z <- leaflet(new) %>%
addTiles() %>%
setView( lat= -5, lng=115 , zoom=4.5) %>%
addProviderTiles("Esri.WorldImagery") %>%
addCircleMarkers(~lon, ~lat,
fillColor = ~mypalette(Freq), fillOpacity = 0.7, color="white", radius=8, stroke=FALSE,
label = mytext,
labelOptions = labelOptions( style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "13px", direction = "auto")
) %>%
addLegend( pal=mypalette, values=~Freq, opacity=0.9, title = "Frequency", position = "bottomright" )
zRingkasan
- Berdasarkan lowongan pekerjaan programmer yang dimuat pada web Indeed diketahui bahwa 66% diantaranya berada di Jakarta. Hal ini tidak mengherankan karena Jakarta merupakan pusat industri dan ekonomi di wilayah Indonesia. Sementara itu apabila dilihat dari peta maka dapat terlihat ketimpangan lowongan pekerjaan antara pulau Jawa dengan pulau lainnya. Hal ini juga mengindikasikan adanya ketimpangan industri dan ekonomi yang masih kurang merata pada masing-masing daerah.
- Kebutuhan pekerjaan programmer paling banyak ditemukan pada lima wilayah berikut secara berurutan yaitu: Jakarta,Surabaya, Bandung, Tangerang, Yogyakarta.
- Secara umum dapat dilihat pada output wordcloud dan grafik bahwa skill pemrograman yang paling banyak dicari oleh perusahaan di Indonesia yaitu : SQL, HTML, php, javascript, dan CSS