#=======================================
# Collect data from timviecnhanh.com
#=======================================
rm(list = ls()) # Clear workspace.
library(rvest)
library(tidyverse)
library(lubridate)
library(stringi)
# Helper function:
read_html_NE <- function(x) {
Page.src <- try(read_html(x), silent = T)
# Test if Page.src is erroneous:
if (class(Page.src)[1] == "try-error") {
error.cond <- attr(Page.src, "condition")
# Check if error condition contains “Timed out” phrase. If regexpr cannot find a
# match it returns -1:
timed.out <- regexpr("Timed out", error.cond, ignore.case = T) != -1
# We want to continue only on “timed out” error:
if (timed.out == TRUE) {
# Print information in the console:
print(paste(x, ": Timed out. Trying to reconnect in 1s. Please wait..."))
Sys.sleep(1 / 5)
return(html_NE(x))
}
}
return(Page.src)
}
# Function extract job descriptions with a specific job link selected:
get_job_timviecnhanh <- function(job_link) {
job_link %>%
read_html_NE() -> content
content %>%
html_nodes(xpath = '//*[@id="left-content"]/article/div[5]/div[1]') %>%
html_text(trim = TRUE) -> inf1
inf1 %>%
stri_trans_general("Latin-ASCII") %>%
str_replace_all(" ", "") %>%
str_split("\n", simplify = TRUE) %>%
as.data.frame() %>%
select(V2, V6, V9, V14, V19) %>%
rename(salary = V2, exprience = V6, qualification = V9, location = V14, job_type = V19) -> df1
content %>%
html_nodes(xpath = '//*[@id="left-content"]/article/div[5]/div[2]') %>%
html_text(trim = TRUE) -> inf2
inf2 %>%
stri_trans_general("Latin-ASCII") %>%
str_replace_all(" ", "") %>%
str_split("\n", simplify = TRUE) %>%
as.data.frame() %>%
select(V2, V5, V8, V11, V14) %>%
rename(numbers_hired = V2, gender = V5, tinh_chat = V8, hinh_thuc = V11, thu_viec = V14) -> df2
content %>%
html_nodes(xpath = '//*[@id="left-content"]/article/div[1]') %>%
html_text() %>%
stri_trans_general("Latin-ASCII") %>%
str_replace_all(" ", "") %>%
str_split("\n", simplify = TRUE) %>%
as.data.frame() %>%
pull(V3) -> update_and_Nviews
content %>%
html_nodes(xpath = '//*[@id="left-content"]/article/div[2]/h3') %>%
html_text() %>%
stri_trans_general("Latin-ASCII") %>%
str_squish() -> firm_name
content %>%
html_nodes(xpath = '//*[@id="left-content"]/article/div[2]/span') %>%
html_text() %>%
stri_trans_general("Latin-ASCII") %>%
str_squish() -> firm_add
content %>%
html_nodes(xpath = '//*[@id="left-content"]/article/table/tbody/tr[2]/td[2]/p') %>%
html_text() %>%
stri_trans_general("Latin-ASCII") %>%
str_squish() -> requirement
content %>%
html_nodes(xpath = '//*[@id="left-content"]/article/table/tbody/tr[3]') %>%
html_text() %>%
stri_trans_general("Latin-ASCII") %>%
str_squish() -> benefits
content %>%
html_nodes(xpath = '//*[@id="left-content"]/article/table/tbody/tr[4]') %>%
html_text() %>%
stri_trans_general("Latin-ASCII") %>%
str_squish() -> deadline
bind_cols(df1, df2) %>%
mutate(update_and_Nviews = update_and_Nviews,
firm_name = firm_name,
firm_add = firm_add,
requirement = requirement,
benefits = benefits,
deadline = deadline) %>%
mutate_all(function(x) {stri_trans_general(x, "Latin-ASCII")})-> df_info
return(df_info %>% mutate(link = job_link, data_date = now() %>% as.character()))
}
# Function extracts all link from a web page:
extract_all_links_page <- function(page_link) {
page_link %>%
read_html_NE() %>%
html_nodes(xpath = '/html/body/section/div/div[1]/div/div/article/table/tbody') %>%
html_nodes("a") %>%
html_attr("href") %>%
return()
}
# Get all job links:
lapply(paste0("https://www.timviecnhanh.com/vieclam/timkiem?&page=", 1:300), extract_all_links_page) -> all_links
all_links %>% unlist() %>% unique() -> all_job_links
# Collect descriptions for all jobs:
n <- length(all_job_links)
list_job_info <- vector("list", n)
system.time(for(i in 1:n) {
list_job_info[[i]] <- tryCatch(get_job_timviecnhanh(all_job_links[i]), error = function(e) {data.frame(link = all_job_links[i])})
})
do.call("bind_rows", list_job_info) -> df_timviecnhanh
# Save data:
today_time <- today() %>% as.character()
write_csv(df_timviecnhanh, paste0("df_timviecnhanh", today_time, ".csv"))
#=======================================
# Collect data from vieclam24h.vn
#=======================================
# Function extracts all link from a web page:
extract_all_links_page <- function(page_link) {
page_link %>%
read_html_NE() %>%
html_nodes(".font12+ .box_shadow .text_grey2") %>%
html_attr("href") -> pref_link
paste0("https://vieclam24h.vn", pref_link) -> all_links
return(all_links)
}
# Function extracts all descriptions for a specific job link:
get_job_vieclam24h <- function(job_link) {
job_link %>%
read_html() -> content
content %>%
html_nodes(xpath = '//*[@id="block_body_main"]/div[2]/div[4]') %>%
html_text(trim = TRUE) -> inf1
inf1 %>%
stri_trans_general("Latin-ASCII") %>%
str_replace_all(" ", "") %>%
str_replace_all("\n", "") %>%
str_split(":", simplify = TRUE) %>%
as.data.frame() %>%
select(salary = V2, experience = V3, qualificatiin = V4,
numbers_hired = V5, job_type = V6, location = V7,
hinh_thuc = V8, tinh_chat = V9, gender = V10, age_requirement = V11) -> df_info
content %>%
html_nodes(xpath = '//*[@id="block_body_main"]/div[2]/div[2]/div[1]/span') %>%
html_text(trim = TRUE) -> deadline
content %>%
html_nodes(xpath = '//*[@id="block_body_main"]/div[2]/div[2]/div[1]/p') %>%
html_text(trim = TRUE) -> update_and_Nviews
content %>%
html_nodes(xpath = '//*[@id="block_body_main"]/div[2]/div[1]/div/p/a') %>%
html_text(trim = TRUE) -> firm_name
content %>%
html_nodes(xpath = '//*[@id="ttd_detail"]/div[2]/div[3]/p') %>%
html_text(trim = TRUE) -> firm_add
content %>%
html_nodes(xpath = '//*[@id="ttd_detail"]/div[1]/div[2]/div[3]/p') %>%
html_text(trim = TRUE) -> requirement
content %>%
html_nodes(xpath = '//*[@id="ttd_detail"]/div[1]/div[2]/div[2]/p') %>%
html_text(trim = TRUE) -> benefits
df_info %>%
mutate(deadline = deadline, update_and_Nviews = update_and_Nviews,
firm_name = firm_name, firm_add = firm_add, requirement = requirement,
benefits = benefits, link = job_link, data_date = as.character(now())) -> job_info
return(job_info %>% mutate_all(function(x) {stri_trans_general(x, "Latin-ASCII")}))
}
# Extract all job links:
paste0("https://vieclam24h.vn/tim-kiem-viec-lam-nhanh/?hdn_tu_khoa=&hdn_nganh_nghe_cap1=&hdn_dia_diem=&key=ttv_nangcao&page=", 1:130) -> all_pages
lapply(all_pages, extract_all_links_page) %>% unlist() %>% unique() -> list_all_job_links
# Collect descriptions for all jobs:
n <- length(list_all_job_links)
list_job_info <- vector("list", n)
system.time(
for(i in 1:n) {list_job_info[[i]] <- tryCatch(get_job_vieclam24h(list_all_job_links[i]), error = function(e) {data.frame(link = list_all_job_links[i])})}
)
do.call("bind_rows", list_job_info) -> df_vieclam24h
# Save data:
today_time <- today() %>% as.character()
write_csv(df_vieclam24h, paste0("df_vieclam24h", today_time, ".csv"))
#========================================================================================================================
# Collect data from viectotnhat
# References:
# 1. https://stackoverflow.com/questions/45414913/gsub-and-remove-all-characters-between-and-in-r
# 2. https://datascience.stackexchange.com/questions/8922/removing-strings-after-a-certain-character-in-a-given-text
#========================================================================================================================
# Function extracts all link from a web page:
extract_all_links_page <- function(page_link) {
page_link %>%
read_html() %>%
html_nodes(".col-sm-6 .margin0 a") %>%
html_attr("href") -> all_links
return(all_links)
}
# Function extracts all descriptions for a specific job link:
get_job_viectotnhat <- function(job_link) {
job_link %>%
read_html() -> content
content %>%
html_nodes(xpath = '//*[@id="style-ipad"]/ul') %>%
html_text(trim = TRUE) -> inf1
inf1 %>%
stri_trans_general("Latin-ASCII") %>%
str_replace_all("\\(", "<") %>%
str_replace_all("\\)", ">") %>%
str_replace_all("<[^>]+>", "") %>%
str_replace_all("\n", "") %>%
str_flatten() %>%
str_split(":", simplify = TRUE) %>%
as.data.frame() %>%
mutate_all(function(x) {str_squish(x)}) -> df1
df1 %>%
select(salary = V2, experience = V3, qualification = V4,
numbers_hired = V5, job_type = V6, location = V7,
hinh_thuc = V8, tinh_chat = V9, gender = V10) -> df1
content %>%
html_nodes(xpath = '//*[@id="main-content"]/div/div/div[1]/div[4]/div[1]/div[2]/span[2]') %>%
html_text(trim = TRUE) -> deadline
content %>%
html_nodes(xpath = '//*[@id="main-content"]/div/div/div[1]/div[4]/div[3]') %>%
html_text(trim = TRUE) -> update_and_Nviews
content %>%
html_nodes(xpath = '//*[@id="main-content"]/div/div/div[1]/div[4]/div[1]/a[1]/h2') %>%
html_text(trim = TRUE) -> firm_name
content %>%
html_nodes(xpath = '//*[@id="main-content"]/div/div/div[1]/div[6]/div/div[2]/div[1]/div[2]') %>%
html_text(trim = TRUE) -> firm_add
content %>%
html_nodes(xpath = '//*[@id="main-content"]/div/div/div[1]/div[6]/div/div[1]/div[1]/div[7]') %>%
html_text(trim = TRUE) -> requirement
content %>%
html_nodes(xpath = '//*[@id="main-content"]/div/div/div[1]/div[6]/div/div[1]/div[1]/div[9]') %>%
html_text(trim = TRUE) -> benefits
df1 %>%
mutate(deadline = deadline, update_and_Nviews = update_and_Nviews,
firm_name = firm_name, firm_add = firm_add, requirement = requirement,
benefits = benefits, link = job_link, data_date = as.character(now())) -> job_info
return(job_info %>% mutate_all(function(x) {stri_trans_general(x, "Latin-ASCII")}))
}
# All pages:
paste0("https://viectotnhat.com/viec-lam/tim-kiem?tu_khoa=&nganh_nghe=0&muc_luong=0&tinh_thanh=0&loai_hinh=0&kinh_nghiem=0&trinh_do=0&gioi_tinh=0&page=", 1:500) -> all_pages
# Get all job links:
system.time(
lapply(all_pages, extract_all_links_page) -> list_all_jobs
)
list_all_jobs %>% unlist() %>% unique() -> all_job_links
# Collect descriptions for all jobs:
n <- length(all_job_links)
list_job_info <- vector("list", n)
system.time(
for(i in 1:n) {list_job_info[[i]] <- tryCatch(get_job_viectotnhat(all_job_links[i]), error = function(e) {data.frame(link = all_job_links[i])})}
)
do.call("bind_rows", list_job_info) -> df_vieclamtotnhat
# Save data (should be saved in .rds form):
today_time <- today() %>% as.character()
write_csv(df_vieclamtotnhat, paste0("df_vieclamtotnhat", today_time, ".csv"))