library(rvest)
library(tidyverse)
reed_data <- read_html("https://www.reed.co.uk/jobs/data-scientist-jobs-in-london?fulltime=True&proximity=1000")
job <- reed_data %>%
html_nodes("div") %>%
html_nodes(xpath = '//*[@class="page-counter"]')%>%
html_text() %>%
strsplit(" ")
current_job <- as.numeric(job[[1]][27])
total_job <- as.numeric(job[[1]][29])
paste('On this page there are', current_job, 'jobs out of a total of', total_job, "jobs")
## [1] "On this page there are 25 jobs out of a total of 508 jobs"
Our plan is to scrape all the jobs pages individually
When I tried to scrape this page there were only partial descriptions.
So we are going to create a loop that will pick out all the ‘data-ids’, which are used to identify the post page in the url.
First we need to get the first page.
job_url <- reed_data %>%
html_nodes("div") %>%
html_nodes(xpath = 'a')%>%
html_attr('data-id')
job_url <- job_url[!is.na(job_url)]
head(job_url)
## [1] "42227997" "42066794" "42081661" "42079755" "42170498" "42287570"
Now we can get the rest of the pages
# We already got page one, so we want to start it out on page 2
n_page=2
start_time <- Sys.time()
while (current_job < total_job){
# This will concatenate the url depending on the page
p = str_c('https://www.reed.co.uk/jobs/data-scientist-jobs-in-london?pageno=',n_page,'&fulltime=True&proximity=450', sep="")
URL_p = read_html(p)
# This will get the url
url <- URL_p %>%
html_nodes("div") %>%
html_nodes(xpath = 'a')%>%
html_attr('data-id')
url <- url[!is.na(url)]
# This appends the data together
job_url <- append(job_url, url)
# This gets the new job count and changes current job to that number
job <- URL_p %>% html_nodes("div") %>% html_nodes(xpath = '//*[@class="page-counter"]')%>% html_text() %>% strsplit(" ")
current_job <- as.numeric(job[[1]][27])
# This tells us to go to the next page
n_page <- n_page + 1
}
end_time <- Sys.time()
paste("There are now", current_job, "jobs out of a total of", total_job, "jobs, and it took" ,round(end_time - start_time), "seconds to complete.")
## [1] "There are now 508 jobs out of a total of 508 jobs, and it took 40 seconds to complete."
all_jobs <- data.frame(description=character(),
position=character(),
posted=as.Date(character()),
salary=character(),
location=character(),
contract=character(),
company=character(),
company_type=character(),
industry=character(),
url=character())
start_time <- Sys.time()
for (i in unique(job_url)) {
p = str_c('https://www.reed.co.uk/jobs/data-scientist/',i, sep="")
URL_p = read_html(p)
# Let's get the description
Desc <- URL_p %>% html_nodes("[itemprop='description']") %>%
html_text()
Desc <- str_trim(Desc, side = "left")
# Let's get the position
Pos <- URL_p %>% html_node("title") %>%
html_text()
# Let's get the posted date
Post <- URL_p %>% html_nodes("[itemprop='datePosted']") %>%
html_attr('content')
# Let's get the salary
Sal <- URL_p %>% html_nodes("[data-qa='salaryLbl']") %>%
html_text()
Sal <- str_trim(Sal, side = "left")
# Let's get the location
Loc <- URL_p %>% html_nodes("[data-qa='regionLbl']") %>%
html_text()
# Let's get the contract
Cont <- URL_p %>% html_nodes("[data-qa='jobTypeMobileLbl']") %>%
html_text()
# Let's get the company name
Comp <- URL_p %>% html_nodes(css ="[itemprop='hiringOrganization']") %>%
html_nodes(css ="[itemprop='name']") %>%
html_text()
Comp <- str_trim(Comp, side = "left")
# Let's get the company type. Since it is in the Javascript, we need to use regex to extract the value
Compt <- URL_p %>% str_extract("(jobRecruiterType: )'(\\w+\\s\\w+\\s\\w+|\\w+\\s\\w+|\\w+|\\s)") %>%
str_extract("(?<=\\')\\D+")
# Let's get the Industry. Since it is in the Javascript, we need to use regex to extract the value
Ind <- URL_p %>% str_extract("(jobKnowledgeDomain: )'(\\w+\\s\\w+\\s\\w+|\\w+\\s\\w+|\\w+|\\s)") %>%
str_extract("(?<=\\')\\D+")
url <- p
temp <- c(Desc, Pos, Post, Sal, Loc, Cont, Comp, Compt, Ind, url)
all_jobs <- rbind(temp, all_jobs)
}
end_time <- Sys.time()
paste("Your dataframe has been built and it took",round(end_time - start_time), "minutes to complete.")
## [1] "Your dataframe has been built and it took 7 minutes to complete."
colnames(all_jobs) <- c("description", "position","posted","salary","location","contract","company","company_type","industry", "url")
clean_jobs <- all_jobs
clean_jobs <- clean_jobs %>% filter(!grepl("Luton|Wimbledon|London|Enfield Town",salary))
clean_jobs$position <- str_remove(clean_jobs$position, "- reed.co.uk")
clean_jobs$position <- str_trim(clean_jobs$position)
clean_jobs <- clean_jobs %>% separate(salary, into = c("min.salary", "max.salary"), sep = "-")
#Regex was not necessary
clean_jobs <- clean_jobs %>% mutate(Min.Salary.Lbs = readr::parse_number(as.character(min.salary)))
clean_jobs <- clean_jobs %>% mutate(Max.Salary.Lbs = readr::parse_number(as.character(max.salary)))
#Was not able to figure out regex where I can identify the first space only.
#Used '0_' to separate text and numbers in the max.salary column
clean_jobs <- clean_jobs %>% separate(max.salary, into = c("X1", "X2"), sep = "0 ")
clean_jobs <- clean_jobs %>% select(-min.salary, -X1)
clean_jobs <- clean_jobs %>% separate(X2, into = c("X3", "X4", "X5"), sep = ", ")
tidy_jobs <- clean_jobs %>% relocate(Min.Salary.Lbs, .after = c(3)) %>%
relocate(Max.Salary.Lbs, .after = c(4)) %>%
rename( 'Salary.Period' = c(6)) %>%
select(-'X4', -'X5')
library(DT)
library('plotly')
library(ggplot2)
This job name also represents data science, so when so many names are at work, which ones appear most often?
df <- tidy_jobs
positions <- df %>%
count(position)
ggplotly(ggplot(positions %>% filter(n > 3), aes(x = reorder(position, -n), y = n)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x = "number of position",
y = "name of position",
title = "Names of position that more than three companies posted "))
What are the average annual salary of these companies and how many of them offer the same average annual salary?
# subset of data: [position], [company], [average_annual_salary], [company_type]
avg_annum <- df %>%
filter(Salary.Period == "per annum" & !is.na(Salary.Period)) %>%
mutate(avg_annual_salary = (Max.Salary.Lbs - Min.Salary.Lbs)/2)%>%
select(position, avg_annual_salary, company, company_type) %>%
arrange(desc(avg_annual_salary))
# number of companies offer the same average annual salary
count_company <- avg_annum %>%
count(avg_annual_salary) %>%
arrange(desc(n))
ggplotly(
ggplot(count_company, aes(x = avg_annual_salary, y = n)) +
geom_bar(stat = "identity") +
labs(y = "number of company",
title = "average annual salary within the companies"))
The larger average, the larger span. which companies offer the largest and the smallest annual salary span?
# min and max of average annual salary
a<-avg_annum %>% filter(avg_annual_salary == max(avg_annual_salary) | avg_annual_salary == min(avg_annual_salary))
datatable(a)
# min and max span of these companies
b<- df %>% filter(((Max.Salary.Lbs-Min.Salary.Lbs)/2 == 40000 | (Max.Salary.Lbs-Min.Salary.Lbs)/2 == 500) & Salary.Period == "per annum") %>% select(-c(description, posted, location, contract,industry, Salary.Period))
datatable(b)
What are the major types and corresponding ratio of these companies?
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
c<-df %>%
count(company_type) %>%
mutate(per = n/sum(n),
label = str_c(percent(per), "(", n, ")")) %>%
arrange(desc(n))
ggplot(data = c) +
geom_bar(aes(x = "", y = per, fill = company_type), stat = "identity", width = 1) +
coord_polar("y", start = 0) +
geom_text(aes(x = 1.7, y = cumsum(per)-per/2, label = label)) +
theme_void()
Among recruitment consultancy, which company or which companies have the most influence? The company is considered influential, so its business scope should be broad. In other words, a company like this has more cooperative work than other companies, which means it will post more jobs. As we can see from the plot, the number of jobs posted by Harnham far exceeds that of other companies
# business with wider scope
d1 <- df %>%
filter(company_type == "Recruitment consultancy") %>%
count(company) %>%
arrange(desc(n))
ggplot(d1 %>% filter(n > 2), aes(x = reorder(company, -n), y = n)) +
geom_bar(stat = "identity")+
coord_flip() +
labs(y = "number of job posts",
title = "recruitment consultancy with at least 2 job posts")
How is the spread of locations?
we’ve select the companies with at least 4 job posts in one location. Ro discover the relationship between companies and location, the scatter plot is shown below, where we can see that most of companies have job location in London, some companies provide more than one locations such as Harnham and Blue Pelican.
# location of these job distributed by companies with at least 4 job posts
d2 <- df %>%
group_by(company)%>%
count(location) %>%
filter(n > 3)
ggplot(d2, aes(x = location, y = company)) +
geom_point(aes(color = location)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Below, we undertook some NLP processes to distill the job descriptions down to a few key skills. After reading in and cleaning the column, we convert the column to tokens, ran the tokens against our stop words list, and found the most prevalent words in the job descriptions. Single words don’t provide much value in terms of analytical insight, so, after enhancing our stop word list, we next determined the most common word pairings, which proved to have much more analytical value. Data Science and Machine Learning were the overwhelming leaders in terms of in-demand skills. Considering Data Science more represents a group of skills as opposed to a single skill, we can safely conclude that in London Data Scientist job postings, Machine Learning represents the most sought-after skill. Nevertheless, it’s useful to learn about other in-demand skills, so we filtered out Data Science and Machine Learning from our dataset in order to better visualize other popular skills. “Computer Science,” “Data Analytics,” and the ever-important “Communication Skills” topped the list of sought-after characteristics in London-area Data Scientist job descriptions. We’ve used some helpful graphics to help illustrate our findings below.
Load the packages
library(RColorBrewer)
library('wordcloud2')
library(openintro)
library(wordcloud)
library(tidytext)
library(magrittr)
library(tinytex)
library(stringr)
library(ggpubr)
library(knitr)
library(fmsb)
library(DBI)
library(NLP)
library(tm)
# first, I'll read in and clean up the job descriptions
jobs_df <- all_jobs$description
# It's easier to manipulate this data how we want to by converting it to a tibble
jobs_tbl <- tibble(txt = jobs_df)
#jobs_tbl
#next, let's tokenize the text of the description and execute a word count to get an idea of the most prevalent words. We'll also run the result against a stop words list to exclude words that don't add any value to our analysis such as "the", "and", "that", etc.
token <- jobs_tbl %>%
unnest_tokens(word, 1) %>%
anti_join(stop_words)
## Joining, by = "word"
token_count <- token %>%
count(word) %>%
arrange(desc(n))
token_count
## # A tibble: 9,238 x 2
## word n
## <chr> <int>
## 1 data 3721
## 2 experience 1419
## 3 scientist 982
## 4 team 905
## 5 role 848
## 6 science 753
## 7 learning 712
## 8 company 673
## 9 machine 625
## 10 business 597
## # ... with 9,228 more rows
token_pairs <- jobs_tbl %>%
unnest_tokens(pairs, 1, token = "ngrams", n = 2)
token_pairs %>%
count(pairs) %>%
arrange(desc(n))
## # A tibble: 53,116 x 2
## pairs n
## <chr> <int>
## 1 data scientist 801
## 2 you will 794
## 3 will be 686
## 4 data science 590
## 5 machine learning 584
## 6 of the 551
## 7 in the 437
## 8 in a 429
## 9 for a 348
## 10 as a 315
## # ... with 53,106 more rows
pairs_separated <- token_pairs %>%
separate(pairs, c("word1", "word2"), sep = " ")
pairs_df <- pairs_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
pairs_count <- pairs_df %>%
count(word1, word2, sort = TRUE)
head(pairs_count)
## # A tibble: 6 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 data scientist 801
## 2 data science 590
## 3 machine learning 584
## 4 data scientists 232
## 5 senior data 202
## 6 data engineer 123
new_stop <- data.frame(word = c("apply", "london", "remote","remotely", "interview", "salary", "contract", "candidate", "scientist", "scientists", "team", "analyst", "engineer", "engineers", "manager", "managers", "senior", "employment", "experienced", "consultant", "junior", "month", "level", "masters", "rosie", "months", "experience", "level", "orientation", "opportunity", "principal", "benefits", "nick", "days", "day", "role", "francesca", "goldman", "luke", "anna", "date", "charlotte", "driven"), lexicon = "custom")
my_stopwords <- rbind(new_stop, stop_words)
pairs_df <- pairs_separated %>%
filter(!word1 %in% my_stopwords$word) %>%
filter(!word2 %in% my_stopwords$word)
# Let's now reunite the columns into a single pairs for analysis.
pairs_united <- pairs_df %>%
unite(term, word1, word2, sep = " ")
df_terms <- pairs_united$term
terms_tbl <- tibble(txt = df_terms)
united_count <- pairs_united %>%
count(term, sort = TRUE)
head(united_count)
## # A tibble: 6 x 2
## term n
## <chr> <int>
## 1 data science 590
## 2 machine learning 584
## 3 computer science 86
## 4 data analytics 83
## 5 communication skills 81
## 6 data engineering 64
a <- 30
data <- united_count
Results<-dplyr::filter(data, data[,2]>a)
colnames(Results)<-c("term", "frequency")
ggplot2::ggplot(Results, aes(x=reorder(term, -frequency), y=frequency, fill=term)) + geom_bar(width = 0.75, stat = "identity", colour = "black", size = 1) + coord_polar(theta = "x") + xlab("") + ylab("") + ggtitle("Term Frequency (min: 30)") + theme(legend.position = "none") + labs(x = NULL, y = NULL)
plotly::ggplotly(ggplot2::ggplot(Results, aes(x=reorder(term, -frequency), y=frequency, fill=term)) + geom_bar(width = 0.75, stat = "identity", colour = "black", size = 1) +
xlab("") + ylab("") + ggtitle("Word Frequency (min: 30)") + theme(legend.position = "none") + labs(x = NULL, y = NULL) + theme(plot.subtitle = element_text(vjust = 1), plot.caption = element_text(vjust = 1), axis.text.x = element_text(angle = 90)) + theme(panel.background = element_rect(fill = "honeydew1"), plot.background = element_rect(fill = "antiquewhite")))%>% config(displaylogo = F) %>% config(showLink = F)
a <- 30
b <- 100
data2 <- united_count
Results2<-dplyr::filter(data2, data2[,2]>a, data2[,2]<b )
colnames(Results2)<-c("term", "frequency")
ggplot2::ggplot(Results2, aes(x=reorder(term, -frequency), y=frequency, fill=term)) + geom_bar(width = 0.75, stat = "identity", colour = "black", size = 1) + coord_polar(theta = "x") + xlab("") + ylab("") + ggtitle("Term Frequency (min: 30, max: 100)") + theme(legend.position = "none") + labs(x = NULL, y = NULL)
plotly::ggplotly(ggplot2::ggplot(Results2, aes(x=reorder(term, -frequency), y=frequency, fill=term)) + geom_bar(width = 0.75, stat = "identity", colour = "black", size = 1) + xlab("") + ylab("") + ggtitle("Word Frequency (min: 30, max: 100)") + theme(legend.position = "none") + labs(x = NULL, y = NULL) + theme(plot.subtitle = element_text(vjust = 1), plot.caption = element_text(vjust = 1), axis.text.x = element_text(angle = 90)) + theme(panel.background = element_rect(fill = "honeydew1"), plot.background = element_rect(fill = "antiquewhite")))%>% config(displaylogo = F) %>% config(showLink = F)
c <- 10
d <- 600
Results3<-dplyr::filter(data2, data2[,2]>c, data2[,2]<d)
wordcloud2(Results3, color = "random-light", backgroundColor = "grey", size = .75)