Jill Anderson, Alvaro Bueno Castillo, Nathan Cooper, Silverio Vasquez, Sarah Wigodsky
October 20, 2017
This project addresses the question: “What are the most valued data science skills?”
To answer this question, the team took the following approach:
COLLECTING: mined the job board Glassdoor.com to identify which skills were requested the most frequently in job descriptions. (Alvaro Bueno & Silverio Vasquez)
TIDYING: creating databases from the raw data. (Sarah Wigodsky)
ANALYZING: developing relevant dataframes from the databases. (Nathan Cooper)
VISUALIZING: creating graphs and descriptions of the conclusions from the analysis. (Jill Anderson & Nathan Cooper)
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(RCurl)))
suppressWarnings(suppressMessages(library(tidyr)))
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(rvest)))
suppressMessages(suppressWarnings(library(tm)))
suppressMessages(suppressWarnings(library(wordcloud)))
suppressMessages(suppressWarnings(library(SnowballC)))
jobOffers <- data.frame(job_title=character(), company=character(), location=character(), description=character(), age=character())
# the urls of your 88 pages, links for ny, sf and boston from glassdoor
base_url <- "https://www.glassdoor.com"
main_url_boston <- "/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword=data+sc&sc.keyword=data+scientist&locT=C&locId=1154532&jobType="
main_url_ny <- "/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=data+scientist&sc.keyword=data+scientist&locT=C&locId=1132348&jobType="
main_url_sf <- "/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=data+scientist&sc.keyword=data+scientist&locT=C&locId=1147401&jobType="
main_url <- "/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword=data+sci&sc.keyword=data+scientist&locT=C&locId=1128808&jobType="
links <- read_html(paste(base_url,main_url, sep="")) %>% html_nodes(xpath='//a[contains(@class, "jobLink")]') %>% xml_attr("href")
next_link <- read_html(paste(base_url,main_url, sep="")) %>% html_nodes(xpath='//li[contains(@class, "next")]//a') %>% xml_attr("href")
count <- 0
# get links from 20 pages only
while(count < 20) {
newLinks <- read_html(paste(base_url,next_link, sep="")) %>% html_nodes(xpath='//a[contains(@class, "jobLink")]') %>% xml_attr("href")
links <- c(links, newLinks)
next_link <- read_html(paste(base_url,next_link, sep="")) %>% html_nodes(xpath='//li[contains(@class, "next")]//a') %>% xml_attr("href")
count <- count + 1
}
# process only the first 400 links
for(the_link in links[1:400]){
tryCatch({
jobOffer <- NULL
the_html <- read_html(paste(base_url, the_link, sep=""))
jobOffer$job_title <- the_html %>% html_nodes(xpath='//div[contains(@class, "header")]//h2') %>% html_text()
jobOffer$company <- the_html %>% html_nodes(xpath='//div[contains(@class, "header")]//span[contains(@class, "ib padRtSm")]') %>% html_text()
jobOffer$location <- the_html %>% html_nodes(xpath='//div[contains(@class, "header")]//span[contains(@class, "subtle ib")]') %>% html_text()
jobOffer$description <- the_html %>% html_nodes(xpath='//div[contains(@class, "jobDescriptionContent")]') %>% html_text()
jobOffer$age <- the_html %>% html_nodes(xpath='//div[contains(@class, "cell alignRt showDesk")]//span[contains(@class, "minor nowrap")]') %>% html_text()
jobOffers <- rbind(jobOffers, data.frame(job_title=jobOffer$job_title, company=jobOffer$company, location=jobOffer$location, description=jobOffer$description, age=jobOffer$age))
print("job added.")
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}
# write.csv(jobOffers, file = "jobOffers_CHI.csv")
joblistma <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_final_with_dupes_reduced.csv", stringsAsFactors = FALSE)
joblistny <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_ny.csv", stringsAsFactors=FALSE)
joblistsf <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_SF.csv", stringsAsFactors=FALSE)
joblistsf <- joblistsf[,-1]
joblistchi <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_CHI.csv", stringsAsFactors=FALSE)
joblistchi <- joblistchi[,-1]
joblisttidy <- rbind(joblistma, joblistny, joblistsf, joblistchi)
Since some companies post the same job on different days, the posted date column is elimintated prior to testing for duplicate entries.
joblisttidy <- joblisttidy[,-5]
joblisttidy <- subset(joblisttidy, duplicated(joblisttidy)==FALSE)
joblisttidy$location <- unlist(str_extract_all(joblisttidy$location, "[[:upper:]]{1}[[:lower:]]{2,}, [[:alpha:]]{2}|[[:upper:]]{1}[[:lower:]]{2,} [[:alpha:]]{2,}, [[:alpha:]]{2}"))
joblisttidy$description <- gsub("[^[:ascii:]]", "", joblisttidy$description, perl=T)
compskills <- joblisttidy %>%
mutate(python = grepl("python", description, ignore.case=TRUE)) %>%
mutate(perl = grepl("perl", description, ignore.case=TRUE)) %>%
mutate(Cplusplus = grepl("C++", description, fixed=TRUE)) %>%
mutate(SQL = grepl("SQL", description)) %>%
mutate(java = grepl("java\\b", description, ignore.case=TRUE)) %>%
mutate(javascript = grepl("javascript", description, ignore.case=TRUE)) %>%
mutate(R = grepl("\\bR\\b,", description)) %>%
mutate(hadoop = grepl("hadoop", description, ignore.case=TRUE)) %>%
mutate(spark = grepl("spark", description, ignore.case=TRUE)) %>%
mutate(scala = grepl("scala", description, ignore.case=TRUE)) %>%
select(job_title, company, python, perl, Cplusplus, SQL, java, javascript, R, hadoop, spark, scala)
## Warning: package 'bindrcpp' was built under R version 3.4.1
summary(compskills)
## job_title company python perl
## Length:498 Length:498 Mode :logical Mode :logical
## Class :character Class :character FALSE:149 FALSE:475
## Mode :character Mode :character TRUE :349 TRUE :23
## Cplusplus SQL java javascript
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:431 FALSE:248 FALSE:369 FALSE:467
## TRUE :67 TRUE :250 TRUE :129 TRUE :31
## R hadoop spark scala
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:318 FALSE:347 FALSE:336 FALSE:385
## TRUE :180 TRUE :151 TRUE :162 TRUE :113
skills <- joblisttidy %>%
mutate(machinelearning = grepl("machine learning", description, ignore.case=TRUE)) %>%
mutate(statisticalmodeling = grepl("statistical model", description, ignore.case=TRUE)) %>%
mutate(techwriting = grepl("technical writing", description, ignore.case=TRUE)) %>%
mutate(plateau = grepl("plateau", description, ignore.case=TRUE)) %>%
mutate(d3 = grepl("D3", description)) %>%
select(job_title, company, machinelearning, statisticalmodeling, techwriting, plateau, d3)
summary(skills)
## job_title company machinelearning statisticalmodeling
## Length:498 Length:498 Mode :logical Mode :logical
## Class :character Class :character FALSE:193 FALSE:416
## Mode :character Mode :character TRUE :305 TRUE :82
## techwriting plateau d3
## Mode :logical Mode :logical Mode :logical
## FALSE:492 FALSE:498 FALSE:480
## TRUE :6 TRUE :18
softskills <- joblisttidy %>%
mutate(collaborative = grepl("collaborat", description, ignore.case=TRUE)) %>%
mutate(organized = grepl("organized", description, ignore.case=TRUE)) %>%
mutate(selfstarter = grepl("self starter", description, ignore.case=TRUE)) %>%
mutate(attndetail = grepl("attention to detail", description, ignore.case=TRUE)) %>%
mutate(communication = grepl("communicat", description, ignore.case=TRUE)) %>%
mutate(creative = grepl("creativ", description, ignore.case=TRUE)) %>%
mutate(visualization = grepl("visualization", description, ignore.case=TRUE)) %>%
select(job_title, company, collaborative, organized, selfstarter, attndetail, communication, creative, visualization)
summary(softskills)
## job_title company collaborative organized
## Length:498 Length:498 Mode :logical Mode :logical
## Class :character Class :character FALSE:254 FALSE:475
## Mode :character Mode :character TRUE :244 TRUE :23
## selfstarter attndetail communication creative
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:496 FALSE:446 FALSE:165 FALSE:363
## TRUE :2 TRUE :52 TRUE :333 TRUE :135
## visualization
## Mode :logical
## FALSE:361
## TRUE :137
cskills <- compskills %>% select(-(1:2)) %>% summarise_all(sum) %>% gather(variable,value) %>% arrange(desc(value))
skills2 <- skills %>% select(-(1:2)) %>% summarise_all(sum) %>% gather(variable,value) %>% arrange(desc(value))
sskills <- softskills %>% select(-(1:2)) %>% summarise_all(sum) %>% gather(variable,value) %>% arrange(desc(value))
ggplot(cskills,aes(x=reorder(variable, value), y=value)) + geom_bar(stat='identity',fill="steelblue") + xlab('') + ylab('Count') + labs(title='Computer Skills') + coord_flip() + theme_minimal()
ggplot(skills2,aes(x=reorder(variable, value), y=value)) + geom_bar(stat='identity',fill="steelblue") + xlab('') + ylab('Count') + labs(title='Analytical Skills') + coord_flip() + theme_minimal()
ggplot(sskills,aes(x=reorder(variable, value), y=value)) + geom_bar(stat='identity',fill="steelblue") + xlab('') + ylab('Count') + labs(title='Soft Skills') + coord_flip() + theme_minimal()
The description column was used to create a word cloud. A few words were removed such as Data, Science, Scientist, you, and will along with the stop words (e.g., “the”). Note that text stemming was not useful in this application as it truncated words like experience and business.
dsCloud <- Corpus(VectorSource(joblisttidy$description))
dsCloud <- tm_map(dsCloud, removePunctuation)
dsCloud <- tm_map(dsCloud, tolower)
dsCloud <- tm_map(dsCloud, removeWords, c("data", "science", "scientist" , "you", "will", stopwords('english')))
wordcloud(dsCloud, max.words = 50, random.order = FALSE, scale=c(3,.3),random.color = FALSE,colors=palette())
As one might expect, the most often referenced soft skills are Communication and Collaboration. The least often referenced soft skill out of the top seven studied here is being a Self Starter.
For computer skills, Python appears the most often and is referenced nearly twice as often as R. SQL is the second most often appearing computer skill.
For analytical skills, Machine Learning is the most in demand.
Our conclusion/recommendation for universities offering Data Science/Analytics programs is to focus on the skills that employers seek the most such as Python, SQL, Machine Learning, Communication, and Teamwork. This will help prepare graduates to enter the workforce promptly after graduation.