Preliminary searches were conducted using the search term “Data Scientist” ere performed on http://dice.com and http://glassdor.com job listing sections and the returned results were observed. Appropriate investigations were carried out to determine the structure of the html nodes containing the needed job links to the individual job pages.
tags <- c("Math", "Computer Science", "Information Systems", "Machine Learning","D3.Js","D3", "statistics", "econometrics", "applied aathematics", "Operations Research", "analytical modeling" , "statistical models", "mmachine learning","algorithms","data modeling", "SAS","[^[:alnum:]]r[^[:alnum:]]"," r ","Python", "Azure ML", "KNIME", "SQL","Agile", "software development","SAS", "Tableau", "Power BI", "power bi","Statistics","customer focused","analytical","problem-solving skills", "Highly motivated", "self-starter","innovative", "quick to learn", "Excellent communication", "communication", "interpersonal skills","DNN","CNN","RNN", "logistic rgression", "neural networks", "cloudformation", "statistics", "MATLAB", "mathematics" , "economics", "engineering", "java", "ruby", "javascript", "scala", "tableau", "hadoop", "HADOOP", "mapreduce", "spark", "pig", "hive", "shark","oozie", "zookeeper", "flume", "mahout", "nosql","NOSQL","hbase","cassandra", "mongodb", "amazon s3", "intellectual curiosity", "business acumen", "communication", "data visualization", "data munging", "calculus", "linear algebra", "software engineering", "scientific method", "math", "product design","product development", "database administration", "project management", "data mining", "predictive modeling", "predictive analytics", "business intelligence", "optimization", "text mining","cloud management", "big data", " viz ", "bayesian statistics","bayesian analysis","n.l.p ", "nlp", "NLP", "natural language processing", "simulation", "simulations", "classification", "clustering", "regression", "glm", "glms", "generalized linear models", "entrepreneurial", "entrepreneur", "least squares", " roc ", "data wrangling", "storyteller", "storytelling", "hacking","deep learning", "neural network", "neural networks", "sci-kit learn", "pandas", "numpy", "cicrosoft power bi", "knime", "octave", "rapidminer", "minitab", "stata", "h20", "curious", "xlstat", "keras", "random forest", "decision tree", "time series", "random tree", "probability", "dato", "ggplot", " C# ", " c# "," C++ ", " c++ ", "ggplot2","ggplt", "ggvis", "predictive analysis", "Java Script", "HBase")
tag_ex <- paste0(tags, collapse = '|')
tag_ex <- tolower(tag_ex)
remove <- c("bein", "buil", "brin", "blis", "brig", "blic")
#gldLinks <-vector()
#gldDst <-vector()
#gldUrl <- 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14_IP'
#for(i in 1: 30)
#{
# gldUri <- paste(gldUrl, i, '.htm', sep = "")
# gLinks <- html_attr(html_nodes(read_html(curl(gldUri, handle = new_handle("useragent" = "Mozilla/5.0"))), 'div.jobTitle a.jobLink'), "href")
# gldLinks <- c(gldLinks, gLinks)
#}
#glData <- vector()
#dScientist <- vector()
#dScience <- vector()
#dstUrl <- 'https://www.dice.com/jobs/q-Data+Scientist-jobs?p='
#for(i in 1: 50)
#{
# diceUri <- paste(dstUrl, i, sep = "")
# dLinks <- html_attr(html_nodes(read_html(curl(diceUri, handle = new_handle("useragent" = "Mozilla/5.0"))), 'ul.list-inline:not(ul.details) a'), "href")
# dScientist <- c(dScientist, dLinks)
#}
#dStData <- vector()
#dscData <- vector()
# write.csv(dStData, "diceStripped.csv", row.names=FALSE)
diceRaw <- read.csv("diceStripped.csv", header = TRUE, stringsAsFactors = FALSE) # Read the .csv file
diceData <- diceRaw %>% str_replace_all('\n', '')%>%str_replace_all('\t', '')%>%str_replace_all('\r', '')%>%str_trim(side='both')%>%tolower()%>% str_extract_all(tag_ex)%>%unlist()
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
diceClean <- diceData %>% .[!(. %in% remove)] %>% {gsub("c(", "", ., fixed="TRUE")} %>% {gsub(")", "", ., fixed="TRUE") } %>% { gsub('"', "", ., fixed="TRUE") }%>%str_replace_all("&", "")%>%str_replace_all("/", "")%>%str_replace_all(",", "")%>%str_trim(side='both')%>%{gsub("\\)|\\]|\\(|[0-9]", "", .)}%>%{gsub("^[^a-z]*", "", .)}%>%{gsub("[^[:alpha:]]+$", "", .)}
diceClean <- diceClean[diceClean != ""]
head(diceClean)
## [1] "analytical" "machine learning" "algorithms"
## [4] "machine learning" "innovative" "predictive modeling"
write.csv(diceClean, "diceClean.csv", row.names=FALSE)
# glD <- vector()
#for (i in 1:length(glData))
#{
# u <- tryCatch(glData[i] %>% str_replace_all('\n', '')%>%str_replace_all('\t', '')%>%str_replace_all('\r', '')%>%str_trim(side='both')%>%tolower()%>% str_extract_all(tag_ex),
# error = function(e){list(result = NA, error = e)})
# glD <- c(glD, u)
#}
#glDClean1 <- glD%>%str_split(',')%>%unlist()
# glDClean1 <- glDClean1 %>% .[!(. %in% remove)] %>% {gsub("c(", "", ., fixed="TRUE")} %>% {gsub(")", "", ., fixed="TRUE") } %>% { gsub('"', "", ., fixed="TRUE") }%>%str_replace_all("&", "")%>%str_replace_all("/", "")%>%str_replace_all(",", "")%>%str_trim(side='both')%>%{gsub("\\)|\\]|\\(|[0-9]", "", .)}%>%{gsub("^[^a-z]*", "", .)}%>%{gsub("[^[:alpha:]]+$", "", .)}
#glDClean1 <- glDClean1[glDClean1 != ""]
#head(glDClean1)
# write.csv(glDClean1, "glassD_cleaned1.csv", row.names=FALSE)
glassdoor <- read.csv("glassD_cleaned1.csv", header = TRUE, stringsAsFactors = FALSE) # Read the .csv file
table function on the glassdoor vectord <- as.data.frame(table(glassdoor))
colnames(d)<-c("skill","frequency")
d <- d %>%arrange(desc(d$frequency))
head(d)
## skill frequency
## 1 machine learning 1291
## 2 python 757
## 3 statistics 690
## 4 sql 621
## 5 r 620
## 6 engineering 598
table function on the diceClean vectordd <- as.data.frame(table(diceClean))
colnames(dd)<-c("skill","frequency")
dd <- dd %>%arrange(desc(dd$frequency))
head(dd)
## skill frequency
## 1 sql 1752
## 2 machine learning 1744
## 3 python 1706
## 4 engineering 1648
## 5 big data 1264
## 6 r 1044
diceClean and glassdoor and form a single dataframe with the words frequency of occurence computed by calling the table function on the resulting vactor allVecglassdoor <- glassdoor$x
allVec <- c(diceClean, glassdoor)
alldata <- as.data.frame(table(allVec))
colnames(alldata)<-c("skill","frequency")
alldata <- alldata %>%arrange(desc(alldata$frequency))
head(alldata)
## skill frequency
## 1 machine learning 3035
## 2 python 2463
## 3 sql 2373
## 4 engineering 2246
## 5 r 1664
## 6 analytical 1509
top20 <- head(d, n = 20)
dplt <- ggplot(data=top20, aes(x = reorder(skill, frequency), y=frequency, fill = "steelblue")) +
geom_bar(stat = "identity") +
xlab("Keywords in Data science Jobs") + ylab("Frequency") +
ggtitle("Most valuable Data Science Skills on glassdoor.com") +
theme(plot.title = element_text(lineheight = .8, face = "bold")) +
theme(axis.text.x = element_text(angle = 90, vjust = .5, size = 9))+ coord_flip()
dplt + theme(legend.position="none")
wordcloud(words = d$skill, freq = d$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
top20d <- head(dd, n = 20)
dplt <- ggplot(data=top20d, aes(x = reorder(skill, frequency), y=frequency, fill = "steelblue")) +
geom_bar(stat = "identity") +
xlab("Keywords in Data science Jobs") + ylab("Frequency") +
ggtitle("Most valuable Data Science Skills on dice.com") +
theme(plot.title = element_text(lineheight = .8, face = "bold")) +
theme(axis.text.x = element_text(angle = 90, vjust = .5, size = 9))+ coord_flip()
dplt + theme(legend.position="none")
wordcloud(words = dd$skill, freq = dd$frequency, min.freq = 1,
max.words=500, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))