Webscrapping

Preliminary searches were conducted using the search term “Data Scientist” ere performed on http://dice.com and http://glassdor.com job listing sections and the returned results were observed. Appropriate investigations were carried out to determine the structure of the html nodes containing the needed job links to the individual job pages.

Form Data science keyword word so as to limit the amount of unwanted words to be extracted from the pages

tags <- c("Math", "Computer Science", "Information Systems", "Machine Learning","D3.Js","D3", "statistics", "econometrics", "applied aathematics", "Operations Research", "analytical modeling" , "statistical models", "mmachine learning","algorithms","data modeling", "SAS","[^[:alnum:]]r[^[:alnum:]]"," r ","Python", "Azure ML", "KNIME", "SQL","Agile", "software development","SAS", "Tableau", "Power BI", "power bi","Statistics","customer focused","analytical","problem-solving skills", "Highly motivated", "self-starter","innovative", "quick to learn", "Excellent communication", "communication", "interpersonal skills","DNN","CNN","RNN", "logistic rgression", "neural networks", "cloudformation", "statistics", "MATLAB", "mathematics" , "economics", "engineering", "java", "ruby", "javascript", "scala", "tableau", "hadoop", "HADOOP", "mapreduce",    "spark", "pig", "hive", "shark","oozie", "zookeeper", "flume",   "mahout",  "nosql","NOSQL","hbase","cassandra", "mongodb", "amazon s3", "intellectual curiosity", "business acumen", "communication", "data visualization", "data munging", "calculus", "linear algebra", "software engineering", "scientific method", "math", "product design","product development", "database administration", "project management", "data mining", "predictive modeling", "predictive analytics", "business intelligence", "optimization", "text mining","cloud management", "big data", " viz ", "bayesian statistics","bayesian analysis","n.l.p ", "nlp", "NLP", "natural language processing", "simulation", "simulations", "classification", "clustering",  "regression", "glm", "glms", "generalized linear models", "entrepreneurial", "entrepreneur", "least squares", " roc ", "data wrangling", "storyteller", "storytelling", "hacking","deep learning", "neural network", "neural networks", "sci-kit learn", "pandas", "numpy", "cicrosoft power bi", "knime", "octave", "rapidminer", "minitab", "stata", "h20", "curious", "xlstat", "keras", "random forest", "decision tree", "time series", "random tree", "probability", "dato", "ggplot", " C# ", " c# "," C++ ", " c++ ", "ggplot2","ggplt", "ggvis", "predictive analysis", "Java Script", "HBase")

tag_ex <- paste0(tags, collapse = '|')
tag_ex <- tolower(tag_ex)
remove <- c("bein", "buil", "brin", "blis", "brig", "blic")

————————- Glassdor ———————————

#gldLinks <-vector()
#gldDst <-vector()

————————- DICE.COM —————————————-

Write Extracted and partially cleaned dice.com job data to .csv

# write.csv(dStData, "diceStripped.csv", row.names=FALSE)
diceRaw <- read.csv("diceStripped.csv", header = TRUE, stringsAsFactors = FALSE) # Read the .csv file
diceData <- diceRaw %>% str_replace_all('\n', '')%>%str_replace_all('\t', '')%>%str_replace_all('\r', '')%>%str_trim(side='both')%>%tolower()%>% str_extract_all(tag_ex)%>%unlist()
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing

Final cleaning of dice.com data

diceClean <- diceData %>% .[!(. %in% remove)] %>% {gsub("c(", "", ., fixed="TRUE")} %>% {gsub(")", "", ., fixed="TRUE") } %>% { gsub('"', "", ., fixed="TRUE") }%>%str_replace_all("&", "")%>%str_replace_all("/", "")%>%str_replace_all(",", "")%>%str_trim(side='both')%>%{gsub("\\)|\\]|\\(|[0-9]", "", .)}%>%{gsub("^[^a-z]*", "", .)}%>%{gsub("[^[:alpha:]]+$", "", .)}

diceClean <- diceClean[diceClean != ""]
head(diceClean)
## [1] "analytical"          "machine learning"    "algorithms"         
## [4] "machine learning"    "innovative"          "predictive modeling"
write.csv(diceClean, "diceClean.csv", row.names=FALSE)

Extract keywords from Glassdoor data

# glD <- vector()
#for (i in 1:length(glData))
#{
#  u <- tryCatch(glData[i] %>% str_replace_all('\n', '')%>%str_replace_all('\t', '')%>%str_replace_all('\r', '')%>%str_trim(side='both')%>%tolower()%>% str_extract_all(tag_ex), 
#         error = function(e){list(result = NA, error = e)})
  
#  glD <- c(glD, u)
#}
#glDClean1 <- glD%>%str_split(',')%>%unlist()

Further remove the funny elements from resulting vector

# glDClean1 <- glDClean1 %>% .[!(. %in% remove)] %>% {gsub("c(", "", ., fixed="TRUE")} %>% {gsub(")", "", ., fixed="TRUE") } %>% { gsub('"', "", ., fixed="TRUE") }%>%str_replace_all("&", "")%>%str_replace_all("/", "")%>%str_replace_all(",", "")%>%str_trim(side='both')%>%{gsub("\\)|\\]|\\(|[0-9]", "", .)}%>%{gsub("^[^a-z]*", "", .)}%>%{gsub("[^[:alpha:]]+$", "", .)}

#glDClean1 <- glDClean1[glDClean1 != ""]
#head(glDClean1)

Write to csv

# write.csv(glDClean1, "glassD_cleaned1.csv", row.names=FALSE)
glassdoor <- read.csv("glassD_cleaned1.csv", header = TRUE, stringsAsFactors = FALSE) # Read the .csv file

For glassdoor.com data form a dataframe with the keywords’ frequency of occurence computed by calling the table function on the glassdoor vector

d <- as.data.frame(table(glassdoor))
colnames(d)<-c("skill","frequency")
d <- d %>%arrange(desc(d$frequency))
head(d)
##              skill frequency
## 1 machine learning      1291
## 2           python       757
## 3       statistics       690
## 4              sql       621
## 5                r       620
## 6      engineering       598

For dice.com data form a dataframe with the words frequency of occurence computed by calling the table function on the diceClean vector

dd <- as.data.frame(table(diceClean))
colnames(dd)<-c("skill","frequency")
dd <- dd %>%arrange(desc(dd$frequency))
head(dd)
##              skill frequency
## 1              sql      1752
## 2 machine learning      1744
## 3           python      1706
## 4      engineering      1648
## 5         big data      1264
## 6                r      1044

merge the two Vectors diceClean and glassdoor and form a single dataframe with the words frequency of occurence computed by calling the table function on the resulting vactor allVec

glassdoor <- glassdoor$x
allVec <- c(diceClean, glassdoor)
alldata <- as.data.frame(table(allVec))
colnames(alldata)<-c("skill","frequency")
alldata <- alldata %>%arrange(desc(alldata$frequency))
head(alldata)
##              skill frequency
## 1 machine learning      3035
## 2           python      2463
## 3              sql      2373
## 4      engineering      2246
## 5                r      1664
## 6       analytical      1509

How the skills compare glassdoor.com

top20 <- head(d, n = 20)

dplt <- ggplot(data=top20, aes(x = reorder(skill, frequency), y=frequency, fill = "steelblue")) +
  geom_bar(stat = "identity") +
 xlab("Keywords in Data science Jobs") + ylab("Frequency") +
  ggtitle("Most valuable Data Science Skills on glassdoor.com") +
  theme(plot.title = element_text(lineheight = .8, face = "bold")) +
  theme(axis.text.x = element_text(angle = 90, vjust = .5, size = 9))+ coord_flip()
 dplt + theme(legend.position="none")

wordcloud(words = d$skill, freq = d$frequency, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

How the skills compare dice.com

top20d <- head(dd, n = 20)

dplt <- ggplot(data=top20d, aes(x = reorder(skill, frequency), y=frequency, fill = "steelblue")) +
  geom_bar(stat = "identity") +
 xlab("Keywords in Data science Jobs") + ylab("Frequency") +
  ggtitle("Most valuable Data Science Skills on dice.com") +
  theme(plot.title = element_text(lineheight = .8, face = "bold")) +
  theme(axis.text.x = element_text(angle = 90, vjust = .5, size = 9))+ coord_flip()
 dplt + theme(legend.position="none")

wordcloud(words = dd$skill, freq = dd$frequency, min.freq = 1,
          max.words=500, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Next: 1. Doing a weighted ranking of the skills from the two sites to compare side-by-side

2. uploading the data to a cloud database and do the querying from there