DATA607 Project 3

Webscrapping

Preliminary searches were conducted using the search term “Data Scientist” ere performed on http://dice.com and http://glassdor.com job listing sections and the returned results were observed. Appropriate investigations were carried out to determine the structure of the html nodes containing the needed job links to the individual job pages.

Form Data science keyword word so as to limit the amount of unwanted words to be extracted from the pages

tags <- c("Math", "Computer Science", "Information Systems", "Machine Learning","D3.Js","D3", "statistics", "econometrics", "applied aathematics", "Operations Research", "analytical modeling" , "statistical models", "mmachine learning","algorithms","data modeling", "SAS","[^[:alnum:]]r[^[:alnum:]]"," r ","Python", "Azure ML", "KNIME", "SQL","Agile", "software development","SAS", "Tableau", "Power BI", "power bi","Statistics","customer focused","analytical","problem-solving skills", "Highly motivated", "self-starter","innovative", "quick to learn", "Excellent communication", "communication", "interpersonal skills","DNN","CNN","RNN", "logistic rgression", "neural networks", "cloudformation", "statistics", "MATLAB", "mathematics" , "economics", "engineering", "java", "ruby", "javascript", "scala", "tableau", "hadoop", "HADOOP", "mapreduce",    "spark", "pig", "hive", "shark","oozie", "zookeeper", "flume",   "mahout",  "nosql","NOSQL","hbase","cassandra", "mongodb", "amazon s3", "intellectual curiosity", "business acumen", "communication", "data visualization", "data munging", "calculus", "linear algebra", "software engineering", "scientific method", "math", "product design","product development", "database administration", "project management", "data mining", "predictive modeling", "predictive analytics", "business intelligence", "optimization", "text mining","cloud management", "big data", " viz ", "bayesian statistics","bayesian analysis","n.l.p ", "nlp", "NLP", "natural language processing", "simulation", "simulations", "classification", "clustering",  "regression", "glm", "glms", "generalized linear models", "entrepreneurial", "entrepreneur", "least squares", " roc ", "data wrangling", "storyteller", "storytelling", "hacking","deep learning", "neural network", "neural networks", "sci-kit learn", "pandas", "numpy", "cicrosoft power bi", "knime", "octave", "rapidminer", "minitab", "stata", "h20", "curious", "xlstat", "keras", "random forest", "decision tree", "time series", "random tree", "probability", "dato", "ggplot", " C# ", " c# "," C++ ", " c++ ", "ggplot2","ggplt", "ggvis", "predictive analysis", "Java Script", "HBase")

tag_ex <- paste0(tags, collapse = '|')
tag_ex <- tolower(tag_ex)

remove <- c("bein", "buil", "brin", "blis", "brig", "blic")

1. Pulling job links with search tags containing “Data Scientist” on:

————————- Glassdor ———————————

#gldLinks <-vector()
#gldDst <-vector()

Search for Data Scientist on glassdoor.com and pull the job links from 30 listing pages

#gldUrl <- 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14_IP'

#for(i in 1: 30)
#{
#  gldUri <- paste(gldUrl, i, '.htm', sep = "")
  
#  gLinks <- html_attr(html_nodes(read_html(curl(gldUri, handle = new_handle("useragent" = "Mozilla/5.0"))), 'div.jobTitle a.jobLink'), "href")
  
#  gldLinks <- c(gldLinks, gLinks)
#}

#glData <- vector()

Navigate to individua job page and Scrape the job descriptions on them

#for (i in 1:length(gldLinks))
#{
#  tt <- tryCatch(read_html(curl(paste('https://www.glassdoor.com', gldLinks[i], sep = ""), handle = new_handle("useragent" = "Mozilla/5.0")))%>% 
#          html_nodes('div.jobDescriptionContent') %>% 
#          html_text(), 
#         error = function(e){list(result = NA, error = e)})
  
#  glData <- c(glData, tt)
#}

————————- DICE.COM —————————————-

Search for Data Scientist on dice.com and pull the job links from 50 listing pages

#dScientist <- vector()
#dScience <- vector()

#dstUrl <- 'https://www.dice.com/jobs/q-Data+Scientist-jobs?p='

#for(i in 1: 50)
#{
#  diceUri <- paste(dstUrl, i, sep = "")
  
#  dLinks <- html_attr(html_nodes(read_html(curl(diceUri, handle = new_handle("useragent" = "Mozilla/5.0"))), 'ul.list-inline:not(ul.details) a'), "href")
  
#  dScientist <- c(dScientist, dLinks)
#}

#dStData <- vector()
#dscData <- vector()

Navigate to individua job page and Scrape the job descriptions on them

# The for-loop below takes a lot of time to run. Uncomment to run it

#for (i in 1:length(dScientist))
#{
#  tt <- tryCatch(read_html(curl(paste('https://www.dice.com', dScientist[i], sep = ""), handle = new_handle("useragent" = #"Mozilla/5.0")))%>% 
#          html_nodes('div#jobdescSec') %>% 
#          html_text(), 
#         error = function(e){list(result = NA, error = e)})
  
#  dStData <- c(dStData, tt)
#}

Write Extracted and partially cleaned dice.com job data to .csv

# write.csv(dStData, "diceStripped.csv", row.names=FALSE)

diceRaw <- read.csv("diceStripped.csv", header = TRUE, stringsAsFactors = FALSE) # Read the .csv file

diceData <- diceRaw %>% str_replace_all('\n', '')%>%str_replace_all('\t', '')%>%str_replace_all('\r', '')%>%str_trim(side='both')%>%tolower()%>% str_extract_all(tag_ex)%>%unlist()

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing

Final cleaning of dice.com data

diceClean <- diceData %>% .[!(. %in% remove)] %>% {gsub("c(", "", ., fixed="TRUE")} %>% {gsub(")", "", ., fixed="TRUE") } %>% { gsub('"', "", ., fixed="TRUE") }%>%str_replace_all("&", "")%>%str_replace_all("/", "")%>%str_replace_all(",", "")%>%str_trim(side='both')%>%{gsub("\\)|\\]|\\(|[0-9]", "", .)}%>%{gsub("^[^a-z]*", "", .)}%>%{gsub("[^[:alpha:]]+$", "", .)}

diceClean <- diceClean[diceClean != ""]

head(diceClean)

## [1] "analytical"          "machine learning"    "algorithms"         
## [4] "machine learning"    "innovative"          "predictive modeling"

write.csv(diceClean, "diceClean.csv", row.names=FALSE)

Extract keywords from Glassdoor data

# glD <- vector()

#for (i in 1:length(glData))
#{
#  u <- tryCatch(glData[i] %>% str_replace_all('\n', '')%>%str_replace_all('\t', '')%>%str_replace_all('\r', '')%>%str_trim(side='both')%>%tolower()%>% str_extract_all(tag_ex), 
#         error = function(e){list(result = NA, error = e)})
  
#  glD <- c(glD, u)
#}

#glDClean1 <- glD%>%str_split(',')%>%unlist()

Further remove the funny elements from resulting vector

# glDClean1 <- glDClean1 %>% .[!(. %in% remove)] %>% {gsub("c(", "", ., fixed="TRUE")} %>% {gsub(")", "", ., fixed="TRUE") } %>% { gsub('"', "", ., fixed="TRUE") }%>%str_replace_all("&", "")%>%str_replace_all("/", "")%>%str_replace_all(",", "")%>%str_trim(side='both')%>%{gsub("\\)|\\]|\\(|[0-9]", "", .)}%>%{gsub("^[^a-z]*", "", .)}%>%{gsub("[^[:alpha:]]+$", "", .)}

#glDClean1 <- glDClean1[glDClean1 != ""]

#head(glDClean1)

Write to csv

# write.csv(glDClean1, "glassD_cleaned1.csv", row.names=FALSE)

glassdoor <- read.csv("glassD_cleaned1.csv", header = TRUE, stringsAsFactors = FALSE) # Read the .csv file

For glassdoor.com data form a dataframe with the keywords’ frequency of occurence computed by calling the `table` function on the `glassdoor` vector

d <- as.data.frame(table(glassdoor))
colnames(d)<-c("skill","frequency")
d <- d %>%arrange(desc(d$frequency))
head(d)

##              skill frequency
## 1 machine learning      1291
## 2           python       757
## 3       statistics       690
## 4              sql       621
## 5                r       620
## 6      engineering       598

For dice.com data form a dataframe with the words frequency of occurence computed by calling the `table` function on the `diceClean` vector

dd <- as.data.frame(table(diceClean))
colnames(dd)<-c("skill","frequency")
dd <- dd %>%arrange(desc(dd$frequency))
head(dd)

##              skill frequency
## 1              sql      1752
## 2 machine learning      1744
## 3           python      1706
## 4      engineering      1648
## 5         big data      1264
## 6                r      1044

merge the two Vectors `diceClean` and `glassdoor` and form a single dataframe with the words frequency of occurence computed by calling the `table` function on the resulting vactor `allVec`

glassdoor <- glassdoor$x

allVec <- c(diceClean, glassdoor)
alldata <- as.data.frame(table(allVec))
colnames(alldata)<-c("skill","frequency")
alldata <- alldata %>%arrange(desc(alldata$frequency))
head(alldata)

##              skill frequency
## 1 machine learning      3035
## 2           python      2463
## 3              sql      2373
## 4      engineering      2246
## 5                r      1664
## 6       analytical      1509

How the skills compare glassdoor.com

top20 <- head(d, n = 20)

dplt <- ggplot(data=top20, aes(x = reorder(skill, frequency), y=frequency, fill = "steelblue")) +
  geom_bar(stat = "identity") +
 xlab("Keywords in Data science Jobs") + ylab("Frequency") +
  ggtitle("Most valuable Data Science Skills on glassdoor.com") +
  theme(plot.title = element_text(lineheight = .8, face = "bold")) +
  theme(axis.text.x = element_text(angle = 90, vjust = .5, size = 9))+ coord_flip()
 dplt + theme(legend.position="none")

wordcloud(words = d$skill, freq = d$frequency, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

How the skills compare dice.com

top20d <- head(dd, n = 20)

dplt <- ggplot(data=top20d, aes(x = reorder(skill, frequency), y=frequency, fill = "steelblue")) +
  geom_bar(stat = "identity") +
 xlab("Keywords in Data science Jobs") + ylab("Frequency") +
  ggtitle("Most valuable Data Science Skills on dice.com") +
  theme(plot.title = element_text(lineheight = .8, face = "bold")) +
  theme(axis.text.x = element_text(angle = 90, vjust = .5, size = 9))+ coord_flip()
 dplt + theme(legend.position="none")

wordcloud(words = dd$skill, freq = dd$frequency, min.freq = 1,
          max.words=500, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

DATA607 Project 3

Henry Otuadinma

18 March 2019

Webscrapping

Form Data science keyword word so as to limit the amount of unwanted words to be extracted from the pages

1. Pulling job links with search tags containing “Data Scientist” on:

————————- Glassdor ———————————

Search for Data Scientist on glassdoor.com and pull the job links from 30 listing pages

Navigate to individua job page and Scrape the job descriptions on them

————————- DICE.COM —————————————-

Search for Data Scientist on dice.com and pull the job links from 50 listing pages

Navigate to individua job page and Scrape the job descriptions on them

Write Extracted and partially cleaned dice.com job data to .csv

Final cleaning of dice.com data

Extract keywords from Glassdoor data

Further remove the funny elements from resulting vector

Write to csv

For glassdoor.com data form a dataframe with the keywords’ frequency of occurence computed by calling the `table` function on the `glassdoor` vector

For dice.com data form a dataframe with the words frequency of occurence computed by calling the `table` function on the `diceClean` vector

merge the two Vectors `diceClean` and `glassdoor` and form a single dataframe with the words frequency of occurence computed by calling the `table` function on the resulting vactor `allVec`

How the skills compare glassdoor.com

How the skills compare dice.com

Next: 1. Doing a weighted ranking of the skills from the two sites to compare side-by-side

2. uploading the data to a cloud database and do the querying from there

DATA607 Project 3

Henry Otuadinma

18 March 2019

Webscrapping

Form Data science keyword word so as to limit the amount of unwanted words to be extracted from the pages

1. Pulling job links with search tags containing “Data Scientist” on:

————————- Glassdor ———————————

Search for Data Scientist on glassdoor.com and pull the job links from 30 listing pages

Navigate to individua job page and Scrape the job descriptions on them

————————- DICE.COM —————————————-

Search for Data Scientist on dice.com and pull the job links from 50 listing pages

Navigate to individua job page and Scrape the job descriptions on them

Write Extracted and partially cleaned dice.com job data to .csv

Final cleaning of dice.com data

Extract keywords from Glassdoor data

Further remove the funny elements from resulting vector

Write to csv

For glassdoor.com data form a dataframe with the keywords’ frequency of occurence computed by calling the table function on the glassdoor vector

For dice.com data form a dataframe with the words frequency of occurence computed by calling the table function on the diceClean vector

merge the two Vectors diceClean and glassdoor and form a single dataframe with the words frequency of occurence computed by calling the table function on the resulting vactor allVec

How the skills compare glassdoor.com

How the skills compare dice.com

Next: 1. Doing a weighted ranking of the skills from the two sites to compare side-by-side

2. uploading the data to a cloud database and do the querying from there

For glassdoor.com data form a dataframe with the keywords’ frequency of occurence computed by calling the `table` function on the `glassdoor` vector

For dice.com data form a dataframe with the words frequency of occurence computed by calling the `table` function on the `diceClean` vector

merge the two Vectors `diceClean` and `glassdoor` and form a single dataframe with the words frequency of occurence computed by calling the `table` function on the resulting vactor `allVec`