After all of the Data has been scraped from the Job websites and turned into .csv’s, next we have to read in the .csv file and merge them together and then prepare them for text analysis.
#Load packages
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(rvest)
## Loading required package: rvest
## Loading required package: xml2
require(stringr)
## Loading required package: stringr
require(tm)
## Loading required package: tm
## Loading required package: NLP
require(SnowballC)
## Loading required package: SnowballC
require(tidytext)
## Loading required package: tidytext
require(stringr)
require(textdata)
## Loading required package: textdata
require(tidyverse)
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v readr 1.3.1
## v tibble 2.1.3 v purrr 0.3.2
## v tidyr 1.0.0 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
require(ggplot2)
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
require(widyr)
## Loading required package: widyr
require(igraph)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
require(ggraph)
## Loading required package: ggraph
Since all the comlumn names were different the column names were changed so that they all can be the same when we merge the three databases different databases.
##Read in .csv of SimplyHired
ltcancel<-read.csv("https://raw.githubusercontent.com/ltcancel/Project3/master/SimplyHiredJobs.csv", stringsAsFactors = FALSE)
colnames(ltcancel)<-c("Position", "Company","Location","Salary","URL","Job_Description")
str(ltcancel)
## 'data.frame': 209 obs. of 6 variables:
## $ Position : chr "Data Scientist, Marketplace – all levels" "Data Engineer" "Senior Data Scientist" "Ecological Wetland Scientist" ...
## $ Company : chr "Spotify" "Noom Inc." "HVH Precision Analytics" "PS&S" ...
## $ Location : chr "New York, NY" "New York, NY" "New York, NY" "Mineola, NY" ...
## $ Salary : chr "Estimated: $110,000 - $160,000 a year" "Estimated: $73,000 - $96,000 a year" "Estimated: $110,000 - $140,000 a yearSimply Apply" "Estimated: $54,000 - $66,000 a year" ...
## $ URL : chr "https://www.simplyhired.com/job/-q6yR-atece9p8LQvm2yP8xIX3VcYfRC9wsdPgSS0nWHIG3f2EZOxA?q=data+scientist" "https://www.simplyhired.com/job/pg680lk5W0WVpIIE7QQXRgEim6bP-NKuilb64EfQF80SDIp_X1ufSA?q=data+scientist" "https://www.simplyhired.com/job/YPL5f6DfJcFxTqZiKpIW3UWlZ0bqR5UKLLVHPAMbe3OnTxpIEdlCpg?q=data+scientist" "https://www.simplyhired.com/job/uip2DCa3k_ke2JyWB08F8Gm7MRFCNdXkFqw4nCdD5nq6tJiCPp7oZA?q=data+scientist" ...
## $ Job_Description: chr "Marketplace is the home for Spotify’s music industry products, such as Spotify for Artists, Spotify Label Analy"| __truncated__ "At Noom, we use scientifically proven methods to help our users create healthier lifestyles, and manage importa"| __truncated__ "Job Description: Data science professional to design, implement and deployadvanced machine learning / artificia"| __truncated__ "Overview\nPS&S is an award-winning “one-stop shop” of architecture and engineering excellence. The depth and br"| __truncated__ ...
selshahawy<-read.csv("https://raw.githubusercontent.com/salma71/MSDS_2019/master/Fall2019/aquisition_management_607/project_3/jobs_detailsInfo.csv", stringsAsFactors = FALSE)
colnames(selshahawy)<-c("Position", "Company","Location","URL","Job_Description")
str(selshahawy)
## 'data.frame': 60 obs. of 5 variables:
## $ Position : chr "Data EngineerMongoDB" "Machine Learning EngineerMedium" "Senior Data EngineerManulife" "Research ScientistAllen Institute for Artificial Intelligence (AI2)" ...
## $ Company : chr "MongoDB" "Medium" "Manulife" "Allen Institute for Artificial Intelligence (AI2)" ...
## $ Location : chr "New York City" "San Francisco" "Toronto, ON CA" "Seattle, WA" ...
## $ URL : chr "https://ai-jobs.net/job/data-engineer-40/" "https://ai-jobs.net/job/machine-learning-engineer-64/" "https://ai-jobs.net/job/senior-data-engineer-8/" "https://ai-jobs.net/job/research-scientist-6/" ...
## $ Job_Description: chr "MongoDB is growing rapidly and seeking a Data Engineer to be a key contributor to the overall internal data pla"| __truncated__ "At Medium, words matter. We are building the best place for reading and writing on the internetâ\200”a place wh"| __truncated__ "Are you looking for unlimited opportunities to develop and succeed? With work that challenges and makes a diffe"| __truncated__ "The Allen Institute for Artificial Intelligence (AI2) is a non-profit research institute in Seattle founded by "| __truncated__ ...
ssufian<-read.csv("https://raw.githubusercontent.com/Luz917/data607project3_ssufian_monster_jobs/master/monsterjobs.csv", stringsAsFactors = FALSE)
colnames(ssufian)<-c("Position", "Company","Location","Salary","URL","Job_Description")
str(ssufian)
## 'data.frame': 26 obs. of 6 variables:
## $ Position : chr "Principal Data Scientist (Facilities Analytics)" "Data Scientist" "Data Scientist" "Lead Data Scientist" ...
## $ Company : chr "Northrop Grumman" "Eaton Corporation" "LRS" "CenturyLink" ...
## $ Location : chr "Redondo Beach, CA" "Eden Prairie, MN" "Maryland Heights, MO" "BROOMFIELD, CO" ...
## $ Salary : logi NA NA NA NA NA NA ...
## $ URL : chr "https://job-openings.monster.com/principal-data-scientist-facilities-analytics-redondo-beach-ca-us-northrop-gru"| __truncated__ "https://job-openings.monster.com/data-scientist-eden-prairie-mn-us-eaton-corporation/2577f928-ed07-402e-bfd1-3e8a72ecb61e" "https://job-openings.monster.com/data-scientist-maryland-heights-mo-us-lrs/6c688631-a3d3-4044-a2a7-4ecfb69f4be6" "https://job-openings.monster.com/lead-data-scientist-broomfield-co-us-centurylink/8a9ed3aa-2fe3-48b4-86a7-4d3b7da8cd8a" ...
## $ Job_Description: chr "At Northrop Grumman we develop cutting-edge technology that preserves freedom and advances human discovery. Our"| __truncated__ "Eaton’s Hydraulics division is currently seeking a DataScientist to join our team. This position is based at ou"| __truncated__ "Our client is in need of a Data Scientist. This is a 6 month right to hire opportunity in St. Louis, MO. If you"| __truncated__ "CenturyLink (NYSE: CTL) is a global communications and IT services company focused on connecting its customers "| __truncated__ ...
Since we only can merge two at at time, we merge the first two .csvs into one. All the column names and rows are not identical so we have to set all = TRUE to make sure that they all merge no matter the number of columns or the number of rows.
twocsv<-merge(ltcancel,selshahawy,all= TRUE)
str(twocsv)
## 'data.frame': 269 obs. of 6 variables:
## $ Position : chr "2020 Data Science Intern" "2020 Machine Learning Internship â\200“ Amazon SearchAmazon.com" "Administrative NP Coordinator - Stroke Program, Bellevue Hospital" "Administrative NP Coordinator - Stroke Program, Bellevue Hospital" ...
## $ Company : chr "Guardian Life Insurance Company" "Amazon.com" "NYU Langone Health" "NYU Langone Medical Center" ...
## $ Location : chr "New York, NY" "Berlin, Germany" "New York, NY" "New York, NY" ...
## $ URL : chr "https://www.simplyhired.com/job/W0oTT9LF3Cfy9OZtnD5FBVyj1P52XNQnt_6JxZW83BvgYoo2zz2uJQ?q=data+scientist" "https://ai-jobs.net/job/2020-machine-learning-internship-amazon-search/" "https://www.simplyhired.com/job/vM0UOOBVm4K93DvY7BZORwTyO-0LDcZSRcQqg55e1Fvxf3gnaooqQw?q=data+scientist" "https://www.simplyhired.com/job/8EoewrddDdBqY2rnCl00C898-taFoO084O_BzpMdVtde014pojcrTQ?q=data+scientist" ...
## $ Job_Description: chr "2020 Data Science Intern - (19001927)\nDescription\n\nInternship Overview\nOur Internship Program is a paid 10-"| __truncated__ "We are looking for PhD students to join Amazon Search in Berlin for a 3-6 month internship in 2020.Hundreds of "| __truncated__ "NYU School of Medicine is one of the nation's top-ranked medical schools. For 175 years, NYU School of Medicine"| __truncated__ "NYU School of Medicine is one of the nation's top-ranked medical schools. For 175 years, NYU School of Medicine"| __truncated__ ...
## $ Salary : chr "5d" NA "Estimated: $44,000 - $57,000 a year8d" "Estimated: $40,000 - $57,000 a year8d" ...
Here we merge the third .csv and all of the .csv’s are merged together.
allcsv<-merge(twocsv,ssufian, all=TRUE)
str(allcsv)
## 'data.frame': 295 obs. of 6 variables:
## $ Position : chr "2020 Citizen Data Scientist Internship" "2020 Data Science Intern" "2020 Machine Learning Internship â\200“ Amazon SearchAmazon.com" "Administrative NP Coordinator - Stroke Program, Bellevue Hospital" ...
## $ Company : chr "FCA" "Guardian Life Insurance Company" "Amazon.com" "NYU Langone Health" ...
## $ Location : chr "Detroit, MI" "New York, NY" "Berlin, Germany" "New York, NY" ...
## $ URL : chr "https://job-openings.monster.com/2020-citizen-data-scientist-internship-detroit-mi-us-fca/212893769" "https://www.simplyhired.com/job/W0oTT9LF3Cfy9OZtnD5FBVyj1P52XNQnt_6JxZW83BvgYoo2zz2uJQ?q=data+scientist" "https://ai-jobs.net/job/2020-machine-learning-internship-amazon-search/" "https://www.simplyhired.com/job/vM0UOOBVm4K93DvY7BZORwTyO-0LDcZSRcQqg55e1Fvxf3gnaooqQw?q=data+scientist" ...
## $ Job_Description: chr "FCA US LLC College Intern Program offers a unique opportunity for highly motivated, innovative, and inspired in"| __truncated__ "2020 Data Science Intern - (19001927)\nDescription\n\nInternship Overview\nOur Internship Program is a paid 10-"| __truncated__ "We are looking for PhD students to join Amazon Search in Berlin for a 3-6 month internship in 2020.Hundreds of "| __truncated__ "NYU School of Medicine is one of the nation's top-ranked medical schools. For 175 years, NYU School of Medicine"| __truncated__ ...
## $ Salary : chr NA "5d" NA "Estimated: $44,000 - $57,000 a year8d" ...
Remove columns to make it easier to run.
allcsv2 <-allcsv[c(1,2,5)]
This step creates character vectors using corpus
descriptionofjobs = Corpus(VectorSource(allcsv2$Job_Description))
descriptionofjobs = tm_map(descriptionofjobs, content_transformer(tolower))##changes to lower letters
## Warning in tm_map.SimpleCorpus(descriptionofjobs,
## content_transformer(tolower)): transformation drops documents
descriptionofjobs=tm_map(descriptionofjobs, content_transformer(gsub), pattern="\\W",replace=" ")
## Warning in tm_map.SimpleCorpus(descriptionofjobs,
## content_transformer(gsub), : transformation drops documents
Removes URLS
removeURL = function(x) gsub("http^\\s\\s*", "", x)%>%
descriptionofjobs <- tm_map(descriptionofjobs, content_transformer(removeURL))
descriptionofjobs=tm_map(descriptionofjobs,removeNumbers) ##Remove numbers
## Warning in tm_map.SimpleCorpus(descriptionofjobs, removeNumbers):
## transformation drops documents
descriptionofjobs=tm_map(descriptionofjobs,removePunctuation)##Punctuation
## Warning in tm_map.SimpleCorpus(descriptionofjobs, removePunctuation):
## transformation drops documents
descriptionofjobs = tm_map(descriptionofjobs, removeWords, stopwords(kind = "english"))##Stopwords
## Warning in tm_map.SimpleCorpus(descriptionofjobs, removeWords,
## stopwords(kind = "english")): transformation drops documents
extraStopwords <- c(setdiff(stopwords('english'), c("r", "big")),"used", "will", "time", "can", "sex", "role", "new","can", "job", "etc", "one", "looking", "well","use","best","also", "high", "real", "please", "key", "able", "must", "like", "full", "include", "good", "non", "need","plus","day","year", "com", "want", "age","using","sexual", "help","apply", "race", "orientation")
descriptionofjobs<- tm_map(descriptionofjobs, removeWords, extraStopwords) ##more stop words or unwanted words
## Warning in tm_map.SimpleCorpus(descriptionofjobs, removeWords,
## extraStopwords): transformation drops documents
descriptionofjobs = tm_map (descriptionofjobs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(descriptionofjobs, stripWhitespace):
## transformation drops documents
allwords2<-DocumentTermMatrix(descriptionofjobs)
sparsewords = removeSparseTerms(allwords2,.80)
Convert into a tidy table
tidywords<-tidy(sparsewords)
tidywords
## # A tibble: 21,137 x 3
## document term count
## <chr> <chr> <dbl>
## 1 1 advanced 1
## 2 1 analysis 1
## 3 1 analytics 1
## 4 1 areas 1
## 5 1 based 2
## 6 1 business 9
## 7 1 candidate 1
## 8 1 candidates 1
## 9 1 communication 2
## 10 1 company 1
## # ... with 21,127 more rows
totalwords<-tidywords%>%
count(term, sort= TRUE)
totalwords
## # A tibble: 208 x 2
## term n
## <chr> <int>
## 1 data 287
## 2 experience 279
## 3 work 249
## 4 team 243
## 5 science 219
## 6 years 217
## 7 python 207
## 8 skills 193
## 9 business 189
## 10 learning 184
## # ... with 198 more rows
tidywords %>%
count(term, sort = TRUE) %>%
filter(n > 180) %>%
ggplot(aes(term, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
summaryofwords<-tidywords %>%
group_by(term) %>%
summarize(total = sum(n()))%>%
arrange(desc(total))
totalwords<-left_join(totalwords, summaryofwords)
## Joining, by = "term"
Frequency of each word
tfrequency <-summaryofwords %>%
group_by(term)%>%
mutate(rank = row_number(), 'frequencyofterm' = n()/total)%>%
arrange(desc(total))
tfrequency
## # A tibble: 208 x 4
## # Groups: term [208]
## term total rank frequencyofterm
## <chr> <int> <int> <dbl>
## 1 data 287 1 0.00348
## 2 experience 279 1 0.00358
## 3 work 249 1 0.00402
## 4 team 243 1 0.00412
## 5 science 219 1 0.00457
## 6 years 217 1 0.00461
## 7 python 207 1 0.00483
## 8 skills 193 1 0.00518
## 9 business 189 1 0.00529
## 10 learning 184 1 0.00543
## # ... with 198 more rows
Pairing of the words
tidy_word_pairs<-tidywords%>%
pairwise_count(term,count, sort= TRUE)
tidy_word_pairs
## # A tibble: 43,056 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 data business 15
## 2 business data 15
## 3 experience data 14
## 4 data experience 14
## 5 research data 13
## 6 research experience 13
## 7 data research 13
## 8 experience research 13
## 9 experience business 12
## 10 business experience 12
## # ... with 43,046 more rows
set.seed(1234)
tidy_word_pairs %>%
filter(n >= 8) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan2") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
Correlation of the words
tidywords_cors <- tidywords %>%
group_by(term) %>%
filter(n() >= 180) %>%
pairwise_cor(term, count, sort = TRUE, upper = FALSE)
tidywords_cors
## # A tibble: 45 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 learning science 0.930
## 2 learning work 0.930
## 3 learning team 0.930
## 4 science work 0.852
## 5 science team 0.852
## 6 work team 0.852
## 7 experience learning 0.817
## 8 skills years 0.815
## 9 science skills 0.783
## 10 skills work 0.783
## # ... with 35 more rows
set.seed(1234)
tidywords_cors %>%
filter(correlation > .3) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "green") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
library(wordcloud)
dtm = DocumentTermMatrix(descriptionofjobs)
dtm = removeSparseTerms(dtm, 0.70)
dataset = as.matrix(dtm)
v = sort(colSums(dataset),decreasing=TRUE)
myNames = names(v)
d = data.frame(word=myNames,freq=v)
wordcloud(d$word, colors=c(1:4),random.color=TRUE, d$freq, min.freq=100)