Text Analysis of Simplyhired, Glassdoor, and Monster

After all of the Data has been scraped from the Job websites and turned into .csv’s, next we have to read in the .csv file and merge them together and then prepare them for text analysis.

#Load packages

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(rvest)
## Loading required package: rvest
## Loading required package: xml2
require(stringr)
## Loading required package: stringr
require(tm)
## Loading required package: tm
## Loading required package: NLP
require(SnowballC)
## Loading required package: SnowballC
require(tidytext)
## Loading required package: tidytext
require(stringr)
require(textdata)
## Loading required package: textdata
require(tidyverse)
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v readr   1.3.1
## v tibble  2.1.3     v purrr   0.3.2
## v tidyr   1.0.0     v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x ggplot2::annotate()     masks NLP::annotate()
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()
require(ggplot2)
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
require(widyr)
## Loading required package: widyr
require(igraph)
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
require(ggraph)
## Loading required package: ggraph

Since all the comlumn names were different the column names were changed so that they all can be the same when we merge the three databases different databases.

##Read in .csv of SimplyHired

ltcancel<-read.csv("https://raw.githubusercontent.com/ltcancel/Project3/master/SimplyHiredJobs.csv", stringsAsFactors = FALSE)
colnames(ltcancel)<-c("Position", "Company","Location","Salary","URL","Job_Description")
str(ltcancel)
## 'data.frame':    209 obs. of  6 variables:
##  $ Position       : chr  "Data Scientist, Marketplace – all levels" "Data Engineer" "Senior Data Scientist" "Ecological Wetland Scientist" ...
##  $ Company        : chr  "Spotify" "Noom Inc." "HVH Precision Analytics" "PS&S" ...
##  $ Location       : chr  "New York, NY" "New York, NY" "New York, NY" "Mineola, NY" ...
##  $ Salary         : chr  "Estimated: $110,000 - $160,000 a year" "Estimated: $73,000 - $96,000 a year" "Estimated: $110,000 - $140,000 a yearSimply Apply" "Estimated: $54,000 - $66,000 a year" ...
##  $ URL            : chr  "https://www.simplyhired.com/job/-q6yR-atece9p8LQvm2yP8xIX3VcYfRC9wsdPgSS0nWHIG3f2EZOxA?q=data+scientist" "https://www.simplyhired.com/job/pg680lk5W0WVpIIE7QQXRgEim6bP-NKuilb64EfQF80SDIp_X1ufSA?q=data+scientist" "https://www.simplyhired.com/job/YPL5f6DfJcFxTqZiKpIW3UWlZ0bqR5UKLLVHPAMbe3OnTxpIEdlCpg?q=data+scientist" "https://www.simplyhired.com/job/uip2DCa3k_ke2JyWB08F8Gm7MRFCNdXkFqw4nCdD5nq6tJiCPp7oZA?q=data+scientist" ...
##  $ Job_Description: chr  "Marketplace is the home for Spotify’s music industry products, such as Spotify for Artists, Spotify Label Analy"| __truncated__ "At Noom, we use scientifically proven methods to help our users create healthier lifestyles, and manage importa"| __truncated__ "Job Description: Data science professional to design, implement and deployadvanced machine learning / artificia"| __truncated__ "Overview\nPS&S is an award-winning “one-stop shop” of architecture and engineering excellence. The depth and br"| __truncated__ ...

Read in .csv of Glassdoor

selshahawy<-read.csv("https://raw.githubusercontent.com/salma71/MSDS_2019/master/Fall2019/aquisition_management_607/project_3/jobs_detailsInfo.csv", stringsAsFactors = FALSE) 
colnames(selshahawy)<-c("Position", "Company","Location","URL","Job_Description")  
str(selshahawy)
## 'data.frame':    60 obs. of  5 variables:
##  $ Position       : chr  "Data EngineerMongoDB" "Machine Learning EngineerMedium" "Senior Data EngineerManulife" "Research ScientistAllen Institute for Artificial Intelligence (AI2)" ...
##  $ Company        : chr  "MongoDB" "Medium" "Manulife" "Allen Institute for Artificial Intelligence (AI2)" ...
##  $ Location       : chr  "New York City" "San Francisco" "Toronto, ON CA" "Seattle, WA" ...
##  $ URL            : chr  "https://ai-jobs.net/job/data-engineer-40/" "https://ai-jobs.net/job/machine-learning-engineer-64/" "https://ai-jobs.net/job/senior-data-engineer-8/" "https://ai-jobs.net/job/research-scientist-6/" ...
##  $ Job_Description: chr  "MongoDB is growing rapidly and seeking a Data Engineer to be a key contributor to the overall internal data pla"| __truncated__ "At Medium, words matter. We are building the best place for reading and writing on the internetâ\200”a place wh"| __truncated__ "Are you looking for unlimited opportunities to develop and succeed? With work that challenges and makes a diffe"| __truncated__ "The Allen Institute for Artificial Intelligence (AI2) is a non-profit research institute in Seattle founded by "| __truncated__ ...

Read in .csv of Monster

ssufian<-read.csv("https://raw.githubusercontent.com/Luz917/data607project3_ssufian_monster_jobs/master/monsterjobs.csv", stringsAsFactors = FALSE) 
colnames(ssufian)<-c("Position", "Company","Location","Salary","URL","Job_Description")  
str(ssufian)
## 'data.frame':    26 obs. of  6 variables:
##  $ Position       : chr  "Principal Data Scientist (Facilities Analytics)" "Data Scientist" "Data Scientist" "Lead Data Scientist" ...
##  $ Company        : chr  "Northrop Grumman" "Eaton Corporation" "LRS" "CenturyLink" ...
##  $ Location       : chr  "Redondo Beach, CA" "Eden Prairie, MN" "Maryland Heights, MO" "BROOMFIELD, CO" ...
##  $ Salary         : logi  NA NA NA NA NA NA ...
##  $ URL            : chr  "https://job-openings.monster.com/principal-data-scientist-facilities-analytics-redondo-beach-ca-us-northrop-gru"| __truncated__ "https://job-openings.monster.com/data-scientist-eden-prairie-mn-us-eaton-corporation/2577f928-ed07-402e-bfd1-3e8a72ecb61e" "https://job-openings.monster.com/data-scientist-maryland-heights-mo-us-lrs/6c688631-a3d3-4044-a2a7-4ecfb69f4be6" "https://job-openings.monster.com/lead-data-scientist-broomfield-co-us-centurylink/8a9ed3aa-2fe3-48b4-86a7-4d3b7da8cd8a" ...
##  $ Job_Description: chr  "At Northrop Grumman we develop cutting-edge technology that preserves freedom and advances human discovery. Our"| __truncated__ "Eaton’s Hydraulics division is currently seeking a DataScientist to join our team. This position is based at ou"| __truncated__ "Our client is in need of a Data Scientist. This is a 6 month right to hire opportunity in St. Louis, MO. If you"| __truncated__ "CenturyLink (NYSE: CTL) is a global communications and IT services company focused on connecting its customers "| __truncated__ ...

Merge all the .csv’s into one.

Since we only can merge two at at time, we merge the first two .csvs into one. All the column names and rows are not identical so we have to set all = TRUE to make sure that they all merge no matter the number of columns or the number of rows.

twocsv<-merge(ltcancel,selshahawy,all= TRUE)
str(twocsv)
## 'data.frame':    269 obs. of  6 variables:
##  $ Position       : chr  "2020 Data Science Intern" "2020 Machine Learning Internship â\200“ Amazon SearchAmazon.com" "Administrative NP Coordinator - Stroke Program, Bellevue Hospital" "Administrative NP Coordinator - Stroke Program, Bellevue Hospital" ...
##  $ Company        : chr  "Guardian Life Insurance Company" "Amazon.com" "NYU Langone Health" "NYU Langone Medical Center" ...
##  $ Location       : chr  "New York, NY" "Berlin, Germany" "New York, NY" "New York, NY" ...
##  $ URL            : chr  "https://www.simplyhired.com/job/W0oTT9LF3Cfy9OZtnD5FBVyj1P52XNQnt_6JxZW83BvgYoo2zz2uJQ?q=data+scientist" "https://ai-jobs.net/job/2020-machine-learning-internship-amazon-search/" "https://www.simplyhired.com/job/vM0UOOBVm4K93DvY7BZORwTyO-0LDcZSRcQqg55e1Fvxf3gnaooqQw?q=data+scientist" "https://www.simplyhired.com/job/8EoewrddDdBqY2rnCl00C898-taFoO084O_BzpMdVtde014pojcrTQ?q=data+scientist" ...
##  $ Job_Description: chr  "2020 Data Science Intern - (19001927)\nDescription\n\nInternship Overview\nOur Internship Program is a paid 10-"| __truncated__ "We are looking for PhD students to join Amazon Search in Berlin for a 3-6 month internship in 2020.Hundreds of "| __truncated__ "NYU School of Medicine is one of the nation's top-ranked medical schools. For 175 years, NYU School of Medicine"| __truncated__ "NYU School of Medicine is one of the nation's top-ranked medical schools. For 175 years, NYU School of Medicine"| __truncated__ ...
##  $ Salary         : chr  "5d" NA "Estimated: $44,000 - $57,000 a year8d" "Estimated: $40,000 - $57,000 a year8d" ...

Here we merge the third .csv and all of the .csv’s are merged together.

allcsv<-merge(twocsv,ssufian, all=TRUE)
str(allcsv)
## 'data.frame':    295 obs. of  6 variables:
##  $ Position       : chr  "2020 Citizen Data Scientist Internship" "2020 Data Science Intern" "2020 Machine Learning Internship â\200“ Amazon SearchAmazon.com" "Administrative NP Coordinator - Stroke Program, Bellevue Hospital" ...
##  $ Company        : chr  "FCA" "Guardian Life Insurance Company" "Amazon.com" "NYU Langone Health" ...
##  $ Location       : chr  "Detroit, MI" "New York, NY" "Berlin, Germany" "New York, NY" ...
##  $ URL            : chr  "https://job-openings.monster.com/2020-citizen-data-scientist-internship-detroit-mi-us-fca/212893769" "https://www.simplyhired.com/job/W0oTT9LF3Cfy9OZtnD5FBVyj1P52XNQnt_6JxZW83BvgYoo2zz2uJQ?q=data+scientist" "https://ai-jobs.net/job/2020-machine-learning-internship-amazon-search/" "https://www.simplyhired.com/job/vM0UOOBVm4K93DvY7BZORwTyO-0LDcZSRcQqg55e1Fvxf3gnaooqQw?q=data+scientist" ...
##  $ Job_Description: chr  "FCA US LLC College Intern Program offers a unique opportunity for highly motivated, innovative, and inspired in"| __truncated__ "2020 Data Science Intern - (19001927)\nDescription\n\nInternship Overview\nOur Internship Program is a paid 10-"| __truncated__ "We are looking for PhD students to join Amazon Search in Berlin for a 3-6 month internship in 2020.Hundreds of "| __truncated__ "NYU School of Medicine is one of the nation's top-ranked medical schools. For 175 years, NYU School of Medicine"| __truncated__ ...
##  $ Salary         : chr  NA "5d" NA "Estimated: $44,000 - $57,000 a year8d" ...

Remove columns to make it easier to run.

allcsv2 <-allcsv[c(1,2,5)]

Prepare the csv for text analysis.

This step creates character vectors using corpus

descriptionofjobs = Corpus(VectorSource(allcsv2$Job_Description)) 
descriptionofjobs = tm_map(descriptionofjobs, content_transformer(tolower))##changes to lower letters
## Warning in tm_map.SimpleCorpus(descriptionofjobs,
## content_transformer(tolower)): transformation drops documents
descriptionofjobs=tm_map(descriptionofjobs, content_transformer(gsub), pattern="\\W",replace=" ")
## Warning in tm_map.SimpleCorpus(descriptionofjobs,
## content_transformer(gsub), : transformation drops documents

Removes URLS

removeURL = function(x) gsub("http^\\s\\s*", "", x)%>%
descriptionofjobs <- tm_map(descriptionofjobs, content_transformer(removeURL))
descriptionofjobs=tm_map(descriptionofjobs,removeNumbers) ##Remove numbers
## Warning in tm_map.SimpleCorpus(descriptionofjobs, removeNumbers):
## transformation drops documents
descriptionofjobs=tm_map(descriptionofjobs,removePunctuation)##Punctuation
## Warning in tm_map.SimpleCorpus(descriptionofjobs, removePunctuation):
## transformation drops documents
descriptionofjobs = tm_map(descriptionofjobs, removeWords, stopwords(kind = "english"))##Stopwords
## Warning in tm_map.SimpleCorpus(descriptionofjobs, removeWords,
## stopwords(kind = "english")): transformation drops documents
extraStopwords <- c(setdiff(stopwords('english'), c("r", "big")),"used", "will", "time", "can", "sex", "role", "new","can", "job", "etc", "one", "looking", "well","use","best","also", "high", "real", "please", "key", "able", "must", "like", "full", "include", "good", "non", "need","plus","day","year", "com", "want", "age","using","sexual", "help","apply", "race", "orientation")
descriptionofjobs<- tm_map(descriptionofjobs, removeWords, extraStopwords) ##more stop words or unwanted words
## Warning in tm_map.SimpleCorpus(descriptionofjobs, removeWords,
## extraStopwords): transformation drops documents
descriptionofjobs = tm_map (descriptionofjobs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(descriptionofjobs, stripWhitespace):
## transformation drops documents

Creating the Bag of Words

allwords2<-DocumentTermMatrix(descriptionofjobs)
sparsewords = removeSparseTerms(allwords2,.80)

Begin the analysis

Convert into a tidy table

tidywords<-tidy(sparsewords)
tidywords
## # A tibble: 21,137 x 3
##    document term          count
##    <chr>    <chr>         <dbl>
##  1 1        advanced          1
##  2 1        analysis          1
##  3 1        analytics         1
##  4 1        areas             1
##  5 1        based             2
##  6 1        business          9
##  7 1        candidate         1
##  8 1        candidates        1
##  9 1        communication     2
## 10 1        company           1
## # ... with 21,127 more rows
totalwords<-tidywords%>%
  count(term, sort= TRUE)
totalwords
## # A tibble: 208 x 2
##    term           n
##    <chr>      <int>
##  1 data         287
##  2 experience   279
##  3 work         249
##  4 team         243
##  5 science      219
##  6 years        217
##  7 python       207
##  8 skills       193
##  9 business     189
## 10 learning     184
## # ... with 198 more rows
tidywords %>%
  count(term, sort = TRUE) %>%
  filter(n > 180) %>%
  ggplot(aes(term, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

summaryofwords<-tidywords %>% 
  group_by(term) %>%
  summarize(total = sum(n()))%>%
  arrange(desc(total))
totalwords<-left_join(totalwords, summaryofwords)
## Joining, by = "term"

Frequency of each word

tfrequency <-summaryofwords %>%
  group_by(term)%>%
  mutate(rank = row_number(), 'frequencyofterm' = n()/total)%>%
  arrange(desc(total))
tfrequency
## # A tibble: 208 x 4
## # Groups:   term [208]
##    term       total  rank frequencyofterm
##    <chr>      <int> <int>           <dbl>
##  1 data         287     1         0.00348
##  2 experience   279     1         0.00358
##  3 work         249     1         0.00402
##  4 team         243     1         0.00412
##  5 science      219     1         0.00457
##  6 years        217     1         0.00461
##  7 python       207     1         0.00483
##  8 skills       193     1         0.00518
##  9 business     189     1         0.00529
## 10 learning     184     1         0.00543
## # ... with 198 more rows

Pairing of the words

tidy_word_pairs<-tidywords%>%
  pairwise_count(term,count, sort= TRUE)

tidy_word_pairs
## # A tibble: 43,056 x 3
##    item1      item2          n
##    <chr>      <chr>      <dbl>
##  1 data       business      15
##  2 business   data          15
##  3 experience data          14
##  4 data       experience    14
##  5 research   data          13
##  6 research   experience    13
##  7 data       research      13
##  8 experience research      13
##  9 experience business      12
## 10 business   experience    12
## # ... with 43,046 more rows
set.seed(1234)
tidy_word_pairs %>%
  filter(n >= 8) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan2") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Correlation of the words

tidywords_cors <- tidywords %>% 
  group_by(term) %>%
  filter(n() >= 180) %>%
  pairwise_cor(term, count, sort = TRUE, upper = FALSE)

tidywords_cors
## # A tibble: 45 x 3
##    item1      item2    correlation
##    <chr>      <chr>          <dbl>
##  1 learning   science        0.930
##  2 learning   work           0.930
##  3 learning   team           0.930
##  4 science    work           0.852
##  5 science    team           0.852
##  6 work       team           0.852
##  7 experience learning       0.817
##  8 skills     years          0.815
##  9 science    skills         0.783
## 10 skills     work           0.783
## # ... with 35 more rows
set.seed(1234)
tidywords_cors %>%
  filter(correlation > .3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "green") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

WordCloud

library(wordcloud)
dtm = DocumentTermMatrix(descriptionofjobs)
dtm = removeSparseTerms(dtm, 0.70)
dataset = as.matrix(dtm)
v = sort(colSums(dataset),decreasing=TRUE)
myNames = names(v)
d = data.frame(word=myNames,freq=v)
wordcloud(d$word, colors=c(1:4),random.color=TRUE, d$freq, min.freq=100)