title: “Project 3b - Data Analysis and Visualizations”

author: “Sufian”

date: “10/16/2019”

output: html_document


Rpub links:

http://rpubs.com/ssufian/540285


# Load packages
library(rvest)
library(stringr)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(xml2)
library(httr)
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
library(tidytext)
library("magrittr")
require(ggthemes)
library('sentimentr')

Reading each teammates’ job site scrapped job listings

#read from Leticia
ltcancel<-read.csv("https://raw.githubusercontent.com/ltcancel/Project3/master/SimplyHiredJobs.csv")
colnames(ltcancel)<-c("Position", "Company","Location","Salary","URL","Job_Description")

#read from Salma
selshahawy<-read.csv("https://raw.githubusercontent.com/salma71/MSDS_2019/master/Fall2019/aquisition_management_607/project_3/jobs_detailsInfo.csv", stringsAsFactors = FALSE) 
colnames(selshahawy)<-c("Position", "Company","Location","URL","Job_Description") 

#read from Sufian
ssufian<-read.csv("https://raw.githubusercontent.com/Luz917/data607project3_ssufian_monster_jobs/master/monsterjobs.csv", stringsAsFactors = FALSE) 
colnames(ssufian)<-c("Position", "Company","Location","Salary","URL","Job_Description")  

Merging all scrapped files into one

twocsv<-merge(ltcancel,selshahawy,all= TRUE)

allcsv<-merge(twocsv,ssufian, all=TRUE)

Deleting unnecessary columns

library("magrittr")
# REmove Salary column

clean_allcsv <- allcsv[,-c(4,6)]

Data cleaning using REGEX

# Removed brackets & general cleanup of job summaries

clean_allcsv <- clean_allcsv %>% mutate_each(funs(tolower), Position, Company, Location, Job_Description)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "[\r\n]" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\." , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\*" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\+" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\\\" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\)" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\(" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "[[:punct:]]+", "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "[\n]" , "")


#head(clean_allcsv1 )
              
clean_allcsv2 <- clean_allcsv[,-c(4)] # this is for sentinment later


clean_allcsv2$Job_Description <- clean_allcsv1 

clean_allcsv2$Job_Description[0] #checking only
## character(0)

Creating a control list to be passed into Term Document Matrix (using TfIDF weightings)

#tfidf <- clean_allcsv1
tfidf <- clean_allcsv2$Job_Description 

# Control list to be used for all corpuses
control_list <- list(removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE,
                     weighting = weightTfIdf)
# TF-IDF Based on the 3 job postings

corpus.all <- VCorpus(VectorSource(tfidf))
tdm.all <- TermDocumentMatrix(corpus.all, control = control_list)
#inspect(tdm.all)
# Remove outliers consisting of very rare terms
tdm.60 <- removeSparseTerms(tdm.all, sparse = 0.60)
#inspect(tdm.60)
# Sum rows for total & make dataframe
df_all <- tidy(sort(rowSums(as.matrix(tdm.60))))
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
colnames(df_all) <- c("words", "count")
#View(df_all)
# Graph of TF-IDF in the 3 job postings
ggplot(tail(df_all, 25), aes(reorder(words, count), count)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(title = "TF-IDF of 3 Job Sites: ai-job.net, SimplyhiredJobs.com, Monster.com",
       x = "Words", y = "Frequency") +
  coord_flip()

Text Mining and Word Cloud by building a term-document matrix

docs <- Corpus(VectorSource(clean_allcsv2$Job_Description))
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs  , toSpace, "/")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "/"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "@")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "@"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "\\|"): transformation drops
## documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
# Text stemming
#docs <- tm_map(docs, stemDocument)

dtmw <- TermDocumentMatrix(docs )
m <- as.matrix(dtmw)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
#head(d, 10)

Generate the Word cloud

The important key skill sets can be illustrated in a word cloud as follow :

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))


Explore frequent terms and their associations


Top 10 most frequent words in the job descriptions of the 3 job sites

barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words in Data Scientist Job Boards",
        ylab = "Word  frequencies")

Frequency of Technical skills

toolskills <- clean_allcsv2 %>%
    mutate(R = grepl("\\bR\\b,", Job_Description, ignore.case=TRUE)) %>%
    mutate(python = grepl("Python", Job_Description, ignore.case=TRUE)) %>%
    mutate(SQL = grepl("SQL", Job_Description, ignore.case=TRUE)) %>%
    mutate(hadoop = grepl("hadoop", Job_Description, ignore.case=TRUE)) %>%
    mutate(perl = grepl("perl", Job_Description, ignore.case=TRUE)) %>%
    mutate(matplotlib = grepl("matplotlib", Job_Description, ignore.case=TRUE)) %>%
    mutate(Cplusplus = grepl("C++", Job_Description, fixed=TRUE)) %>%
    mutate(VB = grepl("VB", Job_Description, ignore.case=TRUE)) %>%
    mutate(java = grepl("java\\b", Job_Description, ignore.case=TRUE)) %>%
    mutate(scala = grepl("scala", Job_Description, ignore.case=TRUE)) %>%
    mutate(tensorflow = grepl("tensorflow",Job_Description, ignore.case=TRUE)) %>%
    mutate(javascript = grepl("javascript", Job_Description, ignore.case=TRUE)) %>%
    mutate(spark = grepl("spark", Job_Description, ignore.case=TRUE)) %>%
   # mutate(spark = grepl("Hadoop", Job_Description, ignore.case=TRUE)) %>%  
    # mutate(spark = grepl("nosql", Job_Description, ignore.case=TRUE)) %>%  
     # mutate(spark = grepl("spark", Job_Description, ignore.case=TRUE)) %>%  
     # mutate(spark = grepl("sas", Job_Description, ignore.case=TRUE)) %>%  
     # mutate(spark = grepl("excel", Job_Description, ignore.case=TRUE)) %>%  
     # mutate(spark = grepl("aws", Job_Description, ignore.case=TRUE)) %>%  
      #mutate(spark = grepl("azure", Job_Description, ignore.case=TRUE)) %>%  
      mutate(spark = grepl("java", Job_Description, ignore.case=TRUE)) %>%   
      #mutate(spark = grepl("tableau", Job_Description, ignore.case=TRUE)) %>% 
  
  
select(Position, Company, R, python, SQL, hadoop, perl, matplotlib, Cplusplus, VB, java, scala, tensorflow, javascript,spark,java)

Setup tool skills for plotting

toolskills2 <- toolskills %>% select(-(1:2)) %>% summarise_all(sum) %>% gather(variable,value) %>% arrange(desc(value))

Visualized the most in-demand tool sKills:

ggplot(toolskills2,aes(x=reorder(variable, value), y=value)) + geom_bar(stat='identity',fill="green",color="black") + xlab('') + ylab('Frequency') + labs(title='Tool Skills') + coord_flip() + theme_minimal()

Frequency of Hard skills

hardskills <- clean_allcsv2 %>%
    mutate(machinelearning = grepl("machine learning", Job_Description, ignore.case=TRUE)) %>%
    mutate(modeling = grepl("model", Job_Description, ignore.case=TRUE)) %>%
    mutate(statistics = grepl("statistics", Job_Description, ignore.case=TRUE)) %>%
    mutate(programming = grepl("programming", Job_Description, ignore.case=TRUE)) %>%
    mutate(quantitative = grepl("quantitative", Job_Description, ignore.case=TRUE)) %>%
    mutate(debugging = grepl("debugging", Job_Description, ignore.case=TRUE)) %>%
    mutate(statistical = grepl("statistical",  Job_Description, ignore.case=TRUE)) %>%
    mutate(regression = grepl("regression",  Job_Description, ignore.case=TRUE)) %>%

  
select(Position, Company, machinelearning, modeling, statistics, programming, quantitative, debugging, statistical, regression)

Setup hard skills for plotting

hardskills2 <- hardskills %>% 
               select(-(1:2)) %>% 
               summarise_all(sum) %>% 
               gather(variable,value) %>% 
               arrange(desc(value))

Visualized the most in-demand hard sKills:

ggplot(hardskills2,aes(x=reorder(variable, value), y=value)) + 
  geom_bar(stat='identity',fill="yellow",color="black") + 
  xlab('') + 
  ylab('Frequency') + 
  labs(title='Hard Skills') + 
  coord_flip() + 
  theme_minimal()

Frequency of Soft skills

softskills <- clean_allcsv2 %>%
    mutate(workingremote = grepl("working remote", Job_Description, ignore.case=TRUE)) %>% 
    mutate(communication = grepl("communication", Job_Description, ignore.case=TRUE)) %>%
    mutate(collaborative = grepl("collaborate", Job_Description, ignore.case=TRUE)) %>%
    mutate(creative = grepl("creative", Job_Description, ignore.case=TRUE)) %>%
    mutate(critical = grepl("critical", Job_Description, ignore.case=TRUE)) %>%
    mutate(problemsolving = grepl("problem solving", Job_Description, ignore.case=TRUE)) %>%
    mutate(activelearning = grepl("active learning", Job_Description, ignore.case=TRUE)) %>%
    mutate(hypothesis = grepl("hypothesis", Job_Description, ignore.case=TRUE)) %>%
    mutate(organized = grepl("organize", Job_Description, ignore.case=TRUE)) %>%
    mutate(judgement = grepl("judgement", Job_Description, ignore.case=TRUE)) %>%
    mutate(selfstarter = grepl("self Starter", Job_Description, ignore.case=TRUE)) %>%
    mutate(interpersonalskills = grepl("interpersonal skills", Job_Description, ignore.case=TRUE)) %>%
    mutate(detail_oriented = grepl("attention to detail", Job_Description, ignore.case=TRUE)) %>%
    mutate(visualization = grepl("visualization", Job_Description, ignore.case=TRUE)) %>%
    mutate(leadership = grepl("leadership", Job_Description, ignore.case=TRUE)) %>%
    mutate(presentation = grepl("presentation", Job_Description, ignore.case=TRUE)) %>%
      mutate(passion = grepl("passion", Job_Description, ignore.case=TRUE)) %>%
      mutate(research = grepl("research", Job_Description, ignore.case=TRUE)) %>%
      mutate(teamwork = grepl("teamwork", Job_Description, ignore.case=TRUE)) %>%
      mutate(integrity = grepl("integrity", Job_Description, ignore.case=TRUE)) %>%
   mutate(passionate= grepl("passionate", Job_Description, ignore.case=TRUE)) %>%

select(Position, Company, workingremote, communication, collaborative, creative, critical, problemsolving, 
  activelearning, hypothesis, organized, judgement, selfstarter, interpersonalskills, detail_oriented, 
  visualization, leadership,presentation,passion,research,teamwork,integrity,passionate)

Setup Soft skills for plotting

softskills2 <- softskills %>% 
               select(-(1:2)) %>% 
               summarise_all(sum) %>% 
               gather(variable,value) %>% 
               arrange(desc(value))

Visualized the most in-demand soft sKills:

ggplot(softskills2,aes(x=reorder(variable, value), y=value)) + geom_bar(stat='identity',fill="orange",color="blue") + xlab('') + ylab('Frequency') + labs(title='Soft Skills') + coord_flip() + theme_minimal()

Sentiment Analysis using sentimentr package

  • sentiment of sentence within each line of job descriptions by job postings of individual companies
scores1 <- sentiment_by(clean_allcsv2$Job_Description, by=NULL) %>%
  pull(ave_sentiment)
head(scores1)
## [1] 0.2539239 0.4040890 0.4040890 0.5159519 0.3710542 0.1971468
#m_wide2 <- m_wide
clean_allcsv2$sentiment_scores<- scores1
clean_allcsv3 <- clean_allcsv2[,-c(4)] # deleting redundant column
m_wide2 <- clean_allcsv3  %>% 
       select(Company, sentiment_scores)
head(m_wide2)
##                           Company sentiment_scores
## 1 guardian life insurance company        0.2539239
## 2              nyu langone health        0.4040890
## 3      nyu langone medical center        0.4040890
## 4                 epic pharma llc        0.5159519
## 5              nyu langone health        0.3710542
## 6                       macmillan        0.1971468

Plotting the sentiments findings

# Take a sample of 10 job listings of individual companies (top 10 vs. bottom 10 sentiment scores):
# First top 10 job listings 
df_top <- m_wide2 %>% 
  top_n(10)
## Selecting by sentiment_scores
# Bottom 10 job listings
df_bottom <- m_wide2 %>% 
  top_n(-10)
## Selecting by sentiment_scores
df_total <- left_join(df_top,df_bottom,by="Company")

 df_total1 <- df_total[,-c(3)] 
  
  
#Bar charts  to show 10 highest (Most Positive) sentiments Companies

ptop <- ggplot(df_top , aes(x=Company,y=sentiment_scores,fill=Company))+geom_bar(color="black",stat="identity")+
  ggtitle("Top 10 Positive Sentiment Scores of Job Listings by Companies")+
    theme_excel()
ptop <- ptop + theme(axis.text = element_text(size = 10,angle =90, hjust = 1))+
     theme(legend.position="bottom")

ptop

#10 lowest (Least Positive) of Job Listings by Companies
pbot <- ggplot(df_bottom , aes(x=Company,y=sentiment_scores,fill=Company))+geom_bar(color="black",stat="identity")+
  ggtitle("Bottom 10 Lowest Sentiment Scores of Job Listings by Companies")+
    theme_excel()
pbot <- pbot+ theme(axis.text = element_text(size = 10,angle =90, hjust = 1))+
     theme(legend.position="bottom")

pbot