title: “Project 3b - Data Analysis and Visualizations”
author: “Sufian”
date: “10/16/2019”
output: html_document
Rpub links:
http://rpubs.com/ssufian/540285
# Load packages
library(rvest)
library(stringr)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(xml2)
library(httr)
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
library(tidytext)
library("magrittr")
require(ggthemes)
library('sentimentr')
#read from Leticia
ltcancel<-read.csv("https://raw.githubusercontent.com/ltcancel/Project3/master/SimplyHiredJobs.csv")
colnames(ltcancel)<-c("Position", "Company","Location","Salary","URL","Job_Description")
#read from Salma
selshahawy<-read.csv("https://raw.githubusercontent.com/salma71/MSDS_2019/master/Fall2019/aquisition_management_607/project_3/jobs_detailsInfo.csv", stringsAsFactors = FALSE)
colnames(selshahawy)<-c("Position", "Company","Location","URL","Job_Description")
#read from Sufian
ssufian<-read.csv("https://raw.githubusercontent.com/Luz917/data607project3_ssufian_monster_jobs/master/monsterjobs.csv", stringsAsFactors = FALSE)
colnames(ssufian)<-c("Position", "Company","Location","Salary","URL","Job_Description")
twocsv<-merge(ltcancel,selshahawy,all= TRUE)
allcsv<-merge(twocsv,ssufian, all=TRUE)
library("magrittr")
# REmove Salary column
clean_allcsv <- allcsv[,-c(4,6)]
# Removed brackets & general cleanup of job summaries
clean_allcsv <- clean_allcsv %>% mutate_each(funs(tolower), Position, Company, Location, Job_Description)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "[\r\n]" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\." , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\*" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\+" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\\\" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\)" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "\\(" , "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "[[:punct:]]+", "")
clean_allcsv1 <- str_replace_all(clean_allcsv$Job_Description, "[\n]" , "")
#head(clean_allcsv1 )
clean_allcsv2 <- clean_allcsv[,-c(4)] # this is for sentinment later
clean_allcsv2$Job_Description <- clean_allcsv1
clean_allcsv2$Job_Description[0] #checking only
## character(0)
#tfidf <- clean_allcsv1
tfidf <- clean_allcsv2$Job_Description
# Control list to be used for all corpuses
control_list <- list(removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE,
weighting = weightTfIdf)
# TF-IDF Based on the 3 job postings
corpus.all <- VCorpus(VectorSource(tfidf))
tdm.all <- TermDocumentMatrix(corpus.all, control = control_list)
#inspect(tdm.all)
# Remove outliers consisting of very rare terms
tdm.60 <- removeSparseTerms(tdm.all, sparse = 0.60)
#inspect(tdm.60)
# Sum rows for total & make dataframe
df_all <- tidy(sort(rowSums(as.matrix(tdm.60))))
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
colnames(df_all) <- c("words", "count")
#View(df_all)
# Graph of TF-IDF in the 3 job postings
ggplot(tail(df_all, 25), aes(reorder(words, count), count)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "TF-IDF of 3 Job Sites: ai-job.net, SimplyhiredJobs.com, Monster.com",
x = "Words", y = "Frequency") +
coord_flip()
docs <- Corpus(VectorSource(clean_allcsv2$Job_Description))
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs , toSpace, "/")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "/"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "@")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "@"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "\\|"): transformation drops
## documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
# Text stemming
#docs <- tm_map(docs, stemDocument)
dtmw <- TermDocumentMatrix(docs )
m <- as.matrix(dtmw)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
#head(d, 10)
The important key skill sets can be illustrated in a word cloud as follow :
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words in Data Scientist Job Boards",
ylab = "Word frequencies")
toolskills <- clean_allcsv2 %>%
mutate(R = grepl("\\bR\\b,", Job_Description, ignore.case=TRUE)) %>%
mutate(python = grepl("Python", Job_Description, ignore.case=TRUE)) %>%
mutate(SQL = grepl("SQL", Job_Description, ignore.case=TRUE)) %>%
mutate(hadoop = grepl("hadoop", Job_Description, ignore.case=TRUE)) %>%
mutate(perl = grepl("perl", Job_Description, ignore.case=TRUE)) %>%
mutate(matplotlib = grepl("matplotlib", Job_Description, ignore.case=TRUE)) %>%
mutate(Cplusplus = grepl("C++", Job_Description, fixed=TRUE)) %>%
mutate(VB = grepl("VB", Job_Description, ignore.case=TRUE)) %>%
mutate(java = grepl("java\\b", Job_Description, ignore.case=TRUE)) %>%
mutate(scala = grepl("scala", Job_Description, ignore.case=TRUE)) %>%
mutate(tensorflow = grepl("tensorflow",Job_Description, ignore.case=TRUE)) %>%
mutate(javascript = grepl("javascript", Job_Description, ignore.case=TRUE)) %>%
mutate(spark = grepl("spark", Job_Description, ignore.case=TRUE)) %>%
# mutate(spark = grepl("Hadoop", Job_Description, ignore.case=TRUE)) %>%
# mutate(spark = grepl("nosql", Job_Description, ignore.case=TRUE)) %>%
# mutate(spark = grepl("spark", Job_Description, ignore.case=TRUE)) %>%
# mutate(spark = grepl("sas", Job_Description, ignore.case=TRUE)) %>%
# mutate(spark = grepl("excel", Job_Description, ignore.case=TRUE)) %>%
# mutate(spark = grepl("aws", Job_Description, ignore.case=TRUE)) %>%
#mutate(spark = grepl("azure", Job_Description, ignore.case=TRUE)) %>%
mutate(spark = grepl("java", Job_Description, ignore.case=TRUE)) %>%
#mutate(spark = grepl("tableau", Job_Description, ignore.case=TRUE)) %>%
select(Position, Company, R, python, SQL, hadoop, perl, matplotlib, Cplusplus, VB, java, scala, tensorflow, javascript,spark,java)
toolskills2 <- toolskills %>% select(-(1:2)) %>% summarise_all(sum) %>% gather(variable,value) %>% arrange(desc(value))
ggplot(toolskills2,aes(x=reorder(variable, value), y=value)) + geom_bar(stat='identity',fill="green",color="black") + xlab('') + ylab('Frequency') + labs(title='Tool Skills') + coord_flip() + theme_minimal()
hardskills <- clean_allcsv2 %>%
mutate(machinelearning = grepl("machine learning", Job_Description, ignore.case=TRUE)) %>%
mutate(modeling = grepl("model", Job_Description, ignore.case=TRUE)) %>%
mutate(statistics = grepl("statistics", Job_Description, ignore.case=TRUE)) %>%
mutate(programming = grepl("programming", Job_Description, ignore.case=TRUE)) %>%
mutate(quantitative = grepl("quantitative", Job_Description, ignore.case=TRUE)) %>%
mutate(debugging = grepl("debugging", Job_Description, ignore.case=TRUE)) %>%
mutate(statistical = grepl("statistical", Job_Description, ignore.case=TRUE)) %>%
mutate(regression = grepl("regression", Job_Description, ignore.case=TRUE)) %>%
select(Position, Company, machinelearning, modeling, statistics, programming, quantitative, debugging, statistical, regression)
hardskills2 <- hardskills %>%
select(-(1:2)) %>%
summarise_all(sum) %>%
gather(variable,value) %>%
arrange(desc(value))
ggplot(hardskills2,aes(x=reorder(variable, value), y=value)) +
geom_bar(stat='identity',fill="yellow",color="black") +
xlab('') +
ylab('Frequency') +
labs(title='Hard Skills') +
coord_flip() +
theme_minimal()
softskills <- clean_allcsv2 %>%
mutate(workingremote = grepl("working remote", Job_Description, ignore.case=TRUE)) %>%
mutate(communication = grepl("communication", Job_Description, ignore.case=TRUE)) %>%
mutate(collaborative = grepl("collaborate", Job_Description, ignore.case=TRUE)) %>%
mutate(creative = grepl("creative", Job_Description, ignore.case=TRUE)) %>%
mutate(critical = grepl("critical", Job_Description, ignore.case=TRUE)) %>%
mutate(problemsolving = grepl("problem solving", Job_Description, ignore.case=TRUE)) %>%
mutate(activelearning = grepl("active learning", Job_Description, ignore.case=TRUE)) %>%
mutate(hypothesis = grepl("hypothesis", Job_Description, ignore.case=TRUE)) %>%
mutate(organized = grepl("organize", Job_Description, ignore.case=TRUE)) %>%
mutate(judgement = grepl("judgement", Job_Description, ignore.case=TRUE)) %>%
mutate(selfstarter = grepl("self Starter", Job_Description, ignore.case=TRUE)) %>%
mutate(interpersonalskills = grepl("interpersonal skills", Job_Description, ignore.case=TRUE)) %>%
mutate(detail_oriented = grepl("attention to detail", Job_Description, ignore.case=TRUE)) %>%
mutate(visualization = grepl("visualization", Job_Description, ignore.case=TRUE)) %>%
mutate(leadership = grepl("leadership", Job_Description, ignore.case=TRUE)) %>%
mutate(presentation = grepl("presentation", Job_Description, ignore.case=TRUE)) %>%
mutate(passion = grepl("passion", Job_Description, ignore.case=TRUE)) %>%
mutate(research = grepl("research", Job_Description, ignore.case=TRUE)) %>%
mutate(teamwork = grepl("teamwork", Job_Description, ignore.case=TRUE)) %>%
mutate(integrity = grepl("integrity", Job_Description, ignore.case=TRUE)) %>%
mutate(passionate= grepl("passionate", Job_Description, ignore.case=TRUE)) %>%
select(Position, Company, workingremote, communication, collaborative, creative, critical, problemsolving,
activelearning, hypothesis, organized, judgement, selfstarter, interpersonalskills, detail_oriented,
visualization, leadership,presentation,passion,research,teamwork,integrity,passionate)
softskills2 <- softskills %>%
select(-(1:2)) %>%
summarise_all(sum) %>%
gather(variable,value) %>%
arrange(desc(value))
ggplot(softskills2,aes(x=reorder(variable, value), y=value)) + geom_bar(stat='identity',fill="orange",color="blue") + xlab('') + ylab('Frequency') + labs(title='Soft Skills') + coord_flip() + theme_minimal()
scores1 <- sentiment_by(clean_allcsv2$Job_Description, by=NULL) %>%
pull(ave_sentiment)
head(scores1)
## [1] 0.2539239 0.4040890 0.4040890 0.5159519 0.3710542 0.1971468
#m_wide2 <- m_wide
clean_allcsv2$sentiment_scores<- scores1
clean_allcsv3 <- clean_allcsv2[,-c(4)] # deleting redundant column
m_wide2 <- clean_allcsv3 %>%
select(Company, sentiment_scores)
head(m_wide2)
## Company sentiment_scores
## 1 guardian life insurance company 0.2539239
## 2 nyu langone health 0.4040890
## 3 nyu langone medical center 0.4040890
## 4 epic pharma llc 0.5159519
## 5 nyu langone health 0.3710542
## 6 macmillan 0.1971468
# Take a sample of 10 job listings of individual companies (top 10 vs. bottom 10 sentiment scores):
# First top 10 job listings
df_top <- m_wide2 %>%
top_n(10)
## Selecting by sentiment_scores
# Bottom 10 job listings
df_bottom <- m_wide2 %>%
top_n(-10)
## Selecting by sentiment_scores
df_total <- left_join(df_top,df_bottom,by="Company")
df_total1 <- df_total[,-c(3)]
#Bar charts to show 10 highest (Most Positive) sentiments Companies
ptop <- ggplot(df_top , aes(x=Company,y=sentiment_scores,fill=Company))+geom_bar(color="black",stat="identity")+
ggtitle("Top 10 Positive Sentiment Scores of Job Listings by Companies")+
theme_excel()
ptop <- ptop + theme(axis.text = element_text(size = 10,angle =90, hjust = 1))+
theme(legend.position="bottom")
ptop
#10 lowest (Least Positive) of Job Listings by Companies
pbot <- ggplot(df_bottom , aes(x=Company,y=sentiment_scores,fill=Company))+geom_bar(color="black",stat="identity")+
ggtitle("Bottom 10 Lowest Sentiment Scores of Job Listings by Companies")+
theme_excel()
pbot <- pbot+ theme(axis.text = element_text(size = 10,angle =90, hjust = 1))+
theme(legend.position="bottom")
pbot