Help Human Resource Department to recruit employees with smiliar characteristics from pros(positve) reviews and improve working condition from cons (negative) reviews
Both employees from Google and Amazon enjoy the working condition with similar traits, for example: “Smart people”, “Fast paced” and “great pay”. Above traits could help the Human resource Department to recurit employee with that smiliat traits. However, both employees from Google and Amazon is unable to enjoy the working condition with similar traits, for example : “long hours”,“work life”, and “life balance”. Above traits can help the Human resource Department detect the problem in the current working condition.
Data set is composed of two(Google&Amazon) csv.data from glassdoor, each data contains 500 pros(postive) reviews and 500 cons(negative) reviews.
amazon_url<- getURL('http://s3.amazonaws.com/assets.datacamp.com/production/course_935/datasets/500_amzn.csv',
ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
amazon <- read.csv(textConnection(amazon_url), header = TRUE)
str(amazon) ### There are both pros and cons in the employee reviews from the data
## 'data.frame': 500 obs. of 4 variables:
## $ pg_num: int 50 50 50 50 50 50 50 50 50 50 ...
## $ url : Factor w/ 58 levels "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P10.htm",..: 44 44 44 44 44 44 44 44 44 44 ...
## $ pros : Factor w/ 496 levels "- Learn a lot, haven't been bored yet.",..: 477 56 152 349 359 367 183 417 210 352 ...
## $ cons : Factor w/ 495 levels "#NAME?","*Depending on your manager, might work long hours",..: 156 276 246 89 288 187 374 212 112 165 ...
google_url<- getURL('http://s3.amazonaws.com/assets.datacamp.com/production/course_935/datasets/500_goog.csv',
ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
google<- read.csv(textConnection(google_url), header = TRUE)
str(google) ### There are both pros and cons in the employee reviews from the data
## 'data.frame': 501 obs. of 4 variables:
## $ pg_num: int 1 1 1 1 1 1 1 1 1 1 ...
## $ url : Factor w/ 50 levels "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ pros : Factor w/ 492 levels "- Access to a vast wealth of technical resources and people",..: 21 24 486 12 410 233 413 376 314 384 ...
## $ cons : Factor w/ 491 levels "- Bureaucracy, politics, legal issues, and privacy handling take up more and more time over the years and slow innovation and d"| __truncated__,..: 17 22 176 6 296 63 453 447 186 113 ...
# Create amzn_pros
amazon_pros <- amazon$pros
# Create amzn_cons
amazon_cons <- amazon$cons
# Create goog_pros
google_pros <- google$pros
# Create goog_cons
google_cons <- google$cons
qdap_clean <- function(x) {
x <- replace_abbreviation(x)
x <- replace_contraction(x)
x <- replace_number(x)
x <- replace_ordinal(x)
x <- replace_symbol(x)
x <- tolower(x)
return(x)
}
tm_clean <- function(corpus) {
corpus<- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "Google","google","amazon", "Amazon","company")) #choosing those words as stopwords in order to increase Accuracy and Precision
return(corpus)
}
amzn_pros <- qdap_clean(amazon_pros)
amzn_cons <- qdap_clean(amazon_cons)
amzn_pros <- VCorpus(VectorSource(amzn_pros))
amzn_cons <- VCorpus(VectorSource(amzn_cons))
amzn_pros_corp <- tm_clean(amzn_pros)
amzn_cons_corp <- tm_clean(amzn_cons)
goog_pros <- qdap_clean(google_pros)
goog_cons <- qdap_clean(google_cons)
goog_pros_corp <- VCorpus(VectorSource(goog_pros))
goog_cons_corp <- VCorpus(VectorSource(goog_cons))
goog_pros_corp <- tm_clean(goog_pros_corp)
goog_cons_corp <- tm_clean(goog_cons_corp)
# Create amzn_c_tdm
amzn_cons_tdm <- TermDocumentMatrix(
amzn_cons_corp)
#matrix version of amzn_c_tdm
amzn_cons_tdm_m <- as.matrix(amzn_cons_tdm)
# Create top 25 most common amzn_c_freq
amzn_cons_freq <- rowSums(amzn_cons_tdm_m)
tp25_amzn_cons_freq <- sort(amzn_cons_freq, decreasing = TRUE)[1:25]
# Create wordcloud negative Amazon bigrams
suppressWarnings(library(wordcloud))
pal2 <- brewer.pal(8,"Dark2")
wordcloud(names(tp25_amzn_cons_freq), tp25_amzn_cons_freq,
max.words = 25, colors=pal2)
# Plot explain : The bigger the word is, the higher frequency the word has in the review.
#As see from the wordcloud plot, employee of Amazon does not like the "work","hours" and "people" the most.
amzn_pros_tdm <- TermDocumentMatrix(
amzn_pros_corp)
amzn_pros_tdm_m <- as.matrix(amzn_pros_tdm)
amzn_pros_freq <- rowSums(amzn_pros_tdm_m)
tp25_amzn_pros_freq <- sort(amzn_pros_freq, decreasing = TRUE)[1:25]
wordcloud(names(tp25_amzn_pros_freq), tp25_amzn_pros_freq,
max.words = 25, colors = pal2)
goog_cons_tdm <- TermDocumentMatrix(goog_cons_corp)
goog_cons_tdm_m<- as.matrix(goog_cons_tdm)
goog_cons_freq<-rowSums(goog_cons_tdm_m)
tp25_goog_cons_freq <- sort(goog_cons_freq, decreasing = TRUE)[1:25]
pal <- brewer.pal(9, "BuGn")
wordcloud(names(tp25_goog_cons_freq), tp25_goog_cons_freq,
max.words = 25, colors = pal2)
goog_pros_tdm <- TermDocumentMatrix(goog_pros_corp)
goog_pros_tdm_m<- as.matrix(goog_pros_tdm)
goog_pros_freq<-rowSums(goog_pros_tdm_m)
tp25_goog_pros_freq <- sort(goog_pros_freq, decreasing = TRUE)[1:25]
wordcloud(names(tp25_goog_pros_freq), tp25_goog_pros_freq,
max.words = 25, colors = pal2)
amazon_pros_all <-paste(amazon_pros, collapse = " ")
google_pros_all <-paste(google_pros, collapse = " ")
all_pros <- c(amazon_pros_all,google_pros_all)
all_pros <- qdap_clean(all_pros)
all_pros_corps <- VCorpus(VectorSource(all_pros))
all_pros_corps <- tm_clean(all_pros_corps)
all_pros_tdm <- TermDocumentMatrix(all_pros_corps)
all_pros_m <- as.matrix(all_pros_tdm)
all_pros_freq <- rowSums(all_pros_m)
commonality.cloud(all_pros_m, colors = "#D82016", max.words = 25)
all_pros_freq <- rowSums(all_pros_m)
tp5_all_pros_freq <- sort(all_pros_freq, decreasing = TRUE)[1:5]
names(tp5_all_pros_freq)
## [1] "great" "work" "good" "people" "benefits"
tp5_all_pros_freq_m <- as.matrix(tp5_all_pros_freq)
plot_ly(
x = row.names(tp5_all_pros_freq_m),
y = tp5_all_pros_freq_m[,1],
name = "tp5_all_pros",
type = "bar",
color = I("red")
)%>%
layout(title = "Top 5 common unigram words in pros reviews ",
yaxis = list(title = "Count"))
amazon_cons_all <-paste(amazon_cons, collapse = " ")
google_cons_all <-paste(google_cons, collapse = " ")
all_cons <- c(amazon_cons_all,google_cons_all)
all_cons <- qdap_clean(all_cons)
all_cons_corps <- VCorpus(VectorSource(all_cons))
all_cons_corps <- tm_clean(all_cons_corps)
all_cons_tdm <- TermDocumentMatrix(all_cons_corps)
all_cons_m <- as.matrix(all_cons_tdm)
all_cons_freq <- rowSums(all_cons_m)
commonality.cloud(all_cons_m, colors = "#4682b4", max.words = 25)
*** ####3.4 Bar Chart of top 5 common unigram words in “cons” reviews. #####kindly hover mouse on the digram for your perusual
all_cons_freq <- rowSums(all_cons_m)
tp5_all_cons_freq <- sort(all_cons_freq, decreasing = TRUE)[1:5]
tp5_all_cons_freq_m <- as.matrix(tp5_all_cons_freq)
plot_ly(
x = row.names(tp5_all_cons_freq_m),
y = tp5_all_cons_freq_m[,1],
name = "tp5_all_cons",
type = "bar"
) %>%
layout(title = "Top 5 common unigram words in cons reviews ",
yaxis = list(title = "Count"))
amzn_pros_df <- data.frame(text=unlist(sapply(amzn_pros_corp, '[',"content")),stringsAsFactors=F)
token_delim <- " \\t\\r\\n.!?,;\"()"
bi_amzn_pros <- NGramTokenizer(amzn_pros_df, Weka_control(min=2,max=2, delimiters = token_delim))
bi_amzn_pros_df <- data.frame(table(bi_amzn_pros))
bi_amzn_pros_order <- bi_amzn_pros_df[order(bi_amzn_pros_df$Freq,decreasing=TRUE),]
top10_bi_amzn_pros<- bi_amzn_pros_order[1:10,]
wordcloud(words = top10_bi_amzn_pros$bi_amzn_pros, freq = top10_bi_amzn_pros$Freq,
colors = brewer.pal(9, "Purples"))
amzn_cons_df <- data.frame(text=unlist(sapply(amzn_cons_corp, '[',"content")),stringsAsFactors=F)
token_delim <- " \\t\\r\\n.!?,;\"()"
bi_amzn_cons <- NGramTokenizer(amzn_cons_df, Weka_control(min=2,max=2, delimiters = token_delim))
bi_amzn_cons_df <- data.frame(table(bi_amzn_cons))
bi_amzn_cons_order <- bi_amzn_cons_df[order(bi_amzn_cons_df$Freq,decreasing=TRUE),]
top10_bi_amzn_cons<- bi_amzn_cons_order[1:10,]
wordcloud(words = top10_bi_amzn_cons$bi_amzn_cons, freq = top10_bi_amzn_cons$Freq,
colors = brewer.pal(9, "Purples"))
goog_pros_df <- data.frame(text=unlist(sapply(goog_pros_corp, '[',"content")),stringsAsFactors=F)
token_delim <- " \\t\\r\\n.!?,;\"()"
bi_goog_pros <- NGramTokenizer(goog_pros_df, Weka_control(min=2,max=2))
bi_goog_pros_df <- data.frame(table(bi_goog_pros))
bi_goog_pros_order <- bi_goog_pros_df[order(bi_goog_pros_df$Freq,decreasing=TRUE),]
top10_bi_goog_pros<- bi_goog_pros_order[1:10,]
wordcloud(words = top10_bi_goog_pros$bi_goog_pros, freq = top10_bi_goog_pros$Freq,colors = pal)
goog_cons_df <- data.frame(text=unlist(sapply(goog_cons_corp, '[',"content")),stringsAsFactors=F)
token_delim <- " \\t\\r\\n.!?,;\"()"
bi_goog_cons <- NGramTokenizer(goog_cons_df, Weka_control(min=2,max=2, delimiters = token_delim))
bi_goog_cons_df <- data.frame(table(bi_goog_cons))
bi_goog_cons_order <- bi_goog_cons_df[order(bi_goog_cons_df$Freq,decreasing=TRUE),]
top10_bi_goog_cons<- bi_goog_cons_order[1:10,]
wordcloud(words = top10_bi_goog_cons$bi_goog_cons, freq = top10_bi_goog_cons$Freq,colors = pal)
all_pros_df <-left_join(bi_amzn_pros_df,bi_goog_pros_df, by=c("bi_amzn_pros" = "bi_goog_pros"),stringsAsFactors=F)
colnames(all_pros_df) <-c("Term","Amazon Pros" ,"Google Pros")
all_pros_df <- na.omit(all_pros_df)
all_pros_df$Total <-(all_pros_df[, 2] +all_pros_df[, 3])
all_pros_total <- all_pros_df %>% arrange(desc(Total)) %>% head(5)
plot_ly(all_pros_total, x = ~all_pros_total$Term, y = ~all_pros_total$`Amazon Pros`, type = 'bar', name = 'Amazon Pros') %>%
add_trace(y = ~all_pros_total$`Google Pros`, name = 'Google Pros') %>%
layout(yaxis = list(title = 'Count'),xaxis = list(title = 'Top 5 common bigram words in Pros reviews '), barmode = 'stack')
all_cons_df <-left_join(bi_amzn_cons_df,bi_goog_cons_df, by=c("bi_amzn_cons" = "bi_goog_cons"),stringsAsFactors=F) #to find out the common words between two companies
colnames(all_cons_df) <-c("Term","Amazon Cons" ,"Google Cons")
all_cons_df <- na.omit(all_cons_df) #
all_cons_df$Total <-(all_cons_df[, 2] +all_cons_df[, 3])
all_cons_total <- all_cons_df %>% arrange(desc(Total)) %>% head(5)
plot_ly(all_cons_total, x = ~all_cons_total$Term, y = ~all_cons_total$`Amazon Cons`, type = 'bar', name = 'Amazon Cons') %>%
add_trace(y = ~all_cons_total$`Google Cons`, name = 'Google Cons') %>%
layout(yaxis = list(title = 'Count'),xaxis = list(title = 'Top 5 common bigram words in Cons reviews'), barmode = 'stack')