Objects :

Help Human Resource Department to recruit employees with smiliar characteristics from pros(positve) reviews and improve working condition from cons (negative) reviews

Summary :


Both employees from Google and Amazon enjoy the working condition with similar traits, for example: “Smart people”, “Fast paced” and “great pay”. Above traits could help the Human resource Department to recurit employee with that smiliat traits.
However, both employees from Google and Amazon is unable to enjoy the working condition with similar traits, for example : “long hours”,“work life”, and “life balance”. Above traits can help the Human resource Department detect the problem in the current working condition.

Data set introduction:

Data set is composed of two(Google&Amazon) csv.data from glassdoor, each data contains 500 pros(postive) reviews and 500 cons(negative) reviews.

Method:


Code:

1 Import data and data cleaning

1.1 Loading the package

1.2 Fetch Amazon Employee Reviews data from URL

amazon_url<- getURL('http://s3.amazonaws.com/assets.datacamp.com/production/course_935/datasets/500_amzn.csv',
          ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
amazon <- read.csv(textConnection(amazon_url), header = TRUE)
str(amazon) ### There are both pros and cons in the employee reviews from the data
## 'data.frame':    500 obs. of  4 variables:
##  $ pg_num: int  50 50 50 50 50 50 50 50 50 50 ...
##  $ url   : Factor w/ 58 levels "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P10.htm",..: 44 44 44 44 44 44 44 44 44 44 ...
##  $ pros  : Factor w/ 496 levels "- Learn a lot, haven't been bored yet.",..: 477 56 152 349 359 367 183 417 210 352 ...
##  $ cons  : Factor w/ 495 levels "#NAME?","*Depending on your manager, might work long hours",..: 156 276 246 89 288 187 374 212 112 165 ...

1.3 Fetch Google Employee Reviews data from URL

google_url<- getURL('http://s3.amazonaws.com/assets.datacamp.com/production/course_935/datasets/500_goog.csv',
          ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
google<- read.csv(textConnection(google_url), header = TRUE)
str(google) ### There are both pros and cons in the employee reviews from the data
## 'data.frame':    501 obs. of  4 variables:
##  $ pg_num: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ url   : Factor w/ 50 levels "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ pros  : Factor w/ 492 levels "- Access to a vast wealth of technical resources and people",..: 21 24 486 12 410 233 413 376 314 384 ...
##  $ cons  : Factor w/ 491 levels "- Bureaucracy, politics, legal issues, and privacy handling take up more and more time over the years and slow innovation and d"| __truncated__,..: 17 22 176 6 296 63 453 447 186 113 ...

1.4 Create both Pros and Cons for the review in the both company

# Create amzn_pros
amazon_pros <- amazon$pros

# Create amzn_cons
amazon_cons <- amazon$cons


# Create goog_pros
google_pros <- google$pros

# Create goog_cons
google_cons <- google$cons

1.5 Build cleaning function based on “qdap” package for Text organization

qdap_clean <- function(x) {
  
  
  x <- replace_abbreviation(x)
  x <- replace_contraction(x)
  x <- replace_number(x)
  x <- replace_ordinal(x)
  x <- replace_symbol(x)
  x <- tolower(x)
  return(x)
  
}

1.6 Build cleaning function based on “tm” package for Text organization

  tm_clean <- function(corpus) {
  corpus<- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, removeWords,
                   c(stopwords("en"), "Google","google","amazon", "Amazon","company")) #choosing those words as stopwords in order to increase Accuracy and Precision


  
  return(corpus)
  
  
}

1.8 Text Organization for Amazon.

In order to be tidy and readable, will alter the name as follows: amzn = amazon
amzn_pros <- qdap_clean(amazon_pros)
amzn_cons <- qdap_clean(amazon_cons)
amzn_pros <- VCorpus(VectorSource(amzn_pros))
amzn_cons <- VCorpus(VectorSource(amzn_cons))
amzn_pros_corp <- tm_clean(amzn_pros)
amzn_cons_corp <- tm_clean(amzn_cons)

1.9 Text Organization for Google

In order to be tidy and readable, will alter the name as follows: goog = google
goog_pros <- qdap_clean(google_pros)

goog_cons <- qdap_clean(google_cons)

goog_pros_corp <- VCorpus(VectorSource(goog_pros))

goog_cons_corp <- VCorpus(VectorSource(goog_cons))

goog_pros_corp <- tm_clean(goog_pros_corp)

goog_cons_corp <- tm_clean(goog_cons_corp)

2 Unigram word analysis

2.1 Top 25 unigram words in Amazon cons reviews

# Create amzn_c_tdm
amzn_cons_tdm <- TermDocumentMatrix(
  amzn_cons_corp)


#matrix version of amzn_c_tdm
amzn_cons_tdm_m <- as.matrix(amzn_cons_tdm)



# Create top 25 most common amzn_c_freq
amzn_cons_freq <- rowSums(amzn_cons_tdm_m)
tp25_amzn_cons_freq <- sort(amzn_cons_freq, decreasing = TRUE)[1:25]


# Create wordcloud negative Amazon bigrams
suppressWarnings(library(wordcloud))
pal2 <- brewer.pal(8,"Dark2")
wordcloud(names(tp25_amzn_cons_freq), tp25_amzn_cons_freq, 
          max.words = 25, colors=pal2)

# Plot explain : The bigger the word is, the higher frequency the word has in the review.
#As see from the wordcloud plot, employee of Amazon does not like the "work","hours" and "people" the most. 

2.2 Top 25 unigram words in Amazon pros reviews

amzn_pros_tdm <- TermDocumentMatrix(
  amzn_pros_corp)

amzn_pros_tdm_m <- as.matrix(amzn_pros_tdm)


amzn_pros_freq <- rowSums(amzn_pros_tdm_m)
tp25_amzn_pros_freq <- sort(amzn_pros_freq, decreasing = TRUE)[1:25]


wordcloud(names(tp25_amzn_pros_freq), tp25_amzn_pros_freq, 
          max.words = 25, colors = pal2)


2.3 Top 25 unigram words in google cons reviews

goog_cons_tdm <- TermDocumentMatrix(goog_cons_corp)

goog_cons_tdm_m<- as.matrix(goog_cons_tdm)

goog_cons_freq<-rowSums(goog_cons_tdm_m)

tp25_goog_cons_freq <- sort(goog_cons_freq, decreasing = TRUE)[1:25]

pal <- brewer.pal(9, "BuGn")
wordcloud(names(tp25_goog_cons_freq), tp25_goog_cons_freq, 
          max.words = 25, colors = pal2)


2.4 Top 25 unigram words in google pros reviews

goog_pros_tdm <- TermDocumentMatrix(goog_pros_corp)

goog_pros_tdm_m<- as.matrix(goog_pros_tdm)

goog_pros_freq<-rowSums(goog_pros_tdm_m)

tp25_goog_pros_freq <- sort(goog_pros_freq, decreasing = TRUE)[1:25]

wordcloud(names(tp25_goog_pros_freq), tp25_goog_pros_freq, 
          max.words = 25, colors = pal2)


3.common unigram words analysis

3.1 Find the top common unigram words in “pros” review from both company and visialize it

amazon_pros_all <-paste(amazon_pros, collapse = " ")

google_pros_all <-paste(google_pros, collapse = " ")

all_pros <- c(amazon_pros_all,google_pros_all)

all_pros <- qdap_clean(all_pros)

all_pros_corps <- VCorpus(VectorSource(all_pros))

all_pros_corps <- tm_clean(all_pros_corps)

all_pros_tdm <- TermDocumentMatrix(all_pros_corps)

all_pros_m <- as.matrix(all_pros_tdm)

all_pros_freq <- rowSums(all_pros_m)

commonality.cloud(all_pros_m, colors = "#D82016", max.words = 25)


3.2 Bar Chart of top 5 common unigram words in both “pros” reviews.

kindly hover mouse on the digram for your perusual
all_pros_freq <- rowSums(all_pros_m)

tp5_all_pros_freq <- sort(all_pros_freq, decreasing = TRUE)[1:5]

names(tp5_all_pros_freq)
## [1] "great"    "work"     "good"     "people"   "benefits"
tp5_all_pros_freq_m <- as.matrix(tp5_all_pros_freq)

 plot_ly(
     x = row.names(tp5_all_pros_freq_m),
     y = tp5_all_pros_freq_m[,1],
     name = "tp5_all_pros",
     type = "bar",
      color = I("red")
 )%>%
   layout(title = "Top 5 common unigram words in pros reviews ",
         yaxis = list(title = "Count"))

3.3 Find the top common unigram words in “cons” reviews from both company and visialize it

amazon_cons_all <-paste(amazon_cons, collapse = " ")

google_cons_all <-paste(google_cons, collapse = " ")

all_cons <- c(amazon_cons_all,google_cons_all)

all_cons <- qdap_clean(all_cons)

all_cons_corps <- VCorpus(VectorSource(all_cons))

all_cons_corps <- tm_clean(all_cons_corps)

all_cons_tdm <- TermDocumentMatrix(all_cons_corps)

all_cons_m <- as.matrix(all_cons_tdm)

all_cons_freq <- rowSums(all_cons_m)

commonality.cloud(all_cons_m, colors = "#4682b4", max.words = 25)

*** ####3.4 Bar Chart of top 5 common unigram words in “cons” reviews. #####kindly hover mouse on the digram for your perusual

all_cons_freq <- rowSums(all_cons_m)

tp5_all_cons_freq <- sort(all_cons_freq, decreasing = TRUE)[1:5]


tp5_all_cons_freq_m <- as.matrix(tp5_all_cons_freq)

 plot_ly(
     x = row.names(tp5_all_cons_freq_m),
     y = tp5_all_cons_freq_m[,1],
     name = "tp5_all_cons",
     type = "bar"
 ) %>%
    layout(title = "Top 5 common unigram words in cons reviews ",
         yaxis = list(title = "Count"))

4 Bigram analysis

4.1 Bigram analysis for Pros reviews in Amazon

amzn_pros_df <- data.frame(text=unlist(sapply(amzn_pros_corp, '[',"content")),stringsAsFactors=F)
token_delim <- " \\t\\r\\n.!?,;\"()"
bi_amzn_pros <- NGramTokenizer(amzn_pros_df, Weka_control(min=2,max=2, delimiters = token_delim))
bi_amzn_pros_df <- data.frame(table(bi_amzn_pros))

bi_amzn_pros_order <- bi_amzn_pros_df[order(bi_amzn_pros_df$Freq,decreasing=TRUE),]
top10_bi_amzn_pros<- bi_amzn_pros_order[1:10,] 
wordcloud(words = top10_bi_amzn_pros$bi_amzn_pros, freq = top10_bi_amzn_pros$Freq,
          colors = brewer.pal(9, "Purples"))


4.2 Bigram analysis for Cons reviews in Amazon

amzn_cons_df <- data.frame(text=unlist(sapply(amzn_cons_corp, '[',"content")),stringsAsFactors=F)
token_delim <- " \\t\\r\\n.!?,;\"()"
bi_amzn_cons <- NGramTokenizer(amzn_cons_df, Weka_control(min=2,max=2, delimiters = token_delim))
bi_amzn_cons_df <- data.frame(table(bi_amzn_cons))

bi_amzn_cons_order <- bi_amzn_cons_df[order(bi_amzn_cons_df$Freq,decreasing=TRUE),]
top10_bi_amzn_cons<- bi_amzn_cons_order[1:10,] 
wordcloud(words = top10_bi_amzn_cons$bi_amzn_cons, freq = top10_bi_amzn_cons$Freq,
          colors = brewer.pal(9, "Purples"))


4.3 Bigram analysis for Pros reviews in Google

goog_pros_df <- data.frame(text=unlist(sapply(goog_pros_corp, '[',"content")),stringsAsFactors=F)
token_delim <- " \\t\\r\\n.!?,;\"()"
bi_goog_pros <- NGramTokenizer(goog_pros_df, Weka_control(min=2,max=2))
bi_goog_pros_df <- data.frame(table(bi_goog_pros))

bi_goog_pros_order <- bi_goog_pros_df[order(bi_goog_pros_df$Freq,decreasing=TRUE),]
top10_bi_goog_pros<- bi_goog_pros_order[1:10,] 
wordcloud(words = top10_bi_goog_pros$bi_goog_pros, freq = top10_bi_goog_pros$Freq,colors = pal)


4.4 Bigram analysis for Cons reviews in Google

goog_cons_df <- data.frame(text=unlist(sapply(goog_cons_corp, '[',"content")),stringsAsFactors=F)
token_delim <- " \\t\\r\\n.!?,;\"()"
bi_goog_cons <- NGramTokenizer(goog_cons_df, Weka_control(min=2,max=2, delimiters = token_delim))
bi_goog_cons_df <- data.frame(table(bi_goog_cons))

bi_goog_cons_order <- bi_goog_cons_df[order(bi_goog_cons_df$Freq,decreasing=TRUE),]
top10_bi_goog_cons<- bi_goog_cons_order[1:10,] 
wordcloud(words = top10_bi_goog_cons$bi_goog_cons, freq = top10_bi_goog_cons$Freq,colors = pal)


5 Common Bigram words analysis

5.1 Top 5 common bigram words in Pros reviews

kindly hover mouse on the digram for your perusual
all_pros_df <-left_join(bi_amzn_pros_df,bi_goog_pros_df, by=c("bi_amzn_pros" = "bi_goog_pros"),stringsAsFactors=F)
colnames(all_pros_df) <-c("Term","Amazon Pros" ,"Google Pros") 
all_pros_df <- na.omit(all_pros_df)
all_pros_df$Total <-(all_pros_df[, 2] +all_pros_df[, 3])
all_pros_total <- all_pros_df %>% arrange(desc(Total)) %>% head(5)

 plot_ly(all_pros_total, x = ~all_pros_total$Term, y = ~all_pros_total$`Amazon Pros`, type = 'bar', name = 'Amazon Pros') %>%
  add_trace(y = ~all_pros_total$`Google Pros`, name = 'Google Pros') %>%
  layout(yaxis = list(title = 'Count'),xaxis = list(title = 'Top 5 common bigram words in Pros reviews '), barmode = 'stack')

5.2 Top 5 common bigram words in Cons reviews

kindly hover mouse on the digram for your perusual
all_cons_df <-left_join(bi_amzn_cons_df,bi_goog_cons_df, by=c("bi_amzn_cons" = "bi_goog_cons"),stringsAsFactors=F) #to find out the common words between two companies
colnames(all_cons_df) <-c("Term","Amazon Cons" ,"Google Cons") 
all_cons_df <- na.omit(all_cons_df) # 
all_cons_df$Total <-(all_cons_df[, 2] +all_cons_df[, 3])
all_cons_total <- all_cons_df %>% arrange(desc(Total)) %>% head(5)

 plot_ly(all_cons_total, x = ~all_cons_total$Term, y = ~all_cons_total$`Amazon Cons`, type = 'bar', name = 'Amazon Cons') %>%
  add_trace(y = ~all_cons_total$`Google Cons`, name = 'Google Cons') %>%
  layout(yaxis = list(title = 'Count'),xaxis = list(title = 'Top 5 common bigram words in Cons reviews'), barmode = 'stack')