Disclaimer: The content of this RMarkdown note came from a course called Text Mining: Bag of Words in datacamp.

rm(list=ls(all=TRUE))


# Load packages
library(qdap)
library(tm)
library(RWeka)
library(wordcloud)

# Make tokenizer function 
# It's used to extract tokens containing two or more words, instaed of single words.
# It's used in DocumentTermMatrix() or TermDocumentMatrix()
tokenizer <- function(x) 
  NGramTokenizer(x, Weka_control(min = 2, max = 2))

Chapter 4: Battle of the tech giants for talent

This chapter ties everything together with a case study in text mining for HR analytics.

Amazon versus Google

Text mining has a well-deinfed six step process.

  1. Define problem
  2. Identify text sources
  3. Organize texts (i.e., excluding stopwords and word stemming)
  4. Extract information (i.e., frequent terms)
  5. Analyze
  6. Conclude

Step 1: Problem definition

  • Does Amazon or Google have a better perceived pay according to online reviews?
  • Does Amazon or Google have a better work-life balance according to current employees?

Step 2: Indentifying the text sources

Employee reviews can come from various sources.

  1. A third party interviewing employees both internally and from your competitor
  2. Forbes and others publish articles about the “best places to work”
  3. Anonymous online reviews from websites like Indeed, Glassdoor or CareerBliss

Here, we’ll focus on a collection of anonymous online reviews.

# Import data
amzn <- read.csv("amzn.csv")
goog <- read.csv("goog.csv")

# Print the structure of amzn
str(amzn)
## 'data.frame':    500 obs. of  4 variables:
##  $ pg_num: int  50 50 50 50 50 50 50 50 50 50 ...
##  $ url   : Factor w/ 59 levels "<NA>","https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P10.htm",..: 45 45 45 45 45 45 45 45 45 45 ...
##  $ pros  : Factor w/ 496 levels "\"- Learn a lot, haven't been bored yet.\"",..: 492 56 151 349 359 367 183 417 210 352 ...
##  $ cons  : Factor w/ 495 levels "\"*Depending on your manager, might work long hours\"",..: 156 276 246 89 288 186 374 212 112 160 ...

amzn <- amzn[complete.cases(amzn), ] #NAs appear to cause errors in Tokenizer function
str(amzn)
## 'data.frame':    496 obs. of  4 variables:
##  $ pg_num: int  50 50 50 50 50 50 50 50 50 50 ...
##  $ url   : Factor w/ 59 levels "<NA>","https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P10.htm",..: 45 45 45 45 45 45 45 45 45 45 ...
##  $ pros  : Factor w/ 496 levels "\"- Learn a lot, haven't been bored yet.\"",..: 492 56 151 349 359 367 183 417 210 352 ...
##  $ cons  : Factor w/ 495 levels "\"*Depending on your manager, might work long hours\"",..: 156 276 246 89 288 186 374 212 112 160 ...

# Create amzn_pros
amzn_pros <- amzn$pros

# Create amzn_cons
amzn_cons <- amzn$cons

# Print the structure of goog
str(goog)
## 'data.frame':    501 obs. of  4 variables:
##  $ pg_num: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ url   : Factor w/ 50 levels "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ pros  : Factor w/ 492 levels "\"- Access to a vast wealth of technical resources and people\"",..: 21 27 488 12 410 233 413 376 314 384 ...
##  $ cons  : Factor w/ 491 levels "\"- Bureaucracy, politics, legal issues, and privacy handling take up more and more time over the years and slow innovation and"| __truncated__,..: 18 26 176 6 296 62 453 447 186 113 ...

# Create goog_pros
goog_pros <- goog$pros

# Create goog_cons
goog_cons <- goog$cons

Step 3: Text organization

  • qdap_clean() applies a series of qdap functions to a text vector, and
  • tm_clean() applies a series of tm functions to a corpus object.

a Create cleanup functions

# Create qdap_clean()
qdap_clean <- function(x) {     # x is a vector employee reviews
  x <- replace_abbreviation(x)
  x <- replace_contraction(x)
  x <- replace_number(x)
  x <- replace_ordinal(x)
  x <- replace_symbol(x)
  x <- tolower(x)
  return(x)
}

# Create tm_clean()
tm_clean <- function(corpus) {
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, removeWords,
              c(stopwords("en"), "Google", "Amazon", "company"))
  return(corpus)
}

b Cleanup Amazon reviews

# Alter amzn_pros
amzn_pros <- qdap_clean(amzn_pros)

# Alter amzn_cons
amzn_cons <- qdap_clean(amzn_cons)

# Create az_p_corp 
az_p_corp <- VCorpus(VectorSource(amzn_pros))

# Create az_c_corp
az_c_corp <- VCorpus(VectorSource(amzn_cons))
#str(az_c_corp)
# Create amzn_pros_corp
amzn_pros_corp <- tm_clean(az_p_corp)

# Create amzn_cons_corp
amzn_cons_corp <- tm_clean(az_c_corp)
#str(amzn_cons_corp)

c Cleanup Google reviews

# Apply qdap_clean to goog_pros
goog_pros <- qdap_clean(goog_pros)

# Apply qdap_clean to goog_cons
goog_cons <- qdap_clean(goog_cons)

# Create goog_p_corp
goog_p_corp <- VCorpus(VectorSource(goog_pros))

# Create goog_c_corp
goog_c_corp <- VCorpus(VectorSource(goog_cons))

# Create goog_pros_corp
goog_pros_corp <- tm_clean(goog_p_corp)

# Create goog_cons_corp
goog_cons_corp <- tm_clean(goog_c_corp)

Step 4: Extract feature

a Create amzn_p_freq that counts number of appearances of positive bigrams

#Unlike in DataCamp, NAs in AMZN appear to cause an error in this section.
#So NAs were removed after importing AMZN data. See Step 3: Text organization.

# Create amzn_p_tdm
amzn_p_tdm <- TermDocumentMatrix(amzn_pros_corp, 
                                 control = list(tokenize = tokenizer))

# Create amzn_p_tdm_m
amzn_p_tdm_m <- as.matrix(amzn_p_tdm)

# Create amzn_p_freq
amzn_p_freq <- rowSums(amzn_p_tdm_m)

b Create amzn_c_freq that counts number of appearances of negative bigrams

# Create amzn_c_tdm
amzn_c_tdm <- TermDocumentMatrix(amzn_cons_corp, control = list(tokenize = tokenizer))

# Create amzn_c_tdm_m
amzn_c_tdm_m <- as.matrix(amzn_c_tdm)

# Create amzn_c_freq
amzn_c_freq <- rowSums(amzn_c_tdm_m)

c Repeat the same for Google’s reviews

Step 5: Analyze

a Plot a wordcloud of Amazon’s positive and negative reviews

# Plot a wordcloud using amzn_p_freq values
wordcloud(names(amzn_p_freq), amzn_p_freq, max.words = 25, color = "blue")


# Plot a wordcloud of negative Amazon bigrams
wordcloud(names(amzn_c_freq), amzn_c_freq, max.words = 25, color = "red")

b Plot a dendogram of Amazon’s negative reviews

It seems there is a strong indication of long working hours and poor work-life balance in the reviews. As a simple clustering technique, you decide to perform a hierarchical cluster and create a dendrogram to see how connected these phrases are.

Note that the dendogram uses amzn_c_tdm, a term document matrix that represents bigrams (two-words tokens) in rows and documents (individual reviews) in columns.

# Create amzn_c_tdm2 by removing sparse terms 
amzn_c_tdm2 <- removeSparseTerms(amzn_c_tdm, sparse = .993)

# Create hc as a cluster of distance values
hc <- hclust(dist(amzn_c_tdm2, method = "euclidean"), method = "complete")

# Produce a plot of hc
plot(hc)

c Word association

Examine top phrases that appeared in the word clouds. You hope to find associated terms using the findAssocs() function from tm. You want to check for something surprising now that you have learned of long hours and a lack of work-life balance.

# Create term_frequency
term_frequency <- sort(amzn_p_freq, decreasing = TRUE)

# Print the 5 most common terms
head(term_frequency, 5)
##       good pay great benefits   smart people     place work     fast paced 
##             25             24             20             17             16

# Find associations with fast paced
findAssocs(amzn_p_tdm, "fast paced", 0.2)
## $`fast paced`
##        paced environment        environments ever               learn fast 
##                     0.49                     0.35                     0.35 
##           paced friendly               paced work               able excel 
##                     0.35                     0.35                     0.25 
##           activity ample              advance one                also well 
##                     0.25                     0.25                     0.25 
##              amazon fast            amazon noting               amazon one 
##                     0.25                     0.25                     0.25 
##              amount time        ample opportunity        assistance ninety 
##                     0.25                     0.25                     0.25 
##       benefits including           break computer            call activity 
##                     0.25                     0.25                     0.25 
##               can choose            catchy cheers            center things 
##                     0.25                     0.25                     0.25 
##       challenging expect       cheers opportunity           choose success 
##                     0.25                     0.25                     0.25 
##   combined encouragement competitive environments            computer room 
##                     0.25                     0.25                     0.25 
##              cool things          deliver results               dock makes 
##                     0.25                     0.25                     0.25 
##           driven deliver               easy learn        emphasis shipping 
##                     0.25                     0.25                     0.25 
## encouragement innovation     environment benefits       environment catchy 
##                     0.25                     0.25                     0.25 
##       environment center         environment fast         environment help 
##                     0.25                     0.25                     0.25 
##        environment smart               ever known           ever witnessed 
##                     0.25                     0.25                     0.25 
##        everchanging fast    everyones preferences            excel advance 
##                     0.25                     0.25                     0.25 
##       excel everchanging     exciting environment             expect learn 
##                     0.25                     0.25                     0.25 
##           extremely fast             facility top          fail successful 
##                     0.25                     0.25                     0.25 
##           fantastic able               fired part             five percent 
##                     0.25                     0.25                     0.25 
##           freindly place      friendly atmosphere      friendly management 
##                     0.25                     0.25                     0.25 
##             full medical                get fired             go extremely 
##                     0.25                     0.25                     0.25 
##             great plenty           great teamwork     happening technology 
##                     0.25                     0.25                     0.25 
##          hassle benefits                 help get             help workers 
##                     0.25                     0.25                     0.25 
##             high quality              high volume           including full 
##                     0.25                     0.25                     0.25 
##        innovation owning         job requirements               leader can 
##                     0.25                     0.25                     0.25 
##               line break       lot responsibility         maintaining high 
##                     0.25                     0.25                     0.25 
##               makes time          management nice            nice facility 
##                     0.25                     0.25                     0.25 
##              ninety five             noting short       offers opportunity 
##                     0.25                     0.25                     0.25 
##          one competitive                 one fast     opportunity overtime 
##                     0.25                     0.25                     0.25 
##         opportunity yell           ownership fast              owning work 
##                     0.25                     0.25                     0.25 
##           paced emphasis           paced exciting               paced high 
##                     0.25                     0.25                     0.25 
##              paced never          paced rewarding               paced ship 
##                     0.25                     0.25                     0.25 
##           paced software             paid upfront           people focused 
##                     0.25                     0.25                     0.25 
##             percent paid            plenty shifts            position fast 
##                     0.25                     0.25                     0.25 
##           possible still         preferences fast         products quickly 
##                     0.25                     0.25                     0.25 
##              quality bar         quickly possible        readily available 
##                     0.25                     0.25                     0.25 
##        requirements easy responsibility ownership            results great 
##                     0.25                     0.25                     0.25 
##             results team         rewarding people         shifts everyones 
##                     0.25                     0.25                     0.25 
##                ship dock        shipping products             short amount 
##                     0.25                     0.25                     0.25 
##          short fantastic          smart coworkers        still maintaining 
##                     0.25                     0.25                     0.25 
##             success fail          successful also              team driven 
##                     0.25                     0.25                     0.25 
##         technology today         things happening               things lot 
##                     0.25                     0.25                     0.25 
##                time fast                  time go                 top line 
##                     0.25                     0.25                     0.25 
##       upfront experience              vision well              volume call 
##                     0.25                     0.25                     0.25 
##            well rewarded             well tuition       witnessed combined 
##                     0.25                     0.25                     0.25 
##                 work can                work cool        work environments 
##                     0.25                     0.25                     0.25 
##                work fast                 work job          workers readily 
##                     0.25                     0.25                     0.25 
##              yell leader 
##                     0.25

Interpreation

For Amazon:

  • Earlier you were surprised to see “fast paced” in the pros despite the other reviews mentioning “work-life balance”.
  • Given the abbreviated results of the associated phrases, you would recommend Amazon HR recruiters look for in candidates that view an intense workload as an opportunity to learn fast and give them ample opportunity.

d Plot a comparison cloud of Google’s positive and negative reviews

Create a comparison.cloud() of Google’s positive and negative reviews for comparison to Amazon. This will give you a quick understanding of top terms without having to spend as much time as you did examining the Amazon reviews in the previous exercises.

# Create all_goog_pros
all_goog_pros <- paste(goog_pros, collapse = " ") # collapses all individual tweets into a single vector
#str(all_goog_pros)

# Create all_goog_cons
all_goog_cons <- paste(goog_cons, collapse = " ")
#str(all_goog_cons)

# Create all_goog
all_goog <- c(all_goog_pros, all_goog_cons)
#str(all_goog)

# Convert to a vector source
all_goog <- VectorSource(all_goog)
#str(all_goog)

# Create all_goog_corpus
all_goog_corpus <- VCorpus(all_goog)
#str(all_goog_corpus)
# Create all_goog_corp
all_goog_corp <- tm_clean(all_goog_corpus)

# Create all_tdm
all_tdm <- TermDocumentMatrix(all_goog_corp)

# Name the columns of all_tdm
colnames(all_tdm) <- c("Goog_Pros", "Goog_Cons")

# Create all_m
all_m <- as.matrix(all_tdm)

# Build a comparison cloud
comparison.cloud(all_m, colors = c("#F44336", "#2196f3"), max.words = 100)

Interpretation

For Google:

  • Positive reviews mention “great food”, “perks”, “smart people”, and “fun culture”.
  • Negative reviews discuss “politics”, “getting big”, “bureaucracy”, and “middle management”.

e Make a pyramid plot of Amazon and Google positive reviews

Make a pyramid plot lining up positive reviews for Amazon and Google so you can adequately see the differences between any shared bigrams.

steps to prep texts for analysis

  1. Get text vector
  2. Convert to a vector source
  3. Create a corpus
  4. Clean texts (i.e., stop words, stemming)
  5. Create a term document matrix or a document term matrix
  6. Convert to a matrix
# Create all_amzn_pros
all_amzn_pros <- paste(amzn_pros, collapse = " ")
#str(all_amzn_pros)

# Create all_goog_pros
all_goog_pros <- paste(goog_pros, collapse = " ") # collapses all individual tweets into a single vector
#str(all_goog_pros)

# Create all
all <- c(all_amzn_pros, all_goog_pros)
#str(all)

# Convert to a vector source
all <- VectorSource(all)
#str(all)

# Create all_corpus
all_corpus <- VCorpus(all)
#str(all_corpus)

# Create all_corp
all_corp <- tm_clean(all_corpus)

# Create all_tdm
all_tdm <- TermDocumentMatrix(all_corp, 
                              control = list(tokenize = tokenizer)) #bigram

# Name the columns of all_tdm
colnames(all_tdm) <- c("Amzn", "Google")

# Create all_m
all_m <- as.matrix(all_tdm)
# Load package
library(plotrix)

# Create common_words
common_words <- subset(all_m, all_m[, 1] > 0 & all_m[, 2] > 0)
head(common_words)
##                   Docs
## Terms              Amzn Google
##   ability make        1      1
##   able move           1      1
##   affect millions     1      1
##   always something    2      1
##   amazing great       1      2
##   amazing people      1      6
# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])
head(difference)
##     ability make        able move  affect millions always something 
##                0                0                0                1 
##    amazing great   amazing people 
##                1                5
# Add difference to common_words
common_words <- cbind(common_words, difference)
head(common_words)
##                  Amzn Google difference
## ability make        1      1          0
## able move           1      1          0
## affect millions     1      1          0
## always something    2      1          1
## amazing great       1      2          1
## amazing people      1      6          5
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[,3], decreasing = TRUE), ]
head(common_words)
##               Amzn Google difference
## smart people    20     42         22
## good pay        26      9         17
## fast paced      16      2         14
## great culture    2     14         12
## people great     6     18         12
## work great       6     17         11
# Create top15_df
top15_df <- data.frame(x = common_words[1:15, 1], y = common_words[1:15, 2], labels = rownames(common_words[1:15, ]))

# Create the pyramid plot
pyramid.plot(top15_df$x, top15_df$y, 
             labels = top15_df$labels, gap = 12, 
             top.labels = c("Amzn", "Pro Words", "Google"), 
             main = "Words in Common", unit = NULL)

## [1] 5.1 4.1 4.1 2.1

Interpretation The graph shows top 15 common words in terms of the difference in the number of appearances between Google and Amazon.

  • While smart people seem to be a positive in both Google and Amazon, it’s more so in Google.
  • While good pay seem to be a positive in both Google and Amazon, it’s more so in Amazon.
  • While great culture seem to be a positive in both Google and Amazon, it’s more so in Google.
  • It appears that Amazon should do better in cultural aspects while Google should consider raising pay and beneifts to compete with Amazon.

f Make a pyramid plot of Amazon and Google negative reviews

# Create all_amzn_cons
all_amzn_cons <- paste(amzn_cons, collapse = " ")
#str(all_amzn_cons)

# Create all_goog_cons
all_goog_cons <- paste(goog_cons, collapse = " ") # collapses all individual tweets into a single vector
#str(all_goog_cons)

# Create all
all <- c(all_amzn_cons, all_goog_cons)
#str(all)

# Convert to a vector source
all <- VectorSource(all)
#str(all)

# Create all_corpus
all_corpus <- VCorpus(all)
#str(all_corpus)

# Create all_corp
all_corp <- tm_clean(all_corpus)

# Create all_tdm
all_tdm <- TermDocumentMatrix(all_corp, 
                              control = list(tokenize = tokenizer)) #bigram

# Name the columns of all_tdm
colnames(all_tdm) <- c("Amzn", "Google")

# Create all_m
all_m <- as.matrix(all_tdm)
# Create common_words
common_words <- subset(all_m, all_m[, 1] > 0 & all_m[, 2] > 0)

# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])

# Bind difference to common_words
common_words <- cbind(common_words, difference)

# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing = TRUE), ]

# Create top15_df
top15_df <- data.frame(x = common_words[1:15, 1], y = common_words[1:15, 2], labels = rownames(common_words[1:15, ])) 

# Create the pyramid plot
pyramid.plot(top15_df$x, top15_df$y, labels = top15_df$labels, gap = 12, top.labels = c("Amzn", "Cons Words", "Google"), main = "Words in Common", unit = NULL)

## [1] 5.1 4.1 4.1 2.1

Interpretation The graph shows top 15 common words in terms of the difference in the number of appearances between Google and Amazon.

  • While long hours seem to be a problem in both Google and Amazon, it’s more serious in Amazon.
  • While worklife balance seem to be a problem in both Google and Amazon, it’s more serious in Amazon.
  • It appears that Amazon should address issues of long work hours and worklife balance to attract employees in the labor market competing against Google.

Step 6: Conclude

A few recommendations for Amazon’s HR department to remain competitive against Google in recruiting talents:

  • Address the long hours and worklife balance issue.
  • Look for candidates that view an intense workload as an opportunity to learn fast and give them ample opportunity.