Disclaimer: The content of this RMarkdown note came from a course called Text Mining: Bag of Words in datacamp.
rm(list=ls(all=TRUE))
# Load packages
library(qdap)
library(tm)
library(RWeka)
library(wordcloud)
# Make tokenizer function
# It's used to extract tokens containing two or more words, instaed of single words.
# It's used in DocumentTermMatrix() or TermDocumentMatrix()
tokenizer <- function(x)
NGramTokenizer(x, Weka_control(min = 2, max = 2))
This chapter ties everything together with a case study in text mining for HR analytics.
Text mining has a well-deinfed six step process.
Employee reviews can come from various sources.
Here, we’ll focus on a collection of anonymous online reviews.
# Import data
amzn <- read.csv("amzn.csv")
goog <- read.csv("goog.csv")
# Print the structure of amzn
str(amzn)
## 'data.frame': 500 obs. of 4 variables:
## $ pg_num: int 50 50 50 50 50 50 50 50 50 50 ...
## $ url : Factor w/ 59 levels "<NA>","https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P10.htm",..: 45 45 45 45 45 45 45 45 45 45 ...
## $ pros : Factor w/ 496 levels "\"- Learn a lot, haven't been bored yet.\"",..: 492 56 151 349 359 367 183 417 210 352 ...
## $ cons : Factor w/ 495 levels "\"*Depending on your manager, might work long hours\"",..: 156 276 246 89 288 186 374 212 112 160 ...
amzn <- amzn[complete.cases(amzn), ] #NAs appear to cause errors in Tokenizer function
str(amzn)
## 'data.frame': 496 obs. of 4 variables:
## $ pg_num: int 50 50 50 50 50 50 50 50 50 50 ...
## $ url : Factor w/ 59 levels "<NA>","https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P10.htm",..: 45 45 45 45 45 45 45 45 45 45 ...
## $ pros : Factor w/ 496 levels "\"- Learn a lot, haven't been bored yet.\"",..: 492 56 151 349 359 367 183 417 210 352 ...
## $ cons : Factor w/ 495 levels "\"*Depending on your manager, might work long hours\"",..: 156 276 246 89 288 186 374 212 112 160 ...
# Create amzn_pros
amzn_pros <- amzn$pros
# Create amzn_cons
amzn_cons <- amzn$cons
# Print the structure of goog
str(goog)
## 'data.frame': 501 obs. of 4 variables:
## $ pg_num: int 1 1 1 1 1 1 1 1 1 1 ...
## $ url : Factor w/ 50 levels "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ pros : Factor w/ 492 levels "\"- Access to a vast wealth of technical resources and people\"",..: 21 27 488 12 410 233 413 376 314 384 ...
## $ cons : Factor w/ 491 levels "\"- Bureaucracy, politics, legal issues, and privacy handling take up more and more time over the years and slow innovation and"| __truncated__,..: 18 26 176 6 296 62 453 447 186 113 ...
# Create goog_pros
goog_pros <- goog$pros
# Create goog_cons
goog_cons <- goog$cons
qdap_clean() applies a series of qdap functions to a text vector, andtm_clean() applies a series of tm functions to a corpus object.# Create qdap_clean()
qdap_clean <- function(x) { # x is a vector employee reviews
x <- replace_abbreviation(x)
x <- replace_contraction(x)
x <- replace_number(x)
x <- replace_ordinal(x)
x <- replace_symbol(x)
x <- tolower(x)
return(x)
}
# Create tm_clean()
tm_clean <- function(corpus) {
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "Google", "Amazon", "company"))
return(corpus)
}
# Alter amzn_pros
amzn_pros <- qdap_clean(amzn_pros)
# Alter amzn_cons
amzn_cons <- qdap_clean(amzn_cons)
# Create az_p_corp
az_p_corp <- VCorpus(VectorSource(amzn_pros))
# Create az_c_corp
az_c_corp <- VCorpus(VectorSource(amzn_cons))
#str(az_c_corp)
# Create amzn_pros_corp
amzn_pros_corp <- tm_clean(az_p_corp)
# Create amzn_cons_corp
amzn_cons_corp <- tm_clean(az_c_corp)
#str(amzn_cons_corp)
# Apply qdap_clean to goog_pros
goog_pros <- qdap_clean(goog_pros)
# Apply qdap_clean to goog_cons
goog_cons <- qdap_clean(goog_cons)
# Create goog_p_corp
goog_p_corp <- VCorpus(VectorSource(goog_pros))
# Create goog_c_corp
goog_c_corp <- VCorpus(VectorSource(goog_cons))
# Create goog_pros_corp
goog_pros_corp <- tm_clean(goog_p_corp)
# Create goog_cons_corp
goog_cons_corp <- tm_clean(goog_c_corp)
amzn_p_freq that counts number of appearances of positive bigrams#Unlike in DataCamp, NAs in AMZN appear to cause an error in this section.
#So NAs were removed after importing AMZN data. See Step 3: Text organization.
# Create amzn_p_tdm
amzn_p_tdm <- TermDocumentMatrix(amzn_pros_corp,
control = list(tokenize = tokenizer))
# Create amzn_p_tdm_m
amzn_p_tdm_m <- as.matrix(amzn_p_tdm)
# Create amzn_p_freq
amzn_p_freq <- rowSums(amzn_p_tdm_m)
amzn_c_freq that counts number of appearances of negative bigrams# Create amzn_c_tdm
amzn_c_tdm <- TermDocumentMatrix(amzn_cons_corp, control = list(tokenize = tokenizer))
# Create amzn_c_tdm_m
amzn_c_tdm_m <- as.matrix(amzn_c_tdm)
# Create amzn_c_freq
amzn_c_freq <- rowSums(amzn_c_tdm_m)
# Plot a wordcloud using amzn_p_freq values
wordcloud(names(amzn_p_freq), amzn_p_freq, max.words = 25, color = "blue")
# Plot a wordcloud of negative Amazon bigrams
wordcloud(names(amzn_c_freq), amzn_c_freq, max.words = 25, color = "red")
It seems there is a strong indication of long working hours and poor work-life balance in the reviews. As a simple clustering technique, you decide to perform a hierarchical cluster and create a dendrogram to see how connected these phrases are.
Note that the dendogram uses amzn_c_tdm, a term document matrix that represents bigrams (two-words tokens) in rows and documents (individual reviews) in columns.
# Create amzn_c_tdm2 by removing sparse terms
amzn_c_tdm2 <- removeSparseTerms(amzn_c_tdm, sparse = .993)
# Create hc as a cluster of distance values
hc <- hclust(dist(amzn_c_tdm2, method = "euclidean"), method = "complete")
# Produce a plot of hc
plot(hc)
Examine top phrases that appeared in the word clouds. You hope to find associated terms using the findAssocs() function from tm. You want to check for something surprising now that you have learned of long hours and a lack of work-life balance.
# Create term_frequency
term_frequency <- sort(amzn_p_freq, decreasing = TRUE)
# Print the 5 most common terms
head(term_frequency, 5)
## good pay great benefits smart people place work fast paced
## 25 24 20 17 16
# Find associations with fast paced
findAssocs(amzn_p_tdm, "fast paced", 0.2)
## $`fast paced`
## paced environment environments ever learn fast
## 0.49 0.35 0.35
## paced friendly paced work able excel
## 0.35 0.35 0.25
## activity ample advance one also well
## 0.25 0.25 0.25
## amazon fast amazon noting amazon one
## 0.25 0.25 0.25
## amount time ample opportunity assistance ninety
## 0.25 0.25 0.25
## benefits including break computer call activity
## 0.25 0.25 0.25
## can choose catchy cheers center things
## 0.25 0.25 0.25
## challenging expect cheers opportunity choose success
## 0.25 0.25 0.25
## combined encouragement competitive environments computer room
## 0.25 0.25 0.25
## cool things deliver results dock makes
## 0.25 0.25 0.25
## driven deliver easy learn emphasis shipping
## 0.25 0.25 0.25
## encouragement innovation environment benefits environment catchy
## 0.25 0.25 0.25
## environment center environment fast environment help
## 0.25 0.25 0.25
## environment smart ever known ever witnessed
## 0.25 0.25 0.25
## everchanging fast everyones preferences excel advance
## 0.25 0.25 0.25
## excel everchanging exciting environment expect learn
## 0.25 0.25 0.25
## extremely fast facility top fail successful
## 0.25 0.25 0.25
## fantastic able fired part five percent
## 0.25 0.25 0.25
## freindly place friendly atmosphere friendly management
## 0.25 0.25 0.25
## full medical get fired go extremely
## 0.25 0.25 0.25
## great plenty great teamwork happening technology
## 0.25 0.25 0.25
## hassle benefits help get help workers
## 0.25 0.25 0.25
## high quality high volume including full
## 0.25 0.25 0.25
## innovation owning job requirements leader can
## 0.25 0.25 0.25
## line break lot responsibility maintaining high
## 0.25 0.25 0.25
## makes time management nice nice facility
## 0.25 0.25 0.25
## ninety five noting short offers opportunity
## 0.25 0.25 0.25
## one competitive one fast opportunity overtime
## 0.25 0.25 0.25
## opportunity yell ownership fast owning work
## 0.25 0.25 0.25
## paced emphasis paced exciting paced high
## 0.25 0.25 0.25
## paced never paced rewarding paced ship
## 0.25 0.25 0.25
## paced software paid upfront people focused
## 0.25 0.25 0.25
## percent paid plenty shifts position fast
## 0.25 0.25 0.25
## possible still preferences fast products quickly
## 0.25 0.25 0.25
## quality bar quickly possible readily available
## 0.25 0.25 0.25
## requirements easy responsibility ownership results great
## 0.25 0.25 0.25
## results team rewarding people shifts everyones
## 0.25 0.25 0.25
## ship dock shipping products short amount
## 0.25 0.25 0.25
## short fantastic smart coworkers still maintaining
## 0.25 0.25 0.25
## success fail successful also team driven
## 0.25 0.25 0.25
## technology today things happening things lot
## 0.25 0.25 0.25
## time fast time go top line
## 0.25 0.25 0.25
## upfront experience vision well volume call
## 0.25 0.25 0.25
## well rewarded well tuition witnessed combined
## 0.25 0.25 0.25
## work can work cool work environments
## 0.25 0.25 0.25
## work fast work job workers readily
## 0.25 0.25 0.25
## yell leader
## 0.25
Interpreation
For Amazon:
Create a comparison.cloud() of Google’s positive and negative reviews for comparison to Amazon. This will give you a quick understanding of top terms without having to spend as much time as you did examining the Amazon reviews in the previous exercises.
# Create all_goog_pros
all_goog_pros <- paste(goog_pros, collapse = " ") # collapses all individual tweets into a single vector
#str(all_goog_pros)
# Create all_goog_cons
all_goog_cons <- paste(goog_cons, collapse = " ")
#str(all_goog_cons)
# Create all_goog
all_goog <- c(all_goog_pros, all_goog_cons)
#str(all_goog)
# Convert to a vector source
all_goog <- VectorSource(all_goog)
#str(all_goog)
# Create all_goog_corpus
all_goog_corpus <- VCorpus(all_goog)
#str(all_goog_corpus)
# Create all_goog_corp
all_goog_corp <- tm_clean(all_goog_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_goog_corp)
# Name the columns of all_tdm
colnames(all_tdm) <- c("Goog_Pros", "Goog_Cons")
# Create all_m
all_m <- as.matrix(all_tdm)
# Build a comparison cloud
comparison.cloud(all_m, colors = c("#F44336", "#2196f3"), max.words = 100)
Interpretation
For Google:
Make a pyramid plot lining up positive reviews for Amazon and Google so you can adequately see the differences between any shared bigrams.
steps to prep texts for analysis
# Create all_amzn_pros
all_amzn_pros <- paste(amzn_pros, collapse = " ")
#str(all_amzn_pros)
# Create all_goog_pros
all_goog_pros <- paste(goog_pros, collapse = " ") # collapses all individual tweets into a single vector
#str(all_goog_pros)
# Create all
all <- c(all_amzn_pros, all_goog_pros)
#str(all)
# Convert to a vector source
all <- VectorSource(all)
#str(all)
# Create all_corpus
all_corpus <- VCorpus(all)
#str(all_corpus)
# Create all_corp
all_corp <- tm_clean(all_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_corp,
control = list(tokenize = tokenizer)) #bigram
# Name the columns of all_tdm
colnames(all_tdm) <- c("Amzn", "Google")
# Create all_m
all_m <- as.matrix(all_tdm)
# Load package
library(plotrix)
# Create common_words
common_words <- subset(all_m, all_m[, 1] > 0 & all_m[, 2] > 0)
head(common_words)
## Docs
## Terms Amzn Google
## ability make 1 1
## able move 1 1
## affect millions 1 1
## always something 2 1
## amazing great 1 2
## amazing people 1 6
# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])
head(difference)
## ability make able move affect millions always something
## 0 0 0 1
## amazing great amazing people
## 1 5
# Add difference to common_words
common_words <- cbind(common_words, difference)
head(common_words)
## Amzn Google difference
## ability make 1 1 0
## able move 1 1 0
## affect millions 1 1 0
## always something 2 1 1
## amazing great 1 2 1
## amazing people 1 6 5
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[,3], decreasing = TRUE), ]
head(common_words)
## Amzn Google difference
## smart people 20 42 22
## good pay 26 9 17
## fast paced 16 2 14
## great culture 2 14 12
## people great 6 18 12
## work great 6 17 11
# Create top15_df
top15_df <- data.frame(x = common_words[1:15, 1], y = common_words[1:15, 2], labels = rownames(common_words[1:15, ]))
# Create the pyramid plot
pyramid.plot(top15_df$x, top15_df$y,
labels = top15_df$labels, gap = 12,
top.labels = c("Amzn", "Pro Words", "Google"),
main = "Words in Common", unit = NULL)
## [1] 5.1 4.1 4.1 2.1
Interpretation The graph shows top 15 common words in terms of the difference in the number of appearances between Google and Amazon.
# Create all_amzn_cons
all_amzn_cons <- paste(amzn_cons, collapse = " ")
#str(all_amzn_cons)
# Create all_goog_cons
all_goog_cons <- paste(goog_cons, collapse = " ") # collapses all individual tweets into a single vector
#str(all_goog_cons)
# Create all
all <- c(all_amzn_cons, all_goog_cons)
#str(all)
# Convert to a vector source
all <- VectorSource(all)
#str(all)
# Create all_corpus
all_corpus <- VCorpus(all)
#str(all_corpus)
# Create all_corp
all_corp <- tm_clean(all_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_corp,
control = list(tokenize = tokenizer)) #bigram
# Name the columns of all_tdm
colnames(all_tdm) <- c("Amzn", "Google")
# Create all_m
all_m <- as.matrix(all_tdm)
# Create common_words
common_words <- subset(all_m, all_m[, 1] > 0 & all_m[, 2] > 0)
# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])
# Bind difference to common_words
common_words <- cbind(common_words, difference)
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing = TRUE), ]
# Create top15_df
top15_df <- data.frame(x = common_words[1:15, 1], y = common_words[1:15, 2], labels = rownames(common_words[1:15, ]))
# Create the pyramid plot
pyramid.plot(top15_df$x, top15_df$y, labels = top15_df$labels, gap = 12, top.labels = c("Amzn", "Cons Words", "Google"), main = "Words in Common", unit = NULL)
## [1] 5.1 4.1 4.1 2.1
Interpretation The graph shows top 15 common words in terms of the difference in the number of appearances between Google and Amazon.
A few recommendations for Amazon’s HR department to remain competitive against Google in recruiting talents: