Disclaimer: The content of this RMarkdown note came from a course called Text Mining: Bag of Words in datacamp.
rm(list=ls(all=TRUE))
# Load packages
library(qdap)
library(tm)
library(RWeka)
library(wordcloud)
# Make tokenizer function
# It's used to extract tokens containing two or more words, instaed of single words.
# It's used in DocumentTermMatrix() or TermDocumentMatrix()
tokenizer <- function(x)
NGramTokenizer(x, Weka_control(min = 2, max = 2))
This chapter ties everything together with a case study in text mining for HR analytics.
Text mining has a well-deinfed six step process.
Employee reviews can come from various sources.
Here, we’ll focus on a collection of anonymous online reviews.
# Import data
amzn <- read.csv("amzn.csv")
goog <- read.csv("goog.csv")
# Print the structure of amzn
str(amzn)
## 'data.frame': 500 obs. of 4 variables:
## $ pg_num: int 50 50 50 50 50 50 50 50 50 50 ...
## $ url : Factor w/ 59 levels "<NA>","https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P10.htm",..: 45 45 45 45 45 45 45 45 45 45 ...
## $ pros : Factor w/ 496 levels "\"- Learn a lot, haven't been bored yet.\"",..: 492 56 151 349 359 367 183 417 210 352 ...
## $ cons : Factor w/ 495 levels "\"*Depending on your manager, might work long hours\"",..: 156 276 246 89 288 186 374 212 112 160 ...
amzn <- amzn[complete.cases(amzn), ] #NAs appear to cause errors in Tokenizer function
str(amzn)
## 'data.frame': 496 obs. of 4 variables:
## $ pg_num: int 50 50 50 50 50 50 50 50 50 50 ...
## $ url : Factor w/ 59 levels "<NA>","https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P10.htm",..: 45 45 45 45 45 45 45 45 45 45 ...
## $ pros : Factor w/ 496 levels "\"- Learn a lot, haven't been bored yet.\"",..: 492 56 151 349 359 367 183 417 210 352 ...
## $ cons : Factor w/ 495 levels "\"*Depending on your manager, might work long hours\"",..: 156 276 246 89 288 186 374 212 112 160 ...
# Create amzn_pros
amzn_pros <- amzn$pros
# Create amzn_cons
amzn_cons <- amzn$cons
# Print the structure of goog
str(goog)
## 'data.frame': 501 obs. of 4 variables:
## $ pg_num: int 1 1 1 1 1 1 1 1 1 1 ...
## $ url : Factor w/ 50 levels "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ pros : Factor w/ 492 levels "\"- Access to a vast wealth of technical resources and people\"",..: 21 27 488 12 410 233 413 376 314 384 ...
## $ cons : Factor w/ 491 levels "\"- Bureaucracy, politics, legal issues, and privacy handling take up more and more time over the years and slow innovation and"| __truncated__,..: 18 26 176 6 296 62 453 447 186 113 ...
# Create goog_pros
goog_pros <- goog$pros
# Create goog_cons
goog_cons <- goog$cons
qdap_clean()
applies a series of qdap
functions to a text vector, andtm_clean()
applies a series of tm
functions to a corpus object.# Create qdap_clean()
qdap_clean <- function(x) { # x is a vector employee reviews
x <- replace_abbreviation(x)
x <- replace_contraction(x)
x <- replace_number(x)
x <- replace_ordinal(x)
x <- replace_symbol(x)
x <- tolower(x)
return(x)
}
# Create tm_clean()
tm_clean <- function(corpus) {
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "Google", "Amazon", "company"))
return(corpus)
}
# Alter amzn_pros
amzn_pros <- qdap_clean(amzn_pros)
# Alter amzn_cons
amzn_cons <- qdap_clean(amzn_cons)
# Create az_p_corp
az_p_corp <- VCorpus(VectorSource(amzn_pros))
# Create az_c_corp
az_c_corp <- VCorpus(VectorSource(amzn_cons))
#str(az_c_corp)
# Create amzn_pros_corp
amzn_pros_corp <- tm_clean(az_p_corp)
# Create amzn_cons_corp
amzn_cons_corp <- tm_clean(az_c_corp)
#str(amzn_cons_corp)
# Apply qdap_clean to goog_pros
goog_pros <- qdap_clean(goog_pros)
# Apply qdap_clean to goog_cons
goog_cons <- qdap_clean(goog_cons)
# Create goog_p_corp
goog_p_corp <- VCorpus(VectorSource(goog_pros))
# Create goog_c_corp
goog_c_corp <- VCorpus(VectorSource(goog_cons))
# Create goog_pros_corp
goog_pros_corp <- tm_clean(goog_p_corp)
# Create goog_cons_corp
goog_cons_corp <- tm_clean(goog_c_corp)
amzn_p_freq
that counts number of appearances of positive bigrams#Unlike in DataCamp, NAs in AMZN appear to cause an error in this section.
#So NAs were removed after importing AMZN data. See Step 3: Text organization.
# Create amzn_p_tdm
amzn_p_tdm <- TermDocumentMatrix(amzn_pros_corp,
control = list(tokenize = tokenizer))
# Create amzn_p_tdm_m
amzn_p_tdm_m <- as.matrix(amzn_p_tdm)
# Create amzn_p_freq
amzn_p_freq <- rowSums(amzn_p_tdm_m)
amzn_c_freq
that counts number of appearances of negative bigrams# Create amzn_c_tdm
amzn_c_tdm <- TermDocumentMatrix(amzn_cons_corp, control = list(tokenize = tokenizer))
# Create amzn_c_tdm_m
amzn_c_tdm_m <- as.matrix(amzn_c_tdm)
# Create amzn_c_freq
amzn_c_freq <- rowSums(amzn_c_tdm_m)
# Plot a wordcloud using amzn_p_freq values
wordcloud(names(amzn_p_freq), amzn_p_freq, max.words = 25, color = "blue")
# Plot a wordcloud of negative Amazon bigrams
wordcloud(names(amzn_c_freq), amzn_c_freq, max.words = 25, color = "red")
It seems there is a strong indication of long working hours and poor work-life balance in the reviews. As a simple clustering technique, you decide to perform a hierarchical cluster and create a dendrogram to see how connected these phrases are.
Note that the dendogram uses amzn_c_tdm
, a term document matrix that represents bigrams (two-words tokens) in rows and documents (individual reviews) in columns.
# Create amzn_c_tdm2 by removing sparse terms
amzn_c_tdm2 <- removeSparseTerms(amzn_c_tdm, sparse = .993)
# Create hc as a cluster of distance values
hc <- hclust(dist(amzn_c_tdm2, method = "euclidean"), method = "complete")
# Produce a plot of hc
plot(hc)
Examine top phrases that appeared in the word clouds. You hope to find associated terms using the findAssocs()
function from tm
. You want to check for something surprising now that you have learned of long hours and a lack of work-life balance.
# Create term_frequency
term_frequency <- sort(amzn_p_freq, decreasing = TRUE)
# Print the 5 most common terms
head(term_frequency, 5)
## good pay great benefits smart people place work fast paced
## 25 24 20 17 16
# Find associations with fast paced
findAssocs(amzn_p_tdm, "fast paced", 0.2)
## $`fast paced`
## paced environment environments ever learn fast
## 0.49 0.35 0.35
## paced friendly paced work able excel
## 0.35 0.35 0.25
## activity ample advance one also well
## 0.25 0.25 0.25
## amazon fast amazon noting amazon one
## 0.25 0.25 0.25
## amount time ample opportunity assistance ninety
## 0.25 0.25 0.25
## benefits including break computer call activity
## 0.25 0.25 0.25
## can choose catchy cheers center things
## 0.25 0.25 0.25
## challenging expect cheers opportunity choose success
## 0.25 0.25 0.25
## combined encouragement competitive environments computer room
## 0.25 0.25 0.25
## cool things deliver results dock makes
## 0.25 0.25 0.25
## driven deliver easy learn emphasis shipping
## 0.25 0.25 0.25
## encouragement innovation environment benefits environment catchy
## 0.25 0.25 0.25
## environment center environment fast environment help
## 0.25 0.25 0.25
## environment smart ever known ever witnessed
## 0.25 0.25 0.25
## everchanging fast everyones preferences excel advance
## 0.25 0.25 0.25
## excel everchanging exciting environment expect learn
## 0.25 0.25 0.25
## extremely fast facility top fail successful
## 0.25 0.25 0.25
## fantastic able fired part five percent
## 0.25 0.25 0.25
## freindly place friendly atmosphere friendly management
## 0.25 0.25 0.25
## full medical get fired go extremely
## 0.25 0.25 0.25
## great plenty great teamwork happening technology
## 0.25 0.25 0.25
## hassle benefits help get help workers
## 0.25 0.25 0.25
## high quality high volume including full
## 0.25 0.25 0.25
## innovation owning job requirements leader can
## 0.25 0.25 0.25
## line break lot responsibility maintaining high
## 0.25 0.25 0.25
## makes time management nice nice facility
## 0.25 0.25 0.25
## ninety five noting short offers opportunity
## 0.25 0.25 0.25
## one competitive one fast opportunity overtime
## 0.25 0.25 0.25
## opportunity yell ownership fast owning work
## 0.25 0.25 0.25
## paced emphasis paced exciting paced high
## 0.25 0.25 0.25
## paced never paced rewarding paced ship
## 0.25 0.25 0.25
## paced software paid upfront people focused
## 0.25 0.25 0.25
## percent paid plenty shifts position fast
## 0.25 0.25 0.25
## possible still preferences fast products quickly
## 0.25 0.25 0.25
## quality bar quickly possible readily available
## 0.25 0.25 0.25
## requirements easy responsibility ownership results great
## 0.25 0.25 0.25
## results team rewarding people shifts everyones
## 0.25 0.25 0.25
## ship dock shipping products short amount
## 0.25 0.25 0.25
## short fantastic smart coworkers still maintaining
## 0.25 0.25 0.25
## success fail successful also team driven
## 0.25 0.25 0.25
## technology today things happening things lot
## 0.25 0.25 0.25
## time fast time go top line
## 0.25 0.25 0.25
## upfront experience vision well volume call
## 0.25 0.25 0.25
## well rewarded well tuition witnessed combined
## 0.25 0.25 0.25
## work can work cool work environments
## 0.25 0.25 0.25
## work fast work job workers readily
## 0.25 0.25 0.25
## yell leader
## 0.25
Interpreation
For Amazon:
Create a comparison.cloud()
of Google’s positive and negative reviews for comparison to Amazon. This will give you a quick understanding of top terms without having to spend as much time as you did examining the Amazon reviews in the previous exercises.
# Create all_goog_pros
all_goog_pros <- paste(goog_pros, collapse = " ") # collapses all individual tweets into a single vector
#str(all_goog_pros)
# Create all_goog_cons
all_goog_cons <- paste(goog_cons, collapse = " ")
#str(all_goog_cons)
# Create all_goog
all_goog <- c(all_goog_pros, all_goog_cons)
#str(all_goog)
# Convert to a vector source
all_goog <- VectorSource(all_goog)
#str(all_goog)
# Create all_goog_corpus
all_goog_corpus <- VCorpus(all_goog)
#str(all_goog_corpus)
# Create all_goog_corp
all_goog_corp <- tm_clean(all_goog_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_goog_corp)
# Name the columns of all_tdm
colnames(all_tdm) <- c("Goog_Pros", "Goog_Cons")
# Create all_m
all_m <- as.matrix(all_tdm)
# Build a comparison cloud
comparison.cloud(all_m, colors = c("#F44336", "#2196f3"), max.words = 100)
Interpretation
For Google:
Make a pyramid plot lining up positive reviews for Amazon and Google so you can adequately see the differences between any shared bigrams.
steps to prep texts for analysis
# Create all_amzn_pros
all_amzn_pros <- paste(amzn_pros, collapse = " ")
#str(all_amzn_pros)
# Create all_goog_pros
all_goog_pros <- paste(goog_pros, collapse = " ") # collapses all individual tweets into a single vector
#str(all_goog_pros)
# Create all
all <- c(all_amzn_pros, all_goog_pros)
#str(all)
# Convert to a vector source
all <- VectorSource(all)
#str(all)
# Create all_corpus
all_corpus <- VCorpus(all)
#str(all_corpus)
# Create all_corp
all_corp <- tm_clean(all_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_corp,
control = list(tokenize = tokenizer)) #bigram
# Name the columns of all_tdm
colnames(all_tdm) <- c("Amzn", "Google")
# Create all_m
all_m <- as.matrix(all_tdm)
# Load package
library(plotrix)
# Create common_words
common_words <- subset(all_m, all_m[, 1] > 0 & all_m[, 2] > 0)
head(common_words)
## Docs
## Terms Amzn Google
## ability make 1 1
## able move 1 1
## affect millions 1 1
## always something 2 1
## amazing great 1 2
## amazing people 1 6
# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])
head(difference)
## ability make able move affect millions always something
## 0 0 0 1
## amazing great amazing people
## 1 5
# Add difference to common_words
common_words <- cbind(common_words, difference)
head(common_words)
## Amzn Google difference
## ability make 1 1 0
## able move 1 1 0
## affect millions 1 1 0
## always something 2 1 1
## amazing great 1 2 1
## amazing people 1 6 5
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[,3], decreasing = TRUE), ]
head(common_words)
## Amzn Google difference
## smart people 20 42 22
## good pay 26 9 17
## fast paced 16 2 14
## great culture 2 14 12
## people great 6 18 12
## work great 6 17 11
# Create top15_df
top15_df <- data.frame(x = common_words[1:15, 1], y = common_words[1:15, 2], labels = rownames(common_words[1:15, ]))
# Create the pyramid plot
pyramid.plot(top15_df$x, top15_df$y,
labels = top15_df$labels, gap = 12,
top.labels = c("Amzn", "Pro Words", "Google"),
main = "Words in Common", unit = NULL)
## [1] 5.1 4.1 4.1 2.1
Interpretation The graph shows top 15 common words in terms of the difference in the number of appearances between Google and Amazon.
# Create all_amzn_cons
all_amzn_cons <- paste(amzn_cons, collapse = " ")
#str(all_amzn_cons)
# Create all_goog_cons
all_goog_cons <- paste(goog_cons, collapse = " ") # collapses all individual tweets into a single vector
#str(all_goog_cons)
# Create all
all <- c(all_amzn_cons, all_goog_cons)
#str(all)
# Convert to a vector source
all <- VectorSource(all)
#str(all)
# Create all_corpus
all_corpus <- VCorpus(all)
#str(all_corpus)
# Create all_corp
all_corp <- tm_clean(all_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_corp,
control = list(tokenize = tokenizer)) #bigram
# Name the columns of all_tdm
colnames(all_tdm) <- c("Amzn", "Google")
# Create all_m
all_m <- as.matrix(all_tdm)
# Create common_words
common_words <- subset(all_m, all_m[, 1] > 0 & all_m[, 2] > 0)
# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])
# Bind difference to common_words
common_words <- cbind(common_words, difference)
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing = TRUE), ]
# Create top15_df
top15_df <- data.frame(x = common_words[1:15, 1], y = common_words[1:15, 2], labels = rownames(common_words[1:15, ]))
# Create the pyramid plot
pyramid.plot(top15_df$x, top15_df$y, labels = top15_df$labels, gap = 12, top.labels = c("Amzn", "Cons Words", "Google"), main = "Words in Common", unit = NULL)
## [1] 5.1 4.1 4.1 2.1
Interpretation The graph shows top 15 common words in terms of the difference in the number of appearances between Google and Amazon.
A few recommendations for Amazon’s HR department to remain competitive against Google in recruiting talents: