# data manipulation
library(dplyr)
# text-mining
library(qdap)
library(tm)
library(RWeka)
# visualization
library(ggplot2)
library(ggthemes)
library(viridisLite)
library(wordcloud)
library(plotrix)
library(dendextend)
Datasets are downloaded from:
https://assets.datacamp.com/production/course_935/datasets/500_amzn.csv https://assets.datacamp.com/production/course_935/datasets/500_goog.csv
Import reviews (from local files) as data frame, remove NAs:
amzn <- read.csv("Datasets/500_amzn.csv", stringsAsFactors = FALSE) %>%
select(pros, cons) %>%
na.omit()
goog <- read.csv("Datasets/500_goog.csv", stringsAsFactors = FALSE) %>%
select(pros, cons) %>%
na.omit()
Make each set of tweets a vector:
amzn_pros <- amzn$pros # Create amzn_pros
amzn_cons <- amzn$cons # Create amzn_cons
goog_pros <- goog$pros # Create goog_pros
goog_cons <- goog$cons # Create goog_cons
To make a corpus, data (vector or data frame) must be first interpreted as a document:
VectorSource() interprets each element of the vector x as a document.
DataframeSource() interprets each row of the data frame x as a document. The first column must be named "doc_id" and contain a unique string identifier for each document. The second column must be named "text" and contain a UTF-8 encoded string representing the document’s content. Optional additional columns are used as document level metadata.
PCorpus, and the volatile corpus, VCorpus.VCorpus creates volatile corpora
# two lines:
amzn_pros_source <- VectorSource(amzn_pros)
amzn_pros_corpus <- VCorpus(amzn_pros_source)
# or as one liners:
amzn_cons_corpus <- VCorpus(VectorSource(amzn_cons))
goog_pros_corpus <- VCorpus(VectorSource(goog_pros))
goog_cons_corpus <- VCorpus(VectorSource(goog_cons))
Bag of words: collapse all words from vector into one string, then combine together
# each vector contains two "bags": pros and cons
all_amzn <- c(paste(amzn_pros, collapse = " "), paste(amzn_cons, collapse = " "))
names(all_amzn) <- c("amzn_pros", "amzn_cons")
all_goog <- c(paste(goog_pros, collapse = " "), paste(goog_cons, collapse = " "))
names(all_goog) <- c("goog_pros", "goog_cons")
# bags can be conbined together
reviews <- c(all_amzn, all_goog)
# make corpus
reviews_corpus <- VCorpus(VectorSource(reviews))
inspect(reviews_corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 4
##
## $amzn_pros
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 53667
##
## $amzn_cons
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 60053
##
## $goog_pros
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 45420
##
## $goog_cons
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 52513
Listing few top common words is a good way to initially inspect the bag of words:
# Find the 10 most frequent terms: term_count
term_count <- freq_terms(reviews_corpus, 10)
plot(term_count)
From the graph above it is clear that most common words are the stop words. Their removal appears essential for (most of the) analysis.
qdap package
bracketX() Remove all text within brackets (e.g. “It’s (so) cool” becomes “It’s cool”)replace_number() Replace numbers with their word equivalents (e.g. “2” becomes “two”)replace_abbreviation() Replace abbreviations with their full text equivalents (e.g. “Sr” becomes “Senior”)replace_contraction() Convert contractions back to their base words (e.g. “shouldn’t” becomes “should not”)replace_symbol() Replace common symbols with their word equivalents (e.g. “$” becomes “dollar”)Functions are applied on the text variable.
qdap functions can be wrapped into qdap_clean function:
qdap_clean <- function(x){
x <- replace_abbreviation(x)
x <- replace_contraction(x)
x <- replace_number(x)
x <- replace_ordinal(x)
x <- replace_ordinal(x)
x <- replace_symbol(x)
x <- tolower(x)
return(x)
}
For compatibility, base R and qdap functions need to be wrapped in content_transformer().
corpus <- tm_map(corpus, content_transformer(replace_abbreviation))
Package tm offers another set of useful functionsm, e.g.:
removePunctuation()stripWhitespace() removes extra white spaceremoveWords() remove certain words (i.e. a, an, the)Use with tm_map. removeWords can be used to remove stop words:
# remove default stopwords + custom "Google" and "Amazon"
stopWordsLib <- c(stopwords("en"), "Google", "Amazon")
corpus <- tm_map(corpus, removeWords, stopWordsLib)
Functions can be wrapped into custom clean_text() function, that can include qdap_clean. NOTE THE ORDER of cleaning functions, as it might be important (e.g. if tolower is first then custom stopword Amazon must be lowercase)
clean_text <- function(corpus){
corpus <- tm_map(corpus, content_transformer(qdap_clean))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "google", "amazon", "company"))
return(corpus)
}
reviews_clean <- clean_text(reviews_corpus)
Again look at the most common words:
# Find the 10 most frequent terms
term_count <- freq_terms(reviews_clean, 10)
plot(term_count)
Make term-document matrix (TDM)
row contains term (word)col contain document# Create a term-document matrix from the corpus
reviews_tdm <- TermDocumentMatrix(reviews_clean)
# Print reviews_tdm data
reviews_tdm
## <<TermDocumentMatrix (terms: 3714, documents: 4)>>
## Non-/sparse entries: 6240/8616
## Sparsity : 58%
## Maximal term length: 27
## Weighting : term frequency (tf)
# Convert coffee_tdm to a matrix
reviews_m <- as.matrix(reviews_tdm)
colnames(reviews_m) <- names(reviews)
# random 10 words
reviews_m[sample(nrow(reviews_m), 10),]
## Docs
## Terms amzn_pros amzn_cons goog_pros goog_cons
## pond 0 0 0 1
## annoying 0 0 1 1
## getter 0 1 0 0
## lacks 0 0 0 1
## leaders 3 1 2 5
## engaged 0 1 1 2
## said 2 2 0 1
## leadership 8 19 6 10
## teammanagement 1 0 0 0
## welcome 1 0 0 1
DTM (document-term matrix) is a transposition of TDM: - each row contains document - each col contain term (word)
Create using
reviews_dtm <- DocumentTermMatrix(reviews_clean)
Word frequency in a document
color_pal <- cividis(n = 10) # other interesting: magma, plasma, inferno
wordcloud(rownames(reviews_m), reviews_m[,"amzn_pros"], max.words = 70, colors = color_pal)
Comparison across documents:
comparison.cloud(reviews_m[,c("amzn_pros", "goog_pros")], max.words = 100)
Commonality cloud: words shared across documents
commonality.cloud(reviews_m[,c("amzn_cons", "goog_cons")], max.words = 100)
top_df <- reviews_m[,c("amzn_cons", "goog_cons")] %>%
# Convert to data frame
as_data_frame(rownames = "word") %>%
# Keep rows where word appears everywhere
filter_all(all_vars(. > 0)) %>%
# Get difference in counts
mutate(difference = amzn_cons - goog_cons) %>%
# Keep rows with biggest difference
top_n(15, wt = difference) %>%
# Arrange by descending difference
arrange(desc(difference))
# library(plotrix) is loaded
pyramid.plot(
# Chardonnay counts
top_df$amzn_cons,
# Coffee counts
top_df$goog_cons,
# Words
labels = top_df$word,
top.labels = c("Amazon", "Words", "Google"),
main = "Words in Common",
unit = NULL,
gap = 40
)
## 158 158
## [1] 5.1 4.1 4.1 2.1
Find words associated with a given word(s) or a phrase(s). Results can be output as a network graph and/or wordcloud. NOTE, that word_associate takes text variable as an input, so in that case amzn_pros vector will be used (with applied cleaning using qdap_clean):
amzn_pros_cleaned <- qdap_clean(amzn_pros)
word_associate(amzn_pros_cleaned, match.string = "balance",
stopwords = c(stopwords("en"), Top200Words, "amazon"),
wordcloud = TRUE, cloud.colors = c("gray55", "darkred")
)
## row group unit text
## 1 36 all 36 good work and life balance
## 2 284 all 284 great opportunities to work on far-reaching and impactful projects. definitely worth working there. lots to learn. could be worth the challenge to your work/life balance.
## 3 292 all 292 pay is great if you overlook the complete lack of work/life balance; opportunity for advancement within the company - good luck promoting out of this building. sr management will go behind your back to stop you from moving to another building
## 4 330 all 330 good location, work atmosphere, nice colleagues and team! i learned a lot here. good work and life balance.
## 5 431 all 431 many growth opportunities, work life balance, many fun activities and other benefits for employees, great career growth.
##
## Match Terms
## ===========
##
## List 1:
## balance
##
# create TDM on cleaned data
amzn_cons_tdm <- TermDocumentMatrix(clean_text(amzn_cons_corpus))
# remove sparse terms
amzn_cons_tdm_filtered <- removeSparseTerms(amzn_cons_tdm, sparse = 0.95)
amzn_cons_tdm_filtered
## <<TermDocumentMatrix (terms: 23, documents: 496)>>
## Non-/sparse entries: 1038/10370
## Sparsity : 91%
## Maximal term length: 10
## Weighting : term frequency (tf)
# h-cluster based on euclidean distance matrix
hc <- hclust(d = dist(amzn_cons_tdm_filtered, method = "euclidean"), method = "complete")
# Plot a dendrogram
plot(hc)
dist()Extended aesthetics with dendextend
# library(dendextend) is loaded
# Create hcd
hcd <- as.dendrogram(hc)
# Print the labels in hcd
labels(hcd)
## [1] "work" "will" "management" "get" "people"
## [6] "time" "employees" "balance" "life" "job"
## [11] "high" "like" "managers" "pay" "one"
## [16] "hard" "working" "lot" "many" "team"
## [21] "can" "hours" "long"
# Change the branch color to red
hcd <- branches_attr_by_labels(hcd, c("long", "hours"), color = "red")
# Plot hcd
plot(hcd)
# Add cluster rectangles
rect.dendrogram(hcd, k = 4, border = "grey50")
Define togenizer to construct bigram (n = 2), trigram (n = 3), etc.:
# library(RWeka) is already loaded
tokenizer <- function(x) {
n = 2
NGramTokenizer(x, Weka_control(min = n, max = n))
}
Tokenizer can be passed to TermDocumentMatrix() function as control = list(tokenize = tokenizer) parameter:
tdm <- TermDocumentMatrix(x, control = list(tokenize = tokenizer))
as.matrix)rowSums)# clean
goog_pros_cleaned <- clean_text(goog_pros_corpus)
# DTM
goog_pros_dtm <- DocumentTermMatrix(goog_pros_cleaned,
control = list(tokenize = tokenizer))
# make matrix
goog_pros_m <- as.matrix(goog_pros_dtm)
# calc freq
goog_pros_freq <- colSums(goog_pros_m)
# Plot a wordcloud
wordcloud(names(goog_pros_freq), goog_pros_freq, max.words = 20)
## Warning in wordcloud(names(goog_pros_freq), goog_pros_freq, max.words = 20):
## smart people could not be fit on page. It will not be plotted.
Another way to handle high frequency words is to use Tfldf weighting (Term frequency-inverse document frequency). This basically just de-emphasises words that show up in a lot of documents. The idea is that these words are either common words or they are words that don’t give helpful information like ‘coffee’ in the coffee tweets.
tdm <- TermDocumentMatrix(x, control = list(weighting = weightTfIdf))
# DTM weighted
goog_pros_dtm_weighted <- DocumentTermMatrix(goog_pros_cleaned,
control = list(tokenize = tokenizer,
weighting = weightTfIdf))
## Warning in weighting(x): empty document(s): 190 240 243 377 446
# make matrix
goog_pros_weighted_m <- as.matrix(goog_pros_dtm_weighted)
# calc freq
goog_pros_weighted_freq <- colSums(goog_pros_weighted_m)
freq_comparison <- cbind(goog_pros_freq, goog_pros_weighted_freq = goog_pros_weighted_freq[names(goog_pros_freq)])
freq_comparison <- as.data.frame(freq_comparison) %>%
tibble::rownames_to_column(var = "bi_word") %>%
arrange(desc(goog_pros_freq))
head(freq_comparison, 10)
## bi_word goog_pros_freq goog_pros_weighted_freq
## 1 smart people 42 28.55821
## 2 free food 41 20.29911
## 3 place work 26 22.49935
## 4 great benefits 22 16.34358
## 5 great perks 20 17.29900
## 6 great work 18 27.60250
## 7 great people 16 16.22425
## 8 people great 16 12.72962
## 9 work environment 16 18.12284
## 10 great place 15 15.69811
# Plot a wordcloud
wordcloud(names(goog_pros_weighted_freq), goog_pros_weighted_freq, max.words = 20)
## Warning in wordcloud(names(goog_pros_weighted_freq), goog_pros_weighted_freq, :
## great perks could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(goog_pros_weighted_freq), goog_pros_weighted_freq, :
## great environment could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(goog_pros_weighted_freq), goog_pros_weighted_freq, :
## great place could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(goog_pros_weighted_freq), goog_pros_weighted_freq, :
## perks benefits could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(goog_pros_weighted_freq), goog_pros_weighted_freq, :
## benefits amazing could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(goog_pros_weighted_freq), goog_pros_weighted_freq, :
## great culture could not be fit on page. It will not be plotted.