Install and Load R Packages
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
library("syuzhet")
library("ggplot2")
Reading my Resume data in R
# Read the text file from local machine , choose file interactively
text <- readLines(file.choose())
# Load the data as a corpus
TextDoc <- Corpus(VectorSource(text))
Clean up the text data in my resume
#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
# Convert the text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
# Remove english common stopwords
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc <- tm_map(TextDoc, removeWords, c("s", "company", "team"))
# Remove punctuations
TextDoc <- tm_map(TextDoc, removePunctuation)
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
# Text stemming - which reduces words to their root form
Build the Term Document matrix
# Build a term-document matrix
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
dtm_m <- as.matrix(TextDoc_dtm)
# Sort by descearing value of frequency
dtm_v <- sort(rowSums(dtm_m),decreasing=TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq=dtm_v)
# Display the top 5 most frequent words
head(dtm_d, 30)
## word freq
## data data 11
## management management 7
## program program 7
## project project 6
## google google 5
## health health 5
## analytics analytics 5
## training training 4
## wide wide 4
## science science 4
## medical medical 4
## software software 4
## operational operational 3
## technical technical 3
## compliance compliance 3
## improvement improvement 3
## standards standards 3
## public public 3
## staff staff 3
## tracking tracking 3
## managed managed 3
## october october 3
## coordinator coordinator 3
## information information 3
## sales sales 3
## infusion infusion 3
## coaching coaching 2
## driven driven 2
## education education 2
## experience experience 2
Plotting the most frequent words
# Plot the most frequent words
barplot(dtm_d[1:5,]$freq, las = 2, names.arg = dtm_d[1:5,]$word,
col ="lightgreen", main ="Top 5 most frequent words",
ylab = "Word frequencies")
Generating a word cloud
#generate word cloud
set.seed(1234)
wordcloud(words = dtm_d$word, freq = dtm_d$freq, min.freq = 4,
max.words=100, random.order=FALSE, rot.per=0.40,
colors=brewer.pal(8, "Dark2"))
Correlation - Word Association
# Find associations
findAssocs(TextDoc_dtm, terms = c("good","work","health"), corlimit = 0.25)
## $good
## numeric(0)
##
## $work
## numeric(0)
##
## $health
## public information tarrant accreditation associated
## 0.49 0.49 0.43 0.43 0.43
## creating detailed facilitating meetings obtaining
## 0.43 0.43 0.43 0.43 0.43
## schedule tasks associate college collin
## 0.43 0.43 0.43 0.43 0.43
## plano alabama bachelor sciences tuscaloosa
## 0.43 0.43 0.43 0.43 0.43
## ahima foundation registered rhit science
## 0.43 0.43 0.43 0.43 0.41
## complete technician county university
## 0.28 0.28 0.28 0.28
Word Association for the top 3 most frequent terms
# Find associations for words that occur at least 50 times
findAssocs(TextDoc_dtm, terms = findFreqTerms(TextDoc_dtm, lowfreq = 10), corlimit = 0.25)
## $data
## professional extracted communication continued dedicated
## 0.53 0.53 0.51 0.51 0.51
## demonstrated driven improvements integrity rehearsed
## 0.51 0.51 0.51 0.51 0.51
## stakeholders success variety vast generated
## 0.51 0.51 0.51 0.51 0.51
## internet meaningful platforms sales sorted
## 0.51 0.51 0.51 0.51 0.51
## strategy supported traffic coaching education
## 0.51 0.51 0.51 0.33 0.33
## experience matter proficiencies subject training
## 0.33 0.33 0.33 0.33 0.33
## wide states united
## 0.33 0.33 0.33
Sentiment Scores
# regular sentiment score using get_sentiment() function and method of your choice
# please note that different methods may have different scales
syuzhet_vector <- get_sentiment(text, method="syuzhet")
# see the first row of the vector
head(syuzhet_vector)
## [1] 0.00 4.40 0.00 0.00 -0.15 0.40
# see summary statistics of the vector
summary(syuzhet_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.1500 0.0000 0.0000 0.4621 0.8000 4.4000
# bing
bing_vector <- get_sentiment(text, method="bing")
head(bing_vector)
## [1] 0 4 0 0 0 0
summary(bing_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2576 0.0000 4.0000
#affin
afinn_vector <- get_sentiment(text, method="afinn")
head(afinn_vector)
## [1] 0 7 0 0 0 0
summary(afinn_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7121 0.0000 8.0000
#compare the first row of each vector using sign function
rbind(
sign(head(syuzhet_vector)),
sign(head(bing_vector)),
sign(head(afinn_vector))
)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0 1 0 0 -1 1
## [2,] 0 1 0 0 0 0
## [3,] 0 1 0 0 0 0
Emotion Classification
# run nrc sentiment analysis to return data frame with each row classified as one of the following
# emotions, rather than a score:
# anger, anticipation, disgust, fear, joy, sadness, surprise, trust
# It also counts the number of positive and negative emotions found in each row
d<-get_nrc_sentiment(text)
## Warning: `spread_()` was deprecated in tidyr 1.2.0.
## Please use `spread()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
# head(d,10) - to see top 10 lines of the get_nrc_sentiment dataframe
head (d,10)
## anger anticipation disgust fear joy sadness surprise trust negative positive
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 1 0 0 4 1 4
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 1 0 0 0 1 1 1
## 6 0 0 0 0 0 0 0 1 0 1
## 7 0 0 0 0 0 0 0 1 0 1
## 8 0 0 0 0 0 0 0 0 0 0
## 9 0 0 0 0 1 0 0 1 0 1
## 10 0 0 0 0 0 0 0 0 0 0
Visualize the emotion findings
#Plot two - count of words associated with each sentiment, expressed as a percentage
barplot(
sort(colSums(prop.table(d[, 1:8]))),
horiz = TRUE,
cex.names = 0.7,
las = 1,
main = "Emotions in Text", xlab="Percentage"
)