Text Mining My Resume #2

Install and Load R Packages

library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
library("syuzhet")
library("ggplot2")

Reading my Resume data in R

# Read the text file from local machine , choose file interactively
text <- readLines(file.choose())
# Load the data as a corpus
TextDoc <- Corpus(VectorSource(text))

Clean up the text data in my resume

 #Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
# Convert the text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
# Remove english common stopwords
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc <- tm_map(TextDoc, removeWords, c("s", "company", "team")) 
# Remove punctuations
TextDoc <- tm_map(TextDoc, removePunctuation)
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
# Text stemming - which reduces words to their root form

Build the Term Document matrix

# Build a term-document matrix
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
dtm_m <- as.matrix(TextDoc_dtm)
# Sort by descearing value of frequency
dtm_v <- sort(rowSums(dtm_m),decreasing=TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq=dtm_v)
# Display the top 5 most frequent words
head(dtm_d, 30)

##                    word freq
## data               data   11
## management   management    7
## program         program    7
## project         project    6
## google           google    5
## health           health    5
## analytics     analytics    5
## training       training    4
## wide               wide    4
## science         science    4
## medical         medical    4
## software       software    4
## operational operational    3
## technical     technical    3
## compliance   compliance    3
## improvement improvement    3
## standards     standards    3
## public           public    3
## staff             staff    3
## tracking       tracking    3
## managed         managed    3
## october         october    3
## coordinator coordinator    3
## information information    3
## sales             sales    3
## infusion       infusion    3
## coaching       coaching    2
## driven           driven    2
## education     education    2
## experience   experience    2

Plotting the most frequent words

# Plot the most frequent words
barplot(dtm_d[1:5,]$freq, las = 2, names.arg = dtm_d[1:5,]$word,
        col ="lightgreen", main ="Top 5 most frequent words",
        ylab = "Word frequencies")

Generating a word cloud

#generate word cloud
set.seed(1234)
wordcloud(words = dtm_d$word, freq = dtm_d$freq, min.freq = 4,
          max.words=100, random.order=FALSE, rot.per=0.40, 
          colors=brewer.pal(8, "Dark2"))

Correlation - Word Association

# Find associations 
findAssocs(TextDoc_dtm, terms = c("good","work","health"), corlimit = 0.25)

## $good
## numeric(0)
## 
## $work
## numeric(0)
## 
## $health
##        public   information       tarrant accreditation    associated 
##          0.49          0.49          0.43          0.43          0.43 
##      creating      detailed  facilitating      meetings     obtaining 
##          0.43          0.43          0.43          0.43          0.43 
##      schedule         tasks     associate       college        collin 
##          0.43          0.43          0.43          0.43          0.43 
##         plano       alabama      bachelor      sciences    tuscaloosa 
##          0.43          0.43          0.43          0.43          0.43 
##         ahima    foundation    registered          rhit       science 
##          0.43          0.43          0.43          0.43          0.41 
##      complete    technician        county    university 
##          0.28          0.28          0.28          0.28

Word Association for the top 3 most frequent terms

# Find associations for words that occur at least 50 times
findAssocs(TextDoc_dtm, terms = findFreqTerms(TextDoc_dtm, lowfreq = 10), corlimit = 0.25)

## $data
##  professional     extracted communication     continued     dedicated 
##          0.53          0.53          0.51          0.51          0.51 
##  demonstrated        driven  improvements     integrity     rehearsed 
##          0.51          0.51          0.51          0.51          0.51 
##  stakeholders       success       variety          vast     generated 
##          0.51          0.51          0.51          0.51          0.51 
##      internet    meaningful     platforms         sales        sorted 
##          0.51          0.51          0.51          0.51          0.51 
##      strategy     supported       traffic      coaching     education 
##          0.51          0.51          0.51          0.33          0.33 
##    experience        matter proficiencies       subject      training 
##          0.33          0.33          0.33          0.33          0.33 
##          wide        states        united 
##          0.33          0.33          0.33

Sentiment Scores

# regular sentiment score using get_sentiment() function and method of your choice
# please note that different methods may have different scales
syuzhet_vector <- get_sentiment(text, method="syuzhet")
# see the first row of the vector
head(syuzhet_vector)

## [1]  0.00  4.40  0.00  0.00 -0.15  0.40

# see summary statistics of the vector
summary(syuzhet_vector)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.1500  0.0000  0.0000  0.4621  0.8000  4.4000

# bing
bing_vector <- get_sentiment(text, method="bing")
head(bing_vector)

## [1] 0 4 0 0 0 0

summary(bing_vector)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2576  0.0000  4.0000

#affin
afinn_vector <- get_sentiment(text, method="afinn")
head(afinn_vector)

## [1] 0 7 0 0 0 0

summary(afinn_vector)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.7121  0.0000  8.0000

#compare the first row of each vector using sign function
rbind(
  sign(head(syuzhet_vector)),
  sign(head(bing_vector)),
  sign(head(afinn_vector))
)

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    0    1    0    0   -1    1
## [2,]    0    1    0    0    0    0
## [3,]    0    1    0    0    0    0

Emotion Classification

# run nrc sentiment analysis to return data frame with each row classified as one of the following
# emotions, rather than a score: 
# anger, anticipation, disgust, fear, joy, sadness, surprise, trust 
# It also counts the number of positive and negative emotions found in each row
d<-get_nrc_sentiment(text)

## Warning: `spread_()` was deprecated in tidyr 1.2.0.
## Please use `spread()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

# head(d,10) - to see top 10 lines of the get_nrc_sentiment dataframe
head (d,10)

##    anger anticipation disgust fear joy sadness surprise trust negative positive
## 1      0            0       0    0   0       0        0     0        0        0
## 2      0            1       0    0   1       0        0     4        1        4
## 3      0            0       0    0   0       0        0     0        0        0
## 4      0            0       0    0   0       0        0     0        0        0
## 5      0            0       0    1   0       0        0     1        1        1
## 6      0            0       0    0   0       0        0     1        0        1
## 7      0            0       0    0   0       0        0     1        0        1
## 8      0            0       0    0   0       0        0     0        0        0
## 9      0            0       0    0   1       0        0     1        0        1
## 10     0            0       0    0   0       0        0     0        0        0

Visualize the emotion findings

#Plot two - count of words associated with each sentiment, expressed as a percentage
barplot(
  sort(colSums(prop.table(d[, 1:8]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Emotions in Text", xlab="Percentage"
)

Text Mining My Resume #2

Colleen

5/09/2022