Text Analystic and Visualize by WordCloud

Read the data file

setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Text Analytic/WK 1/Workshops/Day 1")
data_music <- read.csv("music_survey.csv", stringsAsFactor = FALSE)

Install Packages and

install.packages(“NLP”)

install.packages(“tm”)

install.packages(“SnowballC”)

install.packages(“RColorBrewer”)

install.packages(“wordcloud”)

Library the functions

library(NLP)
library(tm)
library(SnowballC)
library(RColorBrewer)
library(wordcloud)

Extract the column of comment

comment1 <- data_music$Q1..What.do.you.like.most.about.this.portable.music.player.
head(comment1)

## [1] "little, light"                                 
## [2] "The battery power is great."                   
## [3] "cost and size"                                 
## [4] "Having all my CDs in the palm of my hand!"     
## [5] "The shuffle mode."                             
## [6] "Battery life. Portability. Accessories. Style."

Convert to Vcorpus. Vcorpus is generally used to create a volatile object when we handle multiple files for text processings.

vector1 <- VectorSource(comment1)
corpus1 <- VCorpus(vector1)

Check the corpus from 1 to 6

for(i in 1:6) {
  print(corpus1[[i]][1])
}

## $content
## [1] "little, light"
## 
## $content
## [1] "The battery power is great."
## 
## $content
## [1] "cost and size"
## 
## $content
## [1] "Having all my CDs in the palm of my hand!"
## 
## $content
## [1] "The shuffle mode."
## 
## $content
## [1] "Battery life. Portability. Accessories. Style."

Convert the text’s upper case to lower case

corpus1 <- tm_map(corpus1, content_transformer(tolower))
for(i in 1:6) {
  print(corpus1[[i]][1])
}

## $content
## [1] "little, light"
## 
## $content
## [1] "the battery power is great."
## 
## $content
## [1] "cost and size"
## 
## $content
## [1] "having all my cds in the palm of my hand!"
## 
## $content
## [1] "the shuffle mode."
## 
## $content
## [1] "battery life. portability. accessories. style."

Remove number and the english stopwords

corpus1 <- tm_map(corpus1, removeNumbers)
myStopwords <- c(stopwords('english'), 'the', 'and')
corpus1 <- tm_map(corpus1, removeWords, myStopwords)
for(i in 1:6) {
  print(corpus1[[i]][1])
}

## $content
## [1] "little, light"
## 
## $content
## [1] " battery power  great."
## 
## $content
## [1] "cost  size"
## 
## $content
## [1] "   cds   palm   hand!"
## 
## $content
## [1] " shuffle mode."
## 
## $content
## [1] "battery life. portability. accessories. style."

Remove the punctuation

corpus1 <- tm_map(corpus1, removePunctuation)

for(i in 1:6) {
  print(corpus1[[i]][1])
}

## $content
## [1] "little light"
## 
## $content
## [1] " battery power  great"
## 
## $content
## [1] "cost  size"
## 
## $content
## [1] "   cds   palm   hand"
## 
## $content
## [1] " shuffle mode"
## 
## $content
## [1] "battery life portability accessories style"

Perform stemming

corpus1 <- tm_map(corpus1, stemDocument)
for(i in 1:6) {
  print(corpus1[[i]][1])
}

## $content
## [1] "littl light"
## 
## $content
## [1] "batteri power great"
## 
## $content
## [1] "cost size"
## 
## $content
## [1] "cds palm hand"
## 
## $content
## [1] "shuffl mode"
## 
## $content
## [1] "batteri life portabl accessori style"

Remove the whitespace

corpus1 <- tm_map(corpus1, stripWhitespace)
for(i in 1:6) {
  print(corpus1[[i]][1])
}

## $content
## [1] "littl light"
## 
## $content
## [1] "batteri power great"
## 
## $content
## [1] "cost size"
## 
## $content
## [1] "cds palm hand"
## 
## $content
## [1] "shuffl mode"
## 
## $content
## [1] "batteri life portabl accessori style"

Convert to document term matrix (frequency count)

doc <- corpus1
dtm <- DocumentTermMatrix(doc)
m1 = as.matrix(dtm)

weightTFIdf = Weight a term-document matrix by term frequency - Inverse Document Frequency (TfIdf)

dtm2 <- DocumentTermMatrix(doc, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
m2 = as.matrix(dtm2)

Generate dtm (binary) - The term appear in the doc (Yes/No)

dtm3 <- DocumentTermMatrix(doc, control = list(weighting = weightBin))
m3 = as.matrix(dtm3)

Count the frequency of each concept

freq <- colSums(as.matrix(dtm))
head(freq)

##      abil       abl    access accessori     activ    actual 
##         8         3         2         4         1         1

freq1 <- colSums(as.matrix(dtm2))
head(freq1)

##      abil       abl    access accessori     activ    actual 
## 46.835385 21.230447 15.323556 26.647112  8.661778  8.661778

freq2 <- colSums(as.matrix(dtm3))
head(freq2)

##      abil       abl    access accessori     activ    actual 
##         7         3         2         4         1         1

#### Compute the concepts
length(freq)

## [1] 522

length(freq1)

## [1] 522

length(freq2)

## [1] 522

Sort the frequency from highest frequency to lowest

sort <- sort(freq, decreasing = TRUE)
sort1 <- sort(freq1, decreasing = TRUE)
sort2 <- sort(freq2, decreasing = TRUE)
head(sort)

## music   can   use small  like  easi 
##    80    71    62    56    51    50

head(sort1)

##    music      can      use     like    small     easi 
## 194.6368 187.3981 169.3245 166.7425 159.8477 152.3534

head(sort2)

## music   can   use small  easi  like 
##    75    65    61    56    49    42

Save the dtm into excel

m <- as.matrix(dtm)
dim(m)

## [1] 405 522

m1 <- as.matrix(dtm2)
m2 <- as.matrix(dtm3)
setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Text Analytic/WK 1/Workshops/Day 1")
write.csv(m, file = "dtm1_Ng_Soo_Kuan.csv")
write.csv(m1, file = "dtm2_Ng_Soo_Kuan.csv")
write.csv(m2, file = "dtm3_Ng_Soo_Kuan.csv")

Generate dataframe of concepts and their frequency >15

subfreq <- subset(freq, freq>15)
diffreq <- data.frame(term = names(subfreq), fre=subfreq)

Plot the frequency chart

library(ggplot2)
ggplot(diffreq, aes(x=diffreq$term, y = diffreq$fre)) + geom_bar(stat = 'identity') + coord_flip()

Generate the WordCloud

wordcloud(names(freq), freq, min.freq = 10)

dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(freq), freq, max.words = 30, rot.per=0.6, colors = dark2)

wordcloud(names(freq1), freq, max.words = 30, rot.per=0.6, colors = dark2)

wordcloud(names(freq2), freq, max.words = 30, rot.per=0.6, colors = dark2)