Install Packages and
install.packages(“NLP”)
install.packages(“tm”)
install.packages(“SnowballC”)
install.packages(“RColorBrewer”)
install.packages(“wordcloud”)
Extract the column of comment
comment1 <- data_music$Q1..What.do.you.like.most.about.this.portable.music.player.
head(comment1)
## [1] "little, light"
## [2] "The battery power is great."
## [3] "cost and size"
## [4] "Having all my CDs in the palm of my hand!"
## [5] "The shuffle mode."
## [6] "Battery life. Portability. Accessories. Style."
Convert to Vcorpus. Vcorpus is generally used to create a volatile object when we handle multiple files for text processings.
vector1 <- VectorSource(comment1)
corpus1 <- VCorpus(vector1)
Check the corpus from 1 to 6
for(i in 1:6) {
print(corpus1[[i]][1])
}
## $content
## [1] "little, light"
##
## $content
## [1] "The battery power is great."
##
## $content
## [1] "cost and size"
##
## $content
## [1] "Having all my CDs in the palm of my hand!"
##
## $content
## [1] "The shuffle mode."
##
## $content
## [1] "Battery life. Portability. Accessories. Style."
Convert the text’s upper case to lower case
corpus1 <- tm_map(corpus1, content_transformer(tolower))
for(i in 1:6) {
print(corpus1[[i]][1])
}
## $content
## [1] "little, light"
##
## $content
## [1] "the battery power is great."
##
## $content
## [1] "cost and size"
##
## $content
## [1] "having all my cds in the palm of my hand!"
##
## $content
## [1] "the shuffle mode."
##
## $content
## [1] "battery life. portability. accessories. style."
Remove number and the english stopwords
corpus1 <- tm_map(corpus1, removeNumbers)
myStopwords <- c(stopwords('english'), 'the', 'and')
corpus1 <- tm_map(corpus1, removeWords, myStopwords)
for(i in 1:6) {
print(corpus1[[i]][1])
}
## $content
## [1] "little, light"
##
## $content
## [1] " battery power great."
##
## $content
## [1] "cost size"
##
## $content
## [1] " cds palm hand!"
##
## $content
## [1] " shuffle mode."
##
## $content
## [1] "battery life. portability. accessories. style."
Remove the punctuation
corpus1 <- tm_map(corpus1, removePunctuation)
for(i in 1:6) {
print(corpus1[[i]][1])
}
## $content
## [1] "little light"
##
## $content
## [1] " battery power great"
##
## $content
## [1] "cost size"
##
## $content
## [1] " cds palm hand"
##
## $content
## [1] " shuffle mode"
##
## $content
## [1] "battery life portability accessories style"
Remove the whitespace
corpus1 <- tm_map(corpus1, stripWhitespace)
for(i in 1:6) {
print(corpus1[[i]][1])
}
## $content
## [1] "littl light"
##
## $content
## [1] "batteri power great"
##
## $content
## [1] "cost size"
##
## $content
## [1] "cds palm hand"
##
## $content
## [1] "shuffl mode"
##
## $content
## [1] "batteri life portabl accessori style"
Convert to document term matrix (frequency count)
doc <- corpus1
dtm <- DocumentTermMatrix(doc)
m1 = as.matrix(dtm)
weightTFIdf = Weight a term-document matrix by term frequency - Inverse Document Frequency (TfIdf)
dtm2 <- DocumentTermMatrix(doc, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
m2 = as.matrix(dtm2)
Generate dtm (binary) - The term appear in the doc (Yes/No)
dtm3 <- DocumentTermMatrix(doc, control = list(weighting = weightBin))
m3 = as.matrix(dtm3)
Count the frequency of each concept
freq <- colSums(as.matrix(dtm))
head(freq)
## abil abl access accessori activ actual
## 8 3 2 4 1 1
freq1 <- colSums(as.matrix(dtm2))
head(freq1)
## abil abl access accessori activ actual
## 46.835385 21.230447 15.323556 26.647112 8.661778 8.661778
freq2 <- colSums(as.matrix(dtm3))
head(freq2)
## abil abl access accessori activ actual
## 7 3 2 4 1 1
#### Compute the concepts
length(freq)
## [1] 522
length(freq1)
## [1] 522
length(freq2)
## [1] 522
Sort the frequency from highest frequency to lowest
sort <- sort(freq, decreasing = TRUE)
sort1 <- sort(freq1, decreasing = TRUE)
sort2 <- sort(freq2, decreasing = TRUE)
head(sort)
## music can use small like easi
## 80 71 62 56 51 50
head(sort1)
## music can use like small easi
## 194.6368 187.3981 169.3245 166.7425 159.8477 152.3534
head(sort2)
## music can use small easi like
## 75 65 61 56 49 42
Save the dtm into excel
m <- as.matrix(dtm)
dim(m)
## [1] 405 522
m1 <- as.matrix(dtm2)
m2 <- as.matrix(dtm3)
setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Text Analytic/WK 1/Workshops/Day 1")
write.csv(m, file = "dtm1_Ng_Soo_Kuan.csv")
write.csv(m1, file = "dtm2_Ng_Soo_Kuan.csv")
write.csv(m2, file = "dtm3_Ng_Soo_Kuan.csv")
Generate dataframe of concepts and their frequency >15
subfreq <- subset(freq, freq>15)
diffreq <- data.frame(term = names(subfreq), fre=subfreq)
Plot the frequency chart
library(ggplot2)
ggplot(diffreq, aes(x=diffreq$term, y = diffreq$fre)) + geom_bar(stat = 'identity') + coord_flip()

Generate the WordCloud
wordcloud(names(freq), freq, min.freq = 10)

dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(freq), freq, max.words = 30, rot.per=0.6, colors = dark2)

wordcloud(names(freq1), freq, max.words = 30, rot.per=0.6, colors = dark2)

wordcloud(names(freq2), freq, max.words = 30, rot.per=0.6, colors = dark2)
