http://handsondatascience.com/TextMiningO.pdf
Set the working directory in the folder contain the file “music_survey.csv”.
Read data in and name it as data_music.
data_music<-read.csv("music_survey.csv", stringsAsFactor=FALSE)
View your data
View(data_music)
Install and load library
#install.packages("NLP")
#install.packages("tm")
library(NLP)
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
Read Question 1 into new variable called comment1 Print first 6 comments from comment1
comment1<-data_music$Q1..What.do.you.like.most.about.this.portable.music.player.
head(comment1)
## [1] "little, light"
## [2] "The battery power is great."
## [3] "cost and size"
## [4] "Having all my CDs in the palm of my hand!"
## [5] "The shuffle mode."
## [6] "Battery life. Portability. Accessories. Style."
Creat a Corpus, which can be processed by tm package. Print comments.
vector1<-VectorSource(comment1)
corpus1<-VCorpus(vector1)
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "little, light"
##
## $content
## [1] "The battery power is great."
##
## $content
## [1] "cost and size"
##
## $content
## [1] "Having all my CDs in the palm of my hand!"
##
## $content
## [1] "The shuffle mode."
##
## $content
## [1] "Battery life. Portability. Accessories. Style."
Remove punctuation
corpus1<-tm_map(corpus1,removePunctuation)
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "little light"
##
## $content
## [1] "The battery power is great"
##
## $content
## [1] "cost and size"
##
## $content
## [1] "Having all my CDs in the palm of my hand"
##
## $content
## [1] "The shuffle mode"
##
## $content
## [1] "Battery life Portability Accessories Style"
Change the letters to lower cases
corpus1<-tm_map(corpus1, content_transformer(tolower))
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "little light"
##
## $content
## [1] "the battery power is great"
##
## $content
## [1] "cost and size"
##
## $content
## [1] "having all my cds in the palm of my hand"
##
## $content
## [1] "the shuffle mode"
##
## $content
## [1] "battery life portability accessories style"
Remove numbers and stopwords
corpus1 <- tm_map(corpus1, removeNumbers)
myStopwords <- c(stopwords('english'), 'the','and')
corpus1 <- tm_map(corpus1, removeWords, myStopwords)
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "little light"
##
## $content
## [1] " battery power great"
##
## $content
## [1] "cost size"
##
## $content
## [1] " cds palm hand"
##
## $content
## [1] " shuffle mode"
##
## $content
## [1] "battery life portability accessories style"
Install and load library
#install.packages("SnowballC")
library(SnowballC)
Word steming
corpus1<-tm_map(corpus1, stemDocument)
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "littl light"
##
## $content
## [1] "batteri power great"
##
## $content
## [1] "cost size"
##
## $content
## [1] "cds palm hand"
##
## $content
## [1] "shuffl mode"
##
## $content
## [1] "batteri life portabl accessori style"
Remove White space
corpus1<-tm_map(corpus1, stripWhitespace)
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "littl light"
##
## $content
## [1] "batteri power great"
##
## $content
## [1] "cost size"
##
## $content
## [1] "cds palm hand"
##
## $content
## [1] "shuffl mode"
##
## $content
## [1] "batteri life portabl accessori style"
1.Generate dtm (frequency count)
doc<-corpus1
dtm <- DocumentTermMatrix(doc)
m1=as.matrix(dtm)
2.Generate dtm (TfIdf)
dtm2 <- DocumentTermMatrix(doc,
control = list(weighting = function(x) weightTfIdf(x,normalize =FALSE)))
m2=as.matrix(dtm2)
3.Generate dtm (binary)
dtm3 <- DocumentTermMatrix(doc,
control = list(weighting=weightBin))
m3=as.matrix(dtm3)
Count the frequency of each concept
freq <- colSums(as.matrix(dtm)) #Sum each column
Compute concepts
length(freq)
## [1] 530
sort frequency in order
sor<-sort(freq,decreasing = TRUE)
head(sor)
## music can use small like easi
## 80 71 62 56 51 50
Save dtm into excel
m <- as.matrix(dtm)
dim(m)
## [1] 405 530
write.csv(m, file="dtm1.csv")
Geneate dataframe of concepts and their frequency (Only frequency greater than 15)
subfreq<-subset(freq, freq>15)
dffreq<-data.frame(term=names(subfreq), fre=subfreq)
Plot the frequency
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
ggplot(dffreq, aes(x=dffreq$term, y=dffreq$fre))+ geom_bar(stat="identity")+ coord_flip()
###word cloud
Generate word cloud
#install.packages("RColorBrewer")
#install.packages("wordcloud")
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
wordcloud(names(freq), freq, min.freq=10)
Add color on wordcloud
dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(freq), freq, max.words=100, rot.per=0.6, colors=dark2)