http://handsondatascience.com/TextMiningO.pdf
Set the working directory in the folder contain the file “music_survey.csv”.
Read data in and name it as data_music.
data_music<-read.csv("music_survey.csv", stringsAsFactor=FALSE)
View your data
View(data_music)
Install and load library
#install.packages("NLP")
#install.packages("tm")
library(NLP)
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
Read Question 1 into new variable called comment1 Print first 6 comments from comment1
comment1<-data_music$Q2..What.do.you.like.least.about.this.portable.music.player.
head(comment1)
## [1] "expensive"
## [2] "The screen is hard to see when outside."
## [3] "difficult software"
## [4] "Nothing, I love it!"
## [5] "Battery life seems shorter than advertised."
## [6] "Ubiquitousness; everyone has one."
Creat a Corpus, which can be processed by tm package. Print comments.
vector1<-VectorSource(comment1)
corpus1<-VCorpus(vector1)
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "expensive"
##
## $content
## [1] "The screen is hard to see when outside."
##
## $content
## [1] "difficult software"
##
## $content
## [1] "Nothing, I love it!"
##
## $content
## [1] "Battery life seems shorter than advertised."
##
## $content
## [1] "Ubiquitousness; everyone has one."
Remove punctuation
corpus1<-tm_map(corpus1,removePunctuation)
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "expensive"
##
## $content
## [1] "The screen is hard to see when outside"
##
## $content
## [1] "difficult software"
##
## $content
## [1] "Nothing I love it"
##
## $content
## [1] "Battery life seems shorter than advertised"
##
## $content
## [1] "Ubiquitousness everyone has one"
Change the letters to lower cases
corpus1<-tm_map(corpus1, content_transformer(tolower))
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "expensive"
##
## $content
## [1] "the screen is hard to see when outside"
##
## $content
## [1] "difficult software"
##
## $content
## [1] "nothing i love it"
##
## $content
## [1] "battery life seems shorter than advertised"
##
## $content
## [1] "ubiquitousness everyone has one"
Remove numbers and stopwords
corpus1 <- tm_map(corpus1, removeNumbers)
myStopwords <- c(stopwords('english'), 'the','and')
corpus1 <- tm_map(corpus1, removeWords, myStopwords)
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "expensive"
##
## $content
## [1] " screen hard see outside"
##
## $content
## [1] "difficult software"
##
## $content
## [1] "nothing love "
##
## $content
## [1] "battery life seems shorter advertised"
##
## $content
## [1] "ubiquitousness everyone one"
Install and load library
#install.packages("SnowballC")
library(SnowballC)
Word steming
corpus1<-tm_map(corpus1, stemDocument)
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "expens"
##
## $content
## [1] "screen hard see outsid"
##
## $content
## [1] "difficult softwar"
##
## $content
## [1] "noth love"
##
## $content
## [1] "batteri life seem shorter advertis"
##
## $content
## [1] "ubiquit everyon one"
Remove White space
corpus1<-tm_map(corpus1, stripWhitespace)
for( i in 1:6){
print(corpus1[[i]][1])
}
## $content
## [1] "expens"
##
## $content
## [1] "screen hard see outsid"
##
## $content
## [1] "difficult softwar"
##
## $content
## [1] "noth love"
##
## $content
## [1] "batteri life seem shorter advertis"
##
## $content
## [1] "ubiquit everyon one"
1.Generate dtm (frequency count)
doc<-corpus1
dtm <- DocumentTermMatrix(doc)
m1=as.matrix(dtm)
2.Generate dtm (TfIdf)
dtm2 <- DocumentTermMatrix(doc,
control = list(weighting = function(x) weightTfIdf(x,normalize =FALSE)))
m2=as.matrix(dtm2)
3.Generate dtm (binary)
dtm3 <- DocumentTermMatrix(doc,
control = list(weighting=weightBin))
m3=as.matrix(dtm3)
Count the frequency of each concept
freq <- colSums(as.matrix(dtm)) #Sum each column
Compute concepts
length(freq)
## [1] 488
sort frequency in order
sor<-sort(freq,decreasing = TRUE)
head(sor)
## batteri life noth expens music song
## 75 44 36 26 22 20
Save dtm into excel
m <- as.matrix(dtm)
dim(m)
## [1] 405 488
write.csv(m, file="dtm2.csv")
Geneate dataframe of concepts and their frequency (Only frequency greater than 15)
subfreq<-subset(freq, freq>15)
dffreq<-data.frame(term=names(subfreq), fre=subfreq)
Plot the frequency
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
ggplot(dffreq, aes(x=dffreq$term, y=dffreq$fre))+ geom_bar(stat="identity")+ coord_flip()
###word cloud
Generate word cloud
#install.packages("RColorBrewer")
#install.packages("wordcloud")
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
wordcloud(names(freq), freq, min.freq=10)
Add color on wordcloud
dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(freq), freq, max.words=100, rot.per=0.6, colors=dark2)