From text to Words

http://handsondatascience.com/TextMiningO.pdf

Get the data

Set the working directory in the folder contain the file “music_survey.csv”.
Read data in and name it as data_music.

data_music<-read.csv("music_survey.csv", stringsAsFactor=FALSE)

View your data

View(data_music)

Creat Text Corpus

Install and load library

#install.packages("NLP")
#install.packages("tm")
library(NLP)
library(tm)

## Warning: package 'tm' was built under R version 3.3.3

Read Question 1 into new variable called comment1 Print first 6 comments from comment1

comment1<-data_music$Q1..What.do.you.like.most.about.this.portable.music.player.
head(comment1)

## [1] "little, light"                                 
## [2] "The battery power is great."                   
## [3] "cost and size"                                 
## [4] "Having all my CDs in the palm of my hand!"     
## [5] "The shuffle mode."                             
## [6] "Battery life. Portability. Accessories. Style."

Creat a Corpus, which can be processed by tm package. Print comments.

vector1<-VectorSource(comment1)
corpus1<-VCorpus(vector1)


for( i in 1:6){
  print(corpus1[[i]][1])
}

## $content
## [1] "little, light"
## 
## $content
## [1] "The battery power is great."
## 
## $content
## [1] "cost and size"
## 
## $content
## [1] "Having all my CDs in the palm of my hand!"
## 
## $content
## [1] "The shuffle mode."
## 
## $content
## [1] "Battery life. Portability. Accessories. Style."

Normalize text

Remove punctuation

corpus1<-tm_map(corpus1,removePunctuation)

for( i in 1:6){
  print(corpus1[[i]][1])
}

## $content
## [1] "little light"
## 
## $content
## [1] "The battery power is great"
## 
## $content
## [1] "cost and size"
## 
## $content
## [1] "Having all my CDs in the palm of my hand"
## 
## $content
## [1] "The shuffle mode"
## 
## $content
## [1] "Battery life Portability Accessories Style"

Change the letters to lower cases

corpus1<-tm_map(corpus1, content_transformer(tolower))
for( i in 1:6){
  print(corpus1[[i]][1])
}

## $content
## [1] "little light"
## 
## $content
## [1] "the battery power is great"
## 
## $content
## [1] "cost and size"
## 
## $content
## [1] "having all my cds in the palm of my hand"
## 
## $content
## [1] "the shuffle mode"
## 
## $content
## [1] "battery life portability accessories style"

Remove numbers and stopwords

corpus1 <- tm_map(corpus1, removeNumbers)
myStopwords <- c(stopwords('english'), 'the','and')
corpus1 <- tm_map(corpus1, removeWords, myStopwords)
for( i in 1:6){
  print(corpus1[[i]][1])
}

## $content
## [1] "little light"
## 
## $content
## [1] " battery power  great"
## 
## $content
## [1] "cost  size"
## 
## $content
## [1] "   cds   palm   hand"
## 
## $content
## [1] " shuffle mode"
## 
## $content
## [1] "battery life portability accessories style"

Install and load library

#install.packages("SnowballC")
library(SnowballC)

Word steming

corpus1<-tm_map(corpus1, stemDocument)
for( i in 1:6){
  print(corpus1[[i]][1])
}

## $content
## [1] "littl light"
## 
## $content
## [1] "batteri power great"
## 
## $content
## [1] "cost size"
## 
## $content
## [1] "cds palm hand"
## 
## $content
## [1] "shuffl mode"
## 
## $content
## [1] "batteri life portabl accessori style"

Remove White space

corpus1<-tm_map(corpus1, stripWhitespace)
for( i in 1:6){
  print(corpus1[[i]][1])
}

## $content
## [1] "littl light"
## 
## $content
## [1] "batteri power great"
## 
## $content
## [1] "cost size"
## 
## $content
## [1] "cds palm hand"
## 
## $content
## [1] "shuffl mode"
## 
## $content
## [1] "batteri life portabl accessori style"

Generate dtm

1.Generate dtm (frequency count)

doc<-corpus1

dtm <- DocumentTermMatrix(doc) 
m1=as.matrix(dtm)

2.Generate dtm (TfIdf)

dtm2 <- DocumentTermMatrix(doc,
           control = list(weighting = function(x) weightTfIdf(x,normalize =FALSE)))
m2=as.matrix(dtm2)

3.Generate dtm (binary)

dtm3 <- DocumentTermMatrix(doc,
           control = list(weighting=weightBin))
m3=as.matrix(dtm3)

Count the frequency of each concept

freq <- colSums(as.matrix(dtm))  #Sum each column

Compute concepts

length(freq)

## [1] 530

Frequency

sort frequency in order

sor<-sort(freq,decreasing = TRUE)
head(sor)

## music   can   use small  like  easi 
##    80    71    62    56    51    50

Save dtm into excel

m <- as.matrix(dtm)   
dim(m)

## [1] 405 530

write.csv(m, file="dtm1.csv")

Geneate dataframe of concepts and their frequency (Only frequency greater than 15)

subfreq<-subset(freq, freq>15)
dffreq<-data.frame(term=names(subfreq), fre=subfreq)

Plot the frequency

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.3.3

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

ggplot(dffreq, aes(x=dffreq$term, y=dffreq$fre))+ geom_bar(stat="identity")+ coord_flip()

###word cloud
Generate word cloud

#install.packages("RColorBrewer")
#install.packages("wordcloud")

library(RColorBrewer)  
library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.3.3

wordcloud(names(freq), freq, min.freq=10)

Add color on wordcloud

dark2 <- brewer.pal(6, "Dark2")   
wordcloud(names(freq), freq, max.words=100, rot.per=0.6, colors=dark2)