library(NLP)
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(tm)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
URL = "http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz"
download.file(URL,destfile = "reviews.tar.gz")
untar("~/reviews.tar.gz")
setwd("~/txt_sentoken")
SourcePos = DirSource(file.path(".", "pos"), pattern="cv")
SourceNeg = DirSource(file.path(".", "neg"), pattern="cv")
pos = Corpus(SourcePos)
neg = Corpus(SourceNeg)
#pos
Space1=content_transformer(function(x,pattern){return(gsub(pattern,"",x))})
pos <- tm_map(pos,Space1,"-")
pos=tm_map(pos,Space1,":")
pos=tm_map(pos,removePunctuation, ucp = FALSE) #Remove standard punction (ucp if TRUE, the characters with Unicode general category)
pos=tm_map(pos,Space1," -")
pos=tm_map(pos,content_transformer(tolower))#convert corpus to lower case
pos=tm_map(pos,removeWords,stopwords("english")) #remove stopwords (english)
pos=tm_map(pos,removeNumbers)
pos=tm_map(pos,stripWhitespace)
pos=tm_map(pos,stemDocument)
#Further cleaning - use content-transformer function
pos=tm_map(pos,content_transformer(gsub),pattern="theoriz",replacement="theory")
pos=tm_map(pos,content_transformer(gsub),pattern="movi",replacement="movie")
pos=tm_map(pos,content_transformer(gsub),pattern="charact",replacement="character")
pos=tm_map(pos,content_transformer(gsub),pattern="stori",replacement="story")
#neg
neg=tm_map(neg,Space1,"-")
neg=tm_map(neg,Space1,":")
neg=tm_map(neg,removePunctuation, ucp = FALSE) #Remove standard punction (ucp if TRUE, the characters with Unicode general category)
#Remove non-standard punctuation using toSpace function
neg=tm_map(neg,Space1,"- ")
neg=tm_map(neg,content_transformer(tolower))#convert corpus to lower case
neg=tm_map(neg,removeNumbers) #remove numbers from corpus
neg=tm_map(neg,removeWords,stopwords("english")) #remove stopwords (english)
neg=tm_map(neg,stripWhitespace)
neg=tm_map(neg,stemDocument)
#Further cleaning - use content-transformer function
neg=tm_map(neg,content_transformer(gsub),pattern="theoriz",replacement="theory")
neg=tm_map(neg,content_transformer(gsub),pattern="movi",replacement="movie")
neg=tm_map(neg,content_transformer(gsub),pattern="charact",replacement="character")
neg=tm_map(neg,content_transformer(gsub),pattern="stori",replacement="story")
#doc matrix pos $ freq
dtmpos=DocumentTermMatrix(pos)
dtmpos
## <<DocumentTermMatrix (documents: 1000, terms: 22435)>>
## Non-/sparse entries: 262314/22172686
## Sparsity : 99%
## Maximal term length: 61
## Weighting : term frequency (tf)
freqpos=colSums(as.matrix(dtmpos))
ordpos=order(freqpos,decreasing=TRUE)
freqpos[head(ordpos)] #Inspect most frequent terms
## film movie one character like make
## 6145 3126 3044 2053 1948 1650
freqpos[tail(ordpos)] #inspect least frequent terms
## powaqqatsi snoot tangerin timbr trueman westworld
## 1 1 1 1 1 1
#doc matrix neg & freq
dtmneg=DocumentTermMatrix(neg)
dtmneg
## <<DocumentTermMatrix (documents: 1000, terms: 21259)>>
## Non-/sparse entries: 241021/21017979
## Sparsity : 99%
## Maximal term length: 55
## Weighting : term frequency (tf)
freqneg=colSums(as.matrix(dtmneg))
ordneg=order(freqneg,decreasing=TRUE)
freqneg[head(ordneg)] #Inspect most frequent terms
## film movie one like character get
## 4964 3731 2715 2050 1802 1684
freqneg[tail(ordneg)] #inspect least frequent terms
## lorn pelvic pointand rift stonili
## 1 1 1 1 1
## uncreditedcan
## 1
#Remove certain terms from the frequncies - e.g., "can" and "one" provide little information
#Create a new DTM which imposes some constraints
#Select only those terms which have lengths within the range of 4 to 20
dtmrpos4=DocumentTermMatrix(pos,control=list(wordLengths=c(4,20)))
dtmrpos4
## <<DocumentTermMatrix (documents: 1000, terms: 21535)>>
## Non-/sparse entries: 239245/21295755
## Sparsity : 99%
## Maximal term length: 20
## Weighting : term frequency (tf)
freqrpos4=colSums(as.matrix(dtmrpos4))
ordrpos4=order(freqrpos4,decreasing=TRUE)
freqrpos4[head(ordrpos4)]
## film movie character like make time
## 6145 3126 2053 1948 1650 1525
freqrpos4[tail(ordrpos4)]
## powaqqatsi snoot tangerin timbr trueman westworld
## 1 1 1 1 1 1
#Remove certain terms from the frequncies - e.g., "can" and "one" provide little information
#Create a new DTM which imposes some constraints
#Select only those terms which have lengths within the range of 4 to 20
#save into new DTM called "dtmr"
dtmrneg4=DocumentTermMatrix(neg,control=list(wordLengths=c(4,20)))
dtmrneg4
## <<DocumentTermMatrix (documents: 1000, terms: 20388)>>
## Non-/sparse entries: 218576/20169424
## Sparsity : 99%
## Maximal term length: 20
## Weighting : term frequency (tf)
freqrneg4=colSums(as.matrix(dtmrneg4))
ordrneg4=order(freqrneg4,decreasing=TRUE)
freqrneg4[head(ordrneg4)]
## film movie like character just make
## 4964 3731 2050 1802 1561 1502
freqrneg4[tail(ordrneg4)]
## lorn pelvic pointand rift stonili
## 1 1 1 1 1
## uncreditedcan
## 1
# Neg - select only those terms which have lengths within the range of 5 to 20
dtmrneg5=DocumentTermMatrix(neg,control=list(wordLengths=c(5,20)))
dtmrneg5
## <<DocumentTermMatrix (documents: 1000, terms: 18008)>>
## Non-/sparse entries: 154443/17853557
## Sparsity : 99%
## Maximal term length: 20
## Weighting : term frequency (tf)
freqrneg5=colSums(as.matrix(dtmrneg5))
ordrneg5=order(freqrneg5,decreasing=TRUE)
freqrneg5[head(ordrneg5)]
## movie character scene story thing first
## 3731 1802 1288 975 848 809
freqrneg5[tail(ordrneg5)]
## jumpstreet koren pelvic pointand stonili
## 1 1 1 1 1
## uncreditedcan
## 1
#POS Select only those terms which have lengths within the range of 5 to 20
dtmrpos5=DocumentTermMatrix(pos,control=list(wordLengths=c(5,20)))
dtmrpos5
## <<DocumentTermMatrix (documents: 1000, terms: 19094)>>
## Non-/sparse entries: 172669/18921331
## Sparsity : 99%
## Maximal term length: 20
## Weighting : term frequency (tf)
freqrpos5=colSums(as.matrix(dtmrpos5))
ordrpos5=order(freqrpos5,decreasing=TRUE)
freqrpos5[head(ordrpos5)]
## movie character scene story first perform
## 3126 2053 1350 1348 970 941
freqrpos5[tail(ordrpos5)]
## powaqqatsi snoot tangerin timbr trueman westworld
## 1 1 1 1 1 1
### wordcloud
wfpos=data.frame(term=names(freqpos),occurrences=freqpos)
wfneg=data.frame(term=names(freqneg),occurrences=freqneg)
ppos=ggplot(subset(wfpos,freqpos>900),aes(term,occurrences))
ppos=ppos+geom_bar(stat="identity")
ppos=ppos+theme(axis.text.x=element_text(angle=45,hjust=1))
ppos

pneg=ggplot(subset(wfneg,freqneg>900),aes(term,occurrences))
pneg=pneg+geom_bar(stat="identity")
pneg=pneg+theme(axis.text.x=element_text(angle=45,hjust=1))
pneg

set.seed(42)
wordcloud(names(freqpos),freqpos,min.freq=800)

wordcloud(names(freqneg),freqneg,min.freq=800)

### 4 words
wordcloud(names(freqrpos4),freqpos,min.freq=800)

wordcloud(names(freqrneg4),freqneg,min.freq=800)

### 5 words
wordcloud(names(freqrpos5),freqpos,min.freq=800)

wordcloud(names(freqrneg5),freqneg,min.freq=800)
