library(NLP)
library(SnowballC)
library(tm)
## Warning: package 'tm' was built under R version 3.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.3
URL = "http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz" #get a set
download.file(URL,destfile = "reviews.tar.gz") #download the zip file of reviews
untar("reviews.tar.gz") #unzip the file
setwd("txt_sentoken") #set the working directory
# load only those files with "cv" in the file name
SourcePos = DirSource(file.path(".", "pos"), pattern="cv")
SourceNeg = DirSource(file.path(".", "neg"), pattern="cv")
# Create two corpora: one for positive reviews and the other for negative reviews
pos = Corpus(SourcePos)
neg = Corpus(SourceNeg)
############## POS
Space1=content_transformer(function(x,pattern){return(gsub(pattern,"",x))})
pos=tm_map(pos,Space1,"-")
pos=tm_map(pos,Space1,":")
pos=tm_map(pos,removePunctuation) #Remove standard punction (ucp if TRUE, the characters with Unicode general category)
#Remove non-standard punctuation using toSpace function
pos=tm_map(pos,Space1," -")
pos=tm_map(pos,content_transformer(tolower))#convert corpus to lower case
pos=tm_map(pos,removeNumbers) #remove numbers from corpus
pos=tm_map(pos,removeWords,stopwords("english")) #remove stopwords (english)
pos=tm_map(pos,stemDocument)
pos=tm_map(pos,stripWhitespace)
#Inspect cleaned up corpus
writeLines(as.character(pos[[10]]))
## american action film slowli drown death sea asian wirefu copycat pretti death leav like schwartznag stallon van damm wear cement galosh bottom kung fu sea sometim mix result mindblow spectacl unlik qualiti action amaz excit stunt work s matrix can real gem often hollywood get wrong even pay chines director fli ninja float karat master replac soar bronx detect slow motion kick scientist most laughabl hollywood rush emul success matrix trademark asian stunt choreographi becom joke art form iron monkey latest asian import show us get right iron monkey actual reissu film stori th chines vigilant rongguang yu fight uniqu style shaolin kung fu right oppress belli hungri also piec narrat legendari chines film hero wong feihong recent seen one overlook possibl best film drunken master releas u s legend drunken master unlik drunken master star jacki chan adult feihong iron monkey find much younger feihong szeman tsang father wong keiy yen chi dan thrust middl iron monkey fight oppress iron monkey succeed kung fu film sinc drunken master time fight style especi monkey devolv ridicul twinkleto float film like crouch tiger hidden dragon director yuen wo ping eventu rememb bring action scene back earth iron monkey heart hardcor kung fu action film rather kind drama la crouch tiger howev brief moment profound share charact pass outlaw hero goodheart misguid enemi chief fox moment other iron monkey manag transcend mindless kung fu natur touch heart mind audienc way equal masterpiec like drunken master iron monkey danc quit nice invad kung fu tune aka siunin wong feihung tsi titmalau
pos <- tm_map(pos, removeWords, c("can", "one", "film", "movi", "like", "get", "make", "just", "see", "will"))
# furthere cleaning
pos=tm_map(pos,content_transformer(gsub),pattern="charact",replacement="character")
pos=tm_map(pos,content_transformer(gsub),pattern="stori",replacement="story")
pos=tm_map(pos,content_transformer(gsub),pattern="theoriz",replacement="theory")
pos=tm_map(pos,content_transformer(gsub),pattern="westworld",replacement="west world")
pos=tm_map(pos,content_transformer(gsub),pattern="trueman",replacement="true man")
#### POS Frequency
dtmpos <- DocumentTermMatrix(pos)
tdmpos <- TermDocumentMatrix(pos)
freqpos <- colSums(as.matrix(dtmpos))
length(freqpos)
## [1] 22421
head(freqpos, 14)
## abberlin abli absinth accent act actual adapt alan
## 2 9 1 49 533 481 99 54
## albert allen almost amount anoth anyon
## 30 93 452 85 565 186
ordpos=order(freqpos,decreasing=TRUE)
head(table(freqpos), 20)
## freqpos
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 9104 2998 1635 1108 834 628 550 432 321 313 256 225 201 191 163
## 16 17 18 19 20
## 157 139 115 116 89
freqpos[head(ordpos)] #Inspect most frequent terms
## character time scene story good play
## 2053 1525 1350 1348 1229 1227
freqpos[tail(ordpos)] #inspect least frequent terms
## obstruct overfli powaqqatsi snoot tangerin timbr
## 1 1 1 1 1 1
dtmspos1 <- removeSparseTerms(dtmpos, 0.4)
freqpos1 <- colSums(as.matrix(dtmspos1))
ordpos1=order(freqpos1,decreasing=TRUE)
freqpos1[head(ordpos1)]
## character time even also
## 2053 1525 1202 1200
### Pos Plot Word Frequencies
wfpos=data.frame(term=names(freqpos),occurrences=freqpos)
ppos=ggplot(subset(wfpos,freqpos>500),aes(term,occurrences))
ppos=ppos+geom_bar(stat="identity")
ppos=ppos+theme(axis.text.x=element_text(angle=45,hjust=1))
ppos
