rm(list=ls())
library("rvest")
## Loading required package: xml2
library(text2vec)
library(data.table)
library(stringr)
library(tm)
## Loading required package: NLP
library(RWeka)
library(tokenizers)
##
## Attaching package: 'tokenizers'
## The following object is masked from 'package:tm':
##
## stopwords
library(slam)
library(wordcloud)
## Loading required package: RColorBrewer
library(NLP)
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Loading required package: qdapTools
##
## Attaching package: 'qdapTools'
## The following object is masked from 'package:data.table':
##
## shift
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:stringr':
##
## %>%
## The following object is masked from 'package:text2vec':
##
## %>%
## The following object is masked from 'package:rvest':
##
## %>%
## The following object is masked from 'package:base':
##
## Filter
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:qdapRegex':
##
## %+%
## The following object is masked from 'package:NLP':
##
## annotate
Kungfu Hustle Movie Reviews
#####################################
# Kungfu Hustle Movie Reviews
counts = c(0,10,20,30,40,50,60,70,80,90)
ratings.df = NULL
reviews = NULL
for (j in counts){
url1 = paste0("http://www.imdb.com/title/tt0373074/reviews?filter=love;filter=love;start=",j) # positive reviews
url2 = paste0("http://www.imdb.com/title/tt0373074/reviews?filter=hate;filter=hate;start=",j) # negative reviews
page1 = read_html(url1)
page2 = read_html(url2)
reviews1 = html_text( html_nodes( page1,'#tn15content p') )
reviews2 = html_text( html_nodes( page2,'#tn15content p') )
movie.nodes = html_nodes(page1,'h2 + img')
rating1 = substr(html_attr(movie.nodes, name = 'alt'),0,2)
movie.nodes = html_nodes(page2,'h2 + img')
rating2 = substr(html_attr(movie.nodes, name = 'alt'),0,1)
ratings.df <- c(ratings.df,rating1,rating2)
reviews.positive = setdiff(reviews1, c("*** This review may contain spoilers ***","Add another review"))
reviews.negative = setdiff(reviews2, c("*** This review may contain spoilers ***","Add another review"))
reviews = c(reviews,reviews.positive,reviews.negative)
}
Create a text file which includes all the 200 reviews. 100 Positive and 100 Negative
reviews = gsub("\n",' ',reviews)
writeLines(reviews,'Kungfu_Hustle_Reviews.txt')
Create a fucntion to clean the text
text.clean = function(x) # text data
{ require("tm")
x = gsub("<.*?>", " ", x) # regex for removing HTML tags
x = iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
x = gsub("[^[:alnum:]]", " ", x) # keep only alpha numeric
x = tolower(x) # convert to lower case characters
x = removeNumbers(x) # removing numbers
x = stripWhitespace(x) # removing white space
x = gsub("^\\s+|\\s+$", "", x) # remove leading and trailing white space
return(x)
}
#--------------------------------------------------------#
# Step 1 - Reading text data #
#--------------------------------------------------------#
temp.text = readLines('G:\\Software_Installation_Paths\\RStudio\\R_Projects\\Working_Directory_All_Projects\\Kungfu_Hustle_Reviews.txt') #Kungfu_Hustle_Reviews.txt
head(temp.text,1)
## [1] " Goodness me, what a fantastic movie. Caught the world premiere at the Toronto International Film Festival and the entire theater laughed until they cried. Amazingly directed, HILARIOUSLY funny, it blends a 1930s gangster stylishness into a Hong Kong kung fu movie to astonishing results. Who would've thought you could top Shaolin Soccer? Not me, until I saw this movie. Stephen Chow pulled it off. Chow's comedic timing gets better and better with every movie he makes, and while his films are depending more and more on CGI these days, and makes this movie much more a fantasy kung fu film than a traditional one, it hardly detracts from the enjoyable experience. Make it your mission to see this film - it will be one of the most entertaining you ever see. I can't remember the last film I enjoyed myself in more. My eyes still hurt from wiping away tears of laughter. Seriously. "
data = data.frame(id = 1:length(temp.text), text = temp.text, stringsAsFactors = F)
dim(data)
## [1] 174 2
Check the dimensions of data
dim(data)
## [1] 174 2
Adding all the uncessary words to stop list Because, this movie has the plot about kungu and it happens in hong kong, I created this below stop list as
# Read Stopwords list
stpw1 = readLines('G:\\Software_Installation_Paths\\RStudio\\R_Projects\\Working_Directory_All_Projects\\stopwords.txt')
# read in stopwords file
stpw2 = tm::stopwords('english')
# tm package stop word list; tokenizer package has the same name function
stpw3 =c("kung_fu","hustle","plot","film","movie","hong kong","kung fu", "chow","stephen chow", "cast", "films", "movies", "story")
comn = unique(c(stpw1, stpw2,stpw3) ) # Union of two list
stopwords = unique(gsub("'"," ",comn) )
# final stop word lsit after removing punctuation
x = text.clean(data$text ) # pre-process text corpus
x = removeWords(x,stopwords ) # removing stopwords created above
x = stripWhitespace(x ) # removing white space
# x = stemDocument(x)
Create DTM using text2vec package
#--------------------------------------------------------#
## Step 2: Create DTM using text2vec package #
#--------------------------------------------------------#
t1 = Sys.time()
tok_fun = word_tokenizer # using word & not space tokenizers
it_0 = itoken( x,
#preprocessor = text.clean,
tokenizer = tok_fun,
ids = data$id,
progressbar = F)
vocab = create_vocabulary(it_0, # func collects unique terms & corresponding statistics
ngram = c(2L, 2L) #,
#stopwords = stopwords
)
pruned_vocab = prune_vocabulary(vocab, # filters input vocab & throws out v frequent & v infrequent terms
term_count_min = 10)
length(pruned_vocab); str(pruned_vocab)
## [1] 5
## List of 5
## $ vocab :Classes 'data.table' and 'data.frame': 22 obs. of 3 variables:
## ..$ terms : chr [1:22] "looney_tunes" "gong_fu" "sty_alley" "long_time" ...
## ..$ terms_counts: int [1:22] 13 12 12 13 22 61 16 19 16 32 ...
## ..$ doc_counts : int [1:22] 12 6 10 12 19 40 13 16 12 30 ...
## ..- attr(*, ".internal.selfref")=<externalptr>
## $ ngram : Named int [1:2] 2 2
## ..- attr(*, "names")= chr [1:2] "ngram_min" "ngram_max"
## $ document_count: int 174
## $ stopwords : chr(0)
## $ sep_ngram : chr "_"
## - attr(*, "class")= chr "text2vec_vocabulary"
vectorizer = vocab_vectorizer(pruned_vocab) # creates a text vectorizer func used in constructing a dtm/tcm/corpus
dtm_0 = create_dtm(it_0, vectorizer) # high-level function for creating a document-term matrix
# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),] # terms in decreasing order of freq
head(tsum)
## axe_gang shaolin_soccer martial_arts special_effects
## 76 67 61 32
## pig_sty kill_bill
## 25 23
tail(tsum)
## sty_alley wo_ping bad_guys yuen_wo flying_daggers
## 12 12 12 11 10
## action_scenes
## 10
#-------------------------------------------------------
# Code bi-grams as unigram in clean text corpus
text2 = x
text2 = paste("",text2,"")
pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0
for (term in names(tsum)){
i = i + 1
focal.term = gsub("_", " ",term) # in case dot was word-separator
replacement.term = term
text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
# setTxtProgressBar(pb, i)
}
it_m = itoken(text2, # function creates iterators over input objects to vocabularies, corpora, DTM & TCM matrices
# preprocessor = text.clean,
tokenizer = tok_fun,
ids = data$id,
progressbar = F)
vocab = create_vocabulary(it_m # vocab func collects unique terms and corresponding statistics
# ngram = c(2L, 2L),
#stopwords = stopwords
)
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 1)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_m = create_dtm(it_m, vectorizer)
dim(dtm_m)
## [1] 174 4391
dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
a0 = (apply(dtm, 1, sum) > 0) # build vector to identify non-empty docs
dtm = dtm[a0,] # drop empty docs
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 1.064927 secs
# view a sample of the DTM, sorted from most to least frequent tokens
dtm = dtm[,order(apply(dtm, 2, sum), decreasing = T)] # sorting dtm's columns in decreasing order of column sums
inspect(dtm[1:5, 1:5]) # inspect() func used to view parts of a DTM object
## <<DocumentTermMatrix (documents: 5, terms: 5)>>
## Non-/sparse entries: 9/16
## Sparsity : 64%
## Maximal term length: 6
## Weighting : term frequency (tf)
##
## Terms
## Docs good comedy action funny great
## 1 0 0 0 1 0
## 2 1 0 0 0 0
## 3 3 2 3 1 0
## 4 0 1 0 2 0
## 5 1 0 0 0 0
#--------------------------------------------------------#
## Step 2a: # Build word cloud #
#--------------------------------------------------------#
# 1- Using Term frequency(tf)
tst = round(ncol(dtm)/100) # divide DTM's cols into 100 manageble parts
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))
ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
tempdtm = dtm[,(b[i]+1):(b[i+1])]
s = colSums(as.matrix(tempdtm))
ss.col = c(ss.col,s)
}
tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)] #terms in decreasing order of freq
head(tsum)
## good comedy action funny great axe_gang
## 146 114 101 92 84 74
tail(tsum)
## buddhism er maturing calling spoofed adequate
## 1 1 1 1 1 1
windows() # New plot window
wordcloud(names(tsum), tsum, # words, their freqs
scale = c(4, 0.5), # range of word sizes
1, # min.freq of words to consider
max.words = 100, # max #words
colors = brewer.pal(8, "Dark2")) # Plot results in a word cloud
title(sub = "Term Frequency - Wordcloud") # title for the wordcloud display
# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))
windows() # New plot window
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "Blue") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
# -------------------------------------------------------------- #
# step 2b - Using Term frequency inverse document frequency (tfidf)
# -------------------------------------------------------------- #
library(textir)
## Loading required package: distrom
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:qdap':
##
## %&%
## Loading required package: gamlr
## Loading required package: parallel
dtm.tfidf = tfidf(dtm, normalize= FALSE)
tst = round(ncol(dtm.tfidf)/100)
a = rep(tst, 99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm.tfidf))
ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
tempdtm = dtm.tfidf[,(b[i]+1):(b[i+1])]
s = colSums(as.matrix(tempdtm))
ss.col = c(ss.col,s)
}
tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)] #terms in decreasing order of freq
head(tsum)
## sing axe_gang good action great chinese
## 135.81256 110.66653 104.59499 101.01739 95.23110 94.43638
tail(tsum)
## buddhism er maturing calling spoofed adequate
## 4.465908 4.465908 4.465908 4.465908 4.465908 4.465908
windows()
wordcloud(names(tsum), tsum, scale=c(4,0.5),1, max.words=100,colors=brewer.pal(8, "Dark2")) # Plot results in a word cloud
title(sub = "Term Frequency Inverse Document Frequency - Wordcloud")
as.matrix(tsum[1:20]) # to see the top few tokens & their IDF scores
## [,1]
## sing 135.81256
## axe_gang 110.66653
## good 104.59499
## action 101.01739
## great 95.23110
## chinese 94.43638
## funny 92.01584
## people 91.15090
## comedy 90.01525
## watch 88.23004
## martial_arts 88.17448
## shaolin_soccer 86.55314
## scenes 82.47346
## time 80.49296
## matrix 78.56303
## bad 78.55458
## made 78.36734
## fun 77.20128
## lot 77.20128
## humor 76.61061
(dtm.tfidf)[1:10, 1:10] # view first 10x10 cells in the DTM under TF IDF.
## 10 x 10 sparse Matrix of class "dgCMatrix"
## [[ suppressing 10 column names 'good', 'comedy', 'action' ... ]]
##
## 1 . . . 1.000172 . . .
## 2 0.716404 . . . . . 1.133704
## 3 2.149212 1.5792149 3.000517 1.000172 . 4.486481 .
## 4 . 0.7896074 . 2.000344 . . .
## 5 0.716404 . . . . . 1.133704
## 6 3.582020 . . . . 1.495494 2.267407
## 7 . . 2.000344 . 2.267407 . 2.267407
## 8 0.716404 . . . . . .
## 9 1.432808 . 1.000172 . . . .
## 10 . 0.7896074 . . . . .
##
## 1 . 1.352393 .
## 2 . . .
## 3 3.880359 1.352393 1.247032
## 4 . . .
## 5 . . .
## 6 . 1.352393 1.247032
## 7 . . .
## 8 9.700897 . .
## 9 . . .
## 10 . 1.352393 .
# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))
windows() # New plot window
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
#------------------------------------------------------#
# step 2c - Term Co-occurance Matrix (TCM) #
#------------------------------------------------------#
vectorizer = vocab_vectorizer(pruned_vocab,
grow_dtm = FALSE,
skip_grams_window = 5L)
tcm = create_tcm(it_m, vectorizer) # func to build a TCM
tcm.mat = as.matrix(tcm) # use tcm.mat[1:5, 1:5] to view
adj.mat = tcm.mat + t(tcm.mat) # since adjacency matrices are symmetric
z = order(colSums(adj.mat), decreasing = T)
adj.mat = adj.mat[z,z]
# Plot Simple Term Co-occurance graph
adj = adj.mat[1:30,1:30]
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:qdap':
##
## %>%, diversity
## The following object is masked from 'package:stringr':
##
## %>%
## The following objects are masked from 'package:text2vec':
##
## %>%, normalize
## The following object is masked from 'package:rvest':
##
## %>%
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
cog = graph.adjacency(adj, mode = 'undirected')
cog = simplify(cog)
cog = delete.vertices(cog, V(cog)[ degree(cog) == 0 ])
windows()
plot(cog)
#-----------------------------------------------------------#
# Step 2d - a cleaned up or 'distilled' COG PLot #
#-----------------------------------------------------------#
distill.cog = function(mat1, # input TCM ADJ MAT
title, # title for the graph
s, # no. of central nodes
k1){ # max no. of connections
library(igraph)
a = colSums(mat1) # collect colsums into a vector obj a
b = order(-a) # nice syntax for ordering vector in decr order
mat2 = mat1[b, b] # order both rows and columns along vector b
diag(mat2) = 0
## +++ go row by row and find top k adjacencies +++ ##
wc = NULL
for (i1 in 1:s){
thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
mat2[i1, mat2[i1,] < thresh1] = 0
mat2[i1, mat2[i1,] > 0 ] = 1
word = names(mat2[i1, mat2[i1,] > 0])
mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
wc = c(wc,word)
} # i1 loop ends
mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))] # removed any NAs from the list
mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
graph <- graph.adjacency(mat4, mode = "undirected", weighted=T) # Create Network object
graph = simplify(graph)
V(graph)$color[1:s] = "green"
V(graph)$color[(s+1):length(V(graph))] = "pink"
graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ])
plot(graph,
layout = layout.kamada.kawai,
main = title)
} # func ends
windows()
distill.cog(tcm.mat, 'Distilled COG', 10, 5)
## adj.mat and distilled cog for tfidf DTMs ##
adj.mat = t(dtm.tfidf) %*% dtm.tfidf
diag(adj.mat) = 0
a1 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a1[1:50], a1[1:50]])
windows()
distill.cog(adj.mat, 'Distilled COG', 10, 10)
reviews_df <- data.frame(reviews,ratings.df)
#--------------------------------------------------------#
# Step 3 correlation between polarity and rating #
#--------------------------------------------------------#
reviews_df <- data.frame(reviews,ratings.df)
polarity <- counts(polarity(reviews_df$reviews))[, "polarity"]
reviews_df$polarity <-polarity
# reviews_df --> first colmun contains review second column has rating and third has polarity of review
cor(as.numeric(reviews_df$ratings.df), polarity)
## [1] -0.09161505
#--------------------------------------------------------#
# Sentiment Analysis #
#--------------------------------------------------------#
library(qdap)
x1 = x[a0] # remove empty docs from corpus
t1 = Sys.time() # set timer
pol = polarity(x1) # Calculate the polarity from qdap dictionary
wc = pol$all[,2] # Word Count in each doc
val = pol$all[,3] # average polarity score
p = pol$all[,4] # Positive words info
n = pol$all[,5] # Negative Words info
dim(pol)
## NULL
Sys.time() - t1 # how much time did the above take?
## Time difference of 10.9854 secs
head(pol$all)
## all wc polarity
## 1 all 49 0.5714286
## 2 all 41 0.1561738
## 3 all 209 -1.4249318
## 4 all 37 0.3287980
## 5 all 90 1.0540926
## 6 all 211 0.3029085
## pos.words
## 1 goodness, fantastic, amazingly, astonishing, top, enjoyable, entertaining, enjoyed
## 2 good, hero, sincerely, recommend
## 3 imaginative, masters, jaw dropping, famed, good, good, masters, good, works, sophisticated, authentic
## 4 classic, hilarious, proud, success
## 5 promise, enthusiastic, promise, promise, sweeping, good, integrated, smoothly, stellar, perfect, believable, excellent, work, steady
## 6 good, award, winning, humour, good, good, attraction, successful, gratifying, fun, hallmark, skilled, skill, masters, engrossing, good, fun, good
## neg.words
## 1 funny, gangster, detracts, hurt
## 2 weak, dirty, regret
## 3 wildly, petty, ruthless, gangster, extort, notorious, relentless, explosive, outrageous, kill, blow, rascal, glibly, feeble, stuck, poverty, unknown, notorious, cruelest, pig, extortion, clash, skinny, wimpy, evil, torture, funny, suffering, enemy, poisonous
## 4 funny, funny
## 5 strike, warning, evil, jaded
## 6 stumbled, abyss, weak, sarcasm, petty, lecher, beggar, naive, madness, mad, crazy, knife
## text.var
## 1 goodness fantastic caught world premiere toronto international festival entire theater laughed cried amazingly directed hilariously funny blends gangster stylishness astonishing results ve thought top shaolin soccer pulled comedic timing makes depending cgi days makes fantasy traditional detracts enjoyable experience make mission entertaining remember enjoyed eyes hurt wiping tears laughter
## 2 general puts tribute argue stephen lots jokes routines makes bit weak script make dirty jokes usual learn philosophy hours good laugh time watch carefully understands opinion thing crazies common crouching tiger hidden dragon hero express sincerely recommend slightest hesitation watch regret
## 3 wildly imaginative action packed petty thief sing aspires ruthless gangster stumbling gang controlled apartment extort money locals masters disguise sing actions attracts notorious axe gang set relentless chain events brings clans explosive battle showdowns dance sequences featuring tuxedoed mobsters martial arts action outrageous jaw dropping fight sequences yeun wo ping famed action choreographer kill bill vol matrix blow set guangdong province china plays ah xing street rascal fools constantly good talking glibly core feeble minded stuck poverty unknown future good xing beginning discover xing goal member notorious axe gang fiercest cruelest widespread gang city day xing slum pig cage town usual extortion witnesses real clash gangs axe gang clearing local gang shown allegiance axe xing realizes slum residents including plump landlady skinny wimpy husband turned masters fight large group gangs xing realizes battle good evil called choose side continues unique comedy style twisting slapstick jokes reinterpreting composed face intensifies torture sequences creates funny points suffering shows characters practising faced middle aged woman slum quickly kicked groin stabbed flying daggers supposedly aimed enemy bitten face poisonous snakes trials manages survive day creativity demonstrated higher quality comedy early works fight back school king beggars cinematography sophisticated authentic aura shaolin soccer heavily adopts computer graphics present special effects snake daggers flying axes
## 4 yesterday made history simple storyline funny comedy similar harry potter case wizard animations hayao miyazaki world imagination created characters funny classic appeared context result hilarious make people proud hope box office success countries buy dvd personal collection
## 5 decades avoiding copy pressed hands dear friend extracted promise time night sit watch enthusiastic tossed drive athlon promise promise minutes nuclear strike warning town torn screen watched dozen times cheesiness cinema rare occasions writer director directly tap nerve weave directly piece simply standard romance sweeping portrait good versus evil moral lesson power comedic bits integrated flows smoothly tone fight choreography stellar special care make character piece human perfect people places makes wire fighting appears parts believable acting excellent camera work rock steady feeling jaded burdened person make difference settle copy
## 6 good things latest part unlike previous award winning action comedy shaolin soccer continued martial arts theme expanded massively repertoire sfx fan beginning times relied deadpan expression toilet humour spoofs books good ending heart warming small town village boy makes good lines elements noticeably absent kfh ng man tat missing customary female attraction main female character screen time ss back excepting ng goalkeeper roaring time leader axe gang kfh talk sfx creatively world part stumbled cinematic abyss caught hollywood directors dabble black arts sfx thankfully answer true kfh unlike ss weak characters characteristic recent western yawns van helsing blade trinity stepped formula made ss sfx extravaganza successful contrast kfh bundle action sequences thrown showcase sfx wit cheekiness remains gratifying fun sarcasm hallmark acting career translated produced directed kfh spoofing clichs chinese martial arts stories retired martial arts couple petty scrooge lecher heroes save neighbourhood gay end dispatched con beggar sells wu ling mi ji naive boy spoofs sfx spectacles drive madness upward highly skilled sitar players strum blades ghouls battle pugilist toad skill croaks blows lower mouth ascends skies masters buddha palm eagle distance engrossing mass battle black clad axe gangsters lifted matrix mad irreverent times affecting yearn good sfx fun crazy surely funniest bit knife throwing sequence good mind bring year
head(pol$group)
## all total.sentences total.words ave.polarity sd.polarity
## 1 all 174 14895 0.3302639 0.7289411
## stan.mean.polarity
## 1 0.4530735
positive_words = unique(setdiff(unlist(p),"-")) # Positive words list
negative_words = unique(setdiff(unlist(n),"-")) # Negative words list
print(positive_words) # Print all the positive words found in the corpus
## [1] "goodness" "fantastic" "amazingly"
## [4] "astonishing" "top" "enjoyable"
## [7] "entertaining" "enjoyed" "good"
## [10] "hero" "sincerely" "recommend"
## [13] "imaginative" "masters" "jaw dropping"
## [16] "famed" "works" "sophisticated"
## [19] "authentic" "classic" "hilarious"
## [22] "proud" "success" "promise"
## [25] "enthusiastic" "sweeping" "integrated"
## [28] "smoothly" "stellar" "perfect"
## [31] "believable" "excellent" "work"
## [34] "steady" "award" "winning"
## [37] "humour" "attraction" "successful"
## [40] "gratifying" "fun" "hallmark"
## [43] "skilled" "skill" "engrossing"
## [46] "worth" "important" "spectacular"
## [49] "great" "succeeds" "talented"
## [52] "rightful" "worked" "incredibly"
## [55] "likable" "amazing" "powerful"
## [58] "legendary" "fans" "wonderful"
## [61] "fairly" "applaud" "smile"
## [64] "pure" "entertain" "humor"
## [67] "impressive" "phenomenal" "joy"
## [70] "decent" "neatly" "master"
## [73] "enjoy" "cool" "neat"
## [76] "amusing" "suitable" "deserving"
## [79] "thriving" "assure" "interesting"
## [82] "wonders" "gain" "acclaim"
## [85] "free" "positive" "fast"
## [88] "ready" "redeeming" "favor"
## [91] "pride" "survival" "incredible"
## [94] "inventive" "genuine" "properly"
## [97] "superior" "optimistic" "beautiful"
## [100] "proper" "greatest" "marvelous"
## [103] "coolest" "sweet" "love"
## [106] "kindness" "honest" "meaningful"
## [109] "reasonable" "recommended" "prefer"
## [112] "righteous" "worthwhile" "genius"
## [115] "humorous" "adorable" "super"
## [118] "awesome" "creative" "stylized"
## [121] "innovative" "modern" "homage"
## [124] "beauty" "satisfy" "memorable"
## [127] "strong" "masterpiece" "daring"
## [130] "fresh" "improved" "nicely"
## [133] "brilliant" "talent" "sharp"
## [136] "cute" "prowess" "qualify"
## [139] "acclaimed" "solid" "promising"
## [142] "unreal" "adore" "fame"
## [145] "wowed" "pretty" "fine"
## [148] "correct" "enjoys" "enjoyment"
## [151] "inspiration" "happier" "impressed"
## [154] "wonderfully" "pleasant" "loved"
## [157] "veritable" "loving" "effortlessly"
## [160] "gem" "magnificent" "amazed"
## [163] "awe" "honesty" "perfectly"
## [166] "fair" "proves" "fortunately"
## [169] "refreshing" "miracle" "happily"
## [172] "realistic" "smiling" "benefit"
## [175] "superb" "winner" "supporting"
## [178] "awards" "merit" "clever"
## [181] "nice" "appeal" "aspire"
## [184] "progress" "likes" "masterful"
## [187] "intelligible" "fast paced" "woo"
## [190] "top notch" "breathtaking" "coherent"
## [193] "preferring" "appealing" "dazzling"
## [196] "satisfied" "witty" "happiness"
## [199] "defeat" "favorite" "extraordinary"
## [202] "beautifully" "accomplished" "excited"
## [205] "blockbuster" "satisfying" "shine"
## [208] "easy" "enhanced" "idol"
## [211] "brilliantly" "delight" "peaceful"
## [214] "colorful" "romantic" "significant"
## [217] "admire" "exciting" "respect"
## [220] "outdo" "originality" "notably"
## [223] "stronger" "effective" "mighty"
## [226] "heaven" "win" "support"
## [229] "clear" "achievement" "intelligent"
## [232] "delightful" "achievements" "praise"
## [235] "silent" "impress" "magical"
## [238] "visionary" "attractive" "lush"
## [241] "favour" "promised" "stylish"
## [244] "comfortable" "brave" "greatness"
## [247] "lead" "ample" "golden"
## [250] "outstanding" "inspiring" "vibrant"
## [253] "charm" "endearing" "talents"
## [256] "cherished" "mature" "cherish"
## [259] "slick" "perfection" "renowned"
## [262] "famous" "modesty" "righteousness"
## [265] "entertains" "tough" "terrific"
## [268] "thrill" "pleasure" "surreal"
## [271] "dedicated" "dazzle" "rich"
## [274] "spiritual" "enlightenment" "thankful"
## [277] "remarkably" "playful" "magnificence"
## [280] "clean" "reliable" "seasoned"
## [283] "inestimable" "leads" "promises"
## [286] "exuberant" "prize" "stunning"
## [289] "led" "sensational" "amuse"
## [292] "gentle" "champion" "soulful"
## [295] "poignant" "spirited" "worthy"
## [298] "flawless" "lover" "masterpieces"
## [301] "thrilling" "humble" "keen"
## [304] "finest" "unaffected" "razor sharp"
## [307] "passionate" "majestic" "amaze"
## [310] "whoa" "abound" "lucky"
## [313] "cheer" "vivid" "magic"
## [316] "gorgeous" "mind blowing" "fascinating"
## [319] "handsome" "succeeded" "wisely"
## [322] "instantly" "destiny" "protect"
## [325] "sensitive" "enjoying" "supreme"
## [328] "galore" "redemption" "engaging"
## [331] "defeated" "protection" "reputation"
## [334] "easier" "happy" "successfully"
## [337] "mastery" "unselfish" "wow"
## [340] "grand" "sincere" "faith"
## [343] "secure" "exceeded" "shiny"
## [346] "beloved" "loyalty" "regard"
## [349] "abundant" "adequate" "guarantee"
## [352] "prominent" "thrive" "lovable"
## [355] "accessible" "pleased" "variety"
## [358] "glamorous" "excellently" "wins"
## [361] "ease" "entice" "delectable"
## [364] "thoughtful" "delightfully" "loves"
## [367] "hottest" "hot" "earnest"
## [370] "revel" "seamless" "fantastically"
## [373] "superbly" "praising" "stunningly"
## [376] "fascinate" "popular" "bliss"
## [379] "sweetness" "fabulous" "feat"
## [382] "straightforward" "polished" "flashy"
## [385] "smart" "conveniently" "recommendation"
## [388] "exceptional" "cleverly" "splendidly"
## [391] "passion" "glad" "wisdom"
## [394] "friendly" "appreciated" "enthusiasm"
## [397] "glory" "holy" "skillfully"
## [400] "heroine" "extraordinarily" "heroic"
## [403] "fertile" "exceptionally" "brilliance"
## [406] "fancy" "effectively"
print(negative_words) # Print all neg words
## [1] "funny" "gangster" "detracts"
## [4] "hurt" "weak" "dirty"
## [7] "regret" "wildly" "petty"
## [10] "ruthless" "extort" "notorious"
## [13] "relentless" "explosive" "outrageous"
## [16] "kill" "blow" "rascal"
## [19] "glibly" "feeble" "stuck"
## [22] "poverty" "unknown" "cruelest"
## [25] "pig" "extortion" "clash"
## [28] "skinny" "wimpy" "evil"
## [31] "torture" "suffering" "enemy"
## [34] "poisonous" "strike" "warning"
## [37] "jaded" "stumbled" "abyss"
## [40] "sarcasm" "lecher" "beggar"
## [43] "naive" "madness" "mad"
## [46] "crazy" "knife" "disgustingly"
## [49] "overpriced" "overblown" "fail"
## [52] "disappointed" "fell" "tired"
## [55] "bum" "deaf" "weep"
## [58] "fat" "lazy" "frail"
## [61] "domineering" "confusing" "comical"
## [64] "stunt" "waste" "slap"
## [67] "unwatchable" "struggle" "warped"
## [70] "dreadful" "worst" "shock"
## [73] "drag" "stupid" "unfortunate"
## [76] "burn" "misfortune" "suck"
## [79] "bad" "awful" "ripped"
## [82] "ax" "lame" "poorly"
## [85] "wasted" "sucks" "terrible"
## [88] "painful" "contradict" "annoying"
## [91] "poor" "lost" "lose"
## [94] "unbelievably" "lack" "lacks"
## [97] "stinks" "bewildering" "retarded"
## [100] "ridiculous" "stupidity" "drunk"
## [103] "hard" "worse" "hate"
## [106] "cheats" "miscellaneous" "uneasy"
## [109] "broken" "conflict" "break"
## [112] "falls" "dreadfully" "joke"
## [115] "inept" "failing" "sadly"
## [118] "lacking" "incoherent" "absurdity"
## [121] "blame" "morons" "utterly"
## [124] "dumping" "smash" "gross"
## [127] "parody" "strange" "doubt"
## [130] "unpredictable" "violently" "hell"
## [133] "disturbing" "absurd" "death"
## [136] "wild" "bloody" "regretted"
## [139] "miss" "killer" "devil"
## [142] "blah" "struggling" "wrong"
## [145] "darkness" "dilemma" "badly"
## [148] "cry" "suffers" "bull"
## [151] "intense" "sick" "hated"
## [154] "goofy" "sty" "fear"
## [157] "long time" "subversive" "distracting"
## [160] "bother" "randomly" "scratch"
## [163] "disorder" "messing" "horrid"
## [166] "horrible" "twist" "wreck"
## [169] "mess" "stupidest" "overrated"
## [172] "hung" "convoluted" "disaster"
## [175] "excuse" "dumped" "killing"
## [178] "refuse" "stumble" "trick"
## [181] "diabolic" "haphazard" "silly"
## [184] "cruel" "sin" "dumb"
## [187] "warned" "addict" "stereotypical"
## [190] "problem" "childish" "scary"
## [193] "superficial" "embarrassment" "mockery"
## [196] "undercut" "repetitive" "messy"
## [199] "struck" "revenge" "underdog"
## [202] "pointless" "superfluous" "cold"
## [205] "unable" "disbelief" "disappointment"
## [208] "fault" "tragedy" "bizarre"
## [211] "bored" "blunt" "irritating"
## [214] "loud" "faults" "slow"
## [217] "shortness" "mistaken" "complaining"
## [220] "denying" "uneven" "hapless"
## [223] "embroiled" "notoriously" "violent"
## [226] "ramshackle" "dastardly" "attacks"
## [229] "prison" "broke" "showdown"
## [232] "malevolent" "fever" "cartoonish"
## [235] "disliked" "insane" "issues"
## [238] "critical" "squabble" "ridiculously"
## [241] "impossible" "complex" "critics"
## [244] "sloppy" "unbelievable" "onslaught"
## [247] "rife" "inferior" "ruin"
## [250] "fuss" "racist" "nonsense"
## [253] "difficult" "frustrated" "offensive"
## [256] "upsetting" "nervous" "excessive"
## [259] "abuse" "bashing" "drags"
## [262] "sad" "contrived" "boring"
## [265] "clueless" "lacked" "fall"
## [268] "knock" "die" "dangerous"
## [271] "gruesome" "shallow" "alienate"
## [274] "suspect" "dragged" "twists"
## [277] "stolen" "killed" "mistakes"
## [280] "fret" "disappointments" "weird"
## [283] "trouble" "chaos" "lure"
## [286] "blind" "deadly" "infamous"
## [289] "bitter" "spoil" "stab"
## [292] "tainted" "stooges" "fails"
## [295] "miserably" "disgust" "ignore"
## [298] "marginally" "crappy" "miserable"
## [301] "failure" "monotonous" "criticism"
## [304] "dying" "brash" "impoverished"
## [307] "mistake" "confusion" "stale"
## [310] "failed" "disappoint" "concerned"
## [313] "moody" "shortcomings" "unfamiliar"
## [316] "harsh" "betrayal" "hardship"
## [319] "villains" "melancholy" "sucked"
## [322] "tiring" "unsuccessful" "dull"
## [325] "touchy" "susceptible" "rebuff"
## [328] "laughable" "wicked" "flaws"
## [331] "cheesy" "pale" "fist"
## [334] "fury" "ironically" "disgusting"
## [337] "attack" "cloud" "overwhelmed"
## [340] "crime" "unrealistic" "confused"
## [343] "anemic" "strictly" "pratfall"
## [346] "criminal" "garbage" "splitting"
## [349] "bugs" "dark" "fried"
## [352] "hysterical" "diametrically" "perplexed"
## [355] "cheap" "powerless" "vicious"
## [358] "overweight" "panicking" "lie"
## [361] "assassinate" "confuse" "annoy"
## [364] "stagnant" "heinous" "swindle"
## [367] "obnoxious" "assassin" "noisy"
## [370] "struggles" "overwhelming" "complaints"
## [373] "strangely" "brutal" "beware"
## [376] "gasp" "die hard" "problems"
## [379] "irony" "unexpected" "desperate"
## [382] "blackmail" "dead" "stealing"
## [385] "suffer" "dust" "freeze"
## [388] "expensive" "shocked" "offend"
## [391] "frightening" "dump" "annoys"
## [394] "wary" "shame" "split"
## [397] "blurred" "hefty" "contortions"
## [400] "ridicules" "fierce" "overwhelmingly"
## [403] "limited" "beg" "steal"
## [406] "ranting" "useless" "object"
## [409] "oppression" "brutality" "negative"
## [412] "destroy" "unnecessary" "incongruous"
## [415] "disappointing" "simplistic" "complicated"
## [418] "weaker" "refuses" "unappealing"
## [421] "selfish" "afraid" "doldrums"
## [424] "dislike" "rift" "stricken"
## [427] "threaten" "rogue" "loses"
## [430] "harm" "wounds" "suspiciously"
## [433] "exaggerate" "punch" "thug"
## [436] "lackluster" "damn" "disproportionate"
## [439] "tiresome" "denied" "vice"
## [442] "hardships" "insult" "pretentious"
## [445] "crap" "lethal" "tyranny"
## [448] "fleeing" "sly" "bullies"
## [451] "disregard" "defiance" "seriousness"
## [454] "bothered" "cringed" "bashed"
## [457] "guilty" "discomfort" "fatal"
## [460] "flaw" "manic" "confrontation"
## [463] "inability" "tension" "overdone"
## [466] "misses" "deter" "detestable"
## [469] "ironic" "awkward" "falling"
## [472] "imaginary" "worries" "hang"
## [475] "unclear" "anxious" "infernal"
## [478] "satirical" "critic" "murder"
## [481] "loose" "worn" "innuendo"
## [484] "scarce" "troubled" "drunken"
## [487] "startling" "cruelty" "infuriatingly"
## [490] "erratic" "smack" "assault"
## [493] "smoldering" "rage" "offender"
## [496] "villainous" "pan" "idiotic"
## [499] "mocking" "objections" "pain"
## [502] "heck" "obscure" "stumbles"
## [505] "extravagant" "imposing" "exploit"
## [508] "fatty" "abrupt" "risky"
## [511] "missed" "inane" "misread"
## [514] "phony" "limit" "overwhelm"
## [517] "stereotype" "needless" "odd"
## [520] "worried" "quarrels" "flare"
## [523] "joker" "crash" "massacre"
## [526] "shrivel" "fake" "dissuade"
## [529] "pessimistic" "wasting" "idiots"
## [532] "raving" "scum"
#--------------------------------------------------------#
# Create Postive Words wordcloud #
#--------------------------------------------------------#
pos.tdm = dtm[,which(colnames(dtm) %in% positive_words)]
m = as.matrix(pos.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows() # opens new image window
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))
title(sub = "Positive Words - Wordcloud")
# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows() # opens new image window
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "blue") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
#--------------------------------------------------------#
# Create Negative Words wordcloud #
#--------------------------------------------------------#
neg.tdm = dtm[,which(colnames(dtm) %in% negative_words) ]
m = as.matrix(neg.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows()
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))
title(sub = "Negative Words - Wordcloud")
# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows()
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
#--------------------------------------------------------#
# Positive words vs Negative Words plot #
#--------------------------------------------------------#
len = function(x){
if ( x == "-" && length(x) == 1) {return (0)}
else {return(length(unlist(x)))}
}
pcount = unlist(lapply(p, len))
ncount = unlist(lapply(n, len))
doc_id = seq(1:length(wc))
windows()
plot(doc_id,pcount,type="l",col="green",xlab = "Document ID", ylab= "Word Count")
lines(doc_id,ncount,type= "l", col="red")
title(main = "Positive words vs Negative Words" )
legend("topright", inset=.05, c("Positive Words","Negative Words"), fill=c("green","red"), horiz=TRUE)
# Documet Sentiment Running plot
windows()
plot(pol$all$polarity, type = "l", ylab = "Polarity Score",xlab = "Document Number")
abline(h=0)
title(main = "Polarity Plot" )
### COG for sentiment-laden words ? ###
senti.dtm = cbind(pos.tdm, neg.tdm); dim(senti.dtm)
## [1] 174 932
senti.adj.mat = as.matrix(t(senti.dtm)) %*% as.matrix(senti.dtm)
diag(senti.adj.mat) = 0
windows()
distill.cog(senti.adj.mat, # ad mat obj
'Distilled COG of senti words', # plot title
5, # max #central nodes
5) # max #connexns
I would recommend this to be proceeded with a sequel as the overall sentiment is positive.
Also, I see positive words that speaks FOR this film are too good and they are very high when compared to the negative words that go against this film
Martial arts are too good and humour has good role in this movie which is highlighted in most of the positive reviews
On other side, there are also people who talked about the movie’s narration and plot similarity with Shaloin Soccer and other chinese movies.
Sing Lady and the land Lord is one who people talked a lot about in this movie
There are very beautiful elements that can be continued further in the next sequel like love, narration, comedy and the story weaved with the martial arts.
The correlation between the two factors consdiered (Ratings & Polarity Score Generated) are slightly negatively correlated.
TOP 3 Movie Recommendations: * Comedy usage all through the movie * Use of Characters in the narration (Especially the Singing Lady /Lord) * Action / Chinese Martial Arts