R Markdown

By Bharat Kumar Bathula - 71620014

Inculde libraries

rm(list=ls())
library("rvest")
## Loading required package: xml2
library(text2vec)
library(data.table)
library(stringr)
library(tm)
## Loading required package: NLP
library(RWeka)
library(tokenizers)
## 
## Attaching package: 'tokenizers'
## The following object is masked from 'package:tm':
## 
##     stopwords
library(slam)
library(wordcloud)
## Loading required package: RColorBrewer
library(NLP)
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Loading required package: qdapTools
## 
## Attaching package: 'qdapTools'
## The following object is masked from 'package:data.table':
## 
##     shift
## 
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
## 
##     ngrams
## The following object is masked from 'package:stringr':
## 
##     %>%
## The following object is masked from 'package:text2vec':
## 
##     %>%
## The following object is masked from 'package:rvest':
## 
##     %>%
## The following object is masked from 'package:base':
## 
##     Filter
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:qdapRegex':
## 
##     %+%
## The following object is masked from 'package:NLP':
## 
##     annotate

Kungfu Hustle Movie Reviews

#####################################
# Kungfu Hustle Movie Reviews 
counts = c(0,10,20,30,40,50,60,70,80,90)
ratings.df = NULL
reviews = NULL
for (j in counts){
    url1 = paste0("http://www.imdb.com/title/tt0373074/reviews?filter=love;filter=love;start=",j) # positive reviews
    url2 = paste0("http://www.imdb.com/title/tt0373074/reviews?filter=hate;filter=hate;start=",j) # negative reviews 
    page1 = read_html(url1)
    page2 = read_html(url2)
    
    reviews1 = html_text( html_nodes( page1,'#tn15content p') )
    reviews2 = html_text( html_nodes( page2,'#tn15content p') )
    
    
    movie.nodes = html_nodes(page1,'h2 + img')
    rating1 = substr(html_attr(movie.nodes, name = 'alt'),0,2)
    
    movie.nodes = html_nodes(page2,'h2 + img')
    rating2 = substr(html_attr(movie.nodes, name = 'alt'),0,1)
    
    ratings.df <- c(ratings.df,rating1,rating2)
    
    reviews.positive = setdiff(reviews1, c("*** This review may contain spoilers ***","Add another review"))
    reviews.negative = setdiff(reviews2, c("*** This review may contain spoilers ***","Add another review"))
    
    reviews = c(reviews,reviews.positive,reviews.negative)
 
       
}

Create a text file which includes all the 200 reviews. 100 Positive and 100 Negative

reviews = gsub("\n",' ',reviews)
writeLines(reviews,'Kungfu_Hustle_Reviews.txt')         

Create a fucntion to clean the text

text.clean = function(x)                    # text data
{ require("tm")
    x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
    x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
    x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric 
    x  =  tolower(x)                          # convert to lower case characters
    x  =  removeNumbers(x)                    # removing numbers
    x  =  stripWhitespace(x)                  # removing white space
    x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
    return(x)
}
#--------------------------------------------------------#
# Step 1 - Reading text data                             #
#--------------------------------------------------------#

temp.text = readLines('G:\\Software_Installation_Paths\\RStudio\\R_Projects\\Working_Directory_All_Projects\\Kungfu_Hustle_Reviews.txt') #Kungfu_Hustle_Reviews.txt

head(temp.text,1)
## [1] "  Goodness me, what a fantastic movie. Caught the world premiere at the Toronto International Film Festival and the entire theater laughed until they cried. Amazingly directed, HILARIOUSLY funny, it blends a 1930s gangster stylishness into a Hong Kong kung fu movie to astonishing results. Who would've thought you could top Shaolin Soccer? Not me, until I saw this movie. Stephen Chow pulled it off. Chow's comedic timing gets better and better with every movie he makes, and while his films are depending more and more on CGI these days, and makes this movie much more a fantasy kung fu film than a traditional one, it hardly detracts from the enjoyable experience. Make it your mission to see this film - it will be one of the most entertaining you ever see. I can't remember the last film I enjoyed myself in more. My eyes still hurt from wiping away tears of laughter. Seriously.  "
data = data.frame(id = 1:length(temp.text), text = temp.text, stringsAsFactors = F)
dim(data)
## [1] 174   2

Check the dimensions of data

dim(data)
## [1] 174   2

Adding all the uncessary words to stop list Because, this movie has the plot about kungu and it happens in hong kong, I created this below stop list as

# Read Stopwords list

stpw1 = readLines('G:\\Software_Installation_Paths\\RStudio\\R_Projects\\Working_Directory_All_Projects\\stopwords.txt')

# read in stopwords file

stpw2 = tm::stopwords('english')
# tm package stop word list; tokenizer package has the same name function
stpw3 =c("kung_fu","hustle","plot","film","movie","hong kong","kung fu", "chow","stephen chow", "cast", "films", "movies", "story")
comn  = unique(c(stpw1, stpw2,stpw3) )                 # Union of two list
stopwords = unique(gsub("'"," ",comn) ) 

# final stop word lsit after removing punctuation

x  = text.clean(data$text )             # pre-process text corpus
x  =  removeWords(x,stopwords )            # removing stopwords created above
x  =  stripWhitespace(x )                  # removing white space
# x  =  stemDocument(x)

Create DTM using text2vec package

#--------------------------------------------------------#
## Step 2: Create DTM using text2vec package             #
#--------------------------------------------------------#

t1 = Sys.time()

tok_fun = word_tokenizer  # using word & not space tokenizers

it_0 = itoken( x,
               #preprocessor = text.clean,
               tokenizer = tok_fun,
               ids = data$id,
               progressbar = F)

vocab = create_vocabulary(it_0,    #  func collects unique terms & corresponding statistics
                          ngram = c(2L, 2L) #,
                          #stopwords = stopwords
)
pruned_vocab = prune_vocabulary(vocab,  # filters input vocab & throws out v frequent & v infrequent terms
                                term_count_min = 10)

length(pruned_vocab);  str(pruned_vocab)
## [1] 5
## List of 5
##  $ vocab         :Classes 'data.table' and 'data.frame': 22 obs. of  3 variables:
##   ..$ terms       : chr [1:22] "looney_tunes" "gong_fu" "sty_alley" "long_time" ...
##   ..$ terms_counts: int [1:22] 13 12 12 13 22 61 16 19 16 32 ...
##   ..$ doc_counts  : int [1:22] 12 6 10 12 19 40 13 16 12 30 ...
##   ..- attr(*, ".internal.selfref")=<externalptr> 
##  $ ngram         : Named int [1:2] 2 2
##   ..- attr(*, "names")= chr [1:2] "ngram_min" "ngram_max"
##  $ document_count: int 174
##  $ stopwords     : chr(0) 
##  $ sep_ngram     : chr "_"
##  - attr(*, "class")= chr "text2vec_vocabulary"
vectorizer = vocab_vectorizer(pruned_vocab) #  creates a text vectorizer func used in constructing a dtm/tcm/corpus

dtm_0  = create_dtm(it_0, vectorizer) # high-level function for creating a document-term matrix

# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),]       # terms in decreasing order of freq
head(tsum)
##        axe_gang  shaolin_soccer    martial_arts special_effects 
##              76              67              61              32 
##         pig_sty       kill_bill 
##              25              23
tail(tsum)
##      sty_alley        wo_ping       bad_guys        yuen_wo flying_daggers 
##             12             12             12             11             10 
##  action_scenes 
##             10
#-------------------------------------------------------
# Code bi-grams as unigram in clean text corpus

text2 = x
text2 = paste("",text2,"")

pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0

for (term in names(tsum)){
    i = i + 1
    focal.term = gsub("_", " ",term)        # in case dot was word-separator
    replacement.term = term
    text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
   # setTxtProgressBar(pb, i)
}


it_m = itoken(text2,     # function creates iterators over input objects to vocabularies, corpora, DTM & TCM matrices
              # preprocessor = text.clean,
              tokenizer = tok_fun,
              ids = data$id,
              progressbar = F)

vocab = create_vocabulary(it_m     # vocab func collects unique terms and corresponding statistics
                          # ngram = c(2L, 2L),
                          #stopwords = stopwords
)


pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 1)
vectorizer = vocab_vectorizer(pruned_vocab)

dtm_m  = create_dtm(it_m, vectorizer)
dim(dtm_m)
## [1]  174 4391
dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
a0 = (apply(dtm, 1, sum) > 0)   # build vector to identify non-empty docs
dtm = dtm[a0,]                  # drop empty docs

print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 1.064927 secs
# view a sample of the DTM, sorted from most to least frequent tokens 
dtm = dtm[,order(apply(dtm, 2, sum), decreasing = T)]     # sorting dtm's columns in decreasing order of column sums
inspect(dtm[1:5, 1:5])     # inspect() func used to view parts of a DTM object           
## <<DocumentTermMatrix (documents: 5, terms: 5)>>
## Non-/sparse entries: 9/16
## Sparsity           : 64%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs good comedy action funny great
##    1    0      0      0     1     0
##    2    1      0      0     0     0
##    3    3      2      3     1     0
##    4    0      1      0     2     0
##    5    1      0      0     0     0
#--------------------------------------------------------#
## Step 2a:     # Build word cloud                       #
#--------------------------------------------------------#

#   1- Using Term frequency(tf)             

tst = round(ncol(dtm)/100)  # divide DTM's cols into 100 manageble parts
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
    tempdtm = dtm[,(b[i]+1):(b[i+1])]
    s = colSums(as.matrix(tempdtm))
    ss.col = c(ss.col,s)
}

tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)]       #terms in decreasing order of freq
head(tsum)
##     good   comedy   action    funny    great axe_gang 
##      146      114      101       92       84       74
tail(tsum)
## buddhism       er maturing  calling  spoofed adequate 
##        1        1        1        1        1        1
windows()  # New plot window
wordcloud(names(tsum), tsum,     # words, their freqs 
          scale = c(4, 0.5),     # range of word sizes
          1,                     # min.freq of words to consider
          max.words = 100,       # max #words
          colors = brewer.pal(8, "Dark2"))    # Plot results in a word cloud 
title(sub = "Term Frequency - Wordcloud")     # title for the wordcloud display

# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))

windows()  # New plot window
ggplot(test, aes(x = rownames(test), y = test)) + 
    geom_bar(stat = "identity", fill = "Blue") +
    geom_text(aes(label = test), vjust= -0.20) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

# -------------------------------------------------------------- #
# step 2b - Using Term frequency inverse document frequency (tfidf)             
# -------------------------------------------------------------- #

library(textir)
## Loading required package: distrom
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:qdap':
## 
##     %&%
## Loading required package: gamlr
## Loading required package: parallel
dtm.tfidf = tfidf(dtm, normalize= FALSE)

tst = round(ncol(dtm.tfidf)/100)
a = rep(tst, 99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm.tfidf))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
    tempdtm = dtm.tfidf[,(b[i]+1):(b[i+1])]
    s = colSums(as.matrix(tempdtm))
    ss.col = c(ss.col,s)

}

tsum = ss.col

tsum = tsum[order(tsum, decreasing = T)]       #terms in decreasing order of freq
head(tsum)
##      sing  axe_gang      good    action     great   chinese 
## 135.81256 110.66653 104.59499 101.01739  95.23110  94.43638
tail(tsum)
## buddhism       er maturing  calling  spoofed adequate 
## 4.465908 4.465908 4.465908 4.465908 4.465908 4.465908
windows()  
wordcloud(names(tsum), tsum, scale=c(4,0.5),1, max.words=100,colors=brewer.pal(8, "Dark2")) # Plot results in a word cloud 
title(sub = "Term Frequency Inverse Document Frequency - Wordcloud")

as.matrix(tsum[1:20])     #  to see the top few tokens & their IDF scores
##                     [,1]
## sing           135.81256
## axe_gang       110.66653
## good           104.59499
## action         101.01739
## great           95.23110
## chinese         94.43638
## funny           92.01584
## people          91.15090
## comedy          90.01525
## watch           88.23004
## martial_arts    88.17448
## shaolin_soccer  86.55314
## scenes          82.47346
## time            80.49296
## matrix          78.56303
## bad             78.55458
## made            78.36734
## fun             77.20128
## lot             77.20128
## humor           76.61061
(dtm.tfidf)[1:10, 1:10]   # view first 10x10 cells in the DTM under TF IDF.
## 10 x 10 sparse Matrix of class "dgCMatrix"
##    [[ suppressing 10 column names 'good', 'comedy', 'action' ... ]]
##                                                                   
## 1  .        .         .        1.000172 .        .        .       
## 2  0.716404 .         .        .        .        .        1.133704
## 3  2.149212 1.5792149 3.000517 1.000172 .        4.486481 .       
## 4  .        0.7896074 .        2.000344 .        .        .       
## 5  0.716404 .         .        .        .        .        1.133704
## 6  3.582020 .         .        .        .        1.495494 2.267407
## 7  .        .         2.000344 .        2.267407 .        2.267407
## 8  0.716404 .         .        .        .        .        .       
## 9  1.432808 .         1.000172 .        .        .        .       
## 10 .        0.7896074 .        .        .        .        .       
##                              
## 1  .        1.352393 .       
## 2  .        .        .       
## 3  3.880359 1.352393 1.247032
## 4  .        .        .       
## 5  .        .        .       
## 6  .        1.352393 1.247032
## 7  .        .        .       
## 8  9.700897 .        .       
## 9  .        .        .       
## 10 .        1.352393 .
# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))
windows()  # New plot window
ggplot(test, aes(x = rownames(test), y = test)) + 
    geom_bar(stat = "identity", fill = "red") +
    geom_text(aes(label = test), vjust= -0.20) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

#------------------------------------------------------#
# step 2c - Term Co-occurance Matrix (TCM)             #
#------------------------------------------------------#

vectorizer = vocab_vectorizer(pruned_vocab, 
                              grow_dtm = FALSE, 
                              skip_grams_window = 5L)

tcm = create_tcm(it_m, vectorizer) # func to build a TCM

tcm.mat = as.matrix(tcm)         # use tcm.mat[1:5, 1:5] to view
adj.mat = tcm.mat + t(tcm.mat)   # since adjacency matrices are symmetric

z = order(colSums(adj.mat), decreasing = T)
adj.mat = adj.mat[z,z]

# Plot Simple Term Co-occurance graph
adj = adj.mat[1:30,1:30]

library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:qdap':
## 
##     %>%, diversity
## The following object is masked from 'package:stringr':
## 
##     %>%
## The following objects are masked from 'package:text2vec':
## 
##     %>%, normalize
## The following object is masked from 'package:rvest':
## 
##     %>%
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
cog = graph.adjacency(adj, mode = 'undirected')
cog =  simplify(cog)  

cog = delete.vertices(cog, V(cog)[ degree(cog) == 0 ])

windows()
plot(cog)

#-----------------------------------------------------------#
# Step 2d - a cleaned up or 'distilled' COG PLot            #
#-----------------------------------------------------------#

distill.cog = function(mat1, # input TCM ADJ MAT
                       title, # title for the graph
                       s,    # no. of central nodes
                       k1){  # max no. of connections  
    library(igraph)
    a = colSums(mat1) # collect colsums into a vector obj a
    b = order(-a)     # nice syntax for ordering vector in decr order  
    
    mat2 = mat1[b, b]     # order both rows and columns along vector b
    
    diag(mat2) =  0
    
    ## +++ go row by row and find top k adjacencies +++ ##
    
    wc = NULL
    
    for (i1 in 1:s){ 
        thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
        mat2[i1, mat2[i1,] < thresh1] = 0  
        mat2[i1, mat2[i1,] > 0 ] = 1
        word = names(mat2[i1, mat2[i1,] > 0])
        mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
        wc = c(wc,word)
    } # i1 loop ends
    
    
    mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
    ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))]  # removed any NAs from the list
    mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
    graph <- graph.adjacency(mat4, mode = "undirected", weighted=T)    # Create Network object
    graph = simplify(graph) 
    V(graph)$color[1:s] = "green"
    V(graph)$color[(s+1):length(V(graph))] = "pink"
    
    graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ]) 
    
    plot(graph, 
         layout = layout.kamada.kawai, 
         main = title)
    
} # func ends

windows()
distill.cog(tcm.mat, 'Distilled COG',  10,  5)

## adj.mat and distilled cog for tfidf DTMs ##

adj.mat = t(dtm.tfidf) %*% dtm.tfidf
diag(adj.mat) = 0
a1 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a1[1:50], a1[1:50]])

windows()
distill.cog(adj.mat, 'Distilled COG',  10,  10)

reviews_df <- data.frame(reviews,ratings.df)

Compare each review’s polarity score with its star rating.

#--------------------------------------------------------#
#  Step 3 correlation between polarity and rating       #
#--------------------------------------------------------#

reviews_df <- data.frame(reviews,ratings.df)

polarity <- counts(polarity(reviews_df$reviews))[, "polarity"]

reviews_df$polarity <-polarity

# reviews_df --> first colmun contains review second column has rating and third has polarity of review
cor(as.numeric(reviews_df$ratings.df), polarity)
## [1] -0.09161505
#--------------------------------------------------------#
#             Sentiment Analysis                         #
#--------------------------------------------------------#

library(qdap)

x1 = x[a0]    # remove empty docs from corpus

t1 = Sys.time()   # set timer

pol = polarity(x1)         # Calculate the polarity from qdap dictionary
wc = pol$all[,2]                  # Word Count in each doc
val = pol$all[,3]                 # average polarity score
p  = pol$all[,4]                  # Positive words info
n  = pol$all[,5]                  # Negative Words info  

dim(pol)
## NULL
Sys.time() - t1  # how much time did the above take?
## Time difference of 10.9854 secs
head(pol$all)
##   all  wc   polarity
## 1 all  49  0.5714286
## 2 all  41  0.1561738
## 3 all 209 -1.4249318
## 4 all  37  0.3287980
## 5 all  90  1.0540926
## 6 all 211  0.3029085
##                                                                                                                                           pos.words
## 1                                                                goodness, fantastic, amazingly, astonishing, top, enjoyable, entertaining, enjoyed
## 2                                                                                                                  good, hero, sincerely, recommend
## 3                                             imaginative, masters, jaw dropping, famed, good, good, masters, good, works, sophisticated, authentic
## 4                                                                                                                classic, hilarious, proud, success
## 5              promise, enthusiastic, promise, promise, sweeping, good, integrated, smoothly, stellar, perfect, believable, excellent, work, steady
## 6 good, award, winning, humour, good, good, attraction, successful, gratifying, fun, hallmark, skilled, skill, masters, engrossing, good, fun, good
##                                                                                                                                                                                                                                                            neg.words
## 1                                                                                                                                                                                                                                    funny, gangster, detracts, hurt
## 2                                                                                                                                                                                                                                                weak, dirty, regret
## 3 wildly, petty, ruthless, gangster, extort, notorious, relentless, explosive, outrageous, kill, blow, rascal, glibly, feeble, stuck, poverty, unknown, notorious, cruelest, pig, extortion, clash, skinny, wimpy, evil, torture, funny, suffering, enemy, poisonous
## 4                                                                                                                                                                                                                                                       funny, funny
## 5                                                                                                                                                                                                                                       strike, warning, evil, jaded
## 6                                                                                                                                                                           stumbled, abyss, weak, sarcasm, petty, lecher, beggar, naive, madness, mad, crazy, knife
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      text.var
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  goodness fantastic caught world premiere toronto international festival entire theater laughed cried amazingly directed hilariously funny blends gangster stylishness astonishing results ve thought top shaolin soccer pulled comedic timing makes depending cgi days makes fantasy traditional detracts enjoyable experience make mission entertaining remember enjoyed eyes hurt wiping tears laughter 
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       general puts tribute argue stephen lots jokes routines makes bit weak script make dirty jokes usual learn philosophy hours good laugh time watch carefully understands opinion thing crazies common crouching tiger hidden dragon hero express sincerely recommend slightest hesitation watch regret 
## 3  wildly imaginative action packed petty thief sing aspires ruthless gangster stumbling gang controlled apartment extort money locals masters disguise sing actions attracts notorious axe gang set relentless chain events brings clans explosive battle showdowns dance sequences featuring tuxedoed mobsters martial arts action outrageous jaw dropping fight sequences yeun wo ping famed action choreographer kill bill vol matrix blow set guangdong province china plays ah xing street rascal fools constantly good talking glibly core feeble minded stuck poverty unknown future good xing beginning discover xing goal member notorious axe gang fiercest cruelest widespread gang city day xing slum pig cage town usual extortion witnesses real clash gangs axe gang clearing local gang shown allegiance axe xing realizes slum residents including plump landlady skinny wimpy husband turned masters fight large group gangs xing realizes battle good evil called choose side continues unique comedy style twisting slapstick jokes reinterpreting composed face intensifies torture sequences creates funny points suffering shows characters practising faced middle aged woman slum quickly kicked groin stabbed flying daggers supposedly aimed enemy bitten face poisonous snakes trials manages survive day creativity demonstrated higher quality comedy early works fight back school king beggars cinematography sophisticated authentic aura shaolin soccer heavily adopts computer graphics present special effects snake daggers flying axes
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        yesterday made history simple storyline funny comedy similar harry potter case wizard animations hayao miyazaki world imagination created characters funny classic appeared context result hilarious make people proud hope box office success countries buy dvd personal collection
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    decades avoiding copy pressed hands dear friend extracted promise time night sit watch enthusiastic tossed drive athlon promise promise minutes nuclear strike warning town torn screen watched dozen times cheesiness cinema rare occasions writer director directly tap nerve weave directly piece simply standard romance sweeping portrait good versus evil moral lesson power comedic bits integrated flows smoothly tone fight choreography stellar special care make character piece human perfect people places makes wire fighting appears parts believable acting excellent camera work rock steady feeling jaded burdened person make difference settle copy 
## 6                                                                    good things latest part unlike previous award winning action comedy shaolin soccer continued martial arts theme expanded massively repertoire sfx fan beginning times relied deadpan expression toilet humour spoofs books good ending heart warming small town village boy makes good lines elements noticeably absent kfh ng man tat missing customary female attraction main female character screen time ss back excepting ng goalkeeper roaring time leader axe gang kfh talk sfx creatively world part stumbled cinematic abyss caught hollywood directors dabble black arts sfx thankfully answer true kfh unlike ss weak characters characteristic recent western yawns van helsing blade trinity stepped formula made ss sfx extravaganza successful contrast kfh bundle action sequences thrown showcase sfx wit cheekiness remains gratifying fun sarcasm hallmark acting career translated produced directed kfh spoofing clichs chinese martial arts stories retired martial arts couple petty scrooge lecher heroes save neighbourhood gay end dispatched con beggar sells wu ling mi ji naive boy spoofs sfx spectacles drive madness upward highly skilled sitar players strum blades ghouls battle pugilist toad skill croaks blows lower mouth ascends skies masters buddha palm eagle distance engrossing mass battle black clad axe gangsters lifted matrix mad irreverent times affecting yearn good sfx fun crazy surely funniest bit knife throwing sequence good mind bring year
head(pol$group)
##   all total.sentences total.words ave.polarity sd.polarity
## 1 all             174       14895    0.3302639   0.7289411
##   stan.mean.polarity
## 1          0.4530735
positive_words = unique(setdiff(unlist(p),"-"))  # Positive words list
negative_words = unique(setdiff(unlist(n),"-"))  # Negative words list

print(positive_words)       # Print all the positive words found in the corpus
##   [1] "goodness"        "fantastic"       "amazingly"      
##   [4] "astonishing"     "top"             "enjoyable"      
##   [7] "entertaining"    "enjoyed"         "good"           
##  [10] "hero"            "sincerely"       "recommend"      
##  [13] "imaginative"     "masters"         "jaw dropping"   
##  [16] "famed"           "works"           "sophisticated"  
##  [19] "authentic"       "classic"         "hilarious"      
##  [22] "proud"           "success"         "promise"        
##  [25] "enthusiastic"    "sweeping"        "integrated"     
##  [28] "smoothly"        "stellar"         "perfect"        
##  [31] "believable"      "excellent"       "work"           
##  [34] "steady"          "award"           "winning"        
##  [37] "humour"          "attraction"      "successful"     
##  [40] "gratifying"      "fun"             "hallmark"       
##  [43] "skilled"         "skill"           "engrossing"     
##  [46] "worth"           "important"       "spectacular"    
##  [49] "great"           "succeeds"        "talented"       
##  [52] "rightful"        "worked"          "incredibly"     
##  [55] "likable"         "amazing"         "powerful"       
##  [58] "legendary"       "fans"            "wonderful"      
##  [61] "fairly"          "applaud"         "smile"          
##  [64] "pure"            "entertain"       "humor"          
##  [67] "impressive"      "phenomenal"      "joy"            
##  [70] "decent"          "neatly"          "master"         
##  [73] "enjoy"           "cool"            "neat"           
##  [76] "amusing"         "suitable"        "deserving"      
##  [79] "thriving"        "assure"          "interesting"    
##  [82] "wonders"         "gain"            "acclaim"        
##  [85] "free"            "positive"        "fast"           
##  [88] "ready"           "redeeming"       "favor"          
##  [91] "pride"           "survival"        "incredible"     
##  [94] "inventive"       "genuine"         "properly"       
##  [97] "superior"        "optimistic"      "beautiful"      
## [100] "proper"          "greatest"        "marvelous"      
## [103] "coolest"         "sweet"           "love"           
## [106] "kindness"        "honest"          "meaningful"     
## [109] "reasonable"      "recommended"     "prefer"         
## [112] "righteous"       "worthwhile"      "genius"         
## [115] "humorous"        "adorable"        "super"          
## [118] "awesome"         "creative"        "stylized"       
## [121] "innovative"      "modern"          "homage"         
## [124] "beauty"          "satisfy"         "memorable"      
## [127] "strong"          "masterpiece"     "daring"         
## [130] "fresh"           "improved"        "nicely"         
## [133] "brilliant"       "talent"          "sharp"          
## [136] "cute"            "prowess"         "qualify"        
## [139] "acclaimed"       "solid"           "promising"      
## [142] "unreal"          "adore"           "fame"           
## [145] "wowed"           "pretty"          "fine"           
## [148] "correct"         "enjoys"          "enjoyment"      
## [151] "inspiration"     "happier"         "impressed"      
## [154] "wonderfully"     "pleasant"        "loved"          
## [157] "veritable"       "loving"          "effortlessly"   
## [160] "gem"             "magnificent"     "amazed"         
## [163] "awe"             "honesty"         "perfectly"      
## [166] "fair"            "proves"          "fortunately"    
## [169] "refreshing"      "miracle"         "happily"        
## [172] "realistic"       "smiling"         "benefit"        
## [175] "superb"          "winner"          "supporting"     
## [178] "awards"          "merit"           "clever"         
## [181] "nice"            "appeal"          "aspire"         
## [184] "progress"        "likes"           "masterful"      
## [187] "intelligible"    "fast paced"      "woo"            
## [190] "top notch"       "breathtaking"    "coherent"       
## [193] "preferring"      "appealing"       "dazzling"       
## [196] "satisfied"       "witty"           "happiness"      
## [199] "defeat"          "favorite"        "extraordinary"  
## [202] "beautifully"     "accomplished"    "excited"        
## [205] "blockbuster"     "satisfying"      "shine"          
## [208] "easy"            "enhanced"        "idol"           
## [211] "brilliantly"     "delight"         "peaceful"       
## [214] "colorful"        "romantic"        "significant"    
## [217] "admire"          "exciting"        "respect"        
## [220] "outdo"           "originality"     "notably"        
## [223] "stronger"        "effective"       "mighty"         
## [226] "heaven"          "win"             "support"        
## [229] "clear"           "achievement"     "intelligent"    
## [232] "delightful"      "achievements"    "praise"         
## [235] "silent"          "impress"         "magical"        
## [238] "visionary"       "attractive"      "lush"           
## [241] "favour"          "promised"        "stylish"        
## [244] "comfortable"     "brave"           "greatness"      
## [247] "lead"            "ample"           "golden"         
## [250] "outstanding"     "inspiring"       "vibrant"        
## [253] "charm"           "endearing"       "talents"        
## [256] "cherished"       "mature"          "cherish"        
## [259] "slick"           "perfection"      "renowned"       
## [262] "famous"          "modesty"         "righteousness"  
## [265] "entertains"      "tough"           "terrific"       
## [268] "thrill"          "pleasure"        "surreal"        
## [271] "dedicated"       "dazzle"          "rich"           
## [274] "spiritual"       "enlightenment"   "thankful"       
## [277] "remarkably"      "playful"         "magnificence"   
## [280] "clean"           "reliable"        "seasoned"       
## [283] "inestimable"     "leads"           "promises"       
## [286] "exuberant"       "prize"           "stunning"       
## [289] "led"             "sensational"     "amuse"          
## [292] "gentle"          "champion"        "soulful"        
## [295] "poignant"        "spirited"        "worthy"         
## [298] "flawless"        "lover"           "masterpieces"   
## [301] "thrilling"       "humble"          "keen"           
## [304] "finest"          "unaffected"      "razor sharp"    
## [307] "passionate"      "majestic"        "amaze"          
## [310] "whoa"            "abound"          "lucky"          
## [313] "cheer"           "vivid"           "magic"          
## [316] "gorgeous"        "mind blowing"    "fascinating"    
## [319] "handsome"        "succeeded"       "wisely"         
## [322] "instantly"       "destiny"         "protect"        
## [325] "sensitive"       "enjoying"        "supreme"        
## [328] "galore"          "redemption"      "engaging"       
## [331] "defeated"        "protection"      "reputation"     
## [334] "easier"          "happy"           "successfully"   
## [337] "mastery"         "unselfish"       "wow"            
## [340] "grand"           "sincere"         "faith"          
## [343] "secure"          "exceeded"        "shiny"          
## [346] "beloved"         "loyalty"         "regard"         
## [349] "abundant"        "adequate"        "guarantee"      
## [352] "prominent"       "thrive"          "lovable"        
## [355] "accessible"      "pleased"         "variety"        
## [358] "glamorous"       "excellently"     "wins"           
## [361] "ease"            "entice"          "delectable"     
## [364] "thoughtful"      "delightfully"    "loves"          
## [367] "hottest"         "hot"             "earnest"        
## [370] "revel"           "seamless"        "fantastically"  
## [373] "superbly"        "praising"        "stunningly"     
## [376] "fascinate"       "popular"         "bliss"          
## [379] "sweetness"       "fabulous"        "feat"           
## [382] "straightforward" "polished"        "flashy"         
## [385] "smart"           "conveniently"    "recommendation" 
## [388] "exceptional"     "cleverly"        "splendidly"     
## [391] "passion"         "glad"            "wisdom"         
## [394] "friendly"        "appreciated"     "enthusiasm"     
## [397] "glory"           "holy"            "skillfully"     
## [400] "heroine"         "extraordinarily" "heroic"         
## [403] "fertile"         "exceptionally"   "brilliance"     
## [406] "fancy"           "effectively"
print(negative_words)       # Print all neg words
##   [1] "funny"            "gangster"         "detracts"        
##   [4] "hurt"             "weak"             "dirty"           
##   [7] "regret"           "wildly"           "petty"           
##  [10] "ruthless"         "extort"           "notorious"       
##  [13] "relentless"       "explosive"        "outrageous"      
##  [16] "kill"             "blow"             "rascal"          
##  [19] "glibly"           "feeble"           "stuck"           
##  [22] "poverty"          "unknown"          "cruelest"        
##  [25] "pig"              "extortion"        "clash"           
##  [28] "skinny"           "wimpy"            "evil"            
##  [31] "torture"          "suffering"        "enemy"           
##  [34] "poisonous"        "strike"           "warning"         
##  [37] "jaded"            "stumbled"         "abyss"           
##  [40] "sarcasm"          "lecher"           "beggar"          
##  [43] "naive"            "madness"          "mad"             
##  [46] "crazy"            "knife"            "disgustingly"    
##  [49] "overpriced"       "overblown"        "fail"            
##  [52] "disappointed"     "fell"             "tired"           
##  [55] "bum"              "deaf"             "weep"            
##  [58] "fat"              "lazy"             "frail"           
##  [61] "domineering"      "confusing"        "comical"         
##  [64] "stunt"            "waste"            "slap"            
##  [67] "unwatchable"      "struggle"         "warped"          
##  [70] "dreadful"         "worst"            "shock"           
##  [73] "drag"             "stupid"           "unfortunate"     
##  [76] "burn"             "misfortune"       "suck"            
##  [79] "bad"              "awful"            "ripped"          
##  [82] "ax"               "lame"             "poorly"          
##  [85] "wasted"           "sucks"            "terrible"        
##  [88] "painful"          "contradict"       "annoying"        
##  [91] "poor"             "lost"             "lose"            
##  [94] "unbelievably"     "lack"             "lacks"           
##  [97] "stinks"           "bewildering"      "retarded"        
## [100] "ridiculous"       "stupidity"        "drunk"           
## [103] "hard"             "worse"            "hate"            
## [106] "cheats"           "miscellaneous"    "uneasy"          
## [109] "broken"           "conflict"         "break"           
## [112] "falls"            "dreadfully"       "joke"            
## [115] "inept"            "failing"          "sadly"           
## [118] "lacking"          "incoherent"       "absurdity"       
## [121] "blame"            "morons"           "utterly"         
## [124] "dumping"          "smash"            "gross"           
## [127] "parody"           "strange"          "doubt"           
## [130] "unpredictable"    "violently"        "hell"            
## [133] "disturbing"       "absurd"           "death"           
## [136] "wild"             "bloody"           "regretted"       
## [139] "miss"             "killer"           "devil"           
## [142] "blah"             "struggling"       "wrong"           
## [145] "darkness"         "dilemma"          "badly"           
## [148] "cry"              "suffers"          "bull"            
## [151] "intense"          "sick"             "hated"           
## [154] "goofy"            "sty"              "fear"            
## [157] "long time"        "subversive"       "distracting"     
## [160] "bother"           "randomly"         "scratch"         
## [163] "disorder"         "messing"          "horrid"          
## [166] "horrible"         "twist"            "wreck"           
## [169] "mess"             "stupidest"        "overrated"       
## [172] "hung"             "convoluted"       "disaster"        
## [175] "excuse"           "dumped"           "killing"         
## [178] "refuse"           "stumble"          "trick"           
## [181] "diabolic"         "haphazard"        "silly"           
## [184] "cruel"            "sin"              "dumb"            
## [187] "warned"           "addict"           "stereotypical"   
## [190] "problem"          "childish"         "scary"           
## [193] "superficial"      "embarrassment"    "mockery"         
## [196] "undercut"         "repetitive"       "messy"           
## [199] "struck"           "revenge"          "underdog"        
## [202] "pointless"        "superfluous"      "cold"            
## [205] "unable"           "disbelief"        "disappointment"  
## [208] "fault"            "tragedy"          "bizarre"         
## [211] "bored"            "blunt"            "irritating"      
## [214] "loud"             "faults"           "slow"            
## [217] "shortness"        "mistaken"         "complaining"     
## [220] "denying"          "uneven"           "hapless"         
## [223] "embroiled"        "notoriously"      "violent"         
## [226] "ramshackle"       "dastardly"        "attacks"         
## [229] "prison"           "broke"            "showdown"        
## [232] "malevolent"       "fever"            "cartoonish"      
## [235] "disliked"         "insane"           "issues"          
## [238] "critical"         "squabble"         "ridiculously"    
## [241] "impossible"       "complex"          "critics"         
## [244] "sloppy"           "unbelievable"     "onslaught"       
## [247] "rife"             "inferior"         "ruin"            
## [250] "fuss"             "racist"           "nonsense"        
## [253] "difficult"        "frustrated"       "offensive"       
## [256] "upsetting"        "nervous"          "excessive"       
## [259] "abuse"            "bashing"          "drags"           
## [262] "sad"              "contrived"        "boring"          
## [265] "clueless"         "lacked"           "fall"            
## [268] "knock"            "die"              "dangerous"       
## [271] "gruesome"         "shallow"          "alienate"        
## [274] "suspect"          "dragged"          "twists"          
## [277] "stolen"           "killed"           "mistakes"        
## [280] "fret"             "disappointments"  "weird"           
## [283] "trouble"          "chaos"            "lure"            
## [286] "blind"            "deadly"           "infamous"        
## [289] "bitter"           "spoil"            "stab"            
## [292] "tainted"          "stooges"          "fails"           
## [295] "miserably"        "disgust"          "ignore"          
## [298] "marginally"       "crappy"           "miserable"       
## [301] "failure"          "monotonous"       "criticism"       
## [304] "dying"            "brash"            "impoverished"    
## [307] "mistake"          "confusion"        "stale"           
## [310] "failed"           "disappoint"       "concerned"       
## [313] "moody"            "shortcomings"     "unfamiliar"      
## [316] "harsh"            "betrayal"         "hardship"        
## [319] "villains"         "melancholy"       "sucked"          
## [322] "tiring"           "unsuccessful"     "dull"            
## [325] "touchy"           "susceptible"      "rebuff"          
## [328] "laughable"        "wicked"           "flaws"           
## [331] "cheesy"           "pale"             "fist"            
## [334] "fury"             "ironically"       "disgusting"      
## [337] "attack"           "cloud"            "overwhelmed"     
## [340] "crime"            "unrealistic"      "confused"        
## [343] "anemic"           "strictly"         "pratfall"        
## [346] "criminal"         "garbage"          "splitting"       
## [349] "bugs"             "dark"             "fried"           
## [352] "hysterical"       "diametrically"    "perplexed"       
## [355] "cheap"            "powerless"        "vicious"         
## [358] "overweight"       "panicking"        "lie"             
## [361] "assassinate"      "confuse"          "annoy"           
## [364] "stagnant"         "heinous"          "swindle"         
## [367] "obnoxious"        "assassin"         "noisy"           
## [370] "struggles"        "overwhelming"     "complaints"      
## [373] "strangely"        "brutal"           "beware"          
## [376] "gasp"             "die hard"         "problems"        
## [379] "irony"            "unexpected"       "desperate"       
## [382] "blackmail"        "dead"             "stealing"        
## [385] "suffer"           "dust"             "freeze"          
## [388] "expensive"        "shocked"          "offend"          
## [391] "frightening"      "dump"             "annoys"          
## [394] "wary"             "shame"            "split"           
## [397] "blurred"          "hefty"            "contortions"     
## [400] "ridicules"        "fierce"           "overwhelmingly"  
## [403] "limited"          "beg"              "steal"           
## [406] "ranting"          "useless"          "object"          
## [409] "oppression"       "brutality"        "negative"        
## [412] "destroy"          "unnecessary"      "incongruous"     
## [415] "disappointing"    "simplistic"       "complicated"     
## [418] "weaker"           "refuses"          "unappealing"     
## [421] "selfish"          "afraid"           "doldrums"        
## [424] "dislike"          "rift"             "stricken"        
## [427] "threaten"         "rogue"            "loses"           
## [430] "harm"             "wounds"           "suspiciously"    
## [433] "exaggerate"       "punch"            "thug"            
## [436] "lackluster"       "damn"             "disproportionate"
## [439] "tiresome"         "denied"           "vice"            
## [442] "hardships"        "insult"           "pretentious"     
## [445] "crap"             "lethal"           "tyranny"         
## [448] "fleeing"          "sly"              "bullies"         
## [451] "disregard"        "defiance"         "seriousness"     
## [454] "bothered"         "cringed"          "bashed"          
## [457] "guilty"           "discomfort"       "fatal"           
## [460] "flaw"             "manic"            "confrontation"   
## [463] "inability"        "tension"          "overdone"        
## [466] "misses"           "deter"            "detestable"      
## [469] "ironic"           "awkward"          "falling"         
## [472] "imaginary"        "worries"          "hang"            
## [475] "unclear"          "anxious"          "infernal"        
## [478] "satirical"        "critic"           "murder"          
## [481] "loose"            "worn"             "innuendo"        
## [484] "scarce"           "troubled"         "drunken"         
## [487] "startling"        "cruelty"          "infuriatingly"   
## [490] "erratic"          "smack"            "assault"         
## [493] "smoldering"       "rage"             "offender"        
## [496] "villainous"       "pan"              "idiotic"         
## [499] "mocking"          "objections"       "pain"            
## [502] "heck"             "obscure"          "stumbles"        
## [505] "extravagant"      "imposing"         "exploit"         
## [508] "fatty"            "abrupt"           "risky"           
## [511] "missed"           "inane"            "misread"         
## [514] "phony"            "limit"            "overwhelm"       
## [517] "stereotype"       "needless"         "odd"             
## [520] "worried"          "quarrels"         "flare"           
## [523] "joker"            "crash"            "massacre"        
## [526] "shrivel"          "fake"             "dissuade"        
## [529] "pessimistic"      "wasting"          "idiots"          
## [532] "raving"           "scum"
#--------------------------------------------------------#
#   Create Postive Words wordcloud                       #
#--------------------------------------------------------#

pos.tdm = dtm[,which(colnames(dtm) %in% positive_words)]
m = as.matrix(pos.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows() # opens new image window
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))
title(sub = "Positive Words - Wordcloud")

# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows() # opens new image window
ggplot(test, aes(x = rownames(test), y = test)) + 
    geom_bar(stat = "identity", fill = "blue") +
    geom_text(aes(label = test), vjust= -0.20) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

#--------------------------------------------------------#
#  Create Negative Words wordcloud                       #
#--------------------------------------------------------#

neg.tdm = dtm[,which(colnames(dtm) %in% negative_words) ]
m = as.matrix(neg.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows()
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))         
title(sub = "Negative Words - Wordcloud")

# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows()
ggplot(test, aes(x = rownames(test), y = test)) + 
    geom_bar(stat = "identity", fill = "red") +
    geom_text(aes(label = test), vjust= -0.20) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

#--------------------------------------------------------#
#  Positive words vs Negative Words plot                 #
#--------------------------------------------------------#

len = function(x){
    if ( x == "-" && length(x) == 1)  {return (0)} 
    else {return(length(unlist(x)))}
}

pcount = unlist(lapply(p, len))
ncount = unlist(lapply(n, len))
doc_id = seq(1:length(wc))

windows()
plot(doc_id,pcount,type="l",col="green",xlab = "Document ID", ylab= "Word Count")
lines(doc_id,ncount,type= "l", col="red")
title(main = "Positive words vs Negative Words" )
legend("topright", inset=.05, c("Positive Words","Negative Words"), fill=c("green","red"), horiz=TRUE)

# Documet Sentiment Running plot
windows()
plot(pol$all$polarity, type = "l", ylab = "Polarity Score",xlab = "Document Number")
abline(h=0)
title(main = "Polarity Plot" )

### COG for sentiment-laden words ? ###

senti.dtm = cbind(pos.tdm, neg.tdm); dim(senti.dtm)
## [1] 174 932
senti.adj.mat = as.matrix(t(senti.dtm)) %*% as.matrix(senti.dtm)
diag(senti.adj.mat) = 0

windows()
distill.cog(senti.adj.mat,   # ad mat obj 
            'Distilled COG of senti words',       # plot title
            5,       # max #central nodes
            5)        # max #connexns

I would recommend this to be proceeded with a sequel as the overall sentiment is positive.

Also, I see positive words that speaks FOR this film are too good and they are very high when compared to the negative words that go against this film

Martial arts are too good and humour has good role in this movie which is highlighted in most of the positive reviews

On other side, there are also people who talked about the movie’s narration and plot similarity with Shaloin Soccer and other chinese movies.

Sing Lady and the land Lord is one who people talked a lot about in this movie

There are very beautiful elements that can be continued further in the next sequel like love, narration, comedy and the story weaved with the martial arts.

The correlation between the two factors consdiered (Ratings & Polarity Score Generated) are slightly negatively correlated.

TOP 3 Movie Recommendations: * Comedy usage all through the movie * Use of Characters in the narration (Especially the Singing Lady /Lord) * Action / Chinese Martial Arts