Task 1 - Text-Analyzing a simple set of documents

Submitted By : Neeraj Khattar, CBA (Batch-7), Roll #: 71620042

Problem Statement…

Imagine you’re a Data Scientist / consultant for a movie studio. Your brief is to recommend the top 2-3 movie aspects or attributes the studio should focus on in making a sequel.

The aim is to get you to explore with trial-and-error different configurations of possibilities (e.g., what stop-words to use for maximum meaning? TF or IDF? etc) in the text-An of a simple corpus.

Prerequisite

Loading Library and if not present install the required library. Also, some common used that will be used in below code.

require(rvest) || install.packages('rvest')
require(RSelenium) || install.packages('RSelenium')
require(text2vec) || install.packages('text2vec')
require(data.table) || install.packages('data.table')
require(stringr) || install.packages('stringr')
require(tm) || install.packages('tm')
require(RWeka) || install.packages('RWeka')
require(tokenizers) || install.packages('tokenizers')
require(slam) || install.packages('slam')
require(wordcloud) || install.packages('wordcloud')
require(ggplot2) || install.packages('ggplot2')
require(XML) || install.packages('XML')
require(qdap) || install.packages("qdap") # ensure java is up to date!
require(textir) || install.packages("textir")
require(igraph) || install.packages("igraph")

library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(rvest)
library(ggplot2)
library(rvest)
library(XML)
library(qdap)
library(textir)
library(igraph)
    
rm(list=ls())

text.clean = function(x)                    # text data
{ require("tm")
  x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
  x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
  x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric 
  x  =  tolower(x)                          # convert to lower case characters
  x  =  removeNumbers(x)                    # removing numbers
  x  =  stripWhitespace(x)                  # removing white space
  x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
  return(x)
}

find.polarity = function(review.nodes, rating, df)                    # text data
{   
  for (a in 1:length(rating))
  {
    #print (tmp1[a])
    data = review.nodes[a]
    
    
    x  = text.clean(data)             # pre-process text corpus
    x  =  removeWords(x,stopwords)            # removing stopwords created above
    x  =  stripWhitespace(x)                  # removing white space
    # x  =  stemDocument(x)
    
    #--------------------------------------------------------#
    ######  Create DTM using text2vec package                #
    #--------------------------------------------------------#
    
    
    tok_fun = word_tokenizer
    
    it_m = itoken(x,
                  # preprocessor = text.clean,
                  tokenizer = tok_fun,
                  ids = data$id)
    
    vocab = create_vocabulary(it_m
                              # ngram = c(2L, 2L),
                              #stopwords = stopwords
    )
    
    pruned_vocab = prune_vocabulary(vocab,
                                    term_count_min = 1)
    
    vectorizer = vocab_vectorizer(pruned_vocab)
    
    dtm_m  = create_dtm(it_m, vectorizer)
    dim(dtm_m)
    
    dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
    a0 = (apply(dtm, 1, sum) > 0)   # build vector to identify non-empty docs
    dtm = dtm[a0,]                  # drop empty docs
    
    
    
    #--------------------------------------------------------#
    #             Sentiment Analysis                         #
    #--------------------------------------------------------#
    
    x1 = x[a0]    # remove empty docs from corpus
    
    pol = polarity(x1)         # Calculate the polarity from qdap dictionary
    wc = pol$all[,2]                  # Word Count in each doc
    val = pol$all[,3]                 # average polarity score
    p  = pol$all[,4]                  # Positive words info
    n  = pol$all[,5]                  # Negative Words info  
    
    new.row <- data.frame(Rating = c(as.numeric(rating[a])), Polarity = c(pol$group$ave.polarity))
    
    df <- rbind(df, new.row)
  }
  return (df)
}

STEP 1 (Taare Zameen Par)

Go to IMDB and extract 100 reviews (50 positive and 50 negative) for your favourite movie.

The data is parsed for bollywood movie, Taare Zameen Par and result will be saved in C: drive with the name “Taare Zameen Par.txt”

#--------------------------------------------------------#
# Go to IMDB and scrap 50 positive and negative review of Taare Zameen Par movie #
#--------------------------------------------------------#

counts = c(0,10,20,30,40,50)
reviews = NULL
for (j in counts)
{
  url1 = paste0("http://www.imdb.com/title/tt0986264/reviews?filter=love;filter=love;start=",j)
  url2 = paste0("http://www.imdb.com/title/tt0986264/reviews?filter=hate;filter=hate;start=",j)
  
  page1 = read_html(url1)
  page2 = read_html(url2)
  reviews1 = html_text(html_nodes(page1,'#tn15content p'))
  reviews2 = html_text(html_nodes(page2,'#tn15content p'))
  
  reviews.positive = setdiff(reviews1, c("*** This review may contain spoilers ***","Add another review"))
  reviews.negative = setdiff(reviews2, c("*** This review may contain spoilers ***","Add another review"))
  
  reviews = c(reviews,reviews.positive,reviews.negative)
  
}

reviews = gsub("\n",' ',reviews)
writeLines(reviews,'C:/Taare Zameen Par.txt')

STEP 2 (DTM, TF, TFIDF, word-clouds and COGs under both TF and TFIDF weighing scheme)

Pre-process the data and create Document term Matrix. Check word-clouds and COGs under both TF and TFIDF weighing schemes for which configs appear most meaningful / informative. Iterate by updating the stop-words list etc.

The stop-word has been updated by adding words that are not much relevant (words like ishaan, aamir etc.). In this step, we have performed Sentiment analysis and come up with the +ive and -ive word along with polarity score.

#--------------------------------------------------------#
# Step 1 - Reading text data                             #
#--------------------------------------------------------#

temp.text = readLines(file.choose())  #  load Taare Zameen Par
data = data.frame(id = 1:length(temp.text), text = temp.text, stringsAsFactors = F)

# Read Stopwords list
stpw1 = readLines(file.choose())      # read-in stopwords.txt
stpw2 = tm::stopwords('english')                   # tm package stop word list; tokenizer package has the same name function
comn  = unique(c(stpw1, stpw2))                 # Union of two list
stopwords = unique(gsub("'"," ",comn))  # final stop word lsit after removing punctuation
print ('List of stopwords')

## [1] "List of stopwords"

print (stopwords)

##   [1] "a"             "a s"           "able"          "about"        
##   [5] "above"         "according"     "accordingly"   "across"       
##   [9] "actually"      "after"         "afterwards"    "again"        
##  [13] "against"       "ain t"         "all"           "allow"        
##  [17] "allows"        "almost"        "alone"         "along"        
##  [21] "already"       "also"          "although"      "always"       
##  [25] "am"            "among"         "amongst"       "an"           
##  [29] "and"           "another"       "any"           "anybody"      
##  [33] "anyhow"        "anyone"        "anything"      "anyway"       
##  [37] "anyways"       "anywhere"      "apart"         "appear"       
##  [41] "appreciate"    "appropriate"   "are"           "aren t"       
##  [45] "around"        "as"            "aside"         "ask"          
##  [49] "asking"        "associated"    "at"            "available"    
##  [53] "away"          "awfully"       "b"             "be"           
##  [57] "became"        "because"       "become"        "becomes"      
##  [61] "becoming"      "been"          "before"        "beforehand"   
##  [65] "behind"        "being"         "believe"       "below"        
##  [69] "beside"        "besides"       "best"          "better"       
##  [73] "between"       "beyond"        "both"          "brief"        
##  [77] "but"           "by"            "c"             "c mon"        
##  [81] "c s"           "came"          "can"           "can t"        
##  [85] "cannot"        "cant"          "cause"         "causes"       
##  [89] "certain"       "certainly"     "changes"       "clearly"      
##  [93] "co"            "com"           "come"          "comes"        
##  [97] "concerning"    "consequently"  "consider"      "considering"  
## [101] "contain"       "containing"    "contains"      "corresponding"
## [105] "could"         "couldn t"      "course"        "currently"    
## [109] "d"             "definitely"    "described"     "despite"      
## [113] "did"           "didn t"        "different"     "do"           
## [117] "does"          "doesn t"       "doing"         "don t"        
## [121] "done"          "down"          "downwards"     "during"       
## [125] "e"             "each"          "edu"           "eg"           
## [129] "eight"         "either"        "else"          "elsewhere"    
## [133] "enough"        "entirely"      "especially"    "et"           
## [137] "etc"           "even"          "ever"          "every"        
## [141] "everybody"     "everyone"      "everything"    "everywhere"   
## [145] "ex"            "exactly"       "example"       "except"       
## [149] "f"             "far"           "few"           "fifth"        
## [153] "first"         "five"          "followed"      "following"    
## [157] "follows"       "for"           "former"        "formerly"     
## [161] "forth"         "four"          "from"          "further"      
## [165] "furthermore"   "g"             "get"           "gets"         
## [169] "getting"       "given"         "gives"         "go"           
## [173] "goes"          "going"         "gone"          "got"          
## [177] "gotten"        "greetings"     "h"             "had"          
## [181] "hadn t"        "happens"       "hardly"        "has"          
## [185] "hasn t"        "have"          "haven t"       "having"       
## [189] "he"            "he s"          "hello"         "help"         
## [193] "hence"         "her"           "here"          "here s"       
## [197] "hereafter"     "hereby"        "herein"        "hereupon"     
## [201] "hers"          "herself"       "hi"            "him"          
## [205] "himself"       "his"           "hither"        "hopefully"    
## [209] "how"           "howbeit"       "however"       "i"            
## [213] "i d"           "i ll"          "i m"           "i ve"         
## [217] "ie"            "if"            "ignored"       "immediate"    
## [221] "in"            "inasmuch"      "inc"           "indeed"       
## [225] "indicate"      "indicated"     "indicates"     "inner"        
## [229] "insofar"       "instead"       "into"          "inward"       
## [233] "is"            "isn t"         "it"            "it d"         
## [237] "it ll"         "it s"          "its"           "itself"       
## [241] "j"             "just"          "k"             "keep"         
## [245] "keeps"         "kept"          "know"          "knows"        
## [249] "known"         "l"             "last"          "lately"       
## [253] "later"         "latter"        "latterly"      "least"        
## [257] "less"          "lest"          "let"           "let s"        
## [261] "like"          "liked"         "likely"        "little"       
## [265] "look"          "looking"       "looks"         "ltd"          
## [269] "m"             "mainly"        "many"          "may"          
## [273] "maybe"         "me"            "mean"          "meanwhile"    
## [277] "merely"        "might"         "more"          "moreover"     
## [281] "most"          "mostly"        "much"          "must"         
## [285] "my"            "myself"        "n"             "name"         
## [289] "namely"        "nd"            "near"          "nearly"       
## [293] "necessary"     "need"          "needs"         "neither"      
## [297] "never"         "nevertheless"  "new"           "next"         
## [301] "nine"          "no"            "nobody"        "non"          
## [305] "none"          "noone"         "nor"           "normally"     
## [309] "not"           "nothing"       "novel"         "now"          
## [313] "nowhere"       "o"             "obviously"     "of"           
## [317] "off"           "often"         "oh"            "ok"           
## [321] "okay"          "old"           "on"            "once"         
## [325] "one"           "ones"          "only"          "onto"         
## [329] "or"            "other"         "others"        "otherwise"    
## [333] "ought"         "our"           "ours"          "ourselves"    
## [337] "out"           "outside"       "over"          "overall"      
## [341] "own"           "p"             "particular"    "particularly" 
## [345] "per"           "perhaps"       "placed"        "please"       
## [349] "plus"          "possible"      "presumably"    "probably"     
## [353] "provides"      "q"             "que"           "quite"        
## [357] "qv"            "r"             "rather"        "rd"           
## [361] "re"            "really"        "reasonably"    "regarding"    
## [365] "regardless"    "regards"       "relatively"    "respectively" 
## [369] "right"         "s"             "said"          "same"         
## [373] "saw"           "say"           "saying"        "says"         
## [377] "second"        "secondly"      "see"           "seeing"       
## [381] "seem"          "seemed"        "seeming"       "seems"        
## [385] "seen"          "self"          "selves"        "sensible"     
## [389] "sent"          "serious"       "seriously"     "seven"        
## [393] "several"       "shall"         "she"           "should"       
## [397] "shouldn t"     "since"         "six"           "so"           
## [401] "some"          "somebody"      "somehow"       "someone"      
## [405] "something"     "sometime"      "sometimes"     "somewhat"     
## [409] "somewhere"     "soon"          "sorry"         "specified"    
## [413] "specify"       "specifying"    "still"         "sub"          
## [417] "such"          "sup"           "sure"          "t"            
## [421] "t s"           "take"          "taken"         "tell"         
## [425] "tends"         "th"            "than"          "thank"        
## [429] "thanks"        "thanx"         "that"          "that s"       
## [433] "thats"         "the"           "their"         "theirs"       
## [437] "them"          "themselves"    "then"          "thence"       
## [441] "there"         "there s"       "thereafter"    "thereby"      
## [445] "therefore"     "therein"       "theres"        "thereupon"    
## [449] "these"         "they"          "they d"        "they ll"      
## [453] "they re"       "they ve"       "think"         "third"        
## [457] "this"          "thorough"      "thoroughly"    "those"        
## [461] "though"        "three"         "through"       "throughout"   
## [465] "thru"          "thus"          "to"            "together"     
## [469] "too"           "took"          "toward"        "towards"      
## [473] "tried"         "tries"         "truly"         "try"          
## [477] "trying"        "twice"         "two"           "u"            
## [481] "un"            "under"         "unfortunately" "unless"       
## [485] "unlikely"      "until"         "unto"          "up"           
## [489] "upon"          "us"            "use"           "used"         
## [493] "useful"        "uses"          "using"         "usually"      
## [497] "uucp"          "v"             "value"         "various"      
## [501] "very"          "via"           "viz"           "vs"           
## [505] "w"             "want"          "wants"         "was"          
## [509] "wasn t"        "way"           "we"            "we d"         
## [513] "we ll"         "we re"         "we ve"         "welcome"      
## [517] "well"          "went"          "were"          "weren t"      
## [521] "what"          "what s"        "whatever"      "when"         
## [525] "whence"        "whenever"      "where"         "where s"      
## [529] "whereafter"    "whereas"       "whereby"       "wherein"      
## [533] "whereupon"     "wherever"      "whether"       "which"        
## [537] "while"         "whither"       "who"           "who s"        
## [541] "whoever"       "whole"         "whom"          "whose"        
## [545] "why"           "will"          "willing"       "wish"         
## [549] "with"          "within"        "without"       "won t"        
## [553] "wonder"        "would"         "wouldn t"      "x"            
## [557] "y"             "yes"           "yet"           "you"          
## [561] "you d"         "you ll"        "you re"        "you ve"       
## [565] "your"          "yours"         "yourself"      "yourselves"   
## [569] "z"             "zero"          "movie"         "film"         
## [573] "ishaan"        "aamir"         "khan"          "amir"         
## [577] "she s"         "he d"          "she d"         "he ll"        
## [581] "she ll"        "shan t"        "mustn t"       "when s"       
## [585] "why s"         "how s"

x  = text.clean(data$text)             # pre-process text corpus
x  =  removeWords(x,stopwords)            # removing stopwords created above
x  =  stripWhitespace(x)                  # removing white space

#--------------------------------------------------------#
######  Create DTM using text2vec package                #
#--------------------------------------------------------#

t1 = Sys.time()

tok_fun = word_tokenizer

it_m = itoken(x,
              tokenizer = tok_fun,
              ids = data$id)

vocab = create_vocabulary(it_m)

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 1)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_m  = create_dtm(it_m, vectorizer)

dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
a0 = (apply(dtm, 1, sum) > 0)   # build vector to identify non-empty docs
dtm = dtm[a0,]                  # drop empty docs

# view a sample of the DTM, sorted from most to least frequent tokens 
dtm = dtm[,order(apply(dtm, 2, sum), decreasing = T)]     # sorting dtm's columns in decreasing order of column sums
inspect(dtm[1:5, 1:5])     # inspect() func used to view parts of a DTM object

## <<DocumentTermMatrix (documents: 5, terms: 5)>>
## Non-/sparse entries: 8/17
## Sparsity           : 68%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs child parents good school teacher
##    1     0       0    0      0       0
##    2     0       0    0      0       0
##    3     5       3    3      3       2
##    4     1       0    1      2       0
##    5     0       0    0      0       0

#--------------------------------------------------------#
## Step 2a:     # Build word cloud                       #
#--------------------------------------------------------#

#   1- Using Term frequency(tf)             

tst = round(ncol(dtm)/100)  # divide DTM's cols into 100 manageble parts
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
  tempdtm = dtm[,(b[i]+1):(b[i+1])]
  s = colSums(as.matrix(tempdtm))
  ss.col = c(ss.col,s)
}

tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)]       #terms in decreasing order of freq
head(tsum)

##   child parents    good  school teacher   story 
##     123     101     100      95      79      79

tail(tsum)

##   rangeela    actions      booth sicknesses       ruin    century 
##          1          1          1          1          1          1

windows()  # New plot window
wordcloud(names(tsum), tsum,     # words, their freqs 
          scale = c(4, 0.5),     # range of word sizes
          1,                     # min.freq of words to consider
          max.words = 200,       # max #words
          colors = brewer.pal(8, "Dark2"))    # Plot results in a word cloud 
title(sub = "Term Frequency - Wordcloud")     # title for the wordcloud display

# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))

windows()  # New plot window
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "Blue") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

dev.off() # [graphical] device off / close it down

## png 
##   2

# -------------------------------------------------------------- #
# step 2b - Using Term frequency inverse document frequency (tfidf)             
# -------------------------------------------------------------- #

dtm.tfidf = tfidf(dtm, normalize=TRUE)

tst = round(ncol(dtm.tfidf)/100)
a = rep(tst, 99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm.tfidf))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
  tempdtm = dtm.tfidf[,(b[i]+1):(b[i+1])]
  s = colSums(as.matrix(tempdtm))
  ss.col = c(ss.col,s)
}

tsum = ss.col

tsum = tsum[order(tsum, decreasing = T)]       #terms in decreasing order of freq
head(tsum)

##      good     child    movies   parents       kid    school 
## 0.9444900 0.9094940 0.8167724 0.8007779 0.7878499 0.7635032

tail(tsum)

##        wisest distinguishes          sore         visit     misspells 
##   0.009941542   0.009941542   0.009941542   0.009941542   0.009941542 
##      suggests 
##   0.009941542

windows()  # New plot window
wordcloud(names(tsum), tsum, scale=c(4,0.5),1, max.words=200,colors=brewer.pal(8, "Dark2")) # Plot results in a word cloud 
title(sub = "Term Frequency Inverse Document Frequency - Wordcloud")

# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))
windows()  # New plot window
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "red") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

dev.off()

## png 
##   2

#------------------------------------------------------#
# step 2c - Term Co-occurance Matrix (TCM)             #
#------------------------------------------------------#

vectorizer = vocab_vectorizer(pruned_vocab, 
                              grow_dtm = FALSE, 
                              skip_grams_window = 5L)

tcm = create_tcm(it_m, vectorizer) # func to build a TCM

tcm.mat = as.matrix(tcm)         # use tcm.mat[1:5, 1:5] to view
adj.mat = tcm.mat + t(tcm.mat)   # since adjacency matrices are symmetric

z = order(colSums(adj.mat), decreasing = T)
adj.mat = adj.mat[z,z]

# Plot Simple Term Co-occurance graph
adj = adj.mat[1:30,1:30]
cog = graph.adjacency(adj, mode = 'undirected')
cog =  simplify(cog)  

cog = delete.vertices(cog, V(cog)[ degree(cog) == 0 ])

#-----------------------------------------------------------#
# Step 2d - a cleaned up or 'distilled' COG PLot            #
#-----------------------------------------------------------#

distill.cog = function(mat1, # input TCM ADJ MAT
                       title, # title for the graph
                       s,    # no. of central nodes
                       k1){  # max no. of connections  
  a = colSums(mat1) # collect colsums into a vector obj a
  b = order(-a)     # nice syntax for ordering vector in decr order  
  
  mat2 = mat1[b, b]     # order both rows and columns along vector b
  
  diag(mat2) =  0
  
  wc = NULL
  
  for (i1 in 1:s){ 
    thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
    mat2[i1, mat2[i1,] < thresh1] = 0   # neat. didn't need 2 use () in the subset here.
    mat2[i1, mat2[i1,] > 0 ] = 1
    word = names(mat2[i1, mat2[i1,] > 0])
    mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
    wc = c(wc,word)
  } # i1 loop ends
  
  
  mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
  ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))]  # removed any NAs from the list
  mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
  graph <- graph.adjacency(mat4, mode = "undirected", weighted=T)    # Create Network object
  graph = simplify(graph) 
  V(graph)$color[1:s] = "green"
  V(graph)$color[(s+1):length(V(graph))] = "pink"
  
  graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ]) # delete singletons?
  
  plot(graph, 
       layout = layout.kamada.kawai, 
       main = title)
  
} # func ends

windows()
distill.cog(tcm.mat, 'Distilled COG',  10,  5)

## adj.mat and distilled cog for tfidf DTMs ##

adj.mat = t(dtm.tfidf) %*% dtm.tfidf
diag(adj.mat) = 0
a01 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a01[1:50], a01[1:50]])

windows()
distill.cog(adj.mat, 'Distilled COG for tfidf DTMs',  10,  10)

#--------------------------------------------------------#
#             Sentiment Analysis                         #
#--------------------------------------------------------#

x1 = x[a0]    # remove empty docs from corpus

pol = polarity(x1)         # Calculate the polarity from qdap dictionary
wc = pol$all[,2]                  # Word Count in each doc
val = pol$all[,3]                 # average polarity score
p  = pol$all[,4]                  # Positive words info
n  = pol$all[,5]                  # Negative Words info  

positive_words = unique(setdiff(unlist(p),"-"))  # Positive words list
negative_words = unique(setdiff(unlist(n),"-"))  # Negative words list


#--------------------------------------------------------#
#   Create Postive Words wordcloud                       #
#--------------------------------------------------------#

pos.tdm = dtm[,which(colnames(dtm) %in% positive_words)]
m = as.matrix(pos.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows() # opens new image window
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))
title(sub = "Positive Words - Wordcloud")

# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows() # opens new image window
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "blue") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#--------------------------------------------------------#
#  Create Negative Words wordcloud                       #
#--------------------------------------------------------#

neg.tdm = dtm[,which(colnames(dtm) %in% negative_words) ]
m = as.matrix(neg.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows()
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))         
title(sub = "Negative Words - Wordcloud")

# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows()
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "red") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#--------------------------------------------------------#
#  Positive words vs Negative Words plot                 #
#--------------------------------------------------------#

len = function(x){
  if ( x == "-" && length(x) == 1)  {return (0)} 
  else {return(length(unlist(x)))}
}

pcount = unlist(lapply(p, len))
ncount = unlist(lapply(n, len))
doc_id = seq(1:length(wc))

windows()
plot(doc_id,pcount,type="l",col="green",xlab = "Document ID", ylab= "Word Count")
lines(doc_id,ncount,type= "l", col="red")
title(main = "Positive words vs Negative Words" )
legend("topright", inset=.05, c("Positive Words","Negative Words"), fill=c("green","red"), horiz=TRUE)

# Documet Sentiment Running plot
windows()
plot(pol$all$polarity, type = "l", ylab = "Polarity Score",xlab = "Document Number")
abline(h=0)
title(main = "Polarity Plot" )

### COG for sentiment-laden words ? ###

senti.dtm = cbind(pos.tdm, neg.tdm); 
senti.adj.mat = as.matrix(t(senti.dtm)) %*% as.matrix(senti.dtm)
diag(senti.adj.mat) = 0

windows()
distill.cog(senti.adj.mat,   # ad mat obj 
            'Distilled COG of senti words',       # plot title
            5,       # max #central nodes
            5)        # max #connexns

STEP 3 (Polarity Vs Star Rating and Correlation between them)

Compare each review’s polarity score with its star rating. You can choose to use a simple cor() function to check correlation between the two data columns.

For this capture, 50 +ive and 50 -ive, rating and review of Taare Zameen Par. Then find the polarity score of all the review and then find the correlation between them. Finally created a plot of rating Vs polarity.

#--------------------------------------------------------#
#   Polarity Vs Rating and Correlation between them      #
#--------------------------------------------------------#
df <- data.frame("Rating"=numeric(),"Polarity"=numeric())
for (j in counts)
{
  url1 = paste0("http://www.imdb.com/title/tt0986264/reviews?filter=love;filter=love;start=",j)
  page1 = read_html(url1)

  movie.nodes = html_nodes(page1, 'h2 + img')
  tmp = html_attr(movie.nodes, name='alt')
  rating = substr(tmp, 0, regexpr('/', tmp)-1)
  review.nodes = html_nodes(page1, '#tn15content div+ p')

  df <- find.polarity(review.nodes, rating, df)

    
  url2 = paste0("http://www.imdb.com/title/tt0986264/reviews?filter=hate;filter=hate;start=",j)
  page2 = read_html(url2)
  
  movie.nodes = html_nodes(page2, 'h2 + img')
  tmp = html_attr(movie.nodes, name='alt')
  rating = substr(tmp, 0, regexpr('/', tmp)-1)
  review.nodes = html_nodes(page2, '#tn15content div+ p')
  
  df <- find.polarity(review.nodes, rating, df)
  
}

df_data <- df[, sapply(df, is.numeric)]
cor(df_data, use = "complete.obs", method = "pearson")

##             Rating  Polarity
## Rating   1.0000000 0.2073642
## Polarity 0.2073642 1.0000000

plot(df)

STEP 4 (Recommendation)

Now, make a recommendation. What movie attributes or aspects (e.g., plot? star cast? length? etc.) worked well, which the studio should retain? Which ones didn’t work well and which the studio should change?

star cast worked well becuase those are the highest one in TF and TFIDF. The name of actor (Aamir Khan) and kid name (Ishaan) appear multiple times. The reviews talk about parent, kid, school, society etc. Also, saw a emotional keyword too. In the Polarity Vs Rating, the correlation is poistive.

Recommendation will be to make movie that talk about kid, or some kind of disease. People get emotional as per above data. Also, these kind of movie will help in getting aid from government.