Install the packages

rm(list=ls())

Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_66') # for 64-bit
install.packages("rJava",repos="http://cran.rstudio.com/")

## Installing package into 'C:/Users/sundeep/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)

## package 'rJava' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Public\Documents\Wondershare\CreatorTemp\Rtmp8st1Wk\downloaded_packages

require(text2vec) || install.packages("text2vec",repos="http://cran.rstudio.com/")

## Loading required package: text2vec

## [1] TRUE

require(data.table) || install.packages("data.table",repos="http://cran.rstudio.com/")

## Loading required package: data.table

## [1] TRUE

require(stringr) || install.packages("stringr",repos="http://cran.rstudio.com/")

## Loading required package: stringr

## [1] TRUE

require(tm) || install.packages("tm",repos="http://cran.rstudio.com/")

## Loading required package: tm

## Loading required package: NLP

## [1] TRUE

require(RWeka) || install.packages("RWeka",repos="http://cran.rstudio.com/")

## Loading required package: RWeka

## [1] TRUE

require(tokenizers) || install.packages("tokenizers",repos="http://cran.rstudio.com/")

## Loading required package: tokenizers

## 
## Attaching package: 'tokenizers'

## The following object is masked from 'package:tm':
## 
##     stopwords

## [1] TRUE

require(slam) || install.packages("slam")

## Loading required package: slam

## [1] TRUE

require(wordcloud) || install.packages("wordcloud",repos="http://cran.rstudio.com/")

## Loading required package: wordcloud

## Loading required package: RColorBrewer

## [1] TRUE

require(ggplot2) || install.packages("ggplot2",repos="http://cran.rstudio.com/")

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## [1] TRUE

Load the required packages

library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(ggplot2)
library("rvest")

## Loading required package: xml2

Process the data

text.clean = function(x)                    # text data
{ require("tm")
  x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
  x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
  x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric 
  x  =  tolower(x)                          # convert to lower case characters
  x  =  removeNumbers(x)                    # removing numbers
  x  =  stripWhitespace(x)                  # removing white space
  x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
  return(x)
}

Retrieve the ratings and corresponding reviews

counts = c(0,10,20,30,40,50)
reviews = NULL
ratings = NULL
ddr = NULL
for (j in counts){
  url1 = paste0("http://www.imdb.com/title/tt1375666/reviews?filter=love;filter=love;start=",j)
  url2 = paste0("http://www.imdb.com/title/tt1375666/reviews?filter=hate;filter=hate;start=",j)
  
  page1 = read_html(url1)
  page2 = read_html(url2)
  
  reviews1 = html_text(html_nodes(page1,'#tn15content p'))
  reviews.positive = setdiff(reviews1, c("*** This review may contain spoilers ***","Add another review"))
  #reviews.positive
  
  reviews2 = html_text(html_nodes(page2,'#tn15content p'))
  reviews.negative = setdiff(reviews2, c("*** This review may contain spoilers ***","Add another review"))
  
  
  
  movie.nodes.positive = html_nodes(page1,'h2 + img') 
  rr.positive  = html_attr(movie.nodes.positive,name='alt')
  
  movie.nodes.negative = html_nodes(page2,'h2 + img')
  rr.negative  = html_attr(movie.nodes.negative,name='alt')
  
  #ddr= c(ddr,rr.positive,rr.negative)
  #rat =substr(html_attr(movie.nodes.positive,name='alt'),0,2)
  #rat

 
  ratings = c(ratings,rr.positive,rr.negative)
  reviews = c(reviews,reviews.positive,reviews.negative)
  
}

reviews = gsub("\n",' ',reviews)
#reviews

Create the Data frame

df = data.frame(reviews,ratings)
#df

Load the Stopwords

#stpw1 = readLines(file.choose())      # read-in stopwords.txt
stpw1 = readLines('https://raw.githubusercontent.com/sudhir-voleti/basic-text-analysis-shinyapp/master/data/stopwords.txt')# stopwords list from git
stpw2 = tm::stopwords('english')      # tm package stop word list; tokenizer package has the same name function, hence 'tm::'
comn  = unique(c(stpw1, stpw2))         # Union of two list
stopwords = unique(gsub("'"," ",comn))
stopwords

##   [1] "a"             "a s"           "able"          "about"        
##   [5] "above"         "according"     "accordingly"   "across"       
##   [9] "actually"      "after"         "afterwards"    "again"        
##  [13] "against"       "ain t"         "all"           "allow"        
##  [17] "allows"        "almost"        "alone"         "along"        
##  [21] "already"       "also"          "although"      "always"       
##  [25] "am"            "among"         "amongst"       "an"           
##  [29] "and"           "another"       "any"           "anybody"      
##  [33] "anyhow"        "anyone"        "anything"      "anyway"       
##  [37] "anyways"       "anywhere"      "apart"         "appear"       
##  [41] "appreciate"    "appropriate"   "are"           "aren t"       
##  [45] "around"        "as"            "aside"         "ask"          
##  [49] "asking"        "associated"    "at"            "available"    
##  [53] "away"          "awfully"       "b"             "be"           
##  [57] "became"        "because"       "become"        "becomes"      
##  [61] "becoming"      "been"          "before"        "beforehand"   
##  [65] "behind"        "being"         "believe"       "below"        
##  [69] "beside"        "besides"       "best"          "better"       
##  [73] "between"       "beyond"        "both"          "brief"        
##  [77] "but"           "by"            "c"             "c mon"        
##  [81] "c s"           "came"          "can"           "can t"        
##  [85] "cannot"        "cant"          "cause"         "causes"       
##  [89] "certain"       "certainly"     "changes"       "clearly"      
##  [93] "co"            "com"           "come"          "comes"        
##  [97] "concerning"    "consequently"  "consider"      "considering"  
## [101] "contain"       "containing"    "contains"      "corresponding"
## [105] "could"         "couldn t"      "course"        "currently"    
## [109] "d"             "definitely"    "described"     "despite"      
## [113] "did"           "didn t"        "different"     "do"           
## [117] "does"          "doesn t"       "doing"         "don t"        
## [121] "done"          "down"          "downwards"     "during"       
## [125] "e"             "each"          "edu"           "eg"           
## [129] "eight"         "either"        "else"          "elsewhere"    
## [133] "enough"        "entirely"      "especially"    "et"           
## [137] "etc"           "even"          "ever"          "every"        
## [141] "everybody"     "everyone"      "everything"    "everywhere"   
## [145] "ex"            "exactly"       "example"       "except"       
## [149] "f"             "far"           "few"           "fifth"        
## [153] "first"         "five"          "followed"      "following"    
## [157] "follows"       "for"           "former"        "formerly"     
## [161] "forth"         "four"          "from"          "further"      
## [165] "furthermore"   "g"             "get"           "gets"         
## [169] "getting"       "given"         "gives"         "go"           
## [173] "goes"          "going"         "gone"          "got"          
## [177] "gotten"        "greetings"     "h"             "had"          
## [181] "hadn t"        "happens"       "hardly"        "has"          
## [185] "hasn t"        "have"          "haven t"       "having"       
## [189] "he"            "he s"          "hello"         "help"         
## [193] "hence"         "her"           "here"          "here s"       
## [197] "hereafter"     "hereby"        "herein"        "hereupon"     
## [201] "hers"          "herself"       "hi"            "him"          
## [205] "himself"       "his"           "hither"        "hopefully"    
## [209] "how"           "howbeit"       "however"       "i"            
## [213] "i d"           "i ll"          "i m"           "i ve"         
## [217] "ie"            "if"            "ignored"       "immediate"    
## [221] "in"            "inasmuch"      "inc"           "indeed"       
## [225] "indicate"      "indicated"     "indicates"     "inner"        
## [229] "insofar"       "instead"       "into"          "inward"       
## [233] "is"            "isn t"         "it"            "it d"         
## [237] "it ll"         "it s"          "its"           "itself"       
## [241] "j"             "just"          "k"             "keep"         
## [245] "keeps"         "kept"          "know"          "knows"        
## [249] "known"         "l"             "last"          "lately"       
## [253] "later"         "latter"        "latterly"      "least"        
## [257] "less"          "lest"          "let"           "let s"        
## [261] "like"          "liked"         "likely"        "little"       
## [265] "look"          "looking"       "looks"         "ltd"          
## [269] "m"             "mainly"        "many"          "may"          
## [273] "maybe"         "me"            "mean"          "meanwhile"    
## [277] "merely"        "might"         "more"          "moreover"     
## [281] "most"          "mostly"        "much"          "must"         
## [285] "my"            "myself"        "n"             "name"         
## [289] "namely"        "nd"            "near"          "nearly"       
## [293] "necessary"     "need"          "needs"         "neither"      
## [297] "never"         "nevertheless"  "new"           "next"         
## [301] "nine"          "no"            "nobody"        "non"          
## [305] "none"          "noone"         "nor"           "normally"     
## [309] "not"           "nothing"       "novel"         "now"          
## [313] "nowhere"       "o"             "obviously"     "of"           
## [317] "off"           "often"         "oh"            "ok"           
## [321] "okay"          "old"           "on"            "once"         
## [325] "one"           "ones"          "only"          "onto"         
## [329] "or"            "other"         "others"        "otherwise"    
## [333] "ought"         "our"           "ours"          "ourselves"    
## [337] "out"           "outside"       "over"          "overall"      
## [341] "own"           "p"             "particular"    "particularly" 
## [345] "per"           "perhaps"       "placed"        "please"       
## [349] "plus"          "possible"      "presumably"    "probably"     
## [353] "provides"      "q"             "que"           "quite"        
## [357] "qv"            "r"             "rather"        "rd"           
## [361] "re"            "really"        "reasonably"    "regarding"    
## [365] "regardless"    "regards"       "relatively"    "respectively" 
## [369] "right"         "s"             "said"          "same"         
## [373] "saw"           "say"           "saying"        "says"         
## [377] "second"        "secondly"      "see"           "seeing"       
## [381] "seem"          "seemed"        "seeming"       "seems"        
## [385] "seen"          "self"          "selves"        "sensible"     
## [389] "sent"          "serious"       "seriously"     "seven"        
## [393] "several"       "shall"         "she"           "should"       
## [397] "shouldn t"     "since"         "six"           "so"           
## [401] "some"          "somebody"      "somehow"       "someone"      
## [405] "something"     "sometime"      "sometimes"     "somewhat"     
## [409] "somewhere"     "soon"          "sorry"         "specified"    
## [413] "specify"       "specifying"    "still"         "sub"          
## [417] "such"          "sup"           "sure"          "t"            
## [421] "t s"           "take"          "taken"         "tell"         
## [425] "tends"         "th"            "than"          "thank"        
## [429] "thanks"        "thanx"         "that"          "that s"       
## [433] "thats"         "the"           "their"         "theirs"       
## [437] "them"          "themselves"    "then"          "thence"       
## [441] "there"         "there s"       "thereafter"    "thereby"      
## [445] "therefore"     "therein"       "theres"        "thereupon"    
## [449] "these"         "they"          "they d"        "they ll"      
## [453] "they re"       "they ve"       "think"         "third"        
## [457] "this"          "thorough"      "thoroughly"    "those"        
## [461] "though"        "three"         "through"       "throughout"   
## [465] "thru"          "thus"          "to"            "together"     
## [469] "too"           "took"          "toward"        "towards"      
## [473] "tried"         "tries"         "truly"         "try"          
## [477] "trying"        "twice"         "two"           "u"            
## [481] "un"            "under"         "unfortunately" "unless"       
## [485] "unlikely"      "until"         "unto"          "up"           
## [489] "upon"          "us"            "use"           "used"         
## [493] "useful"        "uses"          "using"         "usually"      
## [497] "uucp"          "v"             "value"         "various"      
## [501] "very"          "via"           "viz"           "vs"           
## [505] "w"             "want"          "wants"         "was"          
## [509] "wasn t"        "way"           "we"            "we d"         
## [513] "we ll"         "we re"         "we ve"         "welcome"      
## [517] "well"          "went"          "were"          "weren t"      
## [521] "what"          "what s"        "whatever"      "when"         
## [525] "whence"        "whenever"      "where"         "where s"      
## [529] "whereafter"    "whereas"       "whereby"       "wherein"      
## [533] "whereupon"     "wherever"      "whether"       "which"        
## [537] "while"         "whither"       "who"           "who s"        
## [541] "whoever"       "whole"         "whom"          "whose"        
## [545] "why"           "will"          "willing"       "wish"         
## [549] "with"          "within"        "without"       "won t"        
## [553] "wonder"        "would"         "wouldn t"      "x"            
## [557] "y"             "yes"           "yet"           "you"          
## [561] "you d"         "you ll"        "you re"        "you ve"       
## [565] "your"          "yours"         "yourself"      "yourselves"   
## [569] "z"             "zero"          "she s"         "he d"         
## [573] "she d"         "he ll"         "she ll"        "shan t"       
## [577] "mustn t"       "when s"        "why s"         "how s"

Pre processing

temp.text = reviews
#temp.text
#head(temp.text, 5)

data = data.frame(id = 1:length(temp.text),  # creating doc IDs if name is not given
                  text = temp.text, 
                  stringsAsFactors = F)
dim(data)

## [1] 120   2

x  = text.clean(data$text)                # applying func defined above to pre-process text corpus
x  =  removeWords(x,stopwords)            # removing stopwords created above
x  =  stripWhitespace(x)                  # removing white space

——————————————————–

Step 2: Create DTM using text2vec package

——————————————————–

tok_fun = word_tokenizer  # using word & not space tokenizers

it_0 = itoken( x,
               #preprocessor = text.clean,
               tokenizer = tok_fun,
               ids = data$id,
               progressbar = T)

vocab = create_vocabulary(it_0,    #  func collects unique terms & corresponding statistics
                          ngram = c(2L, 2L) #,
                          #stopwords = stopwords
)

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

# length(vocab); str(vocab)     # view what vocab obj is like

pruned_vocab = prune_vocabulary(vocab,  # filters input vocab & throws out v frequent & v infrequent terms
                                term_count_min = 10)


vectorizer = vocab_vectorizer(pruned_vocab) #  creates a text vectorizer func used in constructing a dtm/tcm/corpus

dtm_0  = create_dtm(it_0, vectorizer) # high-level function for creating a document-term matrix

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),]


text2 = x
text2 = paste("",text2,"")

pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0

for (term in names(tsum)){
  i = i + 1
  focal.term = gsub("_", " ",term)        # in case dot was word-separator
  replacement.term = term
  text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
  setTxtProgressBar(pb, i)
}

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |==========================                                       |  39%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |=======================================                          |  61%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |=====================================================            |  82%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |===============================================================  |  96%
  |                                                                       
  |=================================================================| 100%

it_m = itoken(text2,     # function creates iterators over input objects to vocabularies, corpora, DTM & TCM matrices
              # preprocessor = text.clean,
              tokenizer = tok_fun,
              ids = data$id,
              progressbar = T)

vocab = create_vocabulary(it_m     # vocab func collects unique terms and corresponding statistics
                          # ngram = c(2L, 2L),
                          #stopwords = stopwords
)

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

# length(vocab); str(vocab)     # view what vocab obj is like

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 1)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_m  = create_dtm(it_m, vectorizer)

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

dim(dtm_m)

## [1]  120 4425

dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
a0 = (apply(dtm, 1, sum) > 0)   # build vector to identify non-empty docs
dtm = dtm[a0,]

# view a sample of the DTM, sorted from most to least frequent tokens

dtm = dtm[,order(apply(dtm, 2, sum), decreasing = T)]     # sorting dtm's columns in decreasing order of column sums
inspect(dtm[1:5, 1:5])

## <<DocumentTermMatrix (documents: 5, terms: 5)>>
## Non-/sparse entries: 18/7
## Sparsity           : 28%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs movie film inception dream nolan
##    1     5    1         2     1     3
##    2     0   15         1     0     1
##    3     5    9         2     0     2
##    4     5    0         3     0     5
##    5     0    6         4     0     1

Step 2a: # Build word cloud

tst = round(ncol(dtm)/100)  # divide DTM's cols into 100 manageble parts
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
  tempdtm = dtm[,(b[i]+1):(b[i+1])]
  s = colSums(as.matrix(tempdtm))
  ss.col = c(ss.col,s)
  #print(i)
}

tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)]       #terms in decreasing order of freq

show world cloud

## show bar plot

#windows()  # New plot window
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "Blue") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

#dev.off() # [graphical] device off / close it down

# step 2b - Using Term frequency inverse document frequency (tfidf)

require(textir) || install.packages("textir")

## Loading required package: textir

## Loading required package: distrom

## Loading required package: Matrix

## Loading required package: gamlr

## Loading required package: parallel

## [1] TRUE

library(textir)
dtm.tfidf = tfidf(dtm, normalize=FALSE)

tst = round(ncol(dtm.tfidf)/100)
a = rep(tst, 99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm.tfidf))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
  tempdtm = dtm.tfidf[,(b[i]+1):(b[i+1])]
  s = colSums(as.matrix(tempdtm))
  ss.col = c(ss.col,s)
  #print(i)
}

tsum = ss.col

tsum = tsum[order(tsum, decreasing = T)]

show word cloud and bar plot for tfidf

#windows()  # New plot window
wordcloud(names(tsum), tsum, scale=c(1,0.5),1, max.words=200,colors=brewer.pal(8, "Dark2")) # Plot results in a word cloud 
title(sub = "Term Frequency Inverse Document Frequency - Wordcloud")

#as.matrix(tsum[1:20])     #  to see the top few tokens & their IDF scores
#(dtm.tfidf)[1:10, 1:10]   # view first 10x10 cells in the DTM under TF IDF.

# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))
#windows()  # New plot window
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "red") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

#dev.off()

# step 2c - Term Co-occurance Matrix (TCM)

vectorizer = vocab_vectorizer(pruned_vocab, 
                              grow_dtm = FALSE, 
                              skip_grams_window = 5L)

tcm = create_tcm(it_m, vectorizer) # func to build a TCM

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

tcm.mat = as.matrix(tcm)         # use tcm.mat[1:5, 1:5] to view
adj.mat = tcm.mat + t(tcm.mat)   # since adjacency matrices are symmetric

z = order(colSums(adj.mat), decreasing = T)
adj.mat = adj.mat[z,z]

# Plot Simple Term Co-occurance graph
adj = adj.mat[1:30,1:30]

library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:rvest':
## 
##     %>%

## The following object is masked from 'package:stringr':
## 
##     %>%

## The following objects are masked from 'package:text2vec':
## 
##     %>%, normalize

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

cog = graph.adjacency(adj, mode = 'undirected')
cog =  simplify(cog)  

cog = delete.vertices(cog, V(cog)[ degree(cog) == 0 ])

plot the cog

#windows()
plot(cog)

## Step 2d - a cleaned up or ‘distilled’ COG PLot

distill.cog = function(mat1, # input TCM ADJ MAT
                       title, # title for the graph
                       s,    # no. of central nodes
                       k1){  # max no. of connections  
  library(igraph)
  a = colSums(mat1) # collect colsums into a vector obj a
  b = order(-a)     # nice syntax for ordering vector in decr order  
  
  mat2 = mat1[b, b]     # order both rows and columns along vector b
  
  diag(mat2) =  0
  
  ## +++ go row by row and find top k adjacencies +++ ##
  
  wc = NULL
  
  for (i1 in 1:s){ 
    thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
    mat2[i1, mat2[i1,] < thresh1] = 0   # neat. didn't need 2 use () in the subset here.
    mat2[i1, mat2[i1,] > 0 ] = 1
    word = names(mat2[i1, mat2[i1,] > 0])
    mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
    wc = c(wc,word)
  } # i1 loop ends
  
  
  mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
  ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))]  # removed any NAs from the list
  mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
  graph <- graph.adjacency(mat4, mode = "undirected", weighted=T)    # Create Network object
  graph = simplify(graph) 
  V(graph)$color[1:s] = "green"
  V(graph)$color[(s+1):length(V(graph))] = "pink"
  
  graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ]) # delete singletons?
  
  plot(graph, 
       layout = layout.kamada.kawai, 
       main = title)
  
} # func ends

#windows()
distill.cog(tcm.mat, 'Distilled COG',  10,  5)

## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length

## adj.mat and distilled cog for tfidf DTMs ##

adj.mat = t(dtm.tfidf) %*% dtm.tfidf
diag(adj.mat) = 0
a0 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a0[1:50], a0[1:50]])

#windows()
distill.cog(adj.mat, 'Distilled COG',  10,  10)

Calculate Polarity and Correlation between polarity and rating

Polarity=NULL
RatingCol=NULL

for(i in 1:nrow(df)) {
  row <- df[i,]
  # do stuff with row
  #print(row$reviews)
  #print(length(row$reviews))
  text = row$reviews
  #print(text)
  data = data.frame(id = 1:length(row$reviews),  # creating doc IDs if name is not given
                    text = row$reviews, 
                    stringsAsFactors = F)
  #dim(data)
  #print(data)
  
  x  = text.clean(data$text)                # applying func defined above to pre-process text corpus
  x  =  removeWords(x,stopwords)            # removing stopwords created above
  x  =  stripWhitespace(x)
  #print(x)
  
  #dtm = as.DocumentTermMatrix(x, weighting = weightTf)
  #dtm
  #a0 = (apply(dtm, 1, sum) > 0)   # build vector to identify non-empty docs
  #dtm = dtm[a0,] 
  #dtm
  require(qdap) || install.packages("qdap") # ensure java is up to date!
  library(qdap)
  
  #x1 = x[a0]    # remove empty docs from corpus
  
  
  pol = polarity(x)         # Calculate the polarity from qdap dictionary
  pol
  wc = pol$all[,2]                  # Word Count in each doc
  wc
  val = pol$all[,3]                 # average polarity score
  
  p  = pol$all[,4]                  # Positive words info
  n  = pol$all[,5]  
  dd= as.numeric(row$ratings)
  
  Polarity = c(Polarity,val)
  pos = regexpr(pattern="/", row$ratings)[1]-1
  pos
  rat.rev = as.numeric(substr(row$ratings,0,pos))
  rat.rev
  RatingCol = c(RatingCol,rat.rev)
  
}

## Loading required package: qdap

## Loading required package: qdapDictionaries

## Loading required package: qdapRegex

## 
## Attaching package: 'qdapRegex'

## The following object is masked from 'package:ggplot2':
## 
##     %+%

## Loading required package: qdapTools

## 
## Attaching package: 'qdapTools'

## The following object is masked from 'package:data.table':
## 
##     shift

## 
## Attaching package: 'qdap'

## The following objects are masked from 'package:igraph':
## 
##     %>%, diversity

## The following object is masked from 'package:Matrix':
## 
##     %&%

## The following object is masked from 'package:rvest':
## 
##     %>%

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix

## The following object is masked from 'package:NLP':
## 
##     ngrams

## The following object is masked from 'package:stringr':
## 
##     %>%

## The following object is masked from 'package:text2vec':
## 
##     %>%

## The following object is masked from 'package:base':
## 
##     Filter

f = cor(Polarity,RatingCol,use = "everything", method=c("pearson","kendall","spearman"))
print(f)

## [1] 0.6851005

Recommendations

From the stat’s , correlation between rating and polarity of the reviews is strong. It is 0.68 which conveys a strong relation.

Based on the above cog

Good aspects of the movie are dicapri’s perfromance, story, making of the movie.

Next sequal should concentrate more on characters, action and story.Star Cast also plays an important role. Should avoid lengthy scenes. People might feel bored during lengthy scenes. Should include actions/visual effects like Dark Nights. Should concentrate on actors performances.Should also concentrate on making of the movie. Leonardo Dicaprio should continue to be as lead actor in the sequel as well as it would add great value to the movie.

Addressing the above points in the next sequel would help movie reach the audience so closely and there by resulting in good profits to the producers. This would also bring good reputation to the director as well.

lengthy scenes and too many characters had a negative impact on the movie

InceptionMovieExtraction

sundeep

November 27, 2016

Install the packages

Load the required packages

Process the data

Retrieve the ratings and corresponding reviews

Create the Data frame

Load the Stopwords

Pre processing

——————————————————–

Step 2: Create DTM using text2vec package

——————————————————–

# view a sample of the DTM, sorted from most to least frequent tokens

Step 2a: # Build word cloud

show world cloud

# step 2b - Using Term frequency inverse document frequency (tfidf)

show word cloud and bar plot for tfidf

# step 2c - Term Co-occurance Matrix (TCM)

plot the cog

Calculate Polarity and Correlation between polarity and rating

Recommendations