Introduction

An analysis on what needs to be focussed on the sequal of movie the Shawshank Redemtpion.The crux of anaysis lies on the reviews and ratings based on which an inference has been made.

Load Libraries required for this Markdown

library("rvest")
library("XML")
library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(ggplot2)
library(igraph)
library(textir)
library(qdap)

Extracting first 50 positive and negative reviews of the movie Shawshank Redemtion from IMDB Website.

library(rvest)

counts = c(0,10,20,30,40)
reviews = NULL
for (j in counts){
  url1 = paste0("http://www.imdb.com/title/tt0111161/reviews?filter=love;filter=love;start=",j)
  url2 = paste0("http://www.imdb.com/title/tt0111161/reviews?filter=hate;filter=hate;start=",j)
  
  
  page1 = read_html(url1)
  page2 = read_html(url2)
  reviews1 = html_text(html_nodes(page1,'#tn15content div+ p'))
  reviews2 = html_text(html_nodes(page2,'#tn15content div+ p'))
  
  reviews.positive = setdiff(reviews1, c("*** This review may contain spoilers ***","Add another review"))
  reviews.negative = setdiff(reviews2, c("*** This review may contain spoilers ***","Add another review"))
  
  reviews = c(reviews,reviews.positive,reviews.negative)
  
}

reviews = gsub("\n",' ',reviews)
writeLines(reviews,'Shawshant Redemtption IMDB reviews.txt')

Text Clean function which is used to clean textby removing html tags,numbers,alpha numeric,whitespaces and all

text.clean = function(x)                    # text data
{ require("tm")
  x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
  x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
  x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric 
  x  =  tolower(x)                          # convert to lower case characters
  x  =  removeNumbers(x)                    # removing numbers
  x  =  stripWhitespace(x)                  # removing white space
  x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
  return(x)
}

Reading reviews data ,applying clean function,create dataframe,using stop words

temp.text = readLines(file.choose())  #Reading Shawashank Redemtiopn IMBD review t

Create DTM for the Text

data = data.frame(id = 1:length(temp.text),  # creating doc IDs if name is not given
                  text = temp.text, 
                  stringsAsFactors = F)

Used my own Stop Words

# Read Stopwords list
stpw1 = readLines(file.choose())      # read-in stopwords.txt
## Warning in readLines(file.choose()): incomplete final line found on 'M:\ISB
## \Term1\Text Analytics\Assignment1\stopwords.txt'

Some more operations of stopwords

stpw2 = tm::stopwords('english')      # tm package stop word list; tokenizer package has the same name function, hence 'tm::'
comn  = unique(c(stpw1, stpw2))         # Union of two list
stopwords = unique(gsub("'"," ",comn))  # final stop word lsit after removing punctuation

x  = text.clean(data$text)                # applying func defined above to pre-process text corpus
x  =  removeWords(x,stopwords)            # removing stopwords created above
x  =  stripWhitespace(x)                  # removing white space

Create DTM using text2vec package.

t1 = Sys.time()
tok_fun = word_tokenizer  # using word & not space tokenizers

it_0 = itoken( x,
               preprocessor = text.clean,
               tokenizer = tok_fun,
               ids = data$id,
               )
vocab = create_vocabulary(it_0,    #  func collects unique terms & corresponding statistics
                          ngram = c(2L, 2L) #
                          #stopwords = stopwords
)


pruned_vocab = prune_vocabulary(vocab,  # filters input vocab & throws out v frequent & v infrequent terms
                                term_count_min = 4)


vectorizer = vocab_vectorizer(pruned_vocab) #  creates a text vectorizer func used in constructing a dtm/tcm/corpus

dtm_0  = create_dtm(it_0, vectorizer) # high-level function for creating a document-term matrix
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.3432431 secs

Execution time for above code #Sort Bi-grams using Bigrams for a better analysis than unigrams

# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),]       # terms in decreasing order of freq

Some of the bigrams to display

head(tsum)
## morgan_freeman    tim_robbins   stephen_king  andy_dufresne frank_darabont 
##             40             37             20             15             14 
## robbins_morgan 
##              9

Code bi-grams as unigram in clean text corpus ,create a dtm and findout frequent dtm values

for Example here (Andy Duffrene) is replaced as Andy_duffrene in text-To Consider this as unigram.

t1 = Sys.time()
text2 = x
text2 = paste("",text2,"")

 for (term in names(tsum)){
   focal.term = gsub("_", " ",term)        
   replacement.term = term
   text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
 }
 
 
 it_m = itoken(text2,     # function creates iterators over input objects to vocabularies, corpora, DTM & TCM matrices
               tokenizer = tok_fun,
               ids = data$id,
               )
 
 vocab = create_vocabulary(it_m     # vocab func collects unique terms and corresponding statistics
                           
 )
 
 
 pruned_vocab = prune_vocabulary(vocab,
                                 term_count_min = 1)
 
 
 vectorizer = vocab_vectorizer(pruned_vocab)
 
 dtm_m  = create_dtm(it_m, vectorizer)
 dim(dtm_m)
## [1]  100 2974
 dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
 a0 = (apply(dtm, 1, sum) > 0)   # build vector to identify non-empty docs
 dtm = dtm[a0,]                  # drop empty docs
 
 
 # view a sample of the DTM, sorted from most to least frequent tokens 
 dtm = dtm[,order(apply(dtm, 2, sum), decreasing = T)]     # sorting dtm's columns in decreasing order of column sums
      # inspect() func used to view parts of a DTM object   
 print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.478338 secs

Execution time elapsed for above chunk # construction of DtM using TF(Term Frequency) for Word Cloud

tst = round(ncol(dtm)/100)  # divide DTM's cols into 100 manageble parts
 a = rep(tst,99)
 b = cumsum(a);rm(a)
 b = c(0,b,ncol(dtm))
 
 ss.col = c(NULL)
 for (i in 1:(length(b)-1)) {
   tempdtm = dtm[,(b[i]+1):(b[i+1])]
   s = colSums(as.matrix(tempdtm))
   ss.col = c(ss.col,s)
 }
 
 tsum = ss.col
 tsum = tsum[order(tsum, decreasing = T)]       #terms in decreasing order of freq

Top Values of DTM using TF

head(tsum)
##   time prison  story   andy   hope people 
##     80     78     65     63     59     59

Last Values of DTM using TF

tail(tsum)
##      fair  sexually    babies elongated    player   lingers 
##         1         1         1         1         1         1

Word Cloud Diagram using TermFrequency

 windows()  # New plot window
 wordcloud(names(tsum), tsum,     # words, their freqs 
           scale = c(4, 0.5),     # range of word sizes
           1,                     # min.freq of words to consider
           max.words = 200,       # max #words
           colors = brewer.pal(8, "Dark2"))    # Plot results in a word cloud 
 title(sub = "Term Frequency - Wordcloud")     # title for the wordcloud display

Plot barchart for top tokens(term frequency)

test = as.data.frame(round(tsum[1:15],0))
 
 windows()  # New plot window
 ggplot(test, aes(x = rownames(test), y = test)) + 
   geom_bar(stat = "identity", fill = "Blue") +
   geom_text(aes(label = test), vjust= -0.20) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

 dev.off() # [graphical] device off / close it down
## png 
##   2

Using Term frequency inverse document frequency (tfidf) for wordcloud and Barchart

dtm.tfidf = tfidf(dtm, normalize=FALSE)
 
 tst = round(ncol(dtm.tfidf)/100)
 a = rep(tst, 99)
 b = cumsum(a);rm(a)
 b = c(0,b,ncol(dtm.tfidf))
 
 ss.col = c(NULL)
 for (i in 1:(length(b)-1)) {
   tempdtm = dtm.tfidf[,(b[i]+1):(b[i+1])]
   s = colSums(as.matrix(tempdtm))
   ss.col = c(ss.col,s)
 }
 
 tsum = ss.col
 
 tsum = tsum[order(tsum, decreasing = T)]       #terms in decreasing order of freq

Top Values using TFIDF

head(tsum)
##     andy    story   prison     time     life     hope 
## 82.48800 62.89296 60.56925 60.40181 60.39002 60.27742

Least Values using TFIDF

tail(tsum)
##      fair  sexually    babies elongated    player   lingers 
##  3.912023  3.912023  3.912023  3.912023  3.912023  3.912023

Wordcloud using TFIDF

windows()  # New plot window
 wordcloud(names(tsum), tsum, scale=c(4,0.5),1, max.words=200,colors=brewer.pal(8, "Dark2")) # Plot results in a word cloud 
 title(sub = "Term Frequency Inverse Document Frequency - Wordcloud")

 as.matrix(tsum[1:20])     #  to see the top few tokens & their IDF scores
##                [,1]
## andy       82.48800
## story      62.89296
## prison     60.56925
## time       60.40181
## life       60.39002
## hope       60.27742
## red        59.93889
## good       59.73033
## people     57.08746
## watch      52.37333
## films      49.84172
## book       47.86668
## great      47.67249
## years      47.09484
## music      45.81454
## feel       44.45343
## top        43.69814
## made       42.97513
## world      42.52696
## characters 41.51828
 (dtm.tfidf)[1:10, 1:10]   # view first 10x10 cells in the DTM under TF IDF.
## 10 x 10 sparse Matrix of class "dgCMatrix"
##    [[ suppressing 10 column names 'time', 'prison', 'story' ... ]]
##                                                                    
## 1  0.7550226 1.5530576 .        .        .        0.967584 .       
## 2  .         3.1061152 2.902752 5.237333 4.086605 0.967584 3.418303
## 3  4.5301355 3.1061152 0.967584 6.546667 4.086605 0.967584 2.278869
## 4  0.7550226 0.7765288 0.967584 .        1.021651 .        .       
## 5  0.7550226 .         2.902752 .        2.043302 1.935168 1.139434
## 6  .         0.7765288 .        .        2.043302 .        .       
## 7  0.7550226 .         0.967584 3.928000 .        .        .       
## 8  .         0.7765288 .        2.618667 3.064954 0.967584 1.139434
## 9  3.0200903 .         .        .        1.021651 .        2.278869
## 10 1.5100452 0.7765288 0.967584 1.309333 1.021651 0.967584 2.278869
##                              
## 1  .        .        .       
## 2  .        2.217325 4.281349
## 3  1.171183 .        2.854233
## 4  .        1.108663 .       
## 5  .        1.108663 .       
## 6  1.171183 1.108663 .       
## 7  .        2.217325 1.427116
## 8  .        4.434650 4.281349
## 9  2.342366 2.217325 .       
## 10 .        .        .

plot barchart for top tokens for tfidf

test = as.data.frame(round(tsum[1:15],0))
 windows()  # New plot window
 ggplot(test, aes(x = rownames(test), y = test)) + 
   geom_bar(stat = "identity", fill = "red") +
   geom_text(aes(label = test), vjust= -0.20) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

 dev.off()
## png 
##   2

Term Co-occurance Matrix (TCM)

vectorizer = vocab_vectorizer(pruned_vocab, 
                               grow_dtm = FALSE, 
                               skip_grams_window = 5L)
 
 tcm = create_tcm(it_m, vectorizer) # func to build a TCM
 
 tcm.mat = as.matrix(tcm)         # use tcm.mat[1:5, 1:5] to view
 adj.mat = tcm.mat + t(tcm.mat)   # since adjacency matrices are symmetric
 
 z = order(colSums(adj.mat), decreasing = T)
 adj.mat = adj.mat[z,z]

Create distilled COG for TF

Simple COG dint give any good analysis to derive,so used Distilled COG

 distill.cog = function(mat1, # input TCM ADJ MAT
                        title, # title for the graph
                        s,    # no. of central nodes
                        k1){  # max no. of connections  
   library(igraph)
   a = colSums(mat1) # collect colsums into a vector obj a
   b = order(-a)     # nice syntax for ordering vector in decr order  
   
   mat2 = mat1[b, b]     # order both rows and columns along vector b
   
   diag(mat2) =  0
   
   ## +++ go row by row and find top k adjacencies +++ ##
   
   wc = NULL
   
   for (i1 in 1:s){ 
     thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
     mat2[i1, mat2[i1,] < thresh1] = 0   # neat. didn't need 2 use () in the subset here.
     mat2[i1, mat2[i1,] > 0 ] = 1
     word = names(mat2[i1, mat2[i1,] > 0])
     mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
     wc = c(wc,word)
   } # i1 loop ends
   
   
   mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
   ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))]  # removed any NAs from the list
   mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
   graph <- graph.adjacency(mat4, mode = "undirected", weighted=T)    # Create Network object
   graph = simplify(graph) 
   V(graph)$color[1:s] = "green"
   V(graph)$color[(s+1):length(V(graph))] = "pink"
   
   graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ]) # delete singletons?
   
   plot(graph, 
        layout = layout.kamada.kawai, 
        main = title)
   
 } # func ends
 
 windows()
 distill.cog(tcm.mat, 'Distilled COG using TF',  10,  5)
## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length

### adj.mat and distilled cog for tfidf DTMs ##

adj.mat = t(dtm.tfidf) %*% dtm.tfidf
 diag(adj.mat) = 0
 a0 = order(apply(adj.mat, 2, sum), decreasing = T)
 adj.mat = as.matrix(adj.mat[a0[1:50], a0[1:50]])
 
 windows()
 distill.cog(adj.mat, 'Distilled COG using TFIDF',  10,  10)

find the polarity of positive and negative words

x1 = x[a0]    # remove empty docs from corpus

t1 = Sys.time()   # set timer

pol = polarity(x1)         # Calculate the polarity from qdap dictionary
wc = pol$all[,2]                  # Word Count in each doc
val = pol$all[,3]                 # average polarity score
p  = pol$all[,4]                  # Positive words info
n  = pol$all[,5]                  # Negative Words info  

print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 15.32086 secs

Execution time elapsed for above statement

Total Polarity of words

pol
##   all total.sentences total.words ave.polarity sd.polarity stan.mean.polarity
## 1 all            2974        7739        0.131       0.785              0.167

Extracting the rating values of the movie Shawshank Redemption from IMDB

counts = c(0,10,20,30,40)
ratings = NULL
for (j in counts){
  url1 = paste0("http://www.imdb.com/title/tt0111161/reviews?filter=love;filter=love;start=",j)
  url2 = paste0("http://www.imdb.com/title/tt0111161/reviews?filter=hate;filter=hate;start=",j)
  
  page1 = read_html(url1)
  page2 = read_html(url2)
  ratings1 = html_nodes(page1,'h2+ img')
  ratings2 = html_nodes(page2,'h2+ img')
  
  ratings.positive = substr(html_attr(ratings1,name='alt'),0,2) 
  ratings.negative = substr(html_attr(ratings2,name='alt'),0,2)
  
  ratings = c(ratings,ratings.positive,ratings.negative)
  ratings=gsub("/",' ',ratings)
  
}

length(ratings)
## [1] 100

Find the correlation of Ratings and Polarity values.

cor(val,as.integer(ratings)) [1] 0.4362098

ReCommendation on attributes and aspects for a sequal

From WordCloud TF– Prison,Hope,Andy,Story are pouplated highly TFIDF-Frienship,good,feel,Andy are populated Higly From Distilled Cog TF-Story Realistic,Story Belief,Andy escapes,Morgan Free Man acting,Bromance,Life Understanding,Life Sentence

From Distilled COG TFIDF-NovellaFriendship,strong,andy-red-human, hollywood-dialogue,newman world,newman-story,newman hope

Based on the above inputs, My recommendations for the sequal are as below:

Tim Robbins and Morgan Freeman should be casted again .As their characters Andy and Red are been talked allot

Their Friendhsip should be an asset.As Bromance is talked more.

Plot really worked well,as story is being talked allot.Story again should be based on hope and life with good dialogues as it had.