An analysis on what needs to be focussed on the sequal of movie the Shawshank Redemtpion.The crux of anaysis lies on the reviews and ratings based on which an inference has been made.
library("rvest")
library("XML")
library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(ggplot2)
library(igraph)
library(textir)
library(qdap)
library(rvest)
counts = c(0,10,20,30,40)
reviews = NULL
for (j in counts){
url1 = paste0("http://www.imdb.com/title/tt0111161/reviews?filter=love;filter=love;start=",j)
url2 = paste0("http://www.imdb.com/title/tt0111161/reviews?filter=hate;filter=hate;start=",j)
page1 = read_html(url1)
page2 = read_html(url2)
reviews1 = html_text(html_nodes(page1,'#tn15content div+ p'))
reviews2 = html_text(html_nodes(page2,'#tn15content div+ p'))
reviews.positive = setdiff(reviews1, c("*** This review may contain spoilers ***","Add another review"))
reviews.negative = setdiff(reviews2, c("*** This review may contain spoilers ***","Add another review"))
reviews = c(reviews,reviews.positive,reviews.negative)
}
reviews = gsub("\n",' ',reviews)
writeLines(reviews,'Shawshant Redemtption IMDB reviews.txt')
temp.text = readLines(file.choose()) #Reading Shawashank Redemtiopn IMBD review t
data = data.frame(id = 1:length(temp.text), # creating doc IDs if name is not given
text = temp.text,
stringsAsFactors = F)
# Read Stopwords list
stpw1 = readLines(file.choose()) # read-in stopwords.txt
## Warning in readLines(file.choose()): incomplete final line found on 'M:\ISB
## \Term1\Text Analytics\Assignment1\stopwords.txt'
stpw2 = tm::stopwords('english') # tm package stop word list; tokenizer package has the same name function, hence 'tm::'
comn = unique(c(stpw1, stpw2)) # Union of two list
stopwords = unique(gsub("'"," ",comn)) # final stop word lsit after removing punctuation
x = text.clean(data$text) # applying func defined above to pre-process text corpus
x = removeWords(x,stopwords) # removing stopwords created above
x = stripWhitespace(x) # removing white space
t1 = Sys.time()
tok_fun = word_tokenizer # using word & not space tokenizers
it_0 = itoken( x,
preprocessor = text.clean,
tokenizer = tok_fun,
ids = data$id,
)
vocab = create_vocabulary(it_0, # func collects unique terms & corresponding statistics
ngram = c(2L, 2L) #
#stopwords = stopwords
)
pruned_vocab = prune_vocabulary(vocab, # filters input vocab & throws out v frequent & v infrequent terms
term_count_min = 4)
vectorizer = vocab_vectorizer(pruned_vocab) # creates a text vectorizer func used in constructing a dtm/tcm/corpus
dtm_0 = create_dtm(it_0, vectorizer) # high-level function for creating a document-term matrix
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.3432431 secs
Execution time for above code #Sort Bi-grams using Bigrams for a better analysis than unigrams
# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),] # terms in decreasing order of freq
head(tsum)
## morgan_freeman tim_robbins stephen_king andy_dufresne frank_darabont
## 40 37 20 15 14
## robbins_morgan
## 9
for Example here (Andy Duffrene) is replaced as Andy_duffrene in text-To Consider this as unigram.
t1 = Sys.time()
text2 = x
text2 = paste("",text2,"")
for (term in names(tsum)){
focal.term = gsub("_", " ",term)
replacement.term = term
text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
}
it_m = itoken(text2, # function creates iterators over input objects to vocabularies, corpora, DTM & TCM matrices
tokenizer = tok_fun,
ids = data$id,
)
vocab = create_vocabulary(it_m # vocab func collects unique terms and corresponding statistics
)
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 1)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_m = create_dtm(it_m, vectorizer)
dim(dtm_m)
## [1] 100 2974
dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
a0 = (apply(dtm, 1, sum) > 0) # build vector to identify non-empty docs
dtm = dtm[a0,] # drop empty docs
# view a sample of the DTM, sorted from most to least frequent tokens
dtm = dtm[,order(apply(dtm, 2, sum), decreasing = T)] # sorting dtm's columns in decreasing order of column sums
# inspect() func used to view parts of a DTM object
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.478338 secs
Execution time elapsed for above chunk # construction of DtM using TF(Term Frequency) for Word Cloud
tst = round(ncol(dtm)/100) # divide DTM's cols into 100 manageble parts
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))
ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
tempdtm = dtm[,(b[i]+1):(b[i+1])]
s = colSums(as.matrix(tempdtm))
ss.col = c(ss.col,s)
}
tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)] #terms in decreasing order of freq
head(tsum)
## time prison story andy hope people
## 80 78 65 63 59 59
tail(tsum)
## fair sexually babies elongated player lingers
## 1 1 1 1 1 1
windows() # New plot window
wordcloud(names(tsum), tsum, # words, their freqs
scale = c(4, 0.5), # range of word sizes
1, # min.freq of words to consider
max.words = 200, # max #words
colors = brewer.pal(8, "Dark2")) # Plot results in a word cloud
title(sub = "Term Frequency - Wordcloud") # title for the wordcloud display
test = as.data.frame(round(tsum[1:15],0))
windows() # New plot window
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "Blue") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
dev.off() # [graphical] device off / close it down
## png
## 2
dtm.tfidf = tfidf(dtm, normalize=FALSE)
tst = round(ncol(dtm.tfidf)/100)
a = rep(tst, 99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm.tfidf))
ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
tempdtm = dtm.tfidf[,(b[i]+1):(b[i+1])]
s = colSums(as.matrix(tempdtm))
ss.col = c(ss.col,s)
}
tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)] #terms in decreasing order of freq
head(tsum)
## andy story prison time life hope
## 82.48800 62.89296 60.56925 60.40181 60.39002 60.27742
tail(tsum)
## fair sexually babies elongated player lingers
## 3.912023 3.912023 3.912023 3.912023 3.912023 3.912023
windows() # New plot window
wordcloud(names(tsum), tsum, scale=c(4,0.5),1, max.words=200,colors=brewer.pal(8, "Dark2")) # Plot results in a word cloud
title(sub = "Term Frequency Inverse Document Frequency - Wordcloud")
as.matrix(tsum[1:20]) # to see the top few tokens & their IDF scores
## [,1]
## andy 82.48800
## story 62.89296
## prison 60.56925
## time 60.40181
## life 60.39002
## hope 60.27742
## red 59.93889
## good 59.73033
## people 57.08746
## watch 52.37333
## films 49.84172
## book 47.86668
## great 47.67249
## years 47.09484
## music 45.81454
## feel 44.45343
## top 43.69814
## made 42.97513
## world 42.52696
## characters 41.51828
(dtm.tfidf)[1:10, 1:10] # view first 10x10 cells in the DTM under TF IDF.
## 10 x 10 sparse Matrix of class "dgCMatrix"
## [[ suppressing 10 column names 'time', 'prison', 'story' ... ]]
##
## 1 0.7550226 1.5530576 . . . 0.967584 .
## 2 . 3.1061152 2.902752 5.237333 4.086605 0.967584 3.418303
## 3 4.5301355 3.1061152 0.967584 6.546667 4.086605 0.967584 2.278869
## 4 0.7550226 0.7765288 0.967584 . 1.021651 . .
## 5 0.7550226 . 2.902752 . 2.043302 1.935168 1.139434
## 6 . 0.7765288 . . 2.043302 . .
## 7 0.7550226 . 0.967584 3.928000 . . .
## 8 . 0.7765288 . 2.618667 3.064954 0.967584 1.139434
## 9 3.0200903 . . . 1.021651 . 2.278869
## 10 1.5100452 0.7765288 0.967584 1.309333 1.021651 0.967584 2.278869
##
## 1 . . .
## 2 . 2.217325 4.281349
## 3 1.171183 . 2.854233
## 4 . 1.108663 .
## 5 . 1.108663 .
## 6 1.171183 1.108663 .
## 7 . 2.217325 1.427116
## 8 . 4.434650 4.281349
## 9 2.342366 2.217325 .
## 10 . . .
test = as.data.frame(round(tsum[1:15],0))
windows() # New plot window
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
dev.off()
## png
## 2
vectorizer = vocab_vectorizer(pruned_vocab,
grow_dtm = FALSE,
skip_grams_window = 5L)
tcm = create_tcm(it_m, vectorizer) # func to build a TCM
tcm.mat = as.matrix(tcm) # use tcm.mat[1:5, 1:5] to view
adj.mat = tcm.mat + t(tcm.mat) # since adjacency matrices are symmetric
z = order(colSums(adj.mat), decreasing = T)
adj.mat = adj.mat[z,z]
Simple COG dint give any good analysis to derive,so used Distilled COG
distill.cog = function(mat1, # input TCM ADJ MAT
title, # title for the graph
s, # no. of central nodes
k1){ # max no. of connections
library(igraph)
a = colSums(mat1) # collect colsums into a vector obj a
b = order(-a) # nice syntax for ordering vector in decr order
mat2 = mat1[b, b] # order both rows and columns along vector b
diag(mat2) = 0
## +++ go row by row and find top k adjacencies +++ ##
wc = NULL
for (i1 in 1:s){
thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
mat2[i1, mat2[i1,] < thresh1] = 0 # neat. didn't need 2 use () in the subset here.
mat2[i1, mat2[i1,] > 0 ] = 1
word = names(mat2[i1, mat2[i1,] > 0])
mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
wc = c(wc,word)
} # i1 loop ends
mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))] # removed any NAs from the list
mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
graph <- graph.adjacency(mat4, mode = "undirected", weighted=T) # Create Network object
graph = simplify(graph)
V(graph)$color[1:s] = "green"
V(graph)$color[(s+1):length(V(graph))] = "pink"
graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ]) # delete singletons?
plot(graph,
layout = layout.kamada.kawai,
main = title)
} # func ends
windows()
distill.cog(tcm.mat, 'Distilled COG using TF', 10, 5)
## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length
### adj.mat and distilled cog for tfidf DTMs ##
adj.mat = t(dtm.tfidf) %*% dtm.tfidf
diag(adj.mat) = 0
a0 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a0[1:50], a0[1:50]])
windows()
distill.cog(adj.mat, 'Distilled COG using TFIDF', 10, 10)
x1 = x[a0] # remove empty docs from corpus
t1 = Sys.time() # set timer
pol = polarity(x1) # Calculate the polarity from qdap dictionary
wc = pol$all[,2] # Word Count in each doc
val = pol$all[,3] # average polarity score
p = pol$all[,4] # Positive words info
n = pol$all[,5] # Negative Words info
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 15.32086 secs
Execution time elapsed for above statement
pol
## all total.sentences total.words ave.polarity sd.polarity stan.mean.polarity
## 1 all 2974 7739 0.131 0.785 0.167
counts = c(0,10,20,30,40)
ratings = NULL
for (j in counts){
url1 = paste0("http://www.imdb.com/title/tt0111161/reviews?filter=love;filter=love;start=",j)
url2 = paste0("http://www.imdb.com/title/tt0111161/reviews?filter=hate;filter=hate;start=",j)
page1 = read_html(url1)
page2 = read_html(url2)
ratings1 = html_nodes(page1,'h2+ img')
ratings2 = html_nodes(page2,'h2+ img')
ratings.positive = substr(html_attr(ratings1,name='alt'),0,2)
ratings.negative = substr(html_attr(ratings2,name='alt'),0,2)
ratings = c(ratings,ratings.positive,ratings.negative)
ratings=gsub("/",' ',ratings)
}
length(ratings)
## [1] 100
cor(val,as.integer(ratings)) [1] 0.4362098
From WordCloud TF– Prison,Hope,Andy,Story are pouplated highly TFIDF-Frienship,good,feel,Andy are populated Higly From Distilled Cog TF-Story Realistic,Story Belief,Andy escapes,Morgan Free Man acting,Bromance,Life Understanding,Life Sentence
From Distilled COG TFIDF-NovellaFriendship,strong,andy-red-human, hollywood-dialogue,newman world,newman-story,newman hope
Tim Robbins and Morgan Freeman should be casted again .As their characters Andy and Red are been talked allot
Their Friendhsip should be an asset.As Bromance is talked more.
Plot really worked well,as story is being talked allot.Story again should be based on hope and life with good dialogues as it had.