rm(list=ls())
Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_66') # for 64-bit
install.packages("rJava",repos="http://cran.rstudio.com/")
## Installing package into 'C:/Users/sundeep/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'rJava' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Public\Documents\Wondershare\CreatorTemp\Rtmp8st1Wk\downloaded_packages
require(text2vec) || install.packages("text2vec",repos="http://cran.rstudio.com/")
## Loading required package: text2vec
## [1] TRUE
require(data.table) || install.packages("data.table",repos="http://cran.rstudio.com/")
## Loading required package: data.table
## [1] TRUE
require(stringr) || install.packages("stringr",repos="http://cran.rstudio.com/")
## Loading required package: stringr
## [1] TRUE
require(tm) || install.packages("tm",repos="http://cran.rstudio.com/")
## Loading required package: tm
## Loading required package: NLP
## [1] TRUE
require(RWeka) || install.packages("RWeka",repos="http://cran.rstudio.com/")
## Loading required package: RWeka
## [1] TRUE
require(tokenizers) || install.packages("tokenizers",repos="http://cran.rstudio.com/")
## Loading required package: tokenizers
##
## Attaching package: 'tokenizers'
## The following object is masked from 'package:tm':
##
## stopwords
## [1] TRUE
require(slam) || install.packages("slam")
## Loading required package: slam
## [1] TRUE
require(wordcloud) || install.packages("wordcloud",repos="http://cran.rstudio.com/")
## Loading required package: wordcloud
## Loading required package: RColorBrewer
## [1] TRUE
require(ggplot2) || install.packages("ggplot2",repos="http://cran.rstudio.com/")
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## [1] TRUE
library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(ggplot2)
library("rvest")
## Loading required package: xml2
text.clean = function(x) # text data
{ require("tm")
x = gsub("<.*?>", " ", x) # regex for removing HTML tags
x = iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
x = gsub("[^[:alnum:]]", " ", x) # keep only alpha numeric
x = tolower(x) # convert to lower case characters
x = removeNumbers(x) # removing numbers
x = stripWhitespace(x) # removing white space
x = gsub("^\\s+|\\s+$", "", x) # remove leading and trailing white space
return(x)
}
counts = c(0,10,20,30,40,50)
reviews = NULL
ratings = NULL
ddr = NULL
for (j in counts){
url1 = paste0("http://www.imdb.com/title/tt1375666/reviews?filter=love;filter=love;start=",j)
url2 = paste0("http://www.imdb.com/title/tt1375666/reviews?filter=hate;filter=hate;start=",j)
page1 = read_html(url1)
page2 = read_html(url2)
reviews1 = html_text(html_nodes(page1,'#tn15content p'))
reviews.positive = setdiff(reviews1, c("*** This review may contain spoilers ***","Add another review"))
#reviews.positive
reviews2 = html_text(html_nodes(page2,'#tn15content p'))
reviews.negative = setdiff(reviews2, c("*** This review may contain spoilers ***","Add another review"))
movie.nodes.positive = html_nodes(page1,'h2 + img')
rr.positive = html_attr(movie.nodes.positive,name='alt')
movie.nodes.negative = html_nodes(page2,'h2 + img')
rr.negative = html_attr(movie.nodes.negative,name='alt')
#ddr= c(ddr,rr.positive,rr.negative)
#rat =substr(html_attr(movie.nodes.positive,name='alt'),0,2)
#rat
ratings = c(ratings,rr.positive,rr.negative)
reviews = c(reviews,reviews.positive,reviews.negative)
}
reviews = gsub("\n",' ',reviews)
#reviews
df = data.frame(reviews,ratings)
#df
#stpw1 = readLines(file.choose()) # read-in stopwords.txt
stpw1 = readLines('https://raw.githubusercontent.com/sudhir-voleti/basic-text-analysis-shinyapp/master/data/stopwords.txt')# stopwords list from git
stpw2 = tm::stopwords('english') # tm package stop word list; tokenizer package has the same name function, hence 'tm::'
comn = unique(c(stpw1, stpw2)) # Union of two list
stopwords = unique(gsub("'"," ",comn))
stopwords
## [1] "a" "a s" "able" "about"
## [5] "above" "according" "accordingly" "across"
## [9] "actually" "after" "afterwards" "again"
## [13] "against" "ain t" "all" "allow"
## [17] "allows" "almost" "alone" "along"
## [21] "already" "also" "although" "always"
## [25] "am" "among" "amongst" "an"
## [29] "and" "another" "any" "anybody"
## [33] "anyhow" "anyone" "anything" "anyway"
## [37] "anyways" "anywhere" "apart" "appear"
## [41] "appreciate" "appropriate" "are" "aren t"
## [45] "around" "as" "aside" "ask"
## [49] "asking" "associated" "at" "available"
## [53] "away" "awfully" "b" "be"
## [57] "became" "because" "become" "becomes"
## [61] "becoming" "been" "before" "beforehand"
## [65] "behind" "being" "believe" "below"
## [69] "beside" "besides" "best" "better"
## [73] "between" "beyond" "both" "brief"
## [77] "but" "by" "c" "c mon"
## [81] "c s" "came" "can" "can t"
## [85] "cannot" "cant" "cause" "causes"
## [89] "certain" "certainly" "changes" "clearly"
## [93] "co" "com" "come" "comes"
## [97] "concerning" "consequently" "consider" "considering"
## [101] "contain" "containing" "contains" "corresponding"
## [105] "could" "couldn t" "course" "currently"
## [109] "d" "definitely" "described" "despite"
## [113] "did" "didn t" "different" "do"
## [117] "does" "doesn t" "doing" "don t"
## [121] "done" "down" "downwards" "during"
## [125] "e" "each" "edu" "eg"
## [129] "eight" "either" "else" "elsewhere"
## [133] "enough" "entirely" "especially" "et"
## [137] "etc" "even" "ever" "every"
## [141] "everybody" "everyone" "everything" "everywhere"
## [145] "ex" "exactly" "example" "except"
## [149] "f" "far" "few" "fifth"
## [153] "first" "five" "followed" "following"
## [157] "follows" "for" "former" "formerly"
## [161] "forth" "four" "from" "further"
## [165] "furthermore" "g" "get" "gets"
## [169] "getting" "given" "gives" "go"
## [173] "goes" "going" "gone" "got"
## [177] "gotten" "greetings" "h" "had"
## [181] "hadn t" "happens" "hardly" "has"
## [185] "hasn t" "have" "haven t" "having"
## [189] "he" "he s" "hello" "help"
## [193] "hence" "her" "here" "here s"
## [197] "hereafter" "hereby" "herein" "hereupon"
## [201] "hers" "herself" "hi" "him"
## [205] "himself" "his" "hither" "hopefully"
## [209] "how" "howbeit" "however" "i"
## [213] "i d" "i ll" "i m" "i ve"
## [217] "ie" "if" "ignored" "immediate"
## [221] "in" "inasmuch" "inc" "indeed"
## [225] "indicate" "indicated" "indicates" "inner"
## [229] "insofar" "instead" "into" "inward"
## [233] "is" "isn t" "it" "it d"
## [237] "it ll" "it s" "its" "itself"
## [241] "j" "just" "k" "keep"
## [245] "keeps" "kept" "know" "knows"
## [249] "known" "l" "last" "lately"
## [253] "later" "latter" "latterly" "least"
## [257] "less" "lest" "let" "let s"
## [261] "like" "liked" "likely" "little"
## [265] "look" "looking" "looks" "ltd"
## [269] "m" "mainly" "many" "may"
## [273] "maybe" "me" "mean" "meanwhile"
## [277] "merely" "might" "more" "moreover"
## [281] "most" "mostly" "much" "must"
## [285] "my" "myself" "n" "name"
## [289] "namely" "nd" "near" "nearly"
## [293] "necessary" "need" "needs" "neither"
## [297] "never" "nevertheless" "new" "next"
## [301] "nine" "no" "nobody" "non"
## [305] "none" "noone" "nor" "normally"
## [309] "not" "nothing" "novel" "now"
## [313] "nowhere" "o" "obviously" "of"
## [317] "off" "often" "oh" "ok"
## [321] "okay" "old" "on" "once"
## [325] "one" "ones" "only" "onto"
## [329] "or" "other" "others" "otherwise"
## [333] "ought" "our" "ours" "ourselves"
## [337] "out" "outside" "over" "overall"
## [341] "own" "p" "particular" "particularly"
## [345] "per" "perhaps" "placed" "please"
## [349] "plus" "possible" "presumably" "probably"
## [353] "provides" "q" "que" "quite"
## [357] "qv" "r" "rather" "rd"
## [361] "re" "really" "reasonably" "regarding"
## [365] "regardless" "regards" "relatively" "respectively"
## [369] "right" "s" "said" "same"
## [373] "saw" "say" "saying" "says"
## [377] "second" "secondly" "see" "seeing"
## [381] "seem" "seemed" "seeming" "seems"
## [385] "seen" "self" "selves" "sensible"
## [389] "sent" "serious" "seriously" "seven"
## [393] "several" "shall" "she" "should"
## [397] "shouldn t" "since" "six" "so"
## [401] "some" "somebody" "somehow" "someone"
## [405] "something" "sometime" "sometimes" "somewhat"
## [409] "somewhere" "soon" "sorry" "specified"
## [413] "specify" "specifying" "still" "sub"
## [417] "such" "sup" "sure" "t"
## [421] "t s" "take" "taken" "tell"
## [425] "tends" "th" "than" "thank"
## [429] "thanks" "thanx" "that" "that s"
## [433] "thats" "the" "their" "theirs"
## [437] "them" "themselves" "then" "thence"
## [441] "there" "there s" "thereafter" "thereby"
## [445] "therefore" "therein" "theres" "thereupon"
## [449] "these" "they" "they d" "they ll"
## [453] "they re" "they ve" "think" "third"
## [457] "this" "thorough" "thoroughly" "those"
## [461] "though" "three" "through" "throughout"
## [465] "thru" "thus" "to" "together"
## [469] "too" "took" "toward" "towards"
## [473] "tried" "tries" "truly" "try"
## [477] "trying" "twice" "two" "u"
## [481] "un" "under" "unfortunately" "unless"
## [485] "unlikely" "until" "unto" "up"
## [489] "upon" "us" "use" "used"
## [493] "useful" "uses" "using" "usually"
## [497] "uucp" "v" "value" "various"
## [501] "very" "via" "viz" "vs"
## [505] "w" "want" "wants" "was"
## [509] "wasn t" "way" "we" "we d"
## [513] "we ll" "we re" "we ve" "welcome"
## [517] "well" "went" "were" "weren t"
## [521] "what" "what s" "whatever" "when"
## [525] "whence" "whenever" "where" "where s"
## [529] "whereafter" "whereas" "whereby" "wherein"
## [533] "whereupon" "wherever" "whether" "which"
## [537] "while" "whither" "who" "who s"
## [541] "whoever" "whole" "whom" "whose"
## [545] "why" "will" "willing" "wish"
## [549] "with" "within" "without" "won t"
## [553] "wonder" "would" "wouldn t" "x"
## [557] "y" "yes" "yet" "you"
## [561] "you d" "you ll" "you re" "you ve"
## [565] "your" "yours" "yourself" "yourselves"
## [569] "z" "zero" "she s" "he d"
## [573] "she d" "he ll" "she ll" "shan t"
## [577] "mustn t" "when s" "why s" "how s"
temp.text = reviews
#temp.text
#head(temp.text, 5)
data = data.frame(id = 1:length(temp.text), # creating doc IDs if name is not given
text = temp.text,
stringsAsFactors = F)
dim(data)
## [1] 120 2
x = text.clean(data$text) # applying func defined above to pre-process text corpus
x = removeWords(x,stopwords) # removing stopwords created above
x = stripWhitespace(x) # removing white space
tok_fun = word_tokenizer # using word & not space tokenizers
it_0 = itoken( x,
#preprocessor = text.clean,
tokenizer = tok_fun,
ids = data$id,
progressbar = T)
vocab = create_vocabulary(it_0, # func collects unique terms & corresponding statistics
ngram = c(2L, 2L) #,
#stopwords = stopwords
)
##
|
|====== | 10%
|
|============= | 20%
|
|==================== | 30%
|
|========================== | 40%
|
|================================ | 50%
|
|======================================= | 60%
|
|============================================== | 70%
|
|==================================================== | 80%
|
|========================================================== | 90%
|
|=================================================================| 100%
# length(vocab); str(vocab) # view what vocab obj is like
pruned_vocab = prune_vocabulary(vocab, # filters input vocab & throws out v frequent & v infrequent terms
term_count_min = 10)
vectorizer = vocab_vectorizer(pruned_vocab) # creates a text vectorizer func used in constructing a dtm/tcm/corpus
dtm_0 = create_dtm(it_0, vectorizer) # high-level function for creating a document-term matrix
##
|
|====== | 10%
|
|============= | 20%
|
|==================== | 30%
|
|========================== | 40%
|
|================================ | 50%
|
|======================================= | 60%
|
|============================================== | 70%
|
|==================================================== | 80%
|
|========================================================== | 90%
|
|=================================================================| 100%
# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),]
text2 = x
text2 = paste("",text2,"")
pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0
for (term in names(tsum)){
i = i + 1
focal.term = gsub("_", " ",term) # in case dot was word-separator
replacement.term = term
text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
setTxtProgressBar(pb, i)
}
##
|
| | 0%
|
|== | 4%
|
|===== | 7%
|
|======= | 11%
|
|========= | 14%
|
|============ | 18%
|
|============== | 21%
|
|================ | 25%
|
|=================== | 29%
|
|===================== | 32%
|
|======================= | 36%
|
|========================== | 39%
|
|============================ | 43%
|
|============================== | 46%
|
|================================ | 50%
|
|=================================== | 54%
|
|===================================== | 57%
|
|======================================= | 61%
|
|========================================== | 64%
|
|============================================ | 68%
|
|============================================== | 71%
|
|================================================= | 75%
|
|=================================================== | 79%
|
|===================================================== | 82%
|
|======================================================== | 86%
|
|========================================================== | 89%
|
|============================================================ | 93%
|
|=============================================================== | 96%
|
|=================================================================| 100%
it_m = itoken(text2, # function creates iterators over input objects to vocabularies, corpora, DTM & TCM matrices
# preprocessor = text.clean,
tokenizer = tok_fun,
ids = data$id,
progressbar = T)
vocab = create_vocabulary(it_m # vocab func collects unique terms and corresponding statistics
# ngram = c(2L, 2L),
#stopwords = stopwords
)
##
|
|====== | 10%
|
|============= | 20%
|
|==================== | 30%
|
|========================== | 40%
|
|================================ | 50%
|
|======================================= | 60%
|
|============================================== | 70%
|
|==================================================== | 80%
|
|========================================================== | 90%
|
|=================================================================| 100%
# length(vocab); str(vocab) # view what vocab obj is like
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 1)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_m = create_dtm(it_m, vectorizer)
##
|
|====== | 10%
|
|============= | 20%
|
|==================== | 30%
|
|========================== | 40%
|
|================================ | 50%
|
|======================================= | 60%
|
|============================================== | 70%
|
|==================================================== | 80%
|
|========================================================== | 90%
|
|=================================================================| 100%
dim(dtm_m)
## [1] 120 4425
dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
a0 = (apply(dtm, 1, sum) > 0) # build vector to identify non-empty docs
dtm = dtm[a0,]
dtm = dtm[,order(apply(dtm, 2, sum), decreasing = T)] # sorting dtm's columns in decreasing order of column sums
inspect(dtm[1:5, 1:5])
## <<DocumentTermMatrix (documents: 5, terms: 5)>>
## Non-/sparse entries: 18/7
## Sparsity : 28%
## Maximal term length: 9
## Weighting : term frequency (tf)
##
## Terms
## Docs movie film inception dream nolan
## 1 5 1 2 1 3
## 2 0 15 1 0 1
## 3 5 9 2 0 2
## 4 5 0 3 0 5
## 5 0 6 4 0 1
tst = round(ncol(dtm)/100) # divide DTM's cols into 100 manageble parts
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))
ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
tempdtm = dtm[,(b[i]+1):(b[i+1])]
s = colSums(as.matrix(tempdtm))
ss.col = c(ss.col,s)
#print(i)
}
tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)] #terms in decreasing order of freq
## show bar plot
#windows() # New plot window
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "Blue") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
#dev.off() # [graphical] device off / close it down
require(textir) || install.packages("textir")
## Loading required package: textir
## Loading required package: distrom
## Loading required package: Matrix
## Loading required package: gamlr
## Loading required package: parallel
## [1] TRUE
library(textir)
dtm.tfidf = tfidf(dtm, normalize=FALSE)
tst = round(ncol(dtm.tfidf)/100)
a = rep(tst, 99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm.tfidf))
ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
tempdtm = dtm.tfidf[,(b[i]+1):(b[i+1])]
s = colSums(as.matrix(tempdtm))
ss.col = c(ss.col,s)
#print(i)
}
tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)]
#windows() # New plot window
wordcloud(names(tsum), tsum, scale=c(1,0.5),1, max.words=200,colors=brewer.pal(8, "Dark2")) # Plot results in a word cloud
title(sub = "Term Frequency Inverse Document Frequency - Wordcloud")
#as.matrix(tsum[1:20]) # to see the top few tokens & their IDF scores
#(dtm.tfidf)[1:10, 1:10] # view first 10x10 cells in the DTM under TF IDF.
# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))
#windows() # New plot window
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
#dev.off()
vectorizer = vocab_vectorizer(pruned_vocab,
grow_dtm = FALSE,
skip_grams_window = 5L)
tcm = create_tcm(it_m, vectorizer) # func to build a TCM
##
|
|====== | 10%
|
|============= | 20%
|
|==================== | 30%
|
|========================== | 40%
|
|================================ | 50%
|
|======================================= | 60%
|
|============================================== | 70%
|
|==================================================== | 80%
|
|========================================================== | 90%
|
|=================================================================| 100%
tcm.mat = as.matrix(tcm) # use tcm.mat[1:5, 1:5] to view
adj.mat = tcm.mat + t(tcm.mat) # since adjacency matrices are symmetric
z = order(colSums(adj.mat), decreasing = T)
adj.mat = adj.mat[z,z]
# Plot Simple Term Co-occurance graph
adj = adj.mat[1:30,1:30]
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:rvest':
##
## %>%
## The following object is masked from 'package:stringr':
##
## %>%
## The following objects are masked from 'package:text2vec':
##
## %>%, normalize
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
cog = graph.adjacency(adj, mode = 'undirected')
cog = simplify(cog)
cog = delete.vertices(cog, V(cog)[ degree(cog) == 0 ])
#windows()
plot(cog)
## Step 2d - a cleaned up or ‘distilled’ COG PLot
distill.cog = function(mat1, # input TCM ADJ MAT
title, # title for the graph
s, # no. of central nodes
k1){ # max no. of connections
library(igraph)
a = colSums(mat1) # collect colsums into a vector obj a
b = order(-a) # nice syntax for ordering vector in decr order
mat2 = mat1[b, b] # order both rows and columns along vector b
diag(mat2) = 0
## +++ go row by row and find top k adjacencies +++ ##
wc = NULL
for (i1 in 1:s){
thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
mat2[i1, mat2[i1,] < thresh1] = 0 # neat. didn't need 2 use () in the subset here.
mat2[i1, mat2[i1,] > 0 ] = 1
word = names(mat2[i1, mat2[i1,] > 0])
mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
wc = c(wc,word)
} # i1 loop ends
mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))] # removed any NAs from the list
mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
graph <- graph.adjacency(mat4, mode = "undirected", weighted=T) # Create Network object
graph = simplify(graph)
V(graph)$color[1:s] = "green"
V(graph)$color[(s+1):length(V(graph))] = "pink"
graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ]) # delete singletons?
plot(graph,
layout = layout.kamada.kawai,
main = title)
} # func ends
#windows()
distill.cog(tcm.mat, 'Distilled COG', 10, 5)
## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length
## adj.mat and distilled cog for tfidf DTMs ##
adj.mat = t(dtm.tfidf) %*% dtm.tfidf
diag(adj.mat) = 0
a0 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a0[1:50], a0[1:50]])
#windows()
distill.cog(adj.mat, 'Distilled COG', 10, 10)
Polarity=NULL
RatingCol=NULL
for(i in 1:nrow(df)) {
row <- df[i,]
# do stuff with row
#print(row$reviews)
#print(length(row$reviews))
text = row$reviews
#print(text)
data = data.frame(id = 1:length(row$reviews), # creating doc IDs if name is not given
text = row$reviews,
stringsAsFactors = F)
#dim(data)
#print(data)
x = text.clean(data$text) # applying func defined above to pre-process text corpus
x = removeWords(x,stopwords) # removing stopwords created above
x = stripWhitespace(x)
#print(x)
#dtm = as.DocumentTermMatrix(x, weighting = weightTf)
#dtm
#a0 = (apply(dtm, 1, sum) > 0) # build vector to identify non-empty docs
#dtm = dtm[a0,]
#dtm
require(qdap) || install.packages("qdap") # ensure java is up to date!
library(qdap)
#x1 = x[a0] # remove empty docs from corpus
pol = polarity(x) # Calculate the polarity from qdap dictionary
pol
wc = pol$all[,2] # Word Count in each doc
wc
val = pol$all[,3] # average polarity score
p = pol$all[,4] # Positive words info
n = pol$all[,5]
dd= as.numeric(row$ratings)
Polarity = c(Polarity,val)
pos = regexpr(pattern="/", row$ratings)[1]-1
pos
rat.rev = as.numeric(substr(row$ratings,0,pos))
rat.rev
RatingCol = c(RatingCol,rat.rev)
}
## Loading required package: qdap
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
##
## Attaching package: 'qdapTools'
## The following object is masked from 'package:data.table':
##
## shift
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:igraph':
##
## %>%, diversity
## The following object is masked from 'package:Matrix':
##
## %&%
## The following object is masked from 'package:rvest':
##
## %>%
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:stringr':
##
## %>%
## The following object is masked from 'package:text2vec':
##
## %>%
## The following object is masked from 'package:base':
##
## Filter
f = cor(Polarity,RatingCol,use = "everything", method=c("pearson","kendall","spearman"))
print(f)
## [1] 0.6851005
From the stat’s , correlation between rating and polarity of the reviews is strong. It is 0.68 which conveys a strong relation.
Based on the above cog
Good aspects of the movie are dicapri’s perfromance, story, making of the movie.
Next sequal should concentrate more on characters, action and story.Star Cast also plays an important role. Should avoid lengthy scenes. People might feel bored during lengthy scenes. Should include actions/visual effects like Dark Nights. Should concentrate on actors performances.Should also concentrate on making of the movie. Leonardo Dicaprio should continue to be as lead actor in the sequel as well as it would add great value to the movie.
Addressing the above points in the next sequel would help movie reach the audience so closely and there by resulting in good profits to the producers. This would also bring good reputation to the director as well.
lengthy scenes and too many characters had a negative impact on the movie