Imagine you’re a Data Scientist / consultant for a movie studio. Your brief is to recommend the top 2-3 movie aspects or attributes the studio should focus on in making a sequel.
The aim is to get you to explore with trial-and-error different configurations of possibilities (e.g., what stop-words to use for maximum meaning? TF or IDF? etc) in the text-An of a simple corpus.
Loading Library and if not present install the required library. Also, some common used that will be used in below code.
require(rvest) || install.packages('rvest')
require(RSelenium) || install.packages('RSelenium')
require(text2vec) || install.packages('text2vec')
require(data.table) || install.packages('data.table')
require(stringr) || install.packages('stringr')
require(tm) || install.packages('tm')
require(RWeka) || install.packages('RWeka')
require(tokenizers) || install.packages('tokenizers')
require(slam) || install.packages('slam')
require(wordcloud) || install.packages('wordcloud')
require(ggplot2) || install.packages('ggplot2')
require(XML) || install.packages('XML')
require(qdap) || install.packages("qdap") # ensure java is up to date!
require(textir) || install.packages("textir")
require(igraph) || install.packages("igraph")
library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(rvest)
library(ggplot2)
library(rvest)
library(XML)
library(qdap)
library(textir)
library(igraph)
rm(list=ls())
text.clean = function(x) # text data
{ require("tm")
x = gsub("<.*?>", " ", x) # regex for removing HTML tags
x = iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
x = gsub("[^[:alnum:]]", " ", x) # keep only alpha numeric
x = tolower(x) # convert to lower case characters
x = removeNumbers(x) # removing numbers
x = stripWhitespace(x) # removing white space
x = gsub("^\\s+|\\s+$", "", x) # remove leading and trailing white space
return(x)
}
find.polarity = function(review.nodes, rating, df) # text data
{
for (a in 1:length(rating))
{
#print (tmp1[a])
data = review.nodes[a]
x = text.clean(data) # pre-process text corpus
x = removeWords(x,stopwords) # removing stopwords created above
x = stripWhitespace(x) # removing white space
# x = stemDocument(x)
#--------------------------------------------------------#
###### Create DTM using text2vec package #
#--------------------------------------------------------#
tok_fun = word_tokenizer
it_m = itoken(x,
# preprocessor = text.clean,
tokenizer = tok_fun,
ids = data$id)
vocab = create_vocabulary(it_m
# ngram = c(2L, 2L),
#stopwords = stopwords
)
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 1)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_m = create_dtm(it_m, vectorizer)
dim(dtm_m)
dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
a0 = (apply(dtm, 1, sum) > 0) # build vector to identify non-empty docs
dtm = dtm[a0,] # drop empty docs
#--------------------------------------------------------#
# Sentiment Analysis #
#--------------------------------------------------------#
x1 = x[a0] # remove empty docs from corpus
pol = polarity(x1) # Calculate the polarity from qdap dictionary
wc = pol$all[,2] # Word Count in each doc
val = pol$all[,3] # average polarity score
p = pol$all[,4] # Positive words info
n = pol$all[,5] # Negative Words info
new.row <- data.frame(Rating = c(as.numeric(rating[a])), Polarity = c(pol$group$ave.polarity))
df <- rbind(df, new.row)
}
return (df)
}
Go to IMDB and extract 100 reviews (50 positive and 50 negative) for your favourite movie.
The data is parsed for bollywood movie, Taare Zameen Par and result will be saved in C: drive with the name “Taare Zameen Par.txt”
#--------------------------------------------------------#
# Go to IMDB and scrap 50 positive and negative review of Taare Zameen Par movie #
#--------------------------------------------------------#
counts = c(0,10,20,30,40,50)
reviews = NULL
for (j in counts)
{
url1 = paste0("http://www.imdb.com/title/tt0986264/reviews?filter=love;filter=love;start=",j)
url2 = paste0("http://www.imdb.com/title/tt0986264/reviews?filter=hate;filter=hate;start=",j)
page1 = read_html(url1)
page2 = read_html(url2)
reviews1 = html_text(html_nodes(page1,'#tn15content p'))
reviews2 = html_text(html_nodes(page2,'#tn15content p'))
reviews.positive = setdiff(reviews1, c("*** This review may contain spoilers ***","Add another review"))
reviews.negative = setdiff(reviews2, c("*** This review may contain spoilers ***","Add another review"))
reviews = c(reviews,reviews.positive,reviews.negative)
}
reviews = gsub("\n",' ',reviews)
writeLines(reviews,'C:/Taare Zameen Par.txt')
Pre-process the data and create Document term Matrix. Check word-clouds and COGs under both TF and TFIDF weighing schemes for which configs appear most meaningful / informative. Iterate by updating the stop-words list etc.
The stop-word has been updated by adding words that are not much relevant (words like ishaan, aamir etc.). In this step, we have performed Sentiment analysis and come up with the +ive and -ive word along with polarity score.
#--------------------------------------------------------#
# Step 1 - Reading text data #
#--------------------------------------------------------#
temp.text = readLines(file.choose()) # load Taare Zameen Par
data = data.frame(id = 1:length(temp.text), text = temp.text, stringsAsFactors = F)
# Read Stopwords list
stpw1 = readLines(file.choose()) # read-in stopwords.txt
stpw2 = tm::stopwords('english') # tm package stop word list; tokenizer package has the same name function
comn = unique(c(stpw1, stpw2)) # Union of two list
stopwords = unique(gsub("'"," ",comn)) # final stop word lsit after removing punctuation
print ('List of stopwords')
## [1] "List of stopwords"
print (stopwords)
## [1] "a" "a s" "able" "about"
## [5] "above" "according" "accordingly" "across"
## [9] "actually" "after" "afterwards" "again"
## [13] "against" "ain t" "all" "allow"
## [17] "allows" "almost" "alone" "along"
## [21] "already" "also" "although" "always"
## [25] "am" "among" "amongst" "an"
## [29] "and" "another" "any" "anybody"
## [33] "anyhow" "anyone" "anything" "anyway"
## [37] "anyways" "anywhere" "apart" "appear"
## [41] "appreciate" "appropriate" "are" "aren t"
## [45] "around" "as" "aside" "ask"
## [49] "asking" "associated" "at" "available"
## [53] "away" "awfully" "b" "be"
## [57] "became" "because" "become" "becomes"
## [61] "becoming" "been" "before" "beforehand"
## [65] "behind" "being" "believe" "below"
## [69] "beside" "besides" "best" "better"
## [73] "between" "beyond" "both" "brief"
## [77] "but" "by" "c" "c mon"
## [81] "c s" "came" "can" "can t"
## [85] "cannot" "cant" "cause" "causes"
## [89] "certain" "certainly" "changes" "clearly"
## [93] "co" "com" "come" "comes"
## [97] "concerning" "consequently" "consider" "considering"
## [101] "contain" "containing" "contains" "corresponding"
## [105] "could" "couldn t" "course" "currently"
## [109] "d" "definitely" "described" "despite"
## [113] "did" "didn t" "different" "do"
## [117] "does" "doesn t" "doing" "don t"
## [121] "done" "down" "downwards" "during"
## [125] "e" "each" "edu" "eg"
## [129] "eight" "either" "else" "elsewhere"
## [133] "enough" "entirely" "especially" "et"
## [137] "etc" "even" "ever" "every"
## [141] "everybody" "everyone" "everything" "everywhere"
## [145] "ex" "exactly" "example" "except"
## [149] "f" "far" "few" "fifth"
## [153] "first" "five" "followed" "following"
## [157] "follows" "for" "former" "formerly"
## [161] "forth" "four" "from" "further"
## [165] "furthermore" "g" "get" "gets"
## [169] "getting" "given" "gives" "go"
## [173] "goes" "going" "gone" "got"
## [177] "gotten" "greetings" "h" "had"
## [181] "hadn t" "happens" "hardly" "has"
## [185] "hasn t" "have" "haven t" "having"
## [189] "he" "he s" "hello" "help"
## [193] "hence" "her" "here" "here s"
## [197] "hereafter" "hereby" "herein" "hereupon"
## [201] "hers" "herself" "hi" "him"
## [205] "himself" "his" "hither" "hopefully"
## [209] "how" "howbeit" "however" "i"
## [213] "i d" "i ll" "i m" "i ve"
## [217] "ie" "if" "ignored" "immediate"
## [221] "in" "inasmuch" "inc" "indeed"
## [225] "indicate" "indicated" "indicates" "inner"
## [229] "insofar" "instead" "into" "inward"
## [233] "is" "isn t" "it" "it d"
## [237] "it ll" "it s" "its" "itself"
## [241] "j" "just" "k" "keep"
## [245] "keeps" "kept" "know" "knows"
## [249] "known" "l" "last" "lately"
## [253] "later" "latter" "latterly" "least"
## [257] "less" "lest" "let" "let s"
## [261] "like" "liked" "likely" "little"
## [265] "look" "looking" "looks" "ltd"
## [269] "m" "mainly" "many" "may"
## [273] "maybe" "me" "mean" "meanwhile"
## [277] "merely" "might" "more" "moreover"
## [281] "most" "mostly" "much" "must"
## [285] "my" "myself" "n" "name"
## [289] "namely" "nd" "near" "nearly"
## [293] "necessary" "need" "needs" "neither"
## [297] "never" "nevertheless" "new" "next"
## [301] "nine" "no" "nobody" "non"
## [305] "none" "noone" "nor" "normally"
## [309] "not" "nothing" "novel" "now"
## [313] "nowhere" "o" "obviously" "of"
## [317] "off" "often" "oh" "ok"
## [321] "okay" "old" "on" "once"
## [325] "one" "ones" "only" "onto"
## [329] "or" "other" "others" "otherwise"
## [333] "ought" "our" "ours" "ourselves"
## [337] "out" "outside" "over" "overall"
## [341] "own" "p" "particular" "particularly"
## [345] "per" "perhaps" "placed" "please"
## [349] "plus" "possible" "presumably" "probably"
## [353] "provides" "q" "que" "quite"
## [357] "qv" "r" "rather" "rd"
## [361] "re" "really" "reasonably" "regarding"
## [365] "regardless" "regards" "relatively" "respectively"
## [369] "right" "s" "said" "same"
## [373] "saw" "say" "saying" "says"
## [377] "second" "secondly" "see" "seeing"
## [381] "seem" "seemed" "seeming" "seems"
## [385] "seen" "self" "selves" "sensible"
## [389] "sent" "serious" "seriously" "seven"
## [393] "several" "shall" "she" "should"
## [397] "shouldn t" "since" "six" "so"
## [401] "some" "somebody" "somehow" "someone"
## [405] "something" "sometime" "sometimes" "somewhat"
## [409] "somewhere" "soon" "sorry" "specified"
## [413] "specify" "specifying" "still" "sub"
## [417] "such" "sup" "sure" "t"
## [421] "t s" "take" "taken" "tell"
## [425] "tends" "th" "than" "thank"
## [429] "thanks" "thanx" "that" "that s"
## [433] "thats" "the" "their" "theirs"
## [437] "them" "themselves" "then" "thence"
## [441] "there" "there s" "thereafter" "thereby"
## [445] "therefore" "therein" "theres" "thereupon"
## [449] "these" "they" "they d" "they ll"
## [453] "they re" "they ve" "think" "third"
## [457] "this" "thorough" "thoroughly" "those"
## [461] "though" "three" "through" "throughout"
## [465] "thru" "thus" "to" "together"
## [469] "too" "took" "toward" "towards"
## [473] "tried" "tries" "truly" "try"
## [477] "trying" "twice" "two" "u"
## [481] "un" "under" "unfortunately" "unless"
## [485] "unlikely" "until" "unto" "up"
## [489] "upon" "us" "use" "used"
## [493] "useful" "uses" "using" "usually"
## [497] "uucp" "v" "value" "various"
## [501] "very" "via" "viz" "vs"
## [505] "w" "want" "wants" "was"
## [509] "wasn t" "way" "we" "we d"
## [513] "we ll" "we re" "we ve" "welcome"
## [517] "well" "went" "were" "weren t"
## [521] "what" "what s" "whatever" "when"
## [525] "whence" "whenever" "where" "where s"
## [529] "whereafter" "whereas" "whereby" "wherein"
## [533] "whereupon" "wherever" "whether" "which"
## [537] "while" "whither" "who" "who s"
## [541] "whoever" "whole" "whom" "whose"
## [545] "why" "will" "willing" "wish"
## [549] "with" "within" "without" "won t"
## [553] "wonder" "would" "wouldn t" "x"
## [557] "y" "yes" "yet" "you"
## [561] "you d" "you ll" "you re" "you ve"
## [565] "your" "yours" "yourself" "yourselves"
## [569] "z" "zero" "movie" "film"
## [573] "ishaan" "aamir" "khan" "amir"
## [577] "she s" "he d" "she d" "he ll"
## [581] "she ll" "shan t" "mustn t" "when s"
## [585] "why s" "how s"
x = text.clean(data$text) # pre-process text corpus
x = removeWords(x,stopwords) # removing stopwords created above
x = stripWhitespace(x) # removing white space
#--------------------------------------------------------#
###### Create DTM using text2vec package #
#--------------------------------------------------------#
t1 = Sys.time()
tok_fun = word_tokenizer
it_m = itoken(x,
tokenizer = tok_fun,
ids = data$id)
vocab = create_vocabulary(it_m)
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 1)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_m = create_dtm(it_m, vectorizer)
dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
a0 = (apply(dtm, 1, sum) > 0) # build vector to identify non-empty docs
dtm = dtm[a0,] # drop empty docs
# view a sample of the DTM, sorted from most to least frequent tokens
dtm = dtm[,order(apply(dtm, 2, sum), decreasing = T)] # sorting dtm's columns in decreasing order of column sums
inspect(dtm[1:5, 1:5]) # inspect() func used to view parts of a DTM object
## <<DocumentTermMatrix (documents: 5, terms: 5)>>
## Non-/sparse entries: 8/17
## Sparsity : 68%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs child parents good school teacher
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 5 3 3 3 2
## 4 1 0 1 2 0
## 5 0 0 0 0 0
#--------------------------------------------------------#
## Step 2a: # Build word cloud #
#--------------------------------------------------------#
# 1- Using Term frequency(tf)
tst = round(ncol(dtm)/100) # divide DTM's cols into 100 manageble parts
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))
ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
tempdtm = dtm[,(b[i]+1):(b[i+1])]
s = colSums(as.matrix(tempdtm))
ss.col = c(ss.col,s)
}
tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)] #terms in decreasing order of freq
head(tsum)
## child parents good school teacher story
## 123 101 100 95 79 79
tail(tsum)
## rangeela actions booth sicknesses ruin century
## 1 1 1 1 1 1
windows() # New plot window
wordcloud(names(tsum), tsum, # words, their freqs
scale = c(4, 0.5), # range of word sizes
1, # min.freq of words to consider
max.words = 200, # max #words
colors = brewer.pal(8, "Dark2")) # Plot results in a word cloud
title(sub = "Term Frequency - Wordcloud") # title for the wordcloud display
# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))
windows() # New plot window
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "Blue") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
dev.off() # [graphical] device off / close it down
## png
## 2
# -------------------------------------------------------------- #
# step 2b - Using Term frequency inverse document frequency (tfidf)
# -------------------------------------------------------------- #
dtm.tfidf = tfidf(dtm, normalize=TRUE)
tst = round(ncol(dtm.tfidf)/100)
a = rep(tst, 99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm.tfidf))
ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
tempdtm = dtm.tfidf[,(b[i]+1):(b[i+1])]
s = colSums(as.matrix(tempdtm))
ss.col = c(ss.col,s)
}
tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)] #terms in decreasing order of freq
head(tsum)
## good child movies parents kid school
## 0.9444900 0.9094940 0.8167724 0.8007779 0.7878499 0.7635032
tail(tsum)
## wisest distinguishes sore visit misspells
## 0.009941542 0.009941542 0.009941542 0.009941542 0.009941542
## suggests
## 0.009941542
windows() # New plot window
wordcloud(names(tsum), tsum, scale=c(4,0.5),1, max.words=200,colors=brewer.pal(8, "Dark2")) # Plot results in a word cloud
title(sub = "Term Frequency Inverse Document Frequency - Wordcloud")
# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))
windows() # New plot window
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
dev.off()
## png
## 2
#------------------------------------------------------#
# step 2c - Term Co-occurance Matrix (TCM) #
#------------------------------------------------------#
vectorizer = vocab_vectorizer(pruned_vocab,
grow_dtm = FALSE,
skip_grams_window = 5L)
tcm = create_tcm(it_m, vectorizer) # func to build a TCM
tcm.mat = as.matrix(tcm) # use tcm.mat[1:5, 1:5] to view
adj.mat = tcm.mat + t(tcm.mat) # since adjacency matrices are symmetric
z = order(colSums(adj.mat), decreasing = T)
adj.mat = adj.mat[z,z]
# Plot Simple Term Co-occurance graph
adj = adj.mat[1:30,1:30]
cog = graph.adjacency(adj, mode = 'undirected')
cog = simplify(cog)
cog = delete.vertices(cog, V(cog)[ degree(cog) == 0 ])
#-----------------------------------------------------------#
# Step 2d - a cleaned up or 'distilled' COG PLot #
#-----------------------------------------------------------#
distill.cog = function(mat1, # input TCM ADJ MAT
title, # title for the graph
s, # no. of central nodes
k1){ # max no. of connections
a = colSums(mat1) # collect colsums into a vector obj a
b = order(-a) # nice syntax for ordering vector in decr order
mat2 = mat1[b, b] # order both rows and columns along vector b
diag(mat2) = 0
wc = NULL
for (i1 in 1:s){
thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
mat2[i1, mat2[i1,] < thresh1] = 0 # neat. didn't need 2 use () in the subset here.
mat2[i1, mat2[i1,] > 0 ] = 1
word = names(mat2[i1, mat2[i1,] > 0])
mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
wc = c(wc,word)
} # i1 loop ends
mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))] # removed any NAs from the list
mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
graph <- graph.adjacency(mat4, mode = "undirected", weighted=T) # Create Network object
graph = simplify(graph)
V(graph)$color[1:s] = "green"
V(graph)$color[(s+1):length(V(graph))] = "pink"
graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ]) # delete singletons?
plot(graph,
layout = layout.kamada.kawai,
main = title)
} # func ends
windows()
distill.cog(tcm.mat, 'Distilled COG', 10, 5)
## adj.mat and distilled cog for tfidf DTMs ##
adj.mat = t(dtm.tfidf) %*% dtm.tfidf
diag(adj.mat) = 0
a01 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a01[1:50], a01[1:50]])
windows()
distill.cog(adj.mat, 'Distilled COG for tfidf DTMs', 10, 10)
#--------------------------------------------------------#
# Sentiment Analysis #
#--------------------------------------------------------#
x1 = x[a0] # remove empty docs from corpus
pol = polarity(x1) # Calculate the polarity from qdap dictionary
wc = pol$all[,2] # Word Count in each doc
val = pol$all[,3] # average polarity score
p = pol$all[,4] # Positive words info
n = pol$all[,5] # Negative Words info
positive_words = unique(setdiff(unlist(p),"-")) # Positive words list
negative_words = unique(setdiff(unlist(n),"-")) # Negative words list
#--------------------------------------------------------#
# Create Postive Words wordcloud #
#--------------------------------------------------------#
pos.tdm = dtm[,which(colnames(dtm) %in% positive_words)]
m = as.matrix(pos.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows() # opens new image window
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))
title(sub = "Positive Words - Wordcloud")
# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows() # opens new image window
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "blue") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#--------------------------------------------------------#
# Create Negative Words wordcloud #
#--------------------------------------------------------#
neg.tdm = dtm[,which(colnames(dtm) %in% negative_words) ]
m = as.matrix(neg.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows()
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))
title(sub = "Negative Words - Wordcloud")
# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows()
ggplot(test, aes(x = rownames(test), y = test)) +
geom_bar(stat = "identity", fill = "red") +
geom_text(aes(label = test), vjust= -0.20) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#--------------------------------------------------------#
# Positive words vs Negative Words plot #
#--------------------------------------------------------#
len = function(x){
if ( x == "-" && length(x) == 1) {return (0)}
else {return(length(unlist(x)))}
}
pcount = unlist(lapply(p, len))
ncount = unlist(lapply(n, len))
doc_id = seq(1:length(wc))
windows()
plot(doc_id,pcount,type="l",col="green",xlab = "Document ID", ylab= "Word Count")
lines(doc_id,ncount,type= "l", col="red")
title(main = "Positive words vs Negative Words" )
legend("topright", inset=.05, c("Positive Words","Negative Words"), fill=c("green","red"), horiz=TRUE)
# Documet Sentiment Running plot
windows()
plot(pol$all$polarity, type = "l", ylab = "Polarity Score",xlab = "Document Number")
abline(h=0)
title(main = "Polarity Plot" )
### COG for sentiment-laden words ? ###
senti.dtm = cbind(pos.tdm, neg.tdm);
senti.adj.mat = as.matrix(t(senti.dtm)) %*% as.matrix(senti.dtm)
diag(senti.adj.mat) = 0
windows()
distill.cog(senti.adj.mat, # ad mat obj
'Distilled COG of senti words', # plot title
5, # max #central nodes
5) # max #connexns
Compare each review’s polarity score with its star rating. You can choose to use a simple cor() function to check correlation between the two data columns.
For this capture, 50 +ive and 50 -ive, rating and review of Taare Zameen Par. Then find the polarity score of all the review and then find the correlation between them. Finally created a plot of rating Vs polarity.
#--------------------------------------------------------#
# Polarity Vs Rating and Correlation between them #
#--------------------------------------------------------#
df <- data.frame("Rating"=numeric(),"Polarity"=numeric())
for (j in counts)
{
url1 = paste0("http://www.imdb.com/title/tt0986264/reviews?filter=love;filter=love;start=",j)
page1 = read_html(url1)
movie.nodes = html_nodes(page1, 'h2 + img')
tmp = html_attr(movie.nodes, name='alt')
rating = substr(tmp, 0, regexpr('/', tmp)-1)
review.nodes = html_nodes(page1, '#tn15content div+ p')
df <- find.polarity(review.nodes, rating, df)
url2 = paste0("http://www.imdb.com/title/tt0986264/reviews?filter=hate;filter=hate;start=",j)
page2 = read_html(url2)
movie.nodes = html_nodes(page2, 'h2 + img')
tmp = html_attr(movie.nodes, name='alt')
rating = substr(tmp, 0, regexpr('/', tmp)-1)
review.nodes = html_nodes(page2, '#tn15content div+ p')
df <- find.polarity(review.nodes, rating, df)
}
df_data <- df[, sapply(df, is.numeric)]
cor(df_data, use = "complete.obs", method = "pearson")
## Rating Polarity
## Rating 1.0000000 0.2073642
## Polarity 0.2073642 1.0000000
plot(df)
Now, make a recommendation. What movie attributes or aspects (e.g., plot? star cast? length? etc.) worked well, which the studio should retain? Which ones didn’t work well and which the studio should change?
star cast worked well becuase those are the highest one in TF and TFIDF. The name of actor (Aamir Khan) and kid name (Ishaan) appear multiple times. The reviews talk about parent, kid, school, society etc. Also, saw a emotional keyword too. In the Polarity Vs Rating, the correlation is poistive.
Recommendation will be to make movie that talk about kid, or some kind of disease. People get emotional as per above data. Also, these kind of movie will help in getting aid from government.