Text-Analyzing a simple set of documents

R Markdown

Text-Analyzing a simple set of documents Sree Kashyap Addanki

Extract 12 Angry Men Movie

#####################################
# Extract 12 Angry Men Movie 
counts = c(0,10,20,30,40,50,60,70,80,90)
ratings.df = NULL
reviews = NULL
for (j in counts){
    url1 = paste0("http://www.imdb.com/title/tt0050083/reviews?filter=love;filter=love;start=",j) # positive reviews
    url2 = paste0("http://www.imdb.com/title/tt0050083/reviews?filter=hate;filter=hate;start=",j) # negative reviews 
    page1 = read_html(url1)
    page2 = read_html(url2)
    

    
    reviews1 = html_text( html_nodes( page1,'#tn15content p') )
    reviews2 = html_text( html_nodes( page2,'#tn15content p') )
    
    
    movie.nodes = html_nodes(page1,'h2 + img')
    rating1 = substr(html_attr(movie.nodes, name = 'alt'),0,2)
    
    movie.nodes = html_nodes(page2,'h2 + img')
    rating2 = substr(html_attr(movie.nodes, name = 'alt'),0,1)
    
    ratings.df <- c(ratings.df,rating1,rating2)
    
    reviews.positive = setdiff(reviews1, c("*** This review may contain spoilers ***","Add another review"))
    reviews.negative = setdiff(reviews2, c("*** This review may contain spoilers ***","Add another review"))
    
    reviews = c(reviews,reviews.positive,reviews.negative)
 
       
}

Create a text file which includes all the 200 reviews. 100 Positive and 100 Negative

reviews = gsub("\n",' ',reviews)
writeLines(reviews,' 12 Angry Men (1957) .txt')

Create a fucntion to clean the text

text.clean = function(x)                    # text data
{ require("tm")
    x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
    x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
    x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric 
    x  =  tolower(x)                          # convert to lower case characters
    x  =  removeNumbers(x)                    # removing numbers
    x  =  stripWhitespace(x)                  # removing white space
    x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
    return(x)
}

#--------------------------------------------------------#
# Step 1 - Reading text data                             #
#--------------------------------------------------------#


temp.text = readLines('C:\\Users\\sreek\\Desktop\\Term1\\TA\\Individual Assignment\\ 12 Angry Men (1957) .txt') #12 Angry Men reviews text file
head(temp.text,1)

## [1] "  An excellent courtroom drama with a unique twist. Instead of following the trial itself, the viewer has a unique chance to observe the events behind the closed doors of a jury room. The film begins with the end of the trial. The jurors retire to deliberate the case. A preliminary vote is taken and the result is 11:1 in favour of the guilty verdict. Eleven jurors have raised their hands to convict a young man of killing his father. Only Juror #8 has doubts. At first even he does not truly believe the young man to be innocent but notes (rightfully) that the case for the defence might have been presented in a more convincing manner and that the boy might be given the benefit of a doubt. Since the boy is to be executed if found guilty his life is now in the hands of the jury and juror #8 reasons that the least they could do is talk about the case a bit. As time goes on some of the jurors change their minds and find that there is perhaps enough reasonable doubt not to convict the young man after all. But not everyone is easy to convince.Although the plot of the film is excellent and it is fascinating to see what little things can influence which way a verdict goes, where this film really succeeds is in presenting the characters of the 12 jurors. The character of each of the jurors emerges through a wonderful mix of perfect casting, excellent dialogue and near-flawless acting.Juror #1 - a simple man who clearly does not understand the full complexity of the task that lies before him but is trying to do everything not to let anyone else find this out. He appears at ease only once during the film - when he talks about football. He has the misfortune to be selected foreman of the jury - a task he clearly does not relish.Juror #2 - a small, quite man, clearly unaccustomed to giving his own opinion much less to expecting his views to be of any importance. Apparently he finds solace in his job - he is an accountant.Juror #3 - probably the most complex personality in the film. Starts off like a pleasant self-made successful businessman, he analyses the case impartially, explains his arguments well and is reasonably self assured. As time goes on he becomes more and more passionate and seems to be somehow personally involved with the case.  He also starts to show some signs of slight mental instability. Wonderfully played by Lee J. Cobb - this is the character you remember after the film is over.Juror #4 - self assured, slightly arrogant stockbroker. Obviously considers himself more intelligent than anyone else in the room, he approaches the case with cool heartless logic but (as one of the jurors says - \"this is not an exact science\") he does not take into account the feelings, the passions, the characters of the people involved in the case. He is conspicuous by the fact that he is the only juror that does not take his jacket off (it is a very hot day).Juror #5 - here is a man under great emotional stress. He comes from the same social background as the accused boy - with who he almost unwillingly seems to identify with. Paradoxically this appears one of the main reasons for him voting guilty - he does not want compassion to influence him - so ironically it does.Juror #6 - a simple man, quite readily admitting that everyone in the room is better qualified than he is to make decisions and offer explanations. But he really wants to see justice done and it worries him that he might make a mistake.Juror #7 - the only one that really has no opinion on this case. Literally throughout the film his thoughts are never on the case - he talks of baseball, of the heat, of fixing the fan but the only reason he has for voting this way or that is to speed things up a bit so he might be out of the jury room as soon as possible. Not an evil man he just has no sense of morality whatsoever - he can tell right from wrong but does not seem to think it's worth the bother.Juror #8- a caring man, has put more thought into the case than any of the other jurors. He tries to do his best even in the face of seemingly impossible odds.Juror #9 - a wise old man with his great life experience has quite a unique way of looking at the case.Juror #10 - the most horrifying character in the film. Votes guilty and does not even try to hide the fact that he does so only because of the boy's social background. The tragedy comes from the fact that his own social position is only a cut above the boy's - which makes him all the more eager to accentuate the difference.Juror #11 - an immigrant watchmaker, careful methodical man, well mannered and soft spoken. respects the right of people to have different opinion to his - and is willing to look at both sides of the problem. Loses his temper only once - horrified by the complete indifference of juror #7.Juror #12 - a young business type - perhaps he has his own opinions - but is careful to hide them. What he has learnt out of life seems to be that intelligence is equal with agreeing with what the majority of people think.The film succeeds in doing something very rare today - developing an intelligent plot while also developing 12 believable, memorable and distinct characters.  "

data = data.frame(id = 1:length(temp.text), text = temp.text, stringsAsFactors = F)

Check the dimensions of data

dim(data)

## [1] 200   2

Adding all the uncessary words to stop list Since movie has 12 member jury who decide verdict of a young man we are adding words like jury to stop list as belows

# Read Stopwords list

stpw1 = readLines('C:\\Users\\sreek\\Desktop\\Term1\\TA\\Individual Assignment\\stopwords.txt')      # read in stopwords file

stpw2 = tm::stopwords('english')                   # tm package stop word list; tokenizer package has the same name function
stpw3 =c("angry","man","men","film","movie","twelve","movies", "played","actors", "cast", "films","spoilers","juror","jurors","jury")
comn  = unique(c(stpw1, stpw2,stpw3) )                 # Union of two list
stopwords = unique(gsub("'"," ",comn) )  # final stop word lsit after removing punctuation

x  = text.clean(data$text )             # pre-process text corpus
x  =  removeWords(x,stopwords )            # removing stopwords created above
x  =  stripWhitespace(x )                  # removing white space
# x  =  stemDocument(x)

Create DTM using text2vec package

#--------------------------------------------------------#
## Step 2: Create DTM using text2vec package             #
#--------------------------------------------------------#

t1 = Sys.time()

tok_fun = word_tokenizer  # using word & not space tokenizers

it_0 = itoken( x,
               #preprocessor = text.clean,
               tokenizer = tok_fun,
               ids = data$id,
               progressbar = T)

vocab = create_vocabulary(it_0,    #  func collects unique terms & corresponding statistics
                          ngram = c(2L, 2L) #,
                          #stopwords = stopwords
)

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

pruned_vocab = prune_vocabulary(vocab,  # filters input vocab & throws out v frequent & v infrequent terms
                                term_count_min = 10)

length(pruned_vocab);  str(pruned_vocab)

## [1] 5

## List of 5
##  $ vocab         :Classes 'data.table' and 'data.frame': 35 obs. of  3 variables:
##   ..$ terms       : chr [1:35] "jack_warden" "young_boy" "shut_case" "guilty_vote" ...
##   ..$ terms_counts: int [1:35] 26 12 10 10 26 10 17 10 59 50 ...
##   ..$ doc_counts  : int [1:35] 25 11 8 7 25 8 16 10 54 43 ...
##   ..- attr(*, ".internal.selfref")=<externalptr> 
##  $ ngram         : Named int [1:2] 2 2
##   ..- attr(*, "names")= chr [1:2] "ngram_min" "ngram_max"
##  $ document_count: int 200
##  $ stopwords     : chr(0) 
##  $ sep_ngram     : chr "_"
##  - attr(*, "class")= chr "text2vec_vocabulary"

vectorizer = vocab_vectorizer(pruned_vocab) #  creates a text vectorizer func used in constructing a dtm/tcm/corpus

dtm_0  = create_dtm(it_0, vectorizer) # high-level function for creating a document-term matrix

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),]       # terms in decreasing order of freq
head(tsum)

##      henry_fonda         lee_cobb     sidney_lumet reasonable_doubt 
##              106               59               50               43 
##        ed_begley      black_white 
##               30               30

tail(tsum)

##   accused_guilty defendant_guilty   joseph_sweeney     john_fiedler 
##               10               10               10               10 
##    robert_webber      boy_accused 
##               10               10

#-------------------------------------------------------
# Code bi-grams as unigram in clean text corpus

text2 = x
text2 = paste("",text2,"")

pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0

for (term in names(tsum)){
    i = i + 1
    focal.term = gsub("_", " ",term)        # in case dot was word-separator
    replacement.term = term
    text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
    setTxtProgressBar(pb, i)
}

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |===========                                                      |  18%
  |                                                                       
  |=============                                                    |  21%
  |                                                                       
  |===============                                                  |  24%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |==================================================               |  76%
  |                                                                       
  |====================================================             |  79%
  |                                                                       
  |======================================================           |  82%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |=================================================================| 100%

it_m = itoken(text2,     # function creates iterators over input objects to vocabularies, corpora, DTM & TCM matrices
              # preprocessor = text.clean,
              tokenizer = tok_fun,
              ids = data$id,
              progressbar = T)

vocab = create_vocabulary(it_m     # vocab func collects unique terms and corresponding statistics
                          # ngram = c(2L, 2L),
                          #stopwords = stopwords
)

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 1)
vectorizer = vocab_vectorizer(pruned_vocab)

dtm_m  = create_dtm(it_m, vectorizer)

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

dim(dtm_m)

## [1]  200 5229

dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
a0 = (apply(dtm, 1, sum) > 0)   # build vector to identify non-empty docs
dtm = dtm[a0,]                  # drop empty docs

print(difftime(Sys.time(), t1, units = 'sec'))

## Time difference of 2.785411 secs

# view a sample of the DTM, sorted from most to least frequent tokens 
dtm = dtm[,order(apply(dtm, 2, sum), decreasing = T)]     # sorting dtm's columns in decreasing order of column sums
inspect(dtm[1:5, 1:5])     # inspect() func used to view parts of a DTM object

## <<DocumentTermMatrix (documents: 5, terms: 5)>>
## Non-/sparse entries: 13/12
## Sparsity           : 48%
## Maximal term length: 5
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs room time case good great
##    1    4    2   11    0     2
##    2    0    0    2    0     0
##    3    2    3    0    0     0
##    4    0    2    1    0     1
##    5    0    0    2    1     2

#--------------------------------------------------------#
## Step 2a:     # Build word cloud                       #
#--------------------------------------------------------#

#   1- Using Term frequency(tf)             

tst = round(ncol(dtm)/100)  # divide DTM's cols into 100 manageble parts
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
    tempdtm = dtm[,(b[i]+1):(b[i+1])]
    s = colSums(as.matrix(tempdtm))
    ss.col = c(ss.col,s)
}

tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)]       #terms in decreasing order of freq
head(tsum)

##        room        time        case        good       great henry_fonda 
##         190         124         119         117         113         106

tail(tsum)

##     steven    reflect  balancing    warming oftentimes     spouts 
##          1          1          1          1          1          1

windows()  # New plot window
wordcloud(names(tsum), tsum,     # words, their freqs 
          scale = c(4, 0.5),     # range of word sizes
          1,                     # min.freq of words to consider
          max.words = 200,       # max #words
          colors = brewer.pal(8, "Dark2"))    # Plot results in a word cloud 
title(sub = "Term Frequency - Wordcloud")     # title for the wordcloud display

# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))

windows()  # New plot window
ggplot(test, aes(x = rownames(test), y = test)) + 
    geom_bar(stat = "identity", fill = "Blue") +
    geom_text(aes(label = test), vjust= -0.20) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

# -------------------------------------------------------------- #
# step 2b - Using Term frequency inverse document frequency (tfidf)             
# -------------------------------------------------------------- #

library(textir)

## Warning: package 'textir' was built under R version 3.3.2

## Loading required package: distrom

## Warning: package 'distrom' was built under R version 3.3.2

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:qdap':
## 
##     %&%

## Loading required package: gamlr

## Warning: package 'gamlr' was built under R version 3.3.2

## Loading required package: parallel

dtm.tfidf = tfidf(dtm, normalize= FALSE)

tst = round(ncol(dtm.tfidf)/100)
a = rep(tst, 99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm.tfidf))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
    tempdtm = dtm.tfidf[,(b[i]+1):(b[i+1])]
    s = colSums(as.matrix(tempdtm))
    ss.col = c(ss.col,s)

}

tsum = ss.col

tsum = tsum[order(tsum, decreasing = T)]       #terms in decreasing order of freq
head(tsum)

##     case    great    fonda     good      boy   people 
## 139.3708 123.5796 120.1981 117.9194 114.8574 113.6047

tail(tsum)

##     steven    reflect  balancing    warming oftentimes     spouts 
##    4.60517    4.60517    4.60517    4.60517    4.60517    4.60517

windows()  
wordcloud(names(tsum), tsum, scale=c(4,0.5),1, max.words=200,colors=brewer.pal(8, "Dark2")) # Plot results in a word cloud

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : sidney_lumet could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : thing could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : members could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : watching could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : made could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : personalities could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : important could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : argument could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : martin_balsam could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : presented could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : director could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : mind could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : greatest could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : father could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : time could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : simple could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : watched could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : makes could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : convinced could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : thought could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : accused could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : boy could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : class could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : interesting could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : end could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : justice could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : change could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : drama could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : human could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : idea could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : viewer could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : place could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : defendant could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : performances could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : evidence could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : brilliant could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : deliberation could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : henry_fonda could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : things could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : young could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : picture could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : tension could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : guy could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : perfect could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : character could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : people could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : doubt could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : pooh could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : true could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : minutes could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : characters could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : vote could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : reasonable_doubt could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : high could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : piece could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : classic could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : plot could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : slowly could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : part could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : arguments could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : american could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : years could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : times could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : great could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : system could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : decide could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : real could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : moment could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : love could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : dialogue could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : crime could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : murder could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : room could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : hard could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : death could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : prejudice could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : making could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : knife could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : truth could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : reason could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : eleven could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : guilt could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : black_white could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : court could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : trial could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : acting could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : group could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : verdict could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : watch could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : doubts could not be fit on page. It will not be plotted.

title(sub = "Term Frequency Inverse Document Frequency - Wordcloud")

as.matrix(tsum[1:20])     #  to see the top few tokens & their IDF scores

##                 [,1]
## case       139.37077
## great      123.57960
## fonda      120.19809
## good       117.91938
## boy        114.85743
## people     113.60475
## guilty     112.31175
## time       112.07966
## room       110.16551
## evidence   109.34743
## number     108.79122
## characters 106.43161
## made       104.56972
## watch      104.49503
## story      103.72467
## make        97.52180
## murder      96.83839
## character   95.61680
## end         94.29516
## acting      91.35227

(dtm.tfidf)[1:10, 1:10]   # view first 10x10 cells in the DTM under TF IDF.

## 10 x 10 sparse Matrix of class "dgCMatrix"

##    [[ suppressing 10 column names 'room', 'time', 'case' ... ]]

##                                                                      
## 1  2.3192740 1.8077364 12.883013 .        2.187249 .         3.513549
## 2  .         .          2.342366 .        .        0.8439701 1.171183
## 3  1.1596370 2.7116046  .        .        .        3.3758803 .       
## 4  .         1.8077364  1.171183 .        1.093625 0.8439701 1.171183
## 5  .         .          2.342366 1.007858 2.187249 0.8439701 .       
## 6  .         1.8077364  .        .        .        .         1.171183
## 7  0.5798185 .          .        1.007858 1.093625 0.8439701 .       
## 8  .         0.9038682  1.171183 .        .        1.6879401 .       
## 9  1.7394555 0.9038682  2.342366 2.015716 2.187249 1.6879401 1.171183
## 10 2.8990925 .          5.855915 .        2.187249 0.8439701 1.171183
##                             
## 1  3.325988 3.66234 .       
## 2  5.543313 3.66234 .       
## 3  .        1.22078 1.187444
## 4  4.434650 .       .       
## 5  .        1.22078 .       
## 6  .        .       .       
## 7  .        .       1.187444
## 8  .        .       .       
## 9  1.108663 .       .       
## 10 .        1.22078 2.374887

# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))
windows()  # New plot window
ggplot(test, aes(x = rownames(test), y = test)) + 
    geom_bar(stat = "identity", fill = "red") +
    geom_text(aes(label = test), vjust= -0.20) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

#------------------------------------------------------#
# step 2c - Term Co-occurance Matrix (TCM)             #
#------------------------------------------------------#

vectorizer = vocab_vectorizer(pruned_vocab, 
                              grow_dtm = FALSE, 
                              skip_grams_window = 5L)

tcm = create_tcm(it_m, vectorizer) # func to build a TCM

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

tcm.mat = as.matrix(tcm)         # use tcm.mat[1:5, 1:5] to view
adj.mat = tcm.mat + t(tcm.mat)   # since adjacency matrices are symmetric

z = order(colSums(adj.mat), decreasing = T)
adj.mat = adj.mat[z,z]

# Plot Simple Term Co-occurance graph
adj = adj.mat[1:30,1:30]

library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:qdap':
## 
##     %>%, diversity

## The following object is masked from 'package:stringr':
## 
##     %>%

## The following objects are masked from 'package:text2vec':
## 
##     %>%, normalize

## The following object is masked from 'package:rvest':
## 
##     %>%

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

cog = graph.adjacency(adj, mode = 'undirected')
cog =  simplify(cog)  

cog = delete.vertices(cog, V(cog)[ degree(cog) == 0 ])

windows()
plot(cog)

#-----------------------------------------------------------#
# Step 2d - a cleaned up or 'distilled' COG PLot            #
#-----------------------------------------------------------#

distill.cog = function(mat1, # input TCM ADJ MAT
                       title, # title for the graph
                       s,    # no. of central nodes
                       k1){  # max no. of connections  
    library(igraph)
    a = colSums(mat1) # collect colsums into a vector obj a
    b = order(-a)     # nice syntax for ordering vector in decr order  
    
    mat2 = mat1[b, b]     # order both rows and columns along vector b
    
    diag(mat2) =  0
    
    ## +++ go row by row and find top k adjacencies +++ ##
    
    wc = NULL
    
    for (i1 in 1:s){ 
        thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
        mat2[i1, mat2[i1,] < thresh1] = 0  
        mat2[i1, mat2[i1,] > 0 ] = 1
        word = names(mat2[i1, mat2[i1,] > 0])
        mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
        wc = c(wc,word)
    } # i1 loop ends
    
    
    mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
    ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))]  # removed any NAs from the list
    mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
    graph <- graph.adjacency(mat4, mode = "undirected", weighted=T)    # Create Network object
    graph = simplify(graph) 
    V(graph)$color[1:s] = "green"
    V(graph)$color[(s+1):length(V(graph))] = "pink"
    
    graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ]) 
    
    plot(graph, 
         layout = layout.kamada.kawai, 
         main = title)
    
} # func ends

windows()
distill.cog(tcm.mat, 'Distilled COG',  10,  5)

## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length

## adj.mat and distilled cog for tfidf DTMs ##

adj.mat = t(dtm.tfidf) %*% dtm.tfidf
diag(adj.mat) = 0
a0 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a0[1:50], a0[1:50]])

windows()
distill.cog(adj.mat, 'Distilled COG',  10,  10)

reviews_df <- data.frame(reviews,ratings.df)

Compare each review’s polarity score with its star rating.

#--------------------------------------------------------#
#  Step 3 correlation between polarity and rating       #
#--------------------------------------------------------#

reviews_df <- data.frame(reviews,ratings.df)

polarity <- counts(polarity(reviews_df$reviews))[, "polarity"]

## Warning in polarity(reviews_df$reviews): 
##   Some rows contain double punctuation.  Suggested use of `sentSplit` function.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

## Warning in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends,
## nomatch, : A known encoding (latin1 or UTF-8) was detected in a join
## column. data.table compares the bytes currently, so doesn't support *mixed*
## encodings well; i.e., using both latin1 and UTF-8, or if any unknown
## encodings are non-ascii and some of those are marked known and others
## not. But if either latin1 or UTF-8 is used exclusively, and all unknown
## encodings are ascii, then the result should be ok. In future we will check
## for you and avoid this warning if everything is ok. The tricky part is
## doing this without impacting performance for ascii-only cases.

reviews_df$polarity <-polarity

# reviews_df --> first colmun contains review second column has rating and third has polarity of review
cor(as.numeric(reviews_df$ratings.df), polarity)

## [1] -0.1525462

#--------------------------------------------------------#
#             Sentiment Analysis                         #
#--------------------------------------------------------#

library(qdap)

x1 = x[a0]    # remove empty docs from corpus

t1 = Sys.time()   # set timer

pol = polarity(x1)         # Calculate the polarity from qdap dictionary
wc = pol$all[,2]                  # Word Count in each doc
val = pol$all[,3]                 # average polarity score
p  = pol$all[,4]                  # Positive words info
n  = pol$all[,5]                  # Negative Words info  

dim(pol)

## NULL

Sys.time() - t1  # how much time did the above take?

## Time difference of 1.197062 mins

head(pol$all)

##   all  wc   polarity
## 1 all  52 -0.5547002
## 2 all 168  0.9721111
## 3 all 257 -1.4347006
## 4 all  71  1.1867817
## 5 all  68 -0.2425356
## 6 all  53 -0.3846096
##                                                                                                                                                                                                                                                           pos.words
## 1                                                                                                                                                                                                                                      impressive, led, magnificent
## 2 uplifting, brilliant, flawless, grace, masterfully, greatest, contribution, important, legendary, incredibly, realistic, genius, astoundingly, brilliant, impressed, powerful, inexpensive, awards, wonderful, loves, love, superb, greatest, achievements, honor
## 3                                                                                                                  adored, genius, lead, revelation, logical, defeat, wealthy, easier, righteous, boundless, protect, logical, win, trust, strong, rich, rich, rich
## 4                                                                                                                     advantage, expertly, accomplished, talented, leads, important, genuine, admire, fantastic, incredibly, fantastic, fantastic, appreciated, top
## 5                                                                                                                                                                                                                   won, awards, exciting, good, great, masterpiece
## 6                                                                                                                                                                                                                      good, interesting, spectacular, good, pretty
##                                                                                                                                                                                                                                                                                                                                       neg.words
## 1                                                                                                                                                                                                                                                                                       lone, murder, unsure, guilty, slowly, suffer, miserable
## 2                                                                                                                                                                                                                                                           plot, simplistic, drain, mystery, killing, guilty, guilty, doubts, tension, complex
## 3 death, nemesis, guilty, revenge, estranged, fallacy, slander, guilt, dangerous, poor, kill, poor, suffering, suffering, guilt, punish, fallacy, overwhelming, fall, fall, guilty, guilty, irrational, antithetical, erroneous, helpless, bias, heartbreaking, sad, bad, greed, poor, danger, bias, childish, crap, die, poor, poor, die, poor
## 4                                                                                                                                                                                                                                                                                                                plot, murder, inability, broke
## 5                                                                                                                                                                                                                                                                   dark, tiresome, conflicts, death, unexpectedly, broken, pity, unpredictable
## 6                                                                                                                                                                                                                                                                                    cramped, bored, damn, mediocre, boring, murderer, mediocre
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            text.var
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             plays confining space set choice black white element action impressive led magnificent henry fonda epitomized average american jimmy stewart henry fonda lone murder trial unsure defendant guilty slowly logically begins change minds swelter suffer miserable room exception minutes entire action takes place confining space masks begin show true hands powerhouse order drama 
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        rarely uplifting brilliant time intrigued flawless plot dialogue acting simplistic story set room surprising sidney lumet drain emotions leave edge seat suspense mystery acting bound grace silver screen boy day trial killing father heat domestic arguments forced present verdict guilty ticket electric chair boy decide quickly end discussion raise hands find thinks boy guilty henry fonda put hand trial character revelations doubts possibilities follow masterfully crafted time watch includes character development sidney lumet expert field greatest contribution hollywood history important contributions world cinema henry fonda lee cobb made legendary incredibly realistic performances casting genius dialogue astoundingly riveting brilliant finale impressed personally camera angles movements made suspenseful black white made powerful music minimal gave atmospheric experience room feel tension built proceeds inexpensive simple setting world talking academy awards nominations rolling henry fonda complete form rarely hypnotized lawrence arabia wonderful life mind definitive viewing loves sums love technical point view superb acting simple complex character driven story platinum greatest cinematic achievements time bar statue erected sydney lumet honor henry fonda
## 3  ahead adored young deification system inside hidden genius reasoning years philosophy voted back follow credo speak truth lead death scene tells revelation cobb fonda nemesis voting guilty wishes revenge estranged son reason arguments executioner friends philosophy logical fallacy called ad hominem person defeat argument call names slander character inside including author predilections convict acquit ergo turn argument equal facility fonda liberal guilt wealth release dangerous poor murderers kill people henry feel wealthy midst millions poor suffering people easier righteous thing giving wealth relieve boundless suffering beholds boy sacrificial lamb atonement altar guilt implies protect innocent fonda words punish seek save blood innocent stand front answer god ad hominem logical fallacy trained understand refuge win argument premise matter overwhelming mountain evidence requisite time fall fall leaves ground trust case evidence remotely approaches level ratiocinate end sun guilty synthesis liberals fonda lumet vote guilty filled personal demons irrational investigate antithetical predilections acquit evidence piled alpha centauri mind time find erroneous thought decades thought living ascetic philosopher ruminating finally liberal mind control review called duty remember inclinations acquit strong convict helpless stand count dispassionate objective eliminated bias system god answer actions hear heartbreaking music lumet fonda play accused teenager sits sad remember blood innocent victims hands fonda feels bad rich simple solution greed give poor put danger brainwashing people believing convict private demons bias objective childish turn argument equal adroitness vote back crap wrote innocents die white rich liberals give money poor atones releasing token poor person expiation cares die jesus zacharias thing give poor follow zacharius turned wept rich 
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      simple plot deciding fate young murder set exception minutes hours writer director advantage screenplay reginald rose features dialogue expertly delivered accomplished performances talented henry fonda leads skilfully selected important personalities makes genuine inability decision making quirks story air conditioning broke waiting sporting event raining things make time pass quickly make admire fantastic car chases explosions gunfights aged incredibly offers fantastic performances fantastic dialogue watched appreciated time holds spot top time list
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           life hands dealth minds tag line nominations oscars won awards started dark white atmosphere gave tiresome impression sitting room quarreling murmuring muttering providing human acting talking teenagers exciting sitting dully room engrossed quarreling interested affairs conflicts tag line shows life death boy hands audience left addition henry fonda good actor ends asked hope unexpectedly broken ending pity happen great masterpiece people slight interest comments changed debate unpredictable result
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          talking small room good set makes viewer feel cramped claustrophobic shown places bored minute damn room places interesting mediocre make characters random person street spectacular character larger life boring script good talk back fort wanted half time talk ending closure boy murderer real point find pretty mediocre benefited acting times black white give

head(pol$group)

##   all total.sentences total.words ave.polarity sd.polarity
## 1 all            5229       20046   0.03607942   0.7797011
##   stan.mean.polarity
## 1         0.04627339

positive_words = unique(setdiff(unlist(p),"-"))  # Positive words list
negative_words = unique(setdiff(unlist(n),"-"))  # Negative words list

print(positive_words)       # Print all the positive words found in the corpus

##   [1] "impressive"      "led"             "magnificent"    
##   [4] "uplifting"       "brilliant"       "flawless"       
##   [7] "grace"           "masterfully"     "greatest"       
##  [10] "contribution"    "important"       "legendary"      
##  [13] "incredibly"      "realistic"       "genius"         
##  [16] "astoundingly"    "impressed"       "powerful"       
##  [19] "inexpensive"     "awards"          "wonderful"      
##  [22] "loves"           "love"            "superb"         
##  [25] "achievements"    "honor"           "adored"         
##  [28] "lead"            "revelation"      "logical"        
##  [31] "defeat"          "wealthy"         "easier"         
##  [34] "righteous"       "boundless"       "protect"        
##  [37] "win"             "trust"           "strong"         
##  [40] "rich"            "advantage"       "expertly"       
##  [43] "accomplished"    "talented"        "leads"          
##  [46] "genuine"         "admire"          "fantastic"      
##  [49] "appreciated"     "top"             "won"            
##  [52] "exciting"        "good"            "great"          
##  [55] "masterpiece"     "interesting"     "spectacular"    
##  [58] "pretty"          "wow"             "phenomenal"     
##  [61] "tough"           "works"           "intelligence"   
##  [64] "work"            "wisely"          "nifty"          
##  [67] "deservedly"      "tidy"            "convenient"     
##  [70] "excellent"       "formidable"      "cool"           
##  [73] "favorite"        "likes"           "impress"        
##  [76] "treasure"        "worth"           "incredible"     
##  [79] "reasonable"      "heaven"          "terrific"       
##  [82] "enjoy"           "fortunately"     "perfect"        
##  [85] "pleasant"        "amazing"         "enjoyed"        
##  [88] "respect"         "inspiration"     "miraculously"   
##  [91] "fantastically"   "pure"            "classic"        
##  [94] "thoughtful"      "amazingly"       "top notch"      
##  [97] "finely"          "effortlessly"    "staunchly"      
## [100] "enthusiasm"      "correct"         "wonderfully"    
## [103] "relaxed"         "modern"          "believable"     
## [106] "intriguing"      "clear"           "openly"         
## [109] "victory"         "positive"        "favor"          
## [112] "successful"      "favour"          "rightfully"     
## [115] "convincing"      "benefit"         "easy"           
## [118] "fascinating"     "succeeds"        "ease"           
## [121] "relish"          "solace"          "impartially"    
## [124] "passionate"      "intelligent"     "hot"            
## [127] "compassion"      "readily"         "qualified"      
## [130] "morality"        "wise"            "eager"          
## [133] "soft"            "memorable"       "excited"        
## [136] "bright"          "appeal"          "masterpieces"   
## [139] "outstanding"     "awarded"         "satisfying"     
## [142] "meaningful"      "recommended"     "blockbuster"    
## [145] "engrossing"      "astonishing"     "brilliance"     
## [148] "marvelous"       "nicely"          "beautifully"    
## [151] "cheapest"        "liking"          "profound"       
## [154] "solid"           "educated"        "recommend"      
## [157] "thrills"         "hero"            "enrich"         
## [160] "engaging"        "respectful"      "clever"         
## [163] "noble"           "inspiring"       "privilege"      
## [166] "flashy"          "sexy"            "honest"         
## [169] "lover"           "entertaining"    "fun"            
## [172] "worked"          "faith"           "silent"         
## [175] "courage"         "constructive"    "polite"         
## [178] "adore"           "beauty"          "valuable"       
## [181] "quiet"           "gems"            "progressive"    
## [184] "brilliantly"     "master"          "ready"          
## [187] "authentic"       "understandable"  "brave"          
## [190] "succeeded"       "handy"           "hopeful"        
## [193] "calm"            "supremely"       "sane"           
## [196] "free"            "reasoned"        "loved"          
## [199] "effectively"     "proves"          "beautiful"      
## [202] "achievement"     "sufficient"      "permissible"    
## [205] "acclaimed"       "nice"            "coolest"        
## [208] "awesome"         "glee"            "modest"         
## [211] "reward"          "conveniently"    "poetic"         
## [214] "rational"        "piety"           "privileged"     
## [217] "transparent"     "popular"         "solidarity"     
## [220] "cheer"           "fabulous"        "improvements"   
## [223] "charm"           "prefer"          "exceptional"    
## [226] "supporting"      "originality"     "decent"         
## [229] "helpful"         "masterful"       "fair"           
## [232] "effective"       "talents"         "mesmerized"     
## [235] "success"         "conscientious"   "applaud"        
## [238] "support"         "finer"           "proving"        
## [241] "freed"           "supports"        "supported"      
## [244] "golden"          "promising"       "righteousness"  
## [247] "endorsed"        "virtue"          "glowing"        
## [250] "prefers"         "immaculate"      "supreme"        
## [253] "renaissance"     "unparalleled"    "notably"        
## [256] "splendidly"      "prudent"         "leading"        
## [259] "illuminating"    "sharp"           "swift"          
## [262] "greatness"       "accomplish"      "properly"       
## [265] "reputation"      "powerfully"      "polished"       
## [268] "sincere"         "beloved"         "congratulations"
## [271] "wisdom"          "sufficiently"    "clean"          
## [274] "intrigue"        "excitement"      "super"          
## [277] "comfort"         "grateful"        "simplest"       
## [280] "fairly"          "accomplishment"  "famous"         
## [283] "magnificently"   "efficient"       "happy"          
## [286] "vivid"           "dazzling"        "precise"        
## [289] "rapt"            "instantly"       "stunning"       
## [292] "thrilled"        "nobly"           "remarkable"     
## [295] "qualify"         "steadfast"       "advocate"       
## [298] "significant"     "steadfastly"     "sharper"        
## [301] "smart"           "captivating"     "rightly"        
## [304] "economical"      "superior"        "thrilling"      
## [307] "praise"          "fans"            "perfection"     
## [310] "faster"          "stunned"         "gratitude"      
## [313] "kindness"        "helped"          "shine"          
## [316] "marvel"          "reliable"        "glad"           
## [319] "talent"          "skillful"        "striking"       
## [322] "amazed"          "exemplary"       "hottest"        
## [325] "entrancing"      "pride"           "insightful"     
## [328] "stimulating"     "variety"         "regard"         
## [331] "decisive"        "intricate"       "commendable"    
## [334] "appealing"       "outstandingly"   "fairness"       
## [337] "enthralled"      "paradise"        "distinctive"    
## [340] "satisfactory"    "foremost"        "fortunate"      
## [343] "rewarding"       "cleverly"        "simpler"        
## [346] "clear cut"       "lavish"          "finest"         
## [349] "clearer"         "faultless"       "fine"           
## [352] "mastery"         "snappy"          "enjoyable"      
## [355] "encouraging"     "fancy"           "seamless"       
## [358] "freedom"         "consistently"    "dynamic"        
## [361] "merciful"        "ideal"           "earnest"        
## [364] "honesty"         "sensitive"       "wholesome"      
## [367] "exceedingly"     "delightfully"    "versatile"      
## [370] "decency"         "confidence"      "noteworthy"     
## [373] "magic"           "sublime"         "strongest"      
## [376] "fascination"     "judicious"       "astounding"     
## [379] "worthwhile"      "heroic"          "fond"           
## [382] "celebrated"      "extraordinary"   "wins"           
## [385] "fast"            "creative"        "delicate"       
## [388] "diligence"       "influential"     "principled"     
## [391] "amusing"         "endearing"       "passion"        
## [394] "darling"         "advanced"        "gratifying"     
## [397] "inspirational"   "courageous"      "impeccable"     
## [400] "kindly"          "dominate"        "warm"           
## [403] "tremendously"    "splendid"        "gains"          
## [406] "infallibility"   "spellbound"      "peace"          
## [409] "reforming"       "meticulous"      "awe"            
## [412] "smooth"          "perfectly"       "impartial"      
## [415] "guarantee"       "successfully"    "appreciable"    
## [418] "comprehensive"   "enhance"         "winning"        
## [421] "accolades"       "revolutionary"   "enthrall"       
## [424] "excellence"      "abundance"       "stellar"        
## [427] "instructive"     "correctly"       "adequate"       
## [430] "relief"          "accurate"        "elevate"        
## [433] "straighten"      "exceptionally"   "distinguished"  
## [436] "humble"          "honorable"       "loving"         
## [439] "hard working"    "quieter"         "suitable"       
## [442] "adventurous"     "trusting"        "empathy"        
## [445] "gained"          "unforgettable"   "innovative"     
## [448] "refreshing"      "vibrant"         "fascinate"      
## [451] "suffice"         "admirable"       "affirmation"    
## [454] "safe"            "champion"        "foolproof"      
## [457] "goodness"        "smile"           "improved"       
## [460] "neatly"          "capable"         "admiration"     
## [463] "pluses"          "improve"         "improvement"    
## [466] "pinnacle"        "superbly"        "joy"            
## [469] "positives"       "proper"          "triumph"        
## [472] "faithful"        "patience"        "ingenious"      
## [475] "straightforward" "thankful"        "attractive"     
## [478] "sweeping"        "veritable"       "impassioned"    
## [481] "lean"            "richly"          "assuredly"      
## [484] "destiny"         "entertain"       "genial"         
## [487] "superiority"     "preferring"      "happily"        
## [490] "famed"

print(negative_words)       # Print all neg words

##   [1] "lone"            "murder"          "unsure"         
##   [4] "guilty"          "slowly"          "suffer"         
##   [7] "miserable"       "plot"            "simplistic"     
##  [10] "drain"           "mystery"         "killing"        
##  [13] "doubts"          "tension"         "complex"        
##  [16] "death"           "nemesis"         "revenge"        
##  [19] "estranged"       "fallacy"         "slander"        
##  [22] "guilt"           "dangerous"       "poor"           
##  [25] "kill"            "suffering"       "punish"         
##  [28] "overwhelming"    "fall"            "irrational"     
##  [31] "antithetical"    "erroneous"       "helpless"       
##  [34] "bias"            "heartbreaking"   "sad"            
##  [37] "bad"             "greed"           "danger"         
##  [40] "childish"        "crap"            "die"            
##  [43] "inability"       "broke"           "dark"           
##  [46] "tiresome"        "conflicts"       "unexpectedly"   
##  [49] "broken"          "pity"            "unpredictable"  
##  [52] "cramped"         "bored"           "damn"           
##  [55] "mediocre"        "boring"          "murderer"       
##  [58] "unnecessary"     "waste"           "blow"           
##  [61] "difficult"       "fictional"       "bore"           
##  [64] "hurt"            "disputed"        "ignorance"      
##  [67] "hate"            "crime"           "defy"           
##  [70] "quibble"         "wrath"           "opponent"       
##  [73] "poorly"          "revolting"       "sweaty"         
##  [76] "stifling"        "oppressive"      "static"         
##  [79] "overrated"       "awful"           "hell"           
##  [82] "refuses"         "stubborn"        "badly"          
##  [85] "wrong"           "shocked"         "miss"           
##  [88] "mysterious"      "lose"            "disliked"       
##  [91] "horrible"        "pathetic"        "weak"           
##  [94] "inevitable"      "doubt"           "cheesy"         
##  [97] "dispute"         "confined"        "prejudices"     
## [100] "violently"       "clash"           "stuffy"         
## [103] "stormy"          "uncomfortably"   "darker"         
## [106] "manipulative"    "sly"             "motley"         
## [109] "discomfort"      "belittle"        "mock"           
## [112] "refute"          "shame"           "misconceptions" 
## [115] "tense"           "meaningless"     "slow"           
## [118] "dumb"            "disappointment"  "desperate"      
## [121] "disappointed"    "din"             "irritating"     
## [124] "blame"           "distracting"     "twist"          
## [127] "lies"            "misfortune"      "unaccustomed"   
## [130] "instability"     "arrogant"        "heartless"      
## [133] "conspicuous"     "stress"          "unwillingly"    
## [136] "paradoxically"   "ironically"      "worries"        
## [139] "mistake"         "evil"            "bother"         
## [142] "impossible"      "horrifying"      "tragedy"        
## [145] "problem"         "loses"           "temper"         
## [148] "horrified"       "indifference"    "loud"           
## [151] "abused"          "trouble"         "long time"      
## [154] "bloody"          "stupid"          "apocalypse"     
## [157] "creeping"        "disgusted"       "reactionary"    
## [160] "diatribe"        "bleeding"        "knife"          
## [163] "utterly"         "mocked"          "excruciatingly" 
## [166] "falling"         "inadequate"      "flaw"           
## [169] "insufferably"    "sanctimonious"   "absurd"         
## [172] "weaker"          "provoke"         "pretentious"    
## [175] "inconsistencies" "strain"          "issue"          
## [178] "accusation"      "dull"            "repetitive"     
## [181] "cheap"           "frustrating"     "unpopular"      
## [184] "lacking"         "indiscernible"   "prejudice"      
## [187] "stresses"        "stiff"           "choleric"       
## [190] "violent"         "risky"           "explosive"      
## [193] "strange"         "vindictive"      "poison"         
## [196] "anxiety"         "worthless"       "flaws"          
## [199] "negative"        "dissenter"       "destroy"        
## [202] "manipulation"    "missed"          "fooled"         
## [205] "vengeance"       "hated"           "undermined"     
## [208] "breaks"          "confusing"       "reluctance"     
## [211] "ignorant"        "issues"          "concerns"       
## [214] "despair"         "scars"           "struck"         
## [217] "tragically"      "worse"           "ironic"         
## [220] "grumpy"          "suspect"         "twisted"        
## [223] "criminal"        "insecure"        "loose"          
## [226] "propaganda"      "sadly"           "excuses"        
## [229] "penalty"         "deluded"         "fell"           
## [232] "anger"           "abusive"         "undesirable"    
## [235] "fallen"          "moronic"         "illegally"      
## [238] "silly"           "fears"           "suspicious"     
## [241] "implausible"     "deterrent"       "futile"         
## [244] "refuse"          "muddy"           "insane"         
## [247] "thug"            "bores"           "twists"         
## [250] "drab"            "impropriety"     "ruins"          
## [253] "improper"        "disturbing"      "fiction"        
## [256] "distortion"      "stern"           "rebuke"         
## [259] "interfere"       "unbelievably"    "intense"        
## [262] "upsets"          "arbitrary"       "unknown"        
## [265] "denying"         "blah"            "unfounded"      
## [268] "sin"             "tragic"          "conservative"   
## [271] "bigotry"         "racist"          "racism"         
## [274] "skepticism"      "pretend"         "intimidation"   
## [277] "excuse"          "brute"           "coercion"       
## [280] "falls"           "discredit"       "protest"        
## [283] "opposition"      "discouraging"    "critical"       
## [286] "pandering"       "murky"           "terrible"       
## [289] "hopeless"        "sham"            "lacks"          
## [292] "anxiously"       "fragile"         "strangest"      
## [295] "premeditated"    "oddly"           "odd"            
## [298] "faults"          "disgraceful"     "disgrace"       
## [301] "troubling"       "ruined"          "difficulty"     
## [304] "stab"            "damaged"         "ridiculous"     
## [307] "refuted"         "crash"           "fleeing"        
## [310] "shabby"          "stuck"           "frustrated"     
## [313] "intolerance"     "biases"          "turmoil"        
## [316] "dislike"         "tired"           "uncomfortable"  
## [319] "problematic"     "failed"          "confused"       
## [322] "toxic"           "disregard"       "bizarre"        
## [325] "dead"            "unreasonable"    "crazy"          
## [328] "hollow"          "flimsy"          "hang"           
## [331] "shocking"        "trick"           "nefarious"      
## [334] "undermine"       "creepy"          "troubled"       
## [337] "vague"           "limits"          "adversary"      
## [340] "neglect"         "ominous"         "annoying"       
## [343] "volatile"        "offensive"       "problems"       
## [346] "despised"        "disbelief"       "bullying"       
## [349] "bland"           "stereotype"      "scared"         
## [352] "smugly"          "destruction"     "puzzled"        
## [355] "absent minded"   "timid"           "mad"            
## [358] "aggressive"      "disrespectful"   "stereotypical"  
## [361] "cold"            "lack"            "mindless"       
## [364] "excessive"       "upset"           "unbelievable"   
## [367] "overacted"       "spoil"           "warning"        
## [370] "misses"          "dense"           "burden"         
## [373] "extremists"      "injustice"       "haste"          
## [376] "extraneous"      "smoke"           "ambiguous"      
## [379] "trapped"         "villains"        "deceptively"    
## [382] "devil"           "reluctantly"     "perversely"     
## [385] "useless"         "ulterior"        "ambivalence"    
## [388] "cave"            "criticism"       "conflict"       
## [391] "harassed"        "sarcastic"       "vile"           
## [394] "derision"        "terribly"        "outsider"       
## [397] "killed"          "tumble"          "melancholy"     
## [400] "annoyed"         "smell"           "stupidity"      
## [403] "incompetent"     "lost"            "worst"          
## [406] "debt"            "false"           "expensive"      
## [409] "freaking"        "afraid"          "hard"           
## [412] "ashamed"         "die hard"        "shortcomings"   
## [415] "heck"            "critics"         "ignore"         
## [418] "draining"        "gruesome"        "sloppy"         
## [421] "limited"         "excruciating"    "tedious"        
## [424] "unravel"         "insurmountable"  "prejudicial"    
## [427] "complication"    "succumb"         "desperately"    
## [430] "arrogance"       "ridicule"        "prison"         
## [433] "shake"           "spite"           "bothered"       
## [436] "omission"        "critic"          "hatred"         
## [439] "funny"           "strained"        "struggle"       
## [442] "bitter"          "biased"          "annoyance"      
## [445] "insecurity"      "hype"            "wasted"         
## [448] "object"          "error"           "scream"         
## [451] "disaster"        "criticisms"      "complicated"    
## [454] "overblown"       "overbearing"     "solemn"         
## [457] "adamant"         "taut"            "fake"           
## [460] "lazy"            "worried"         "irritated"      
## [463] "rage"            "chatter"         "aspersions"     
## [466] "unfortunate"     "awkward"         "ugly"           
## [469] "unconvincing"    "dubious"         "killer"         
## [472] "impatient"       "rude"            "confrontation"  
## [475] "defensive"       "catastrophic"    "hasty"          
## [478] "adversity"       "precarious"      "uncertain"      
## [481] "ambiguity"       "insatiable"      "desperation"    
## [484] "scarcely"        "warped"          "discrimination" 
## [487] "lurking"         "excessively"     "heavy handed"   
## [490] "unrealistic"     "unexpected"      "hefty"          
## [493] "disappoint"      "unsatisfactory"  "abrasive"       
## [496] "fleeting"        "tortured"        "appalling"      
## [499] "failing"         "punch"           "drawback"       
## [502] "riled"           "rife"            "anti"           
## [505] "struggling"      "condemn"         "dissent"        
## [508] "scary"           "hardened"        "breaking"       
## [511] "ruin"            "preoccupy"       "pointless"      
## [514] "grate"           "petty"           "intrusive"      
## [517] "hesitant"        "concerned"       "isolated"       
## [520] "loneliness"      "unable"          "rift"           
## [523] "unwilling"       "pig"             "unnoticed"      
## [526] "bickering"       "exhaustion"      "spoiled"        
## [529] "incapable"       "convoluted"      "flawed"         
## [532] "sceptical"       "ordeal"          "seriousness"    
## [535] "idiots"          "nightmare"       "steals"         
## [538] "hateful"         "alienated"       "spews"          
## [541] "poverty"         "break"           "accuse"         
## [544] "boil"            "collapse"        "noise"          
## [547] "vibrate"         "mistaken"        "naive"          
## [550] "died"            "cloud"           "interferes"     
## [553] "passive"         "insult"          "weary"          
## [556] "mistrust"        "anxious"         "drags"          
## [559] "provocative"     "limitations"     "dust"           
## [562] "swelling"        "lying"           "rough"          
## [565] "refused"         "raped"           "erosion"        
## [568] "poorest"         "dirty"           "wildly"         
## [571] "mistakes"        "fear"            "strangely"      
## [574] "vulgar"          "obscures"        "creep"          
## [577] "chaotic"         "messed"          "imperfections"  
## [580] "ranting"         "raving"          "plea"           
## [583] "pervasive"       "controversial"   "controversy"    
## [586] "ploy"            "absence"         "angrily"        
## [589] "fail"            "criticized"      "obnoxious"      
## [592] "inevitably"      "unreliable"      "inconsistent"   
## [595] "misgivings"      "hobble"          "aggravation"    
## [598] "egregious"       "nonsense"        "horrific"       
## [601] "hung"            "wrongly"         "sketchy"        
## [604] "crack"           "louder"          "laughable"      
## [607] "antagonist"      "picket"          "regret"         
## [610] "idiot"           "jaded"           "cynical"        
## [613] "bashed"          "unconfirmed"     "distorted"      
## [616] "glare"           "ludicrous"       "commonplace"    
## [619] "failure"         "warned"          "threaten"       
## [622] "overwhelm"       "outbursts"       "mundane"        
## [625] "fails"           "subversive"      "deprived"       
## [628] "isolation"       "deceive"         "absurdity"      
## [631] "frail"           "undetermined"    "friction"       
## [634] "disagree"        "lengthy"         "drawbacks"      
## [637] "conflicting"     "shallow"         "repulsive"      
## [640] "weaknesses"      "crowded"         "unresolved"     
## [643] "bully"           "uncaring"        "knock"          
## [646] "vain"            "objection"       "pains"          
## [649] "shaky"           "overdone"        "drag"           
## [652] "cocky"           "stark"           "attack"         
## [655] "abuse"           "brash"           "unusual"        
## [658] "temptation"      "startling"       "disadvantaged"  
## [661] "resistance"      "spiteful"        "errors"         
## [664] "extravagant"     "exploitation"    "unsuccessful"   
## [667] "cumbersome"      "unwillingness"   "partiality"     
## [670] "manic"           "brutally"        "deplorable"     
## [673] "outburst"        "illogical"       "losing"         
## [676] "darkness"        "crept"           "hostile"        
## [679] "blinding"        "denies"          "implication"    
## [682] "frustration"     "exhausts"        "criticize"      
## [685] "subjugate"       "detriment"       "vengeful"       
## [688] "enraged"         "fatally"         "imprisonment"   
## [691] "flaunt"          "gross"           "questionable"   
## [694] "grievous"        "despicable"      "antagonistic"   
## [697] "harbors"         "insignificant"   "reluctant"

#--------------------------------------------------------#
#   Create Postive Words wordcloud                       #
#--------------------------------------------------------#

pos.tdm = dtm[,which(colnames(dtm) %in% positive_words)]
m = as.matrix(pos.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows() # opens new image window
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))
title(sub = "Positive Words - Wordcloud")

# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows() # opens new image window
ggplot(test, aes(x = rownames(test), y = test)) + 
    geom_bar(stat = "identity", fill = "blue") +
    geom_text(aes(label = test), vjust= -0.20) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

#--------------------------------------------------------#
#  Create Negative Words wordcloud                       #
#--------------------------------------------------------#

neg.tdm = dtm[,which(colnames(dtm) %in% negative_words) ]
m = as.matrix(neg.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows()
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))         
title(sub = "Negative Words - Wordcloud")

# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows()
ggplot(test, aes(x = rownames(test), y = test)) + 
    geom_bar(stat = "identity", fill = "red") +
    geom_text(aes(label = test), vjust= -0.20) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

#--------------------------------------------------------#
#  Positive words vs Negative Words plot                 #
#--------------------------------------------------------#

len = function(x){
    if ( x == "-" && length(x) == 1)  {return (0)} 
    else {return(length(unlist(x)))}
}

pcount = unlist(lapply(p, len))
ncount = unlist(lapply(n, len))
doc_id = seq(1:length(wc))

windows()
plot(doc_id,pcount,type="l",col="green",xlab = "Document ID", ylab= "Word Count")
lines(doc_id,ncount,type= "l", col="red")
title(main = "Positive words vs Negative Words" )
legend("topright", inset=.05, c("Positive Words","Negative Words"), fill=c("green","red"), horiz=TRUE)

# Documet Sentiment Running plot
windows()
plot(pol$all$polarity, type = "l", ylab = "Polarity Score",xlab = "Document Number")
abline(h=0)
title(main = "Polarity Plot" )

### COG for sentiment-laden words ? ###

senti.dtm = cbind(pos.tdm, neg.tdm); dim(senti.dtm)

## [1]  200 1182

senti.adj.mat = as.matrix(t(senti.dtm)) %*% as.matrix(senti.dtm)
diag(senti.adj.mat) = 0

windows()
distill.cog(senti.adj.mat,   # ad mat obj 
            'Distilled COG of senti words',       # plot title
            5,       # max #central nodes
            5)        # max #connexns