R Markdown

This is an R Markdown document for IMDB review extrack and text analysis assinment.

rm(list=ls())
library(rvest)
## Loading required package: xml2
library(RSelenium)
## Warning: package 'RSelenium' was built under R version 3.3.2
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:rvest':
## 
##     %>%
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(qdap)
## Warning: package 'qdap' was built under R version 3.3.2
## Loading required package: qdapDictionaries
## Warning: package 'qdapDictionaries' was built under R version 3.3.2
## Loading required package: qdapRegex
## Warning: package 'qdapRegex' was built under R version 3.3.2
## Loading required package: qdapTools
## Warning: package 'qdapTools' was built under R version 3.3.2
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.3.2
## 
## Attaching package: 'qdap'
## The following objects are masked from 'package:igraph':
## 
##     %>%, diversity
## The following object is masked from 'package:rvest':
## 
##     %>%
## The following object is masked from 'package:base':
## 
##     Filter
library(text2vec)
## Warning: package 'text2vec' was built under R version 3.3.2
## 
## Attaching package: 'text2vec'
## The following object is masked from 'package:qdap':
## 
##     %>%
## The following objects are masked from 'package:igraph':
## 
##     %>%, normalize
library(data.table)
## Warning: package 'data.table' was built under R version 3.3.2
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:qdapTools':
## 
##     shift
library(stringr)
## 
## Attaching package: 'stringr'
## The following object is masked from 'package:qdap':
## 
##     %>%
## The following object is masked from 'package:igraph':
## 
##     %>%
library(tm)
## Warning: package 'tm' was built under R version 3.3.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.2
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:qdap':
## 
##     ngrams
## 
## Attaching package: 'tm'
## The following objects are masked from 'package:qdap':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.3.2
library(tokenizers)
## Warning: package 'tokenizers' was built under R version 3.3.2
## 
## Attaching package: 'tokenizers'
## The following object is masked from 'package:tm':
## 
##     stopwords
library(slam)
## Warning: package 'slam' was built under R version 3.3.2
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## The following object is masked from 'package:qdapRegex':
## 
##     %+%
library(scam)
## Warning: package 'scam' was built under R version 3.3.2
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
## This is scam 1.2-0.
library(NLP)
library(openNLP)
## Warning: package 'openNLP' was built under R version 3.3.2
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.3.2
library(rworldmap)
## Warning: package 'rworldmap' was built under R version 3.3.2
## Loading required package: sp
## Warning: package 'sp' was built under R version 3.3.2
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
library(rworldxtra)
## Warning: package 'rworldxtra' was built under R version 3.3.2
text.clean = function(x)                    # text data
{ require("tm")
  x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
  x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
  x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric 
  x  =  tolower(x)                          # convert to lower case characters
  x  =  removeNumbers(x)                    # removing numbers
  x  =  stripWhitespace(x)                  # removing white space
  x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
  return(x)
}

Read the IMBD website go get 50 like and dislike reviews and create read stop words.

Clean up the data using the clean. text function created.

counts = c(0,10,20,30,40,50)
reviews = NULL
setwd('C:///All_Data//ISB//Term_1//07_TM//04_Assignmnts')
getwd()
## [1] "C:/All_Data/ISB/Term_1/07_TM/04_Assignmnts"
for (j in counts){
  url1 = paste0("http://www.imdb.com/title/tt0910970/reviews?filter=love;filter=love;start=",j)
  url2 = paste0("http://www.imdb.com/title/tt0910970/reviews?filter=hate;filter=hate;start=",j)
  
  page1 = read_html(url1)
  page2 = read_html(url2)
  reviews1 = html_text(html_nodes(page1,'#tn15content p'))
  reviews2 = html_text(html_nodes(page2,'#tn15content p'))
  
  reviews.positive = setdiff(reviews1, c("*** This review may contain spoilers ***","Add another review"))
  reviews.negative = setdiff(reviews2, c("*** This review may contain spoilers ***","Add another review"))
  
  reviews = c(reviews,reviews.positive,reviews.negative)
  
}

reviews = gsub("\n",' ',reviews)
writeLines(reviews,'Wall-e reviews.txt')

temp.text = readLines(file.choose())  #  Wall-e reviews
head(temp.text, 4)
## [1] " WALL-E, Pixar's latest film, is about a robot named WALL-E (or 'Waste Allocation Load Lifter, Earth-Class'), who is the only thing left on earth with some sort of emotion. He meets another robot named EVE, and the trip begins.It's hard to describe in words how incredible I personally find this film.The animation is flawless. Absolutely flawless. Especially on earth and the robots. It looks real. Much of the time it's impossible to tell whether or not it is real. The few slightly-shaky styled shots that appear a few times in the film only makes the animation that much more amazing and realistic. The humans are really good, too, while not realistic in the sense of you seeing it right now in real life, but they do have a realistic feel to them. The thing with the humans, I believe, is that they were purposefully meant to have this slightly rounded, slightly unrealistic feeling. I believe the reason is to take a satirical look at humans, and what our goals for a future, perfect utopia, is. It questions what we want, and shows you what is a very, very likely outcome of our desires for a 'better' world, showing both positive and negative effects. The animation for the humans, I believe, was made rounded and slightly more cartoonish to emphasize that that is how we will become. Fat, lazy, yet so perfect. At times, especially with that perspective on the humans, they actually do look very real.The story is brilliant. There are many little things in the film that have so much meaning to them. There are things that will be nostalgic to older viewers, and things that younger viewers will love to look at. However, it does steer for the cheesy, cliché aspects of a child's film, yet still remaining a completely G-rated film.I don't call this a child's film at all. Not because it has adult material, because it doesn't. I say this because the film is perfect for everyone. Literally, everyone. There are things in it every person can enjoy, no matter who they are. It has obvious homages to Stanley Kubrick's \"2001: A Space Odyssey\", and any adult who remembers seeing that film will notice this.WALL-E is such a lovable character. I've never felt so much emotion for one character. He will definitely go down in history as iconic as Darth Vader, or Indiana Jones. I was so close to crying at pivotal parts in the film, and although I didn't fully break out and cry, I have never felt so much emotion in my heart with any other film as I did with this one.EVE is very fun and interesting. One scene in particular, with her, was so beautiful, that my eyes got teary. Her chemistry with WALL-E is so oddly perfect. They are so different in appearance and personality, yet they work so well together.The other robots are all lovable, except for the \"enemy\" robots, who still add much depth to the film. In particular, M-O was the cutest, obviously not counting WALL-E.Pixar has always made great animation films. But this, without a doubt, tops all of their own films, and most other films. It restores faith in the animation films. It captures the magic and wonder as past Disney films, which is something I have not seen in most modern animation films.I would not be surprised at all if this won for best picture of the year. It deserves it more than anything.This is one of greatest achievements in cinematic history, and I encourage everyone to see this.10/10 "
## [2] " I just returned from an advanced benefit screening of WALL*E, and I want to be careful not to spill too much regarding the movie. I had the added privilege of watching the film at Pixar, which in and of itself, was amazing.This picture is not a cartoon; it is a film. In fact, it even has the LOOK of film. One of my complaints of more recent 3-D/CG animated films (not from Pixar) is that they all seem to look the same... clean lines, crisp colors, and very \"virtual\", for lack of a better term. WALL*E transcends the typical look of CG animation, and has a true to life \"grit.\" The creators at Pixar are true artists, and are indeed masters of their craft. Not only are they masters of the technology, they are masters of telling a story. WALL*E is no exception.The best way to describe the film is as a science fiction, comedy, dramatic love story. WALL*E, as a character, has dimension, personality, and heart... pretty impressive given that he is essentially a trash compactor. It is true that there is little dialogue in this feature, but I personally did not feel it detracted from the story at all.WALL*E is very much a different Pixar film from it's previous features. I will be curious to see how it is received by others, but in my opinion, I think Pixar has stayed true to itself, demonstrating a commitment to telling great stories and pushing the edge of technology to leave your jaw dropping! My most sincere compliments to Andrew Stanton, Jim Morris, John Lasseter, Ben Burtt, and all the creative forces at Pixar. Can't wait to see what the future brings... "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## [3] " We went to the San Francisco Film Institute's first public screening at their campus in Emeryville. Everyone's sworn to secrecy, but for a film with little dialog, it carries more of an emotional punch and has a richer story than any live-action movie this year. The tone and style of the film is completely different for Pixar, and Disney haven't tried to override the darker thematic elements at all, making the story surprisingly three-dimensional.This will end up being the animated film of the year and I had the same 'wow' feeling as after seeing Ratatouille. Considering that animated films have always played second-fiddle to live-action, and have been aimed at kids, it's ironic that once again Pixar produces a film that rivals any live action on every level. Bravo! "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [4] " I can't say enough about how good this movie, that you probably haven't read, so I'm going to keep this short.This is the best thing out there in theater's right now, and might just be the best animated film of all time, whether you believe that or not, is your own opinion, but what Pixar has done here, can put companies like Dreamworks, Sony, and Blue sky to shame.Wall E also may go down as the most lovable character ever to grace the movie screen, I praise Ben Burtt and Andrew Stanton, and the people at Pixar for what they did, and will continue to do.This is why Pixar is the top studio in the world.10/10 "
data = data.frame(id = 1:length(temp.text), text = temp.text, stringsAsFactors = F)
dim(data)
## [1] 143   2
# Read Stopwords list
stpw1 = readLines(file.choose())      #
stpw2 = tm::stopwords('english')                   # tm package stop word list; tokenizer package has the same name function
comn  = unique(c(stpw1, stpw2))                 # Union of two list
stopwords = unique(gsub("'"," ",comn))  # final stop word lsit after removing punctuation

x  = text.clean(data$text)             # pre-process text corpus
x  =  removeWords(x,stopwords)            # removing stopwords created above
x  =  stripWhitespace(x)                  # removing white space
# x  =  stemDocument(x)
Create DTM using text2vec package
t1 = Sys.time()

tok_fun = word_tokenizer

it_m = itoken(x,
              # preprocessor = text.clean,
              tokenizer = tok_fun,
              ids = data$id,
              progressbar = T)

vocab = create_vocabulary(it_m 
                          #ngram = c(2L, 2L),
                          #stopwords = stopwords
)
## 
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=================================================================| 100%
pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 1)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_m  = create_dtm(it_m, vectorizer)
## 
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=================================================================| 100%
dim(dtm_m)
## [1]  143 3936
dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
a0 = (apply(dtm, 1, sum) > 0)   # build vector to identify non-empty docs
dtm = dtm[a0,]                  # drop empty docs


print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.5080001 secs

Sentiment Analysis

require(qdap) || install.packages("qdap") # ensure java is up to date!
## [1] TRUE
library(qdap)

x1 = x[a0]    # remove empty docs from corpus

t1 = Sys.time()   # set timer

pol = polarity(x1)         # Calculate the polarity from qdap dictionary
wc = pol$all[,2]                  # Word Count in each doc
val = pol$all[,3]                 # average polarity score
p  = pol$all[,4]                  # Positive words info
n  = pol$all[,5]                  # Negative Words info  

Sys.time() - t1  # how much time did the above take?
## Time difference of 9.508 secs
head(pol$all)
##   all  wc  polarity
## 1 all 222 1.3960046
## 2 all 116 1.4112846
## 3 all  59 0.0000000
## 4 all  36 0.6666667
## 5 all 405 0.8546749
## 6 all  55 1.6989837
##                                                                                                                                                                                                                                                                                                                                                                                                                               pos.words
## 1                                                                                                                                                  incredible, flawless, flawless, amazing, realistic, good, realistic, realistic, perfect, positive, perfect, brilliant, love, perfect, enjoy, lovable, fun, interesting, beautiful, perfect, work, lovable, great, tops, faith, magic, modern, won, greatest, achievements, encourage
## 2                                                                                                                                                                                                                                                                          advanced, benefit, privilege, amazing, clean, crisp, masters, masters, masters, love, pretty, impressive, commitment, great, jaw dropping, sincere, creative
## 3                                                                                                                                                                                                                                                                                                                                                                                                                    richer, wow, bravo
## 4                                                                                                                                                                                                                                                                                                                                                                                                     good, lovable, grace, praise, top
## 5 richly, entertaining, thrilling, satisfying, finest, pure, beautifully, magical, breathtakingly, gorgeous, fairly, realistic, cute, love, love, love, favorite, love, love, succeeds, uncomplicated, powerful, love, beauty, delightful, genius, free, top, work, great, important, clarity, powerful, survival, top, humor, work, stimulating, brilliantly, satisfying, great, easy, modern, classic, brilliance, awesome, entertain
## 6                                                                                                                                                                                                                                                                                                                        amazed, famous, loved, great, fun, humor, memorable, fun, revolutionary, credible, cute, adorable, love, loved
##                                                                                                                                                                                                                                                                                          neg.words
## 1                                                                                                                                                                 waste, hard, impossible, shaky, unrealistic, satirical, negative, cartoonish, fat, lazy, cheesy, break, cry, oddly, enemy, doubt
## 2                                                                                                                                                                                                                                                      complaints, lack, fiction, trash, detracted
## 3                                                                                                                                                                                                                                                                            punch, darker, ironic
## 4                                                                                                                                                                                                                                                                                            shame
## 5 unnecessary, complication, disgusting, trash, fell, fell, fell, falling, frightening, sadness, frustrating, difficulty, impossible, villains, peril, oddly, killed, damage, worried, killer, ambiguity, mystery, helplessness, alienation, audaciously, unresolved, unusual, unyielding, violent
## 6                                                                                                                                                                                                                                                                             fiction, hated, fall
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              text.var
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  wall pixar latest film robot named wall waste allocation load lifter earth class thing left earth sort emotion meets robot named eve trip begins hard describe words incredible personally find film animation flawless absolutely flawless earth robots real time impossible real slightly shaky styled shots times film makes animation amazing realistic humans good realistic sense real life realistic feel thing humans purposefully meant slightly rounded slightly unrealistic feeling reason satirical humans goals future perfect utopia questions shows outcome desires world showing positive negative effects animation humans made rounded slightly cartoonish emphasize fat lazy perfect times perspective humans real story brilliant things film meaning things nostalgic older viewers things younger viewers love steer cheesy clich aspects child film remaining completely rated film call child film adult material film perfect literally things person enjoy matter obvious homages stanley kubrick space odyssey adult remembers film notice wall lovable character felt emotion character history iconic darth vader indiana jones close crying pivotal parts film fully break cry felt emotion heart film eve fun interesting scene beautiful eyes teary chemistry wall oddly perfect appearance personality work robots lovable enemy robots add depth film cutest counting wall pixar made great animation films doubt tops films films restores faith animation films captures magic past disney films modern animation films surprised won picture year deserves greatest achievements cinematic history encourage 
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    returned advanced benefit screening wall careful spill movie added privilege watching film pixar amazing picture cartoon film fact film complaints recent cg animated films pixar clean lines crisp colors virtual lack term wall transcends typical cg animation true life grit creators pixar true artists masters craft masters technology masters telling story wall exception describe film science fiction comedy dramatic love story wall character dimension personality heart pretty impressive essentially trash compactor true dialogue feature personally feel detracted story wall pixar film previous features curious received opinion pixar stayed true demonstrating commitment telling great stories pushing edge technology leave jaw dropping sincere compliments andrew stanton jim morris john lasseter ben burtt creative forces pixar wait future brings
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  san francisco film institute public screening campus emeryville sworn secrecy film dialog carries emotional punch richer story live action movie year tone style film completely pixar disney override darker thematic elements making story surprisingly dimensional end animated film year wow feeling ratatouille animated films played fiddle live action aimed kids ironic pixar produces film rivals live action level bravo
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            good movie read short thing theater animated film time opinion pixar put companies dreamworks sony blue sky shame wall lovable character grace movie screen praise ben burtt andrew stanton people pixar continue pixar top studio world
## 5 wall movie experience film richly entertaining thrilling touching satisfying spider man finest pixar animated cgi film date discuss spoilers easily films space odyssey exists pure experience heart senses collection events supposed track intellectually wall rises kind unnecessary complication kind space occupied dreams imagination film beautifully animated magical pixar point piles disgusting trash breathtakingly gorgeous fairly realistic roaches cute importantly heart emotion movie unlike experienced cinema forrest gump tear ducts welled watching movie fell character wall trailer watching movie fell love minutes shortly fell love idea wall falling love previous favorite movie romance superman lois lane original superman films love story love experience wall eve operates succeeds level couples create uncomplicated innocent simple deep powerful bond capture experience love sight writ large possess instant chemistry tells belong time makes root relationship film wall eve share moments real cinematic beauty true hilarity frightening sadness frustrating difficulty delightful satisfaction testament level genius pixar storytellers operating feel beat relationship resonate step fact characters robots modeled humans speak handful words movie animated movie refreshingly free obvious guest star voices top stand comedians upstage movie superman films care characters individuals care relationship impossible rest movie work hooked wall adds expected complications lovers time great mcguffin heroes villains busy item question outwardly simple ends holding key important world pacing adventures breakneck star wars films action staged crystal clarity scenes peril wall reminiscent oddly powerful sequence short circuit johnny killed filmmakers pull absolutely punches running heart ringer characters care helps lot physical damage robot character human character keeping rating audience dramatically worried survival top action emotion visuals humor wall extra mile thought provoking thematic territory film hits head preachy outright opinions subjects raises explicitly lay explanations exists wall world talking killer scenes verbal exposition bits ambiguity work add sense mystery helplessness alienation characters movie feel degree human characters movie humans shown robot world purpose robots designed serve curiosity earlier cgi movie robots humans wall developed robots exist represent humanity individuals asked ponder consequences choices make society moving direction person wall eve heart movie humans add intellectual gravity audience chew choices made movie leave room debate integration live action footage film movie audaciously stimulating brilliantly satisfying left unresolved unusual things question great emotional visual roller coaster experience wall serves audience movie experience easy watch modern day classic earn place cinema history cgi animated films movies indisputable brilliance unyielding imagination unending entertainment footnote pre movie short awesome violent looney tunes roger rabbit esquire toon entertain 
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      screener theater amazed kind person science fiction movies famous movies genre star wars matrix space odyssey hated theater couple sci fi lovers loved wall belief wall great story packed fun humor built memorable characters fun ages revolutionary animation techniques world pixar creates credible forget animation thumbs wall cute adorable character fall love immediately loved eve
head(pol$group)
##   all total.sentences total.words ave.polarity sd.polarity
## 1 all             139       12759    0.1363164   0.8865236
##   stan.mean.polarity
## 1          0.1537651
positive_words = unique(setdiff(unlist(p),"-"))  # Positive words list
negative_words = unique(setdiff(unlist(n),"-"))  # Negative words list

print(positive_words)       # Print all the positive words found in the corpus
##   [1] "incredible"      "flawless"        "amazing"        
##   [4] "realistic"       "good"            "perfect"        
##   [7] "positive"        "brilliant"       "love"           
##  [10] "enjoy"           "lovable"         "fun"            
##  [13] "interesting"     "beautiful"       "work"           
##  [16] "great"           "tops"            "faith"          
##  [19] "magic"           "modern"          "won"            
##  [22] "greatest"        "achievements"    "encourage"      
##  [25] "advanced"        "benefit"         "privilege"      
##  [28] "clean"           "crisp"           "masters"        
##  [31] "pretty"          "impressive"      "commitment"     
##  [34] "jaw dropping"    "sincere"         "creative"       
##  [37] "richer"          "wow"             "bravo"          
##  [40] "grace"           "praise"          "top"            
##  [43] "richly"          "entertaining"    "thrilling"      
##  [46] "satisfying"      "finest"          "pure"           
##  [49] "beautifully"     "magical"         "breathtakingly" 
##  [52] "gorgeous"        "fairly"          "cute"           
##  [55] "favorite"        "succeeds"        "uncomplicated"  
##  [58] "powerful"        "beauty"          "delightful"     
##  [61] "genius"          "free"            "important"      
##  [64] "clarity"         "survival"        "humor"          
##  [67] "stimulating"     "brilliantly"     "easy"           
##  [70] "classic"         "brilliance"      "awesome"        
##  [73] "entertain"       "amazed"          "famous"         
##  [76] "loved"           "memorable"       "revolutionary"  
##  [79] "credible"        "adorable"        "gold"           
##  [82] "worth"           "charming"        "safe"           
##  [85] "adored"          "recommend"       "wonderful"      
##  [88] "imaginative"     "excellent"       "succeeded"      
##  [91] "hilarious"       "humble"          "novelty"        
##  [94] "stimulates"      "enjoys"          "sleek"          
##  [97] "darling"         "intelligence"    "delight"        
## [100] "relief"          "win"             "silent"         
## [103] "playful"         "joy"             "strong"         
## [106] "wonderfully"     "vivid"           "mesmerizing"    
## [109] "terrific"        "effectively"     "rich"           
## [112] "humour"          "fabulous"        "support"        
## [115] "magnificent"     "joyous"          "winner"         
## [118] "promising"       "ideal"           "pride"          
## [121] "ingenious"       "abundance"       "correct"        
## [124] "happy"           "comfortable"     "engaging"       
## [127] "intelligent"     "sensitive"       "courageous"     
## [130] "precious"        "delicate"        "restored"       
## [133] "veritable"       "bright"          "skillfully"     
## [136] "humorous"        "likable"         "reputation"     
## [139] "spectacular"     "loving"          "acclaimed"      
## [142] "masterpiece"     "wholeheartedly"  "originality"    
## [145] "generous"        "fine"            "ready"          
## [148] "worthy"          "endearing"       "pleasant"       
## [151] "appealing"       "consistently"    "merit"          
## [154] "popular"         "accurate"        "compact"        
## [157] "inspiration"     "dedicated"       "affection"      
## [160] "exciting"        "dynamic"         "romantic"       
## [163] "effective"       "supports"        "helping"        
## [166] "fantastic"       "top notch"       "powerfully"     
## [169] "meaningful"      "simplified"      "timely"         
## [172] "clever"          "nice"            "notably"        
## [175] "courage"         "wonders"         "enhance"        
## [178] "innovative"      "talented"        "marvel"         
## [181] "heartfelt"       "masterful"       "sweet"          
## [184] "gained"          "colorful"        "famed"          
## [187] "neat"            "keenly"          "smitten"        
## [190] "amusing"         "leading"         "lovely"         
## [193] "convincing"      "marvelous"       "leads"          
## [196] "dazzle"          "revel"           "vibrant"        
## [199] "maturity"        "humorously"      "entertains"     
## [202] "distinction"     "assuredly"       "suitable"       
## [205] "prefer"          "beloved"         "happily"        
## [208] "gentle"          "classy"          "uplifting"      
## [211] "warm"            "defeat"          "success"        
## [214] "clear"           "rightful"        "smooth"         
## [217] "incredibly"      "enjoyed"         "fast"           
## [220] "perfectly"       "triumph"         "perfection"     
## [223] "outdone"         "worked"          "futuristic"     
## [226] "refreshing"      "hero"            "encouraging"    
## [229] "cool"            "capable"         "grand"          
## [232] "fair"            "works"           "awards"         
## [235] "enchanted"       "appeal"          "obsession"      
## [238] "enjoyment"       "gladly"          "stellar"        
## [241] "defeats"         "honest"          "breathtaking"   
## [244] "genuine"         "majesty"         "humane"         
## [247] "friendly"        "brave"           "likes"          
## [250] "accolades"       "manageable"      "superbly"       
## [253] "soulful"         "homage"          "fiery"          
## [256] "proud"           "tougher"         "worthwhile"     
## [259] "soft"            "acclaim"         "recommended"    
## [262] "passion"         "nicely"          "exceptional"    
## [265] "charm"           "inventive"       "guarantee"      
## [268] "understandable"  "reliable"        "bravery"        
## [271] "glorious"        "confident"       "marvellous"     
## [274] "magnificently"   "enthralled"      "flawlessly"     
## [277] "masterfully"     "legendary"       "superb"         
## [280] "lead"            "satisfied"       "enjoyable"      
## [283] "pleasure"        "heartwarming"    "captivating"    
## [286] "significant"     "masterpieces"    "glad"           
## [289] "convenient"      "smile"           "talent"         
## [292] "entranced"       "approval"        "super"          
## [295] "gem"             "excited"         "excitement"     
## [298] "solid"           "successes"       "outdo"          
## [301] "effortlessly"    "captivate"       "influential"    
## [304] "warmth"          "remarkable"      "excitedly"      
## [307] "eagerly"         "believable"      "trust"          
## [310] "stunning"        "stunningly"      "achievement"    
## [313] "luxury"          "passionately"    "happiness"      
## [316] "monumental"      "admiration"      "tickle"         
## [319] "successful"      "brighter"        "empathy"        
## [322] "amazingly"       "exceeded"        "effortless"     
## [325] "comfort"         "variety"         "heaven"         
## [328] "innocuous"       "fascination"     "kindness"       
## [331] "loyal"           "rapt"            "loyalty"        
## [334] "spirited"        "smiling"         "effusively"     
## [337] "faithful"        "smart"           "superior"       
## [340] "shine"           "accomplish"      "amazes"         
## [343] "thankful"        "praising"        "correctly"      
## [346] "impeccable"      "impressed"       "amuse"          
## [349] "intricate"       "fairness"        "smarter"        
## [352] "enchanting"      "tidy"            "noble"          
## [355] "clears"          "protect"         "thoughtfully"   
## [358] "stupendous"      "smartly"         "surpass"        
## [361] "feat"            "groundbreaking"  "pinnacle"       
## [364] "satisfy"         "wholesome"       "cheer"          
## [367] "splendidly"      "poised"          "awe"            
## [370] "inspiring"       "suffice"         "kindly"         
## [373] "respect"         "tough"           "award"          
## [376] "indebted"        "righteous"       "righteousness"  
## [379] "trusted"         "gain"            "outstanding"    
## [382] "decent"          "loves"           "calm"           
## [385] "cuteness"        "quiet"           "angel"          
## [388] "eye catching"    "unforgettable"   "recommendations"
## [391] "destiny"         "excellence"      "cozy"           
## [394] "meticulous"      "fascinating"     "poignant"       
## [397] "led"             "renowned"        "proves"         
## [400] "instantly"       "strongest"       "mind blowing"   
## [403] "wise"            "liking"          "sufficient"     
## [406] "stable"          "defeated"        "ease"           
## [409] "continuity"      "ethical"
print(negative_words)       # Print all neg words
##   [1] "waste"          "hard"           "impossible"     "shaky"         
##   [5] "unrealistic"    "satirical"      "negative"       "cartoonish"    
##   [9] "fat"            "lazy"           "cheesy"         "break"         
##  [13] "cry"            "oddly"          "enemy"          "doubt"         
##  [17] "complaints"     "lack"           "fiction"        "trash"         
##  [21] "detracted"      "punch"          "darker"         "ironic"        
##  [25] "shame"          "unnecessary"    "complication"   "disgusting"    
##  [29] "fell"           "falling"        "frightening"    "sadness"       
##  [33] "frustrating"    "difficulty"     "villains"       "peril"         
##  [37] "killed"         "damage"         "worried"        "killer"        
##  [41] "ambiguity"      "mystery"        "helplessness"   "alienation"    
##  [45] "audaciously"    "unresolved"     "unusual"        "unyielding"    
##  [49] "violent"        "hated"          "fall"           "bad"           
##  [53] "garbage"        "worry"          "overweight"     "wrong"         
##  [57] "slap"           "sad"            "stress"         "lifeless"      
##  [61] "pointless"      "vice"           "defensive"      "sadly"         
##  [65] "jaded"          "blind"          "boring"         "sucker"        
##  [69] "loneliness"     "funny"          "silly"          "horrid"        
##  [73] "wasted"         "plot"           "cheated"        "bloated"       
##  [77] "maddening"      "mess"           "unbearable"     "dragging"      
##  [81] "tired"          "frantic"        "dud"            "criticized"    
##  [85] "dreary"         "limp"           "problems"       "vain"          
##  [89] "ignorance"      "unfamiliar"     "blow"           "craven"        
##  [93] "helpless"       "hopelessly"     "stuck"          "tiring"        
##  [97] "hideously"      "insufferable"   "dumb"           "stupid"        
## [101] "whine"          "moody"          "mediocre"       "unprepared"    
## [105] "dismayed"       "banal"          "concerned"      "mindlessly"    
## [109] "disaster"       "flawed"         "obese"          "imbecile"      
## [113] "manipulative"   "evil"           "worst"          "sour"          
## [117] "depressing"     "excuse"         "degenerate"     "morons"        
## [121] "rusty"          "wanton"         "destruction"    "subjected"     
## [125] "loss"           "shamelessly"    "stolen"         "idiots"        
## [129] "propaganda"     "overrated"      "fail"           "rubbish"       
## [133] "annoying"       "monotonous"     "drags"          "torture"       
## [137] "warned"         "clumsy"         "disagree"       "lacked"        
## [141] "joke"           "lacks"          "lost"           "ridiculous"    
## [145] "shockingly"     "kill"           "harsh"          "blah"          
## [149] "disdain"        "uncomfortable"  "unpleasant"     "apocalyptic"   
## [153] "mad"            "aghast"         "inferior"       "dreadful"      
## [157] "scathing"       "refuse"         "dead"           "naive"         
## [161] "slowly"         "spoil"          "dark"           "compulsion"    
## [165] "fails"          "utterly"        "dirty"          "disarray"      
## [169] "debt"           "plunder"        "pathetic"       "cautionary"    
## [173] "limits"         "sickly"         "ruined"         "monstrous"     
## [177] "ominous"        "cheap"          "broken"         "lonely"        
## [181] "junk"           "heartbreaking"  "unknown"        "rogue"         
## [185] "fool"           "gloom"          "rampant"        "overblown"     
## [189] "unnerving"      "scary"          "falls"          "hostile"       
## [193] "dangerous"      "desperate"      "crowded"        "unable"        
## [197] "concern"        "spite"          "disdainful"     "dies"          
## [201] "loses"          "stumbled"       "bastard"        "rotten"        
## [205] "sarcastic"      "senseless"      "pigs"           "ruins"         
## [209] "dull"           "hopeless"       "insane"         "trashed"       
## [213] "errors"         "defiantly"      "problem"        "failed"        
## [217] "long time"      "warning"        "missed"         "loud"          
## [221] "strictly"       "contrived"      "irritating"     "unnatural"     
## [225] "awkward"        "unexpected"     "headache"       "incessant"     
## [229] "terrible"       "horrible"       "awful"          "hate"          
## [233] "sick"           "crap"           "bull"           "mistakes"      
## [237] "gross"          "pretentious"    "bored"          "weak"          
## [241] "badly"          "disliked"       "aversion"       "bewildered"    
## [245] "aggravating"    "crazy"          "revolting"      "indoctrination"
## [249] "dislike"        "pricey"         "irrational"     "desperately"   
## [253] "heartless"      "foolish"        "haunting"       "died"          
## [257] "cold"           "crushed"        "dump"           "miss"          
## [261] "strange"        "twist"          "noisy"          "dusty"         
## [265] "spurn"          "object"         "difficult"      "breaking"      
## [269] "temper"         "poor"           "adversary"      "mistake"       
## [273] "bum"            "frenzy"         "slow"           "lose"          
## [277] "excessively"    "risk"           "barren"         "sucked"        
## [281] "attack"         "falter"         "conflict"       "droop"         
## [285] "outburst"       "sneak"          "embarrassing"   "dust"          
## [289] "fictional"      "nightmare"      "lame"           "uncomfortably" 
## [293] "morbidly"       "squeals"        "slower"         "wasting"       
## [297] "simplistic"     "cynical"        "flaw"           "sorrow"        
## [301] "impaired"       "mundane"        "lags"           "critics"       
## [305] "tortured"       "critic"         "drones"         "pity"          
## [309] "retards"        "confused"       "frustrated"     "guilt"         
## [313] "fuss"           "overrun"        "irony"          "slow moving"   
## [317] "devoid"         "aggressive"     "destructive"    "bothered"      
## [321] "abysmal"        "ugly"           "filthy"         "idiotic"       
## [325] "trashy"         "negate"         "denying"        "disappointment"
## [329] "yawn"           "bug"            "downhill"       "drain"         
## [333] "moron"          "boredom"        "mysterious"     "betrayal"      
## [337] "losing"         "cramped"        "apocalypse"     "dirt"          
## [341] "disappointed"   "trouble"        "worn"           "limit"         
## [345] "oblivious"      "restricted"     "limited"        "hindrance"     
## [349] "sneaky"         "innuendo"       "explosive"      "torturous"     
## [353] "exhausted"      "imperfect"      "weakness"       "decrepit"      
## [357] "conflicts"      "stark"          "appallingly"    "criticism"     
## [361] "accusation"     "complicated"    "antagonism"     "intimidating"  
## [365] "problematic"    "bores"          "lacking"        "blame"         
## [369] "infested"       "poverty"        "appalling"      "isolate"       
## [373] "uneventful"     "adversity"      "hurt"           "pain"          
## [377] "desolate"       "ignore"         "apathy"         "uproariously"  
## [381] "tramp"          "grating"        "hell"           "mockery"       
## [385] "discredit"      "ripped"         "bash"           "tarnished"     
## [389] "cynicism"       "stall"          "sucks"          "crappy"        
## [393] "immature"       "worse"          "monster"        "junkyard"      
## [397] "crushing"       "embarrass"      "crashes"        "miserably"     
## [401] "selfish"        "issues"         "catastrophe"    "guilty"        
## [405] "broke"          "faltered"       "deny"           "damn"          
## [409] "submissive"     "bleak"          "mishap"         "corrupt"       
## [413] "wary"           "fret"           "alarming"       "bizarre"       
## [417] "malevolent"     "careless"       "passive"        "stern"         
## [421] "sedentary"      "devastating"    "begging"        "struggle"      
## [425] "abuse"          "gravely"        "confusing"      "twists"        
## [429] "worthless"      "comical"        "damned"         "twisted"       
## [433] "dying"          "disturbing"     "offensive"      "callous"       
## [437] "inaccurate"     "disturbed"      "insensitive"    "angry"         
## [441] "slaves"         "hater"          "scared"         "pollute"       
## [445] "frighten"       "suffer"         "fear"           "afraid"        
## [449] "suffered"       "phobia"         "suffering"      "issue"         
## [453] "subversive"     "contradict"     "ambiguous"      "dilapidated"   
## [457] "inattentive"    "obscure"        "unpredictable"  "regret"        
## [461] "static"         "breaks"         "vague"          "complex"       
## [465] "ordeal"         "dismay"         "retarded"       "villainous"    
## [469] "reluctantly"    "death"          "flaws"          "sloth"         
## [473] "meaningless"    "indoctrinate"   "blatantly"      "puzzled"       
## [477] "challenging"    "crush"          "weird"
pos.tdm = dtm[,which(colnames(dtm) %in% positive_words)]
m = as.matrix(pos.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows() # opens new image window
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))
title(sub = "Positive Words - Wordcloud")

# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows() # opens new image window
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "blue") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

#--------------------------------------------------------#
#  Create Negative Words wordcloud                       #
#--------------------------------------------------------#

neg.tdm = dtm[,which(colnames(dtm) %in% negative_words) ]
m = as.matrix(neg.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows()
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))         
title(sub = "Negative Words - Wordcloud")

# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows()
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "red") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

#--------------------------------------------------------#
#  Positive words vs Negative Words plot                 #
#--------------------------------------------------------#

len = function(x){
  if ( x == "-" && length(x) == 1)  {return (0)} 
  else {return(length(unlist(x)))}
}

pcount = unlist(lapply(p, len))
ncount = unlist(lapply(n, len))
doc_id = seq(1:length(wc))

windows()
plot(doc_id,pcount,type="l",col="green",xlab = "Document ID", ylab= "Word Count")
lines(doc_id,ncount,type= "l", col="red")
title(main = "Positive words vs Negative Words" )
legend("topright", inset=.05, c("Positive Words","Negative Words"), fill=c("green","red"), horiz=TRUE)

# Documet Sentiment Running plot
windows()
plot(pol$all$polarity, type = "l", ylab = "Polarity Score",xlab = "Document Number")
abline(h=0)
title(main = "Polarity Plot" )

#############################################
t1 = Sys.time()

tok_fun = word_tokenizer  # using word & not space tokenizers

it_0 = itoken( x,
               #preprocessor = text.clean,
               tokenizer = tok_fun,
               ids = data$id,
               progressbar = T)

vocab = create_vocabulary(it_0,    #  func collects unique terms & corresponding statistics
                          ngram = c(2L, 2L) #,
                          #stopwords = stopwords
)
## 
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=================================================================| 100%
# length(vocab); str(vocab)     # view what vocab obj is like

pruned_vocab = prune_vocabulary(vocab,  # filters input vocab & throws out v frequent & v infrequent terms
                                term_count_min = 10)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)

# length(pruned_vocab);  str(pruned_vocab)

vectorizer = vocab_vectorizer(pruned_vocab) #  creates a text vectorizer func used in constructing a dtm/tcm/corpus

dtm_0  = create_dtm(it_0, vectorizer) # high-level function for creating a document-term matrix
## 
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=================================================================| 100%
# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),]       # terms in decreasing order of freq
head(tsum)
##       wall_eve   finding_nemo     love_story andrew_stanton      toy_story 
##             30             20             16             15             14 
##    movie_movie 
##             14
tail(tsum)
##   disney_pixar animated_films     pixar_film      ben_burtt    pixar_movie 
##             12             12             12             11             10 
##  animated_film 
##             10
# # select Top 1000 bigrams to unigram
# if (length(tsum) > 1000) {n = 1000} else {n = length(tsum)}
# tsum = tsum[1:n]

#-------------------------------------------------------
# Code bi-grams as unigram in clean text corpus

text2 = x
text2 = paste("",text2,"")

pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0

for (term in names(tsum)){
  i = i + 1
  focal.term = gsub("_", " ",term)        # in case dot was word-separator
  replacement.term = term
  text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
  setTxtProgressBar(pb, i)
}
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=================================================================| 100%
it_m = itoken(text2,     # function creates iterators over input objects to vocabularies, corpora, DTM & TCM matrices
              # preprocessor = text.clean,
              tokenizer = tok_fun,
              ids = data$id,
              progressbar = T)

vocab = create_vocabulary(it_m     # vocab func collects unique terms and corresponding statistics
                          # ngram = c(2L, 2L),
                          #stopwords = stopwords
)
## 
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=================================================================| 100%
# length(vocab); str(vocab)     # view what vocab obj is like

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 1)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)

vectorizer = vocab_vectorizer(pruned_vocab)


dtm_m  = create_dtm(it_m, vectorizer)
## 
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=================================================================| 100%
dim(dtm_m)
## [1]  143 3948
dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
a0 = (apply(dtm, 1, sum) > 0)   # build vector to identify non-empty docs
dtm = dtm[a0,]                  # drop empty docs

print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.8889999 secs
# view a sample of the DTM, sorted from most to least frequent tokens 
dtm = dtm[,order(apply(dtm, 2, sum), decreasing = T)]     # sorting dtm's columns in decreasing order of column sums
inspect(dtm[1:5, 1:5])     # inspect() func used to view parts of a DTM object           
## <<DocumentTermMatrix (documents: 5, terms: 5)>>
## Non-/sparse entries: 19/6
## Sparsity           : 24%
## Maximal term length: 5
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs movie wall film pixar robot
##    1     0    5   12     2     2
##    2     1    5    4     5     0
##    3     1    0    4     2     0
##    4     2    1    0     3     0
##    5    16   10    6     3     2
#   1- Using Term frequency(tf)             

tst = round(ncol(dtm)/100)  # divide DTM's cols into 100 manageble parts
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
  tempdtm = dtm[,(b[i]+1):(b[i+1])]
  s = colSums(as.matrix(tempdtm))
  ss.col = c(ss.col,s)
  print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 51
## [1] 52
## [1] 53
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 61
## [1] 62
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
## [1] 69
## [1] 70
## [1] 71
## [1] 72
## [1] 73
## [1] 74
## [1] 75
## [1] 76
## [1] 77
## [1] 78
## [1] 79
## [1] 80
## [1] 81
## [1] 82
## [1] 83
## [1] 84
## [1] 85
## [1] 86
## [1] 87
## [1] 88
## [1] 89
## [1] 90
## [1] 91
## [1] 92
## [1] 93
## [1] 94
## [1] 95
## [1] 96
## [1] 97
## [1] 98
## [1] 99
## [1] 100
tsum = ss.col
tsum = tsum[order(tsum, decreasing = T)]       #terms in decreasing order of freq
head(tsum)
## movie  wall  film pixar robot earth 
##   324   323   220   131   109   106
tail(tsum)
##  wechat destiny  symbol    king aladdin mermaid 
##       1       1       1       1       1       1
windows()  # New plot window
wordcloud(names(tsum), tsum,     # words, their freqs 
          scale = c(4, 0.5),     # range of word sizes
          1,                     # min.freq of words to consider
          max.words = 200,       # max #words
          colors = brewer.pal(8, "Dark2"))    # Plot results in a word cloud 
title(sub = "Term Frequency - Wordcloud")     # title for the wordcloud display

# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))

windows()  # New plot window
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "Blue") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

dev.off() # [graphical] device off / close it down
## png 
##   2
# -------------------------------------------------------------- #
# step 2b - Using Term frequency inverse document frequency (tfidf)             
# -------------------------------------------------------------- #

require(textir) || install.packages("textir")
## Loading required package: textir
## Warning: package 'textir' was built under R version 3.3.2
## Loading required package: distrom
## Warning: package 'distrom' was built under R version 3.3.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:qdap':
## 
##     %&%
## Loading required package: gamlr
## Warning: package 'gamlr' was built under R version 3.3.2
## Loading required package: parallel
## 
## Attaching package: 'distrom'
## The following object is masked from 'package:nlme':
## 
##     collapse
## [1] TRUE
library(textir)
dtm.tfidf = tfidf(dtm, normalize=FALSE)

tst = round(ncol(dtm.tfidf)/100)
a = rep(tst, 99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm.tfidf))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
  tempdtm = dtm.tfidf[,(b[i]+1):(b[i+1])]
  s = colSums(as.matrix(tempdtm))
  ss.col = c(ss.col,s)
  print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 51
## [1] 52
## [1] 53
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 61
## [1] 62
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
## [1] 69
## [1] 70
## [1] 71
## [1] 72
## [1] 73
## [1] 74
## [1] 75
## [1] 76
## [1] 77
## [1] 78
## [1] 79
## [1] 80
## [1] 81
## [1] 82
## [1] 83
## [1] 84
## [1] 85
## [1] 86
## [1] 87
## [1] 88
## [1] 89
## [1] 90
## [1] 91
## [1] 92
## [1] 93
## [1] 94
## [1] 95
## [1] 96
## [1] 97
## [1] 98
## [1] 99
## [1] 100
tsum = ss.col

tsum = tsum[order(tsum, decreasing = T)]       #terms in decreasing order of freq
head(tsum)
##     wall     film    earth    robot      eve    movie 
## 151.3468 144.7177 121.9301 109.2887 105.7491 100.2784
tail(tsum)
##   wechat  destiny   symbol     king  aladdin  mermaid 
## 4.241327 4.241327 4.241327 4.241327 4.241327 4.241327
windows()  # New plot window
wordcloud(names(tsum), tsum, scale=c(4,0.5),1, max.words=200,colors=brewer.pal(8, "Dark2")) # Plot results in a word cloud 
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : andrew_stanton could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : quality could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : spaceship could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : disney_pixar could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : favorite could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : love_story could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : comedy could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : ben_burtt could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : robot could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : interesting could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : boring could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : end could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : finding_nemo could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : space could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : ratatouille could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : scene could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : experience could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : piece could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : simple could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : robots could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : friend could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : dialog could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : film could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : movie_movie could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : stars could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(tsum), tsum, scale = c(4, 0.5), 1, max.words =
## 200, : year could not be fit on page. It will not be plotted.
title(sub = "Term Frequency Inverse Document Frequency - Wordcloud")

as.matrix(tsum[1:20])     #  to see the top few tokens & their IDF scores
##                 [,1]
## wall       151.34676
## film       144.71772
## earth      121.93014
## robot      109.28866
## eve        105.74914
## movie      100.27836
## animation   93.99577
## humans      88.96386
## robots      83.30244
## walle       81.60813
## human       80.11347
## pixar       79.09002
## movies      78.79656
## good        77.79228
## love        75.66137
## films       73.59727
## space       71.81378
## characters  71.31763
## story       70.48121
## great       69.59141
(dtm.tfidf)[1:10, 1:10]   # view first 10x10 cells in the DTM under TF IDF.
## 10 x 10 sparse Matrix of class "dgCMatrix"
##    [[ suppressing 10 column names 'movie', 'wall', 'film' ... ]]
##                                                                     
## 1  .         2.3428291 7.893694 1.2074812 2.005297 3.450853 6.634995
## 2  0.3095011 2.3428291 2.631231 3.0187030 .        .        1.105833
## 3  0.3095011 .         2.631231 1.2074812 .        .        .       
## 4  0.6190022 0.4685658 .        1.8112218 .        .        .       
## 5  4.9520179 4.6856581 3.946847 1.8112218 2.005297 .        .       
## 6  .         1.4056974 .        0.6037406 .        .        2.211665
## 7  2.4760090 .         1.973423 .         3.007945 4.601137 3.317498
## 8  0.3095011 1.8742633 1.315616 1.2074812 .        .        .       
## 9  0.6190022 2.3428291 2.631231 .         2.005297 1.150284 .       
## 10 .         1.4056974 .        1.2074812 1.002648 1.150284 2.211665
##                              
## 1  1.022451 2.937476 3.519821
## 2  .        .        .       
## 3  .        .        .       
## 4  .        .        .       
## 5  5.112255 .        4.693095
## 6  1.022451 1.468738 .       
## 7  2.044902 5.874952 1.173274
## 8  .        .        .       
## 9  .        4.406214 1.173274
## 10 2.044902 1.468738 .
# plot barchart for top tokens
test = as.data.frame(round(tsum[1:15],0))
windows()  # New plot window
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "red") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

dev.off()
## png 
##   2
#------------------------------------------------------#
# step 2c - Term Co-occurance Matrix (TCM)             #
#------------------------------------------------------#

vectorizer = vocab_vectorizer(pruned_vocab, 
                              grow_dtm = FALSE, 
                              skip_grams_window = 5L)

tcm = create_tcm(it_m, vectorizer) # func to build a TCM
## 
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=================================================================| 100%
tcm.mat = as.matrix(tcm)         # use tcm.mat[1:5, 1:5] to view
adj.mat = tcm.mat + t(tcm.mat)   # since adjacency matrices are symmetric

z = order(colSums(adj.mat), decreasing = T)
adj.mat = adj.mat[z,z]

# Plot Simple Term Co-occurance graph
adj = adj.mat[1:30,1:30]

library(igraph)
cog = graph.adjacency(adj, mode = 'undirected')
cog =  simplify(cog)  

cog = delete.vertices(cog, V(cog)[ degree(cog) == 0 ])

windows()
plot(cog)

#-----------------------------------------------------------#
# Step 2d - a cleaned up or 'distilled' COG PLot            #
#-----------------------------------------------------------#

distill.cog = function(mat1, # input TCM ADJ MAT
                       title, # title for the graph
                       s,    # no. of central nodes
                       k1){  # max no. of connections  
  library(igraph)
  a = colSums(mat1) # collect colsums into a vector obj a
  b = order(-a)     # nice syntax for ordering vector in decr order  
  
  mat2 = mat1[b, b]     # order both rows and columns along vector b
  
  diag(mat2) =  0
  
  ## +++ go row by row and find top k adjacencies +++ ##
  
  wc = NULL
  
  for (i1 in 1:s){ 
    thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
    mat2[i1, mat2[i1,] < thresh1] = 0   # neat. didn't need 2 use () in the subset here.
    mat2[i1, mat2[i1,] > 0 ] = 1
    word = names(mat2[i1, mat2[i1,] > 0])
    mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
    wc = c(wc,word)
  } # i1 loop ends
  
  
  mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
  ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))]  # removed any NAs from the list
  mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
  graph <- graph.adjacency(mat4, mode = "undirected", weighted=T)    # Create Network object
  graph = simplify(graph) 
  V(graph)$color[1:s] = "green"
  V(graph)$color[(s+1):length(V(graph))] = "pink"
  
  graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ]) # delete singletons?
  
  plot(graph, 
       layout = layout.kamada.kawai, 
       main = title)
  
} # func ends

windows()
distill.cog(tcm.mat, 'Distilled COG',  10,  5)
## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length

## adj.mat and distilled cog for tfidf DTMs ##

adj.mat = t(dtm.tfidf) %*% dtm.tfidf
diag(adj.mat) = 0
a0 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a0[1:50], a0[1:50]])

windows()
distill.cog(adj.mat, 'Distilled COG',  10,  10)

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.