install.packages(‘topicmodels’)

library(topicmodels)
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)  

Manually create the stopword in addition with English stopword

my_stopwords <- c(stopwords("english"), "will", "also", "etc", "else", "can", "even", "within", "without", 
                  "well", "say", "year", "must", "need", "never", "now", "want", "still", 
                  "time", "therefore", "send", "today", "may", "many", "make", "whose",
                  "however", "get", "have", "just", "him","will")

======create document term matrix=====

read in file

setwd("C:/Users/ngsook/Documents")
textdata <- read.delim("MovieStories.utf8.txt", header=TRUE, sep="\t", quote = "", stringsAsFactors = FALSE)
head(textdata, 3)
##   MovieID CategoryGross
## 1       1             3
## 2       2             4
## 3       3             1
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Storyline
## 1 An action-packed drama about a Christian high school football coach who uses his undying faith to battle the giants of fear and failure. In six years of coaching, Grant Taylor has never led his Shiloh Eagles to a winning season. After learning that he and his wife Brooke face infertility, Grant discovers that a group of fathers are secretly organizing to have him dismissed as head coach. Devastated by his circumstances, he cries out to God in desperation. When Grant receives a message from an unexpected visitor, he searches for a stronger purpose for his football team. He dares to challenge his players to believe God for the impossible on and off the field. When faced with unbelievable odds, the Eagles must step up to their greatest test of strength and courage. What transpires is a dynamic story of the fight between faith and fear. 
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        This holiday comedy is centered around two neighbors in a small New England town who go to war when one of them decides to decorate his house with a so many Christmas lights that they are visible from space. The neighborhood is turned upside down as the families try to discover the true meaning of Christmas. 
## 3                                                                                                                                                                                         The day after they get the word they'll go home in two weeks, a group of soldiers from Spokane are ambushed in an Iraqi city. Back stateside we follow four of them - a surgeon who saw too much, a teacher who's a single mom and who lost a hand in the ambush, an infantry man whose best friend died that day, and a soldier who keeps reliving the moment he killed a civilian woman. Each of the four has come home changed, each feels dislocation. Group therapy, V.A. services, halting gestures from family and colleagues, and regular flashbacks keep the war front and center in their minds. They're angry, touchy, and explosive: can a warrior find peace back home?
dim(textdata)
## [1] 1053    3

get the movie stories

text <- textdata[, 3]
atext <- textdata[1, 3]

help(sapply)

a little data exploration - how long are the stories?

split the word to space

doclen <- sapply(text, function(x) length(strsplit(x, " ")[[1]]))
str(doclen)
##  Named int [1:1053] 145 56 120 110 22 31 120 156 119 139 ...
##  - attr(*, "names")= chr [1:1053] "An action-packed drama about a Christian high school football coach who uses his undying faith to battle the gi"| __truncated__ "This holiday comedy is centered around two neighbors in a small New England town who go to war when one of them"| __truncated__ "The day after they get the word they'll go home in two weeks, a group of soldiers from Spokane are ambushed in "| __truncated__ "This is the story of Doogal, an adorable candy-loving mutt who goes on a mission to save the world. Doogal must"| __truncated__ ...

Explore the doc length

table(doclen)
## doclen
##   3   7  11  13  14  15  17  18  19  21  22  23  24  25  26  27  28  29 
##   1   1   1   2   1   2   5   2   2   4   1   4   4   5   2   1   4   3 
##  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47 
##   1   7   3   2   4   3   3   4   1   6   5   7   6   7  11   7   8   3 
##  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65 
##   8  13  10   7   7  13  11   7   9  10   6   9   7   4  10   6   8   8 
##  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83 
##  13   4  12   8   9  10   9  10   7  12  11  10   8   5   7  11   9   4 
##  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 
##   7   6   6   9   4   7   7  13   3  10  11   3   8   8  12   8   7   7 
## 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 
##  10  11   8   7   6   6  10  11  10   9  12  14   8  14   5  12  11   7 
## 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 
##   7   6   8  11   7   5   1   5   4   6   6   3   4   4   7   4   5   9 
## 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 
##   6   4   4   5   4   4   3   5   6   5   4   7   5  11   5  10   9   3 
## 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 
##  11   7   7   7   9   6   6   7   3   3   9   7   3   1   2   1   1   3 
## 178 181 182 183 187 
##   2   1   1   1   1
hist(doclen)

head(doclen,5)
## An action-packed drama about a Christian high school football coach who uses his undying faith to battle the giants of fear and failure. In six years of coaching, Grant Taylor has never led his Shiloh Eagles to a winning season. After learning that he and his wife Brooke face infertility, Grant discovers that a group of fathers are secretly organizing to have him dismissed as head coach. Devastated by his circumstances, he cries out to God in desperation. When Grant receives a message from an unexpected visitor, he searches for a stronger purpose for his football team. He dares to challenge his players to believe God for the impossible on and off the field. When faced with unbelievable odds, the Eagles must step up to their greatest test of strength and courage. What transpires is a dynamic story of the fight between faith and fear.  
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           145 
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        This holiday comedy is centered around two neighbors in a small New England town who go to war when one of them decides to decorate his house with a so many Christmas lights that they are visible from space. The neighborhood is turned upside down as the families try to discover the true meaning of Christmas.  
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            56 
##                                                                                                                                                                                         The day after they get the word they'll go home in two weeks, a group of soldiers from Spokane are ambushed in an Iraqi city. Back stateside we follow four of them - a surgeon who saw too much, a teacher who's a single mom and who lost a hand in the ambush, an infantry man whose best friend died that day, and a soldier who keeps reliving the moment he killed a civilian woman. Each of the four has come home changed, each feels dislocation. Group therapy, V.A. services, halting gestures from family and colleagues, and regular flashbacks keep the war front and center in their minds. They're angry, touchy, and explosive: can a warrior find peace back home?  
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           120 
##                                                                                                                                                                                           This is the story of Doogal, an adorable candy-loving mutt who goes on a mission to save the world. Doogal must prevent the evil sorcerer Zeebad from freezing the earth forever with the power of the three mysterious legendary diamonds. Joining Doogal on his big quest are pals Dylan, a guitar-playing rabbit, Ermintrude, an opera-singing cow, and Brian, a bashful snail. Hopping on a magic train, they travel over ice-capped mountains, navigate fiery pits of lava, and sail across vast oceans on the perilous journey of a lifetime. Along the way, they learn that the most powerful weapon of all is their friendship - which even Zeebad's magic cannot destroy!  
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           110 
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        A drama that focuses on the period in Mary and Joseph's life where they journeyed to Bethlehem for the birth of Jesus. 
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            22
head(text,2)
## [1] "An action-packed drama about a Christian high school football coach who uses his undying faith to battle the giants of fear and failure. In six years of coaching, Grant Taylor has never led his Shiloh Eagles to a winning season. After learning that he and his wife Brooke face infertility, Grant discovers that a group of fathers are secretly organizing to have him dismissed as head coach. Devastated by his circumstances, he cries out to God in desperation. When Grant receives a message from an unexpected visitor, he searches for a stronger purpose for his football team. He dares to challenge his players to believe God for the impossible on and off the field. When faced with unbelievable odds, the Eagles must step up to their greatest test of strength and courage. What transpires is a dynamic story of the fight between faith and fear. "
## [2] "This holiday comedy is centered around two neighbors in a small New England town who go to war when one of them decides to decorate his house with a so many Christmas lights that they are visible from space. The neighborhood is turned upside down as the families try to discover the true meaning of Christmas. "

For MacOS

text<-iconv(text,“UTF-8”)

create dtm using TF indexing

corpus <- VCorpus(VectorSource(text))

corpus <- tm_map(corpus, content_transformer(tolower)) #covernt to lower cases
corpus <- tm_map(corpus, removeNumbers) #remove digits
corpus <- tm_map(corpus, removeWords, my_stopwords)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument) #word stemming
corpus <- tm_map(corpus, removeWords, my_stopwords) #stopwords removal
corpus <- tm_map(corpus, stripWhitespace) #delete redundent whitespace "a  b"-> "a b"

dtm <- DocumentTermMatrix(corpus)

word cloud to check roughly what’s in the data

tf <-sort(colSums(as.matrix(dtm)), decreasing=TRUE)
dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(tf), tf, max.words=20, scale=c(2, .04), colors=dark2)

====topic modeling=========

LDA is a Bayesian mixture model.

Two estimation methods are available for LDA: VEM and Gibbs.

let’s use Gibbs method

lda_5_g <- LDA(dtm, 5, method="Gibbs")

—What are in the topics——–

Look at the most frequent terms for each topic

terms(lda_5_g, 10)
##       Topic 1    Topic 2        Topic 3    Topic 4  Topic 5
##  [1,] "find"     "life"         "world"    "friend" "man"  
##  [2,] "famili"   "new"          "name"     "becom"  "one"  
##  [3,] "father"   "love"         "forc"     "way"    "work" 
##  [4,] "tri"      "live"         "back"     "school" "two"  
##  [5,] "old"      "stori"        "discov"   "girl"   "wife" 
##  [6,] "young"    "meet"         "power"    "turn"   "death"
##  [7,] "home"     "day"          "american" "take"   "kill" 
##  [8,] "daughter" "end"          "set"      "decid"  "agent"
##  [9,] "son"      "two"          "brother"  "team"   "job"  
## [10,] "mother"   "relationship" "one"      "help"   "look"

Function logLik() gives us the log-likelihood of the model,

which is the sum over the log-likelihoods of all documents,

maximized during maximum likelihood estimation of the model

logLik(lda_5_g)
## 'log Lik.' -395079.6 (df=47505)

use “@terms” to find out the terms in columns

beta, logarithmized parameters of the word distribution for topic N

lda_5_g@terms[1:10]
##  [1] "ã©migrã©" "aaa"      "aaron"    "abandon"  "abbi"     "abbott"  
##  [7] "abdic"    "abduct"   "abigail"  "abil"
lda_5_g@beta[3, 1:10]
##  [1] -11.645453 -11.645453 -11.645453 -11.645453  -8.211465 -11.645453
##  [7] -11.645453 -11.645453 -11.645453 -11.645453

We can produce wordcloud for more intuitive visualisation of the topics.

Let’s create a helper function which takes in a topic model and the index

of a topic and generates a wordcloud for this topic.

get the matrix of probabilities of words over topics - the beta

name the columns of the matrix with the corresponding terms

get the ith topic (a vector of word probabilities) and sort them in decreasing order

display the top 20 most frequent words in wordcloud

showcloud = function (m, i) {
  tt <- m@beta
  colnames(tt) <- m@terms
  top <- sort(tt[i, ], decreasing = TRUE)
  wordcloud(names(top[1:20]), 2^top[1:20],scale=c(2, .04),rot.per=0.3, colors=dark2)
}

showcloud(lda_5_g, 5) #show cloud for the selected topic

Now how do we know which document belongs to which topic?

Let us get the 5 topics for the first ten documents.

1st document rank 1st on topic 4, 2nd on topic 3

t(topics(lda_5_g, 5))[1:10,]
##    [,1] [,2] [,3] [,4] [,5]
## 1     4    3    1    5    2
## 2     1    3    4    2    5
## 3     2    1    3    5    4
## 4     3    4    1    2    5
## 5     1    2    5    3    4
## 6     5    2    4    1    3
## 7     4    1    3    2    5
## 8     4    5    2    1    3
## 9     3    1    5    2    4
## 10    3    4    1    2    5

which topic has largest number of documents?

which.max(tabulate(topics(lda_5_g)))
## [1] 2
tabulate(topics(lda_5_g))
## [1] 226 238 208 186 195
table(topics(lda_5_g))
## 
##   1   2   3   4   5 
## 226 238 208 186 195

gamma, posterior topic distribution for each document, gives the actual probabilities

let’s look at the probabilities of the first document belonging to each of the topics

lda_5_g@gamma[1,]
## [1] 0.16935484 0.09677419 0.27419355 0.33064516 0.12903226
barplot(lda_5_g@gamma[1,], names.arg=1:5, main="Topic distribution of Story 1")

We may also build topic models using a matrix with its sparsity reduced

dtm_slim <- removeSparseTerms(dtm, 0.998)
dtm_slim
## <<DocumentTermMatrix (documents: 1053, terms: 3057)>>
## Non-/sparse entries: 38157/3180864
## Sparsity           : 99%
## Maximal term length: 15
## Weighting          : term frequency (tf)
lda_5_g_s <- LDA(dtm_slim, 5, method="Gibbs")

Are you still getting the same topics?

terms(lda_5_g_s, 10)
##       Topic 1 Topic 2  Topic 3    Topic 4   Topic 5   
##  [1,] "new"   "friend" "world"    "man"     "find"    
##  [2,] "life"  "take"   "one"      "work"    "famili"  
##  [3,] "live"  "school" "back"     "one"     "father"  
##  [4,] "love"  "decid"  "forc"     "wife"    "old"     
##  [5,] "day"   "becom"  "discov"   "kill"    "home"    
##  [6,] "stori" "girl"   "power"    "call"    "daughter"
##  [7,] "meet"  "way"    "american" "look"    "son"     
##  [8,] "young" "help"   "use"      "murder"  "mother"  
##  [9,] "fall"  "team"   "secret"   "mysteri" "come"    
## [10,] "begin" "boy"    "name"     "death"   "hous"

However, with more sparse terms removed(e.g. at 0.995), some documents with fewer words may end

up having no non-zero values in the row, causing LDA() unable to run

dtm_skinny <- removeSparseTerms(dtm, 0.995)
dtm_skinny
## <<DocumentTermMatrix (documents: 1053, terms: 1659)>>
## Non-/sparse entries: 32913/1714014
## Sparsity           : 98%
## Maximal term length: 15
## Weighting          : term frequency (tf)
### LDA(dtm_skinny, 5, method="Gibbs") fail due to zero entry for the input matrix

Error might be observed

####Error in LDA(dtm_skinny, 5, method = “Gibbs”) : ####Each row of the input matrix needs to contain at least one non-zero entry ####Therefore, we need to remove those ‘empty’ rows from the matrix first ####This is what you can do:

Find the total count of words in each Document

rowTotals <- apply(dtm_skinny, 1, sum) 

remove all docs with 0 words due to RemoveSparse

dtm_skinny   <- dtm_skinny[rowTotals> 0, ] 

Now try

1. build the topic models again using Gibbs method. This should be much faster than just now.

Since it’s much faster, let’s try getting more topics.

lda_5_g_sk <- LDA(dtm_skinny, 5, method="Gibbs")
lda_7_g_sk <- LDA(dtm_skinny, 7, method="Gibbs")
lda_10_g_sk <- LDA(dtm_skinny, 10, method="Gibbs")

2. check out the topics. Do they still make sense?

terms(lda_5_g_sk, 10)
##       Topic 1    Topic 2 Topic 3  Topic 4  Topic 5
##  [1,] "world"    "two"   "life"   "friend" "find" 
##  [2,] "back"     "man"   "famili" "becom"  "new"  
##  [3,] "name"     "work"  "love"   "take"   "one"  
##  [4,] "forc"     "tri"   "young"  "way"    "old"  
##  [5,] "american" "woman" "father" "decid"  "come" 
##  [6,] "power"    "death" "stori"  "school" "citi" 
##  [7,] "discov"   "call"  "meet"   "help"   "york" 
##  [8,] "use"      "kill"  "day"    "girl"   "town" 
##  [9,] "war"      "job"   "home"   "team"   "soon" 
## [10,] "secret"   "agent" "mother" "high"   "three"
terms(lda_7_g_sk, 10)
##       Topic 1    Topic 2  Topic 3        Topic 4    Topic 5   Topic 6
##  [1,] "famili"   "one"    "new"          "world"    "work"    "take" 
##  [2,] "find"     "day"    "life"         "forc"     "man"     "begin"
##  [3,] "father"   "back"   "love"         "power"    "kill"    "town" 
##  [4,] "old"      "two"    "stori"        "set"      "becom"   "start"
##  [5,] "home"     "decid"  "live"         "american" "call"    "job"  
##  [6,] "meet"     "come"   "relationship" "one"      "death"   "plan" 
##  [7,] "daughter" "live"   "citi"         "save"     "agent"   "leav" 
##  [8,] "mother"   "discov" "woman"        "war"      "secret"  "local"
##  [9,] "son"      "wife"   "york"         "find"     "murder"  "men"  
## [10,] "hous"     "three"  "young"        "group"    "mysteri" "first"
##       Topic 7  
##  [1,] "friend" 
##  [2,] "school" 
##  [3,] "way"    
##  [4,] "help"   
##  [5,] "team"   
##  [6,] "soon"   
##  [7,] "girl"   
##  [8,] "high"   
##  [9,] "game"   
## [10,] "student"
terms(lda_10_g_sk, 10)
##       Topic 1  Topic 2 Topic 3    Topic 4  Topic 5    Topic 6 Topic 7  
##  [1,] "find"   "one"   "world"    "friend" "man"      "way"   "life"   
##  [2,] "tri"    "live"  "power"    "school" "two"      "team"  "becom"  
##  [3,] "name"   "plan"  "forc"     "decid"  "kill"     "help"  "stori"  
##  [4,] "town"   "three" "war"      "turn"   "call"     "back"  "take"   
##  [5,] "discov" "know"  "american" "boy"    "agent"    "game"  "end"    
##  [6,] "death"  "job"   "face"     "see"    "mysteri"  "die"   "film"   
##  [7,] "leav"   "money" "human"    "old"    "investig" "wife"  "chang"  
##  [8,] "local"  "anoth" "fight"    "high"   "drug"     "put"   "like"   
##  [9,] "soon"   "place" "train"    "girl"   "murder"   "win"   "struggl"
## [10,] "car"    "stop"  "battl"    "best"   "dead"     "men"   "togeth" 
##       Topic 8        Topic 9 Topic 10  
##  [1,] "new"          "love"  "famili"  
##  [2,] "work"         "day"   "father"  
##  [3,] "citi"         "meet"  "son"     
##  [4,] "york"         "young" "home"    
##  [5,] "relationship" "fall"  "daughter"
##  [6,] "life"         "woman" "mother"  
##  [7,] "set"          "two"   "hous"    
##  [8,] "realiz"       "come"  "move"    
##  [9,] "girlfriend"   "john"  "night"   
## [10,] "seem"         "peopl" "old"

Explore the topic distribution of document 1

barplot(lda_10_g_sk@gamma[1,], names.arg=1:10, main="Topic distribution of Story 1")

barplot(lda_5_g_sk@gamma[1,], names.arg=1:5, main="Topic distribution of Story 1")

barplot(lda_7_g_sk@gamma[1,], names.arg=1:7, main="Topic distribution of Story 1")