library(topicmodels)
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
my_stopwords <- c(stopwords("english"), "will", "also", "etc", "else", "can", "even", "within", "without",
"well", "say", "year", "must", "need", "never", "now", "want", "still",
"time", "therefore", "send", "today", "may", "many", "make", "whose",
"however", "get", "have", "just", "him","will")
setwd("C:/Users/ngsook/Documents")
textdata <- read.delim("MovieStories.utf8.txt", header=TRUE, sep="\t", quote = "", stringsAsFactors = FALSE)
head(textdata, 3)
## MovieID CategoryGross
## 1 1 3
## 2 2 4
## 3 3 1
## Storyline
## 1 An action-packed drama about a Christian high school football coach who uses his undying faith to battle the giants of fear and failure. In six years of coaching, Grant Taylor has never led his Shiloh Eagles to a winning season. After learning that he and his wife Brooke face infertility, Grant discovers that a group of fathers are secretly organizing to have him dismissed as head coach. Devastated by his circumstances, he cries out to God in desperation. When Grant receives a message from an unexpected visitor, he searches for a stronger purpose for his football team. He dares to challenge his players to believe God for the impossible on and off the field. When faced with unbelievable odds, the Eagles must step up to their greatest test of strength and courage. What transpires is a dynamic story of the fight between faith and fear.
## 2 This holiday comedy is centered around two neighbors in a small New England town who go to war when one of them decides to decorate his house with a so many Christmas lights that they are visible from space. The neighborhood is turned upside down as the families try to discover the true meaning of Christmas.
## 3 The day after they get the word they'll go home in two weeks, a group of soldiers from Spokane are ambushed in an Iraqi city. Back stateside we follow four of them - a surgeon who saw too much, a teacher who's a single mom and who lost a hand in the ambush, an infantry man whose best friend died that day, and a soldier who keeps reliving the moment he killed a civilian woman. Each of the four has come home changed, each feels dislocation. Group therapy, V.A. services, halting gestures from family and colleagues, and regular flashbacks keep the war front and center in their minds. They're angry, touchy, and explosive: can a warrior find peace back home?
dim(textdata)
## [1] 1053 3
text <- textdata[, 3]
atext <- textdata[1, 3]
doclen <- sapply(text, function(x) length(strsplit(x, " ")[[1]]))
str(doclen)
## Named int [1:1053] 145 56 120 110 22 31 120 156 119 139 ...
## - attr(*, "names")= chr [1:1053] "An action-packed drama about a Christian high school football coach who uses his undying faith to battle the gi"| __truncated__ "This holiday comedy is centered around two neighbors in a small New England town who go to war when one of them"| __truncated__ "The day after they get the word they'll go home in two weeks, a group of soldiers from Spokane are ambushed in "| __truncated__ "This is the story of Doogal, an adorable candy-loving mutt who goes on a mission to save the world. Doogal must"| __truncated__ ...
table(doclen)
## doclen
## 3 7 11 13 14 15 17 18 19 21 22 23 24 25 26 27 28 29
## 1 1 1 2 1 2 5 2 2 4 1 4 4 5 2 1 4 3
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
## 1 7 3 2 4 3 3 4 1 6 5 7 6 7 11 7 8 3
## 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
## 8 13 10 7 7 13 11 7 9 10 6 9 7 4 10 6 8 8
## 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
## 13 4 12 8 9 10 9 10 7 12 11 10 8 5 7 11 9 4
## 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
## 7 6 6 9 4 7 7 13 3 10 11 3 8 8 12 8 7 7
## 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## 10 11 8 7 6 6 10 11 10 9 12 14 8 14 5 12 11 7
## 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
## 7 6 8 11 7 5 1 5 4 6 6 3 4 4 7 4 5 9
## 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
## 6 4 4 5 4 4 3 5 6 5 4 7 5 11 5 10 9 3
## 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
## 11 7 7 7 9 6 6 7 3 3 9 7 3 1 2 1 1 3
## 178 181 182 183 187
## 2 1 1 1 1
hist(doclen)
head(doclen,5)
## An action-packed drama about a Christian high school football coach who uses his undying faith to battle the giants of fear and failure. In six years of coaching, Grant Taylor has never led his Shiloh Eagles to a winning season. After learning that he and his wife Brooke face infertility, Grant discovers that a group of fathers are secretly organizing to have him dismissed as head coach. Devastated by his circumstances, he cries out to God in desperation. When Grant receives a message from an unexpected visitor, he searches for a stronger purpose for his football team. He dares to challenge his players to believe God for the impossible on and off the field. When faced with unbelievable odds, the Eagles must step up to their greatest test of strength and courage. What transpires is a dynamic story of the fight between faith and fear.
## 145
## This holiday comedy is centered around two neighbors in a small New England town who go to war when one of them decides to decorate his house with a so many Christmas lights that they are visible from space. The neighborhood is turned upside down as the families try to discover the true meaning of Christmas.
## 56
## The day after they get the word they'll go home in two weeks, a group of soldiers from Spokane are ambushed in an Iraqi city. Back stateside we follow four of them - a surgeon who saw too much, a teacher who's a single mom and who lost a hand in the ambush, an infantry man whose best friend died that day, and a soldier who keeps reliving the moment he killed a civilian woman. Each of the four has come home changed, each feels dislocation. Group therapy, V.A. services, halting gestures from family and colleagues, and regular flashbacks keep the war front and center in their minds. They're angry, touchy, and explosive: can a warrior find peace back home?
## 120
## This is the story of Doogal, an adorable candy-loving mutt who goes on a mission to save the world. Doogal must prevent the evil sorcerer Zeebad from freezing the earth forever with the power of the three mysterious legendary diamonds. Joining Doogal on his big quest are pals Dylan, a guitar-playing rabbit, Ermintrude, an opera-singing cow, and Brian, a bashful snail. Hopping on a magic train, they travel over ice-capped mountains, navigate fiery pits of lava, and sail across vast oceans on the perilous journey of a lifetime. Along the way, they learn that the most powerful weapon of all is their friendship - which even Zeebad's magic cannot destroy!
## 110
## A drama that focuses on the period in Mary and Joseph's life where they journeyed to Bethlehem for the birth of Jesus.
## 22
head(text,2)
## [1] "An action-packed drama about a Christian high school football coach who uses his undying faith to battle the giants of fear and failure. In six years of coaching, Grant Taylor has never led his Shiloh Eagles to a winning season. After learning that he and his wife Brooke face infertility, Grant discovers that a group of fathers are secretly organizing to have him dismissed as head coach. Devastated by his circumstances, he cries out to God in desperation. When Grant receives a message from an unexpected visitor, he searches for a stronger purpose for his football team. He dares to challenge his players to believe God for the impossible on and off the field. When faced with unbelievable odds, the Eagles must step up to their greatest test of strength and courage. What transpires is a dynamic story of the fight between faith and fear. "
## [2] "This holiday comedy is centered around two neighbors in a small New England town who go to war when one of them decides to decorate his house with a so many Christmas lights that they are visible from space. The neighborhood is turned upside down as the families try to discover the true meaning of Christmas. "
corpus <- VCorpus(VectorSource(text))
corpus <- tm_map(corpus, content_transformer(tolower)) #covernt to lower cases
corpus <- tm_map(corpus, removeNumbers) #remove digits
corpus <- tm_map(corpus, removeWords, my_stopwords)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument) #word stemming
corpus <- tm_map(corpus, removeWords, my_stopwords) #stopwords removal
corpus <- tm_map(corpus, stripWhitespace) #delete redundent whitespace "a b"-> "a b"
dtm <- DocumentTermMatrix(corpus)
tf <-sort(colSums(as.matrix(dtm)), decreasing=TRUE)
dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(tf), tf, max.words=20, scale=c(2, .04), colors=dark2)
lda_5_g <- LDA(dtm, 5, method="Gibbs")
terms(lda_5_g, 10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "find" "life" "world" "friend" "man"
## [2,] "famili" "new" "name" "becom" "one"
## [3,] "father" "love" "forc" "way" "work"
## [4,] "tri" "live" "back" "school" "two"
## [5,] "old" "stori" "discov" "girl" "wife"
## [6,] "young" "meet" "power" "turn" "death"
## [7,] "home" "day" "american" "take" "kill"
## [8,] "daughter" "end" "set" "decid" "agent"
## [9,] "son" "two" "brother" "team" "job"
## [10,] "mother" "relationship" "one" "help" "look"
logLik(lda_5_g)
## 'log Lik.' -395079.6 (df=47505)
lda_5_g@terms[1:10]
## [1] "ã©migrã©" "aaa" "aaron" "abandon" "abbi" "abbott"
## [7] "abdic" "abduct" "abigail" "abil"
lda_5_g@beta[3, 1:10]
## [1] -11.645453 -11.645453 -11.645453 -11.645453 -8.211465 -11.645453
## [7] -11.645453 -11.645453 -11.645453 -11.645453
showcloud = function (m, i) {
tt <- m@beta
colnames(tt) <- m@terms
top <- sort(tt[i, ], decreasing = TRUE)
wordcloud(names(top[1:20]), 2^top[1:20],scale=c(2, .04),rot.per=0.3, colors=dark2)
}
showcloud(lda_5_g, 5) #show cloud for the selected topic
t(topics(lda_5_g, 5))[1:10,]
## [,1] [,2] [,3] [,4] [,5]
## 1 4 3 1 5 2
## 2 1 3 4 2 5
## 3 2 1 3 5 4
## 4 3 4 1 2 5
## 5 1 2 5 3 4
## 6 5 2 4 1 3
## 7 4 1 3 2 5
## 8 4 5 2 1 3
## 9 3 1 5 2 4
## 10 3 4 1 2 5
which.max(tabulate(topics(lda_5_g)))
## [1] 2
tabulate(topics(lda_5_g))
## [1] 226 238 208 186 195
table(topics(lda_5_g))
##
## 1 2 3 4 5
## 226 238 208 186 195
lda_5_g@gamma[1,]
## [1] 0.16935484 0.09677419 0.27419355 0.33064516 0.12903226
barplot(lda_5_g@gamma[1,], names.arg=1:5, main="Topic distribution of Story 1")
dtm_slim <- removeSparseTerms(dtm, 0.998)
dtm_slim
## <<DocumentTermMatrix (documents: 1053, terms: 3057)>>
## Non-/sparse entries: 38157/3180864
## Sparsity : 99%
## Maximal term length: 15
## Weighting : term frequency (tf)
lda_5_g_s <- LDA(dtm_slim, 5, method="Gibbs")
terms(lda_5_g_s, 10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "new" "friend" "world" "man" "find"
## [2,] "life" "take" "one" "work" "famili"
## [3,] "live" "school" "back" "one" "father"
## [4,] "love" "decid" "forc" "wife" "old"
## [5,] "day" "becom" "discov" "kill" "home"
## [6,] "stori" "girl" "power" "call" "daughter"
## [7,] "meet" "way" "american" "look" "son"
## [8,] "young" "help" "use" "murder" "mother"
## [9,] "fall" "team" "secret" "mysteri" "come"
## [10,] "begin" "boy" "name" "death" "hous"
dtm_skinny <- removeSparseTerms(dtm, 0.995)
dtm_skinny
## <<DocumentTermMatrix (documents: 1053, terms: 1659)>>
## Non-/sparse entries: 32913/1714014
## Sparsity : 98%
## Maximal term length: 15
## Weighting : term frequency (tf)
### LDA(dtm_skinny, 5, method="Gibbs") fail due to zero entry for the input matrix
####Error in LDA(dtm_skinny, 5, method = “Gibbs”) : ####Each row of the input matrix needs to contain at least one non-zero entry ####Therefore, we need to remove those ‘empty’ rows from the matrix first ####This is what you can do:
rowTotals <- apply(dtm_skinny, 1, sum)
dtm_skinny <- dtm_skinny[rowTotals> 0, ]
lda_5_g_sk <- LDA(dtm_skinny, 5, method="Gibbs")
lda_7_g_sk <- LDA(dtm_skinny, 7, method="Gibbs")
lda_10_g_sk <- LDA(dtm_skinny, 10, method="Gibbs")
terms(lda_5_g_sk, 10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "world" "two" "life" "friend" "find"
## [2,] "back" "man" "famili" "becom" "new"
## [3,] "name" "work" "love" "take" "one"
## [4,] "forc" "tri" "young" "way" "old"
## [5,] "american" "woman" "father" "decid" "come"
## [6,] "power" "death" "stori" "school" "citi"
## [7,] "discov" "call" "meet" "help" "york"
## [8,] "use" "kill" "day" "girl" "town"
## [9,] "war" "job" "home" "team" "soon"
## [10,] "secret" "agent" "mother" "high" "three"
terms(lda_7_g_sk, 10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
## [1,] "famili" "one" "new" "world" "work" "take"
## [2,] "find" "day" "life" "forc" "man" "begin"
## [3,] "father" "back" "love" "power" "kill" "town"
## [4,] "old" "two" "stori" "set" "becom" "start"
## [5,] "home" "decid" "live" "american" "call" "job"
## [6,] "meet" "come" "relationship" "one" "death" "plan"
## [7,] "daughter" "live" "citi" "save" "agent" "leav"
## [8,] "mother" "discov" "woman" "war" "secret" "local"
## [9,] "son" "wife" "york" "find" "murder" "men"
## [10,] "hous" "three" "young" "group" "mysteri" "first"
## Topic 7
## [1,] "friend"
## [2,] "school"
## [3,] "way"
## [4,] "help"
## [5,] "team"
## [6,] "soon"
## [7,] "girl"
## [8,] "high"
## [9,] "game"
## [10,] "student"
terms(lda_10_g_sk, 10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7
## [1,] "find" "one" "world" "friend" "man" "way" "life"
## [2,] "tri" "live" "power" "school" "two" "team" "becom"
## [3,] "name" "plan" "forc" "decid" "kill" "help" "stori"
## [4,] "town" "three" "war" "turn" "call" "back" "take"
## [5,] "discov" "know" "american" "boy" "agent" "game" "end"
## [6,] "death" "job" "face" "see" "mysteri" "die" "film"
## [7,] "leav" "money" "human" "old" "investig" "wife" "chang"
## [8,] "local" "anoth" "fight" "high" "drug" "put" "like"
## [9,] "soon" "place" "train" "girl" "murder" "win" "struggl"
## [10,] "car" "stop" "battl" "best" "dead" "men" "togeth"
## Topic 8 Topic 9 Topic 10
## [1,] "new" "love" "famili"
## [2,] "work" "day" "father"
## [3,] "citi" "meet" "son"
## [4,] "york" "young" "home"
## [5,] "relationship" "fall" "daughter"
## [6,] "life" "woman" "mother"
## [7,] "set" "two" "hous"
## [8,] "realiz" "come" "move"
## [9,] "girlfriend" "john" "night"
## [10,] "seem" "peopl" "old"
barplot(lda_10_g_sk@gamma[1,], names.arg=1:10, main="Topic distribution of Story 1")
barplot(lda_5_g_sk@gamma[1,], names.arg=1:5, main="Topic distribution of Story 1")
barplot(lda_7_g_sk@gamma[1,], names.arg=1:7, main="Topic distribution of Story 1")