Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.
library(gutenbergr)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(tm)
## Loading required package: NLP
library(topicmodels)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v readr 1.3.1
## v tibble 2.1.3 v purrr 0.3.2
## v ggplot2 3.2.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(slam)
You will want to use some books from Project Gutenberg to perform either a Topic or LSA model. The code to pick the books has been provided for you, so all you would need to do is change out the titles.
##pick some titles from project gutenberg
titles = c("Twenty Thousand Leagues under the Sea", "The War of the Worlds",
"Pride and Prejudice", "Great Expectations")
##read in those books
books = gutenberg_works(title %in% titles) %>%
gutenberg_download(meta_fields = "title") %>%
mutate(document = row_number())
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
create_chapters = books %>%
group_by(title) %>%
mutate(chapter = cumsum(str_detect(text, regex("^chapter", ignore_case = TRUE)))) %>%
ungroup() %>%
filter(chapter > 0) %>%
unite(document, title, chapter)
by_chapter = create_chapters %>%
group_by(document) %>%
summarise(text=paste(text,collapse=' '))
The by_chapter data.frame can be used to create a corpus with VectorSource by using the text column.
import_corpus = Corpus(VectorSource(by_chapter$text))
import_mat =
DocumentTermMatrix(import_corpus,
control = list(stemming = FALSE, #create root words
stopwords = TRUE, #remove stop words
minWordLength = 4, #cut out small words
removeNumbers = TRUE, #take out the numbers
removePunctuation = TRUE)) #take out the punctuation
#weight the space
import_weight = tapply(import_mat$v/row_sums(import_mat)[import_mat$i],
import_mat$j,
mean) *
log2(nDocs(import_mat)/col_sums(import_mat > 0))
#ignore the very frequent and 0 terms
import_mat = import_mat[ , import_weight >= 0.03]
import_mat = import_mat[row_sums(import_mat) > 0, ]
#set the number of topics
k = 3
#set a random number for seed
SEED = 12345
LDA_fit = LDA(import_mat, k = k,
control = list(seed = SEED))
LDA_fixed = LDA(import_mat, k = k,
control = list(estimate.alpha = FALSE, seed = SEED))
LDA_gibbs = LDA(import_mat, k = k, method = "Gibbs",
control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000))
CTM_fit = CTM(import_mat, k = k,
control = list(seed = SEED,
var = list(tol = 10^-4),
em = list(tol = 10^-3)))
LDA_fit@alpha
## [1] 0.03544873
LDA_fixed@alpha
## [1] 16.66667
LDA_gibbs@alpha
## [1] 16.66667
sapply(list(LDA_fit, LDA_fixed, LDA_gibbs, CTM_fit),
function (x)
mean(apply(posterior(x)$topics, 1, function(z) - sum(z * log(z)))))
## [1] 0.1355341 1.0509520 1.0417784 0.2350925
#Deepdive of LDA Fit Model
#Topic Matrix
topics(LDA_fit, k)
## 1 2 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 22 23 24 25 26 28 29
## [1,] 3 3 3 3 3 3 3 3 3 3 3 2 2 3 2 2 2 2 3 3 3 3 3 2 3 3
## [2,] 2 2 2 2 2 2 2 2 2 2 1 1 1 2 3 1 3 1 2 2 2 2 1 1 2 2
## [3,] 1 1 1 1 1 1 1 1 1 1 2 3 3 1 1 3 1 3 1 1 1 1 2 3 1 1
## 30 31 32 33 34 35 37 38 39 40 41 42 43 45 47 48 49 50 51 52 53 54 55
## [1,] 2 2 2 3 3 2 1 2 2 2 2 2 2 1 2 3 3 3 2 3 3 3 3
## [2,] 1 1 3 2 2 1 2 3 3 1 1 1 3 3 1 2 2 2 1 1 2 1 2
## [3,] 3 3 1 1 1 3 3 1 1 3 3 3 1 2 3 1 1 1 3 2 1 2 1
## 56 57 58 59 69 104 114 124 125 130 132 134 135 136 137 138 139 141
## [1,] 3 3 3 3 1 2 1 1 1 1 2 1 2 1 1 2 3 3
## [2,] 2 2 2 1 3 1 3 3 3 3 1 3 1 2 2 1 1 1
## [3,] 1 1 1 2 2 3 2 2 2 2 3 2 3 3 3 3 2 2
## 142 145 151 154 158 159 166 167 176 182 183 184 186 189 193
## [1,] 1 2 2 1 2 2 2 2 3 1 1 1 1 1 1
## [2,] 2 1 1 2 1 1 1 1 1 2 2 2 3 2 3
## [3,] 3 3 3 3 3 3 3 3 2 3 3 3 2 3 2
#Most Frequent Terms
terms(LDA_fit,10)
## Topic 1 Topic 2 Topic 3
## [1,] "sergeant" "wemmick" "joe"
## [2,] "cuttlefish" "drummle" "ulla"
## [3,] "wandered" "vanikoro" "galley"
## [4,] "adventures" "perouse" "waldengarver"
## [5,] "arthur" "pearl" "rul"
## [6,] "experiences" "cellar" "twoandthirty"
## [7,] "roehampton" "jaggerth" "venus"
## [8,] "hopped" "turnkey" "sargasso"
## [9,] "hilton" "coiler" "danish"
## [10,] "ecclesiastes" "sausage" "experiences"
#Beta Values
#use tidyverse to clean up the fit
LDA_fit_topics = tidy(LDA_fit, matrix = "beta")
#create top terms
top_terms = LDA_fit_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
#clean up ggplot2 defaults
cleanup = theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line.x = element_line(color = "black"),
axis.line.y = element_line(color = "black"),
legend.key = element_rect(fill = "white"),
text = element_text(size = 10))
#make the plot
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
cleanup +
coord_flip()
LDA_gamma = tidy(LDA_fit, matrix = "gamma")
#Gamma Values
LDA_gamma %>%
ggplot(aes(factor(topic), gamma)) +
geom_point() +
cleanup