Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.
library(gutenbergr)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(tm)
## Loading required package: NLP
library(topicmodels)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v readr 1.3.1
## v tibble 2.1.1 v purrr 0.3.2
## v ggplot2 3.2.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(slam)
You will want to use some books from Project Gutenberg to perform either a Topic or LSA model. The code to pick the books has been provided for you, so all you would need to do is change out the titles.
##pick some titles from project gutenberg
titles = c("Twenty Thousand Leagues under the Sea", "The War of the Worlds",
"Pride and Prejudice", "Great Expectations")
##read in those books
books = gutenberg_works(title %in% titles) %>%
gutenberg_download(meta_fields = "title") %>%
mutate(document = row_number())
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
create_chapters = books %>%
group_by(title) %>%
mutate(chapter = cumsum(str_detect(text, regex("^chapter", ignore_case = TRUE)))) %>%
ungroup() %>%
filter(chapter > 0) %>%
unite(document, title, chapter)
by_chapter = create_chapters %>%
group_by(document) %>%
summarise(text=paste(text,collapse=' '))
#by_chapter
The by_chapter
data.frame can be used to create a corpus with VectorSource
by using the text
column.
Include these steps with a brief explanation of what you are doing. For example, you could write “I am doing LSA, this part creates the corpus and the space.”, and then include an r-chunk underneath it that runs the analysis. You should end up with a report that explains the space - you can assume the person reading it knows the terminology, but explain why/what you are doing at each step. Therefore, you don’t have to explain what alpha is but just simply that you are calculating it and what the interpretation of your results might be.
import_corpus = Corpus(VectorSource(by_chapter$text))
import_mat =
DocumentTermMatrix(import_corpus,
control = list(stemming = FALSE,
stopwords = TRUE,
minWordLength = 4,
removeNumbers = TRUE,
removePunctuation = TRUE))
import_weight = tapply(import_mat$v/row_sums(import_mat)[import_mat$i],
import_mat$j,
mean) *
log2(nDocs(import_mat)/col_sums(import_mat > 0))
import_mat = import_mat[ , import_weight >= 0.03]
import_mat = import_mat[row_sums(import_mat) > 0, ]
k = 3
SEED = 12345
LDA_fit = LDA(import_mat, k = k,
control = list(seed = SEED))
LDA_fixed = LDA(import_mat, k = k,
control = list(estimate.alpha = FALSE, seed = SEED))
LDA_gibbs = LDA(import_mat, k = k, method = "Gibbs",
control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000))
CTM_fit = CTM(import_mat, k = k,
control = list(seed = SEED,
var = list(tol = 10^-4),
em = list(tol = 10^-3)))
LDA_fit@alpha
## [1] 0.03544873
LDA_fixed@alpha
## [1] 16.66667
LDA_gibbs@alpha
## [1] 16.66667
sapply(list(LDA_fit, LDA_fixed, LDA_gibbs, CTM_fit),
function (x)
mean(apply(posterior(x)$topics, 1, function(z) - sum(z * log(z)))))
## [1] 0.1355341 1.0509520 1.0417784 0.2350925
topics(LDA_fit, k)
## 1 2 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 22 23 24 25 26 28 29
## [1,] 3 3 3 3 3 3 3 3 3 3 3 2 2 3 2 2 2 2 3 3 3 3 3 2 3 3
## [2,] 2 2 2 2 2 2 2 2 2 2 1 1 1 2 3 1 3 1 2 2 2 2 1 1 2 2
## [3,] 1 1 1 1 1 1 1 1 1 1 2 3 3 1 1 3 1 3 1 1 1 1 2 3 1 1
## 30 31 32 33 34 35 37 38 39 40 41 42 43 45 47 48 49 50 51 52 53 54 55
## [1,] 2 2 2 3 3 2 1 2 2 2 2 2 2 1 2 3 3 3 2 3 3 3 3
## [2,] 1 1 3 2 2 1 2 3 3 1 1 1 3 3 1 2 2 2 1 1 2 1 2
## [3,] 3 3 1 1 1 3 3 1 1 3 3 3 1 2 3 1 1 1 3 2 1 2 1
## 56 57 58 59 69 104 114 124 125 130 132 134 135 136 137 138 139 141
## [1,] 3 3 3 3 1 2 1 1 1 1 2 1 2 1 1 2 3 3
## [2,] 2 2 2 1 3 1 3 3 3 3 1 3 1 2 2 1 1 1
## [3,] 1 1 1 2 2 3 2 2 2 2 3 2 3 3 3 3 2 2
## 142 145 151 154 158 159 166 167 176 182 183 184 186 189 193
## [1,] 1 2 2 1 2 2 2 2 3 1 1 1 1 1 1
## [2,] 2 1 1 2 1 1 1 1 1 2 2 2 3 2 3
## [3,] 3 3 3 3 3 3 3 3 2 3 3 3 2 3 2
terms(LDA_fit,10)
## Topic 1 Topic 2 Topic 3
## [1,] "sergeant" "wemmick" "joe"
## [2,] "cuttlefish" "drummle" "ulla"
## [3,] "wandered" "vanikoro" "galley"
## [4,] "adventures" "perouse" "waldengarver"
## [5,] "arthur" "pearl" "rul"
## [6,] "experiences" "cellar" "twoandthirty"
## [7,] "roehampton" "jaggerth" "venus"
## [8,] "hopped" "turnkey" "sargasso"
## [9,] "hilton" "coiler" "danish"
## [10,] "ecclesiastes" "sausage" "experiences"
LDA_fit_topics = tidy(LDA_fit, matrix = "beta")
top_terms = LDA_fit_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
cleanup = theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line.x = element_line(color = "black"),
axis.line.y = element_line(color = "black"),
legend.key = element_rect(fill = "white"),
text = element_text(size = 10))
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
cleanup +
coord_flip()
LDA_gamma = tidy(LDA_fit, matrix = "gamma")
LDA_gamma %>%
ggplot(aes(factor(topic), gamma)) +
geom_point() +
cleanup