Assignment 8: Semantic Vector Spaces

Load the libraries + functions

Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.

library(gutenbergr)
library(stringr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(tm)

## Loading required package: NLP

library(topicmodels)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v readr   1.3.1
## v tibble  2.1.1     v purrr   0.3.2
## v ggplot2 3.2.1     v forcats 0.4.0

## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()

library(tidytext)
library(slam)

The Data

You will want to use some books from Project Gutenberg to perform either a Topic or LSA model. The code to pick the books has been provided for you, so all you would need to do is change out the titles.

##pick some titles from project gutenberg
titles = c("Twenty Thousand Leagues under the Sea", "The War of the Worlds",
            "Pride and Prejudice", "Great Expectations")

##read in those books
books = gutenberg_works(title %in% titles) %>%
  gutenberg_download(meta_fields = "title") %>% 
  mutate(document = row_number())

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

create_chapters = books %>% 
  group_by(title) %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter", ignore_case = TRUE)))) %>% 
  ungroup() %>%
  filter(chapter > 0) %>%
  unite(document, title, chapter) 

by_chapter = create_chapters %>% 
  group_by(document) %>% 
  summarise(text=paste(text,collapse=' '))

#by_chapter

The by_chapter data.frame can be used to create a corpus with VectorSource by using the text column.

Your Analysis

Pick either LSA or Topics Models (pick just one version of the topics math).
Create a corpus from the data above.
Build the semantic vector space.
Explore the vector space:

Include at least one graphic of the vector space that interests you.
Include some statistics for your model (LSA: might be coherence for a chapter, cosine, etc.; Topics: might be alpha, entropy, etc.)

Include these steps with a brief explanation of what you are doing. For example, you could write “I am doing LSA, this part creates the corpus and the space.”, and then include an r-chunk underneath it that runs the analysis. You should end up with a report that explains the space - you can assume the person reading it knows the terminology, but explain why/what you are doing at each step. Therefore, you don’t have to explain what alpha is but just simply that you are calculating it and what the interpretation of your results might be.

Creating the Corpus

import_corpus = Corpus(VectorSource(by_chapter$text))

CBuilding the Semantic Vector Space

import_mat =
  DocumentTermMatrix(import_corpus,
                     control = list(stemming = FALSE, 
                                    stopwords = TRUE, 
                                    minWordLength = 4, 
                                    removeNumbers = TRUE, 
                                    removePunctuation = TRUE))

Weighting the matrix

import_weight = tapply(import_mat$v/row_sums(import_mat)[import_mat$i],
                       import_mat$j,
                       mean) *
  log2(nDocs(import_mat)/col_sums(import_mat > 0))

import_mat = import_mat[ , import_weight >= 0.03]
import_mat = import_mat[row_sums(import_mat) > 0, ]

Modeling

k = 3

SEED = 12345

LDA_fit = LDA(import_mat, k = k,
              control = list(seed = SEED))

LDA_fixed = LDA(import_mat, k = k,
                control = list(estimate.alpha = FALSE, seed = SEED))

LDA_gibbs = LDA(import_mat, k = k, method = "Gibbs",
                control = list(seed = SEED, burnin = 1000,
                               thin = 100, iter = 1000))

CTM_fit = CTM(import_mat, k = k,
              control = list(seed = SEED,
                             var = list(tol = 10^-4),
                             em = list(tol = 10^-3)))

Exploring the Vector Space

LDA_fit@alpha

## [1] 0.03544873

LDA_fixed@alpha

## [1] 16.66667

LDA_gibbs@alpha

## [1] 16.66667

sapply(list(LDA_fit, LDA_fixed, LDA_gibbs, CTM_fit),
       function (x)
         mean(apply(posterior(x)$topics, 1, function(z) - sum(z * log(z)))))

## [1] 0.1355341 1.0509520 1.0417784 0.2350925

topics(LDA_fit, k)

##      1 2 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 22 23 24 25 26 28 29
## [1,] 3 3 3 3 3 3 3 3  3  3  3  2  2  3  2  2  2  2  3  3  3  3  3  2  3  3
## [2,] 2 2 2 2 2 2 2 2  2  2  1  1  1  2  3  1  3  1  2  2  2  2  1  1  2  2
## [3,] 1 1 1 1 1 1 1 1  1  1  2  3  3  1  1  3  1  3  1  1  1  1  2  3  1  1
##      30 31 32 33 34 35 37 38 39 40 41 42 43 45 47 48 49 50 51 52 53 54 55
## [1,]  2  2  2  3  3  2  1  2  2  2  2  2  2  1  2  3  3  3  2  3  3  3  3
## [2,]  1  1  3  2  2  1  2  3  3  1  1  1  3  3  1  2  2  2  1  1  2  1  2
## [3,]  3  3  1  1  1  3  3  1  1  3  3  3  1  2  3  1  1  1  3  2  1  2  1
##      56 57 58 59 69 104 114 124 125 130 132 134 135 136 137 138 139 141
## [1,]  3  3  3  3  1   2   1   1   1   1   2   1   2   1   1   2   3   3
## [2,]  2  2  2  1  3   1   3   3   3   3   1   3   1   2   2   1   1   1
## [3,]  1  1  1  2  2   3   2   2   2   2   3   2   3   3   3   3   2   2
##      142 145 151 154 158 159 166 167 176 182 183 184 186 189 193
## [1,]   1   2   2   1   2   2   2   2   3   1   1   1   1   1   1
## [2,]   2   1   1   2   1   1   1   1   1   2   2   2   3   2   3
## [3,]   3   3   3   3   3   3   3   3   2   3   3   3   2   3   2

terms(LDA_fit,10)

##       Topic 1        Topic 2    Topic 3       
##  [1,] "sergeant"     "wemmick"  "joe"         
##  [2,] "cuttlefish"   "drummle"  "ulla"        
##  [3,] "wandered"     "vanikoro" "galley"      
##  [4,] "adventures"   "perouse"  "waldengarver"
##  [5,] "arthur"       "pearl"    "rul"         
##  [6,] "experiences"  "cellar"   "twoandthirty"
##  [7,] "roehampton"   "jaggerth" "venus"       
##  [8,] "hopped"       "turnkey"  "sargasso"    
##  [9,] "hilton"       "coiler"   "danish"      
## [10,] "ecclesiastes" "sausage"  "experiences"

Beta Values

LDA_fit_topics = tidy(LDA_fit, matrix = "beta")

top_terms = LDA_fit_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

cleanup = theme(panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.background = element_blank(),
                axis.line.x = element_line(color = "black"),
                axis.line.y = element_line(color = "black"),
                legend.key = element_rect(fill = "white"),
                text = element_text(size = 10))

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  cleanup +
  coord_flip()

Gamma Values

LDA_gamma = tidy(LDA_fit, matrix = "gamma")

LDA_gamma %>%
  ggplot(aes(factor(topic), gamma)) +
  geom_point() +
  cleanup