Assignment 8: Semantic Vector Spaces

Load the libraries + functions

Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.

library(gutenbergr)
library(stringr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(tm)

## Loading required package: NLP

library(topicmodels)
library(tidyverse)

## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v readr   1.3.1
## v tibble  2.1.3     v purrr   0.3.2
## v ggplot2 3.2.1     v forcats 0.4.0

## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()

library(tidytext)
library(slam)

The Data

You will want to use some books from Project Gutenberg to perform either a Topic or LSA model. The code to pick the books has been provided for you, so all you would need to do is change out the titles.

##pick some titles from project gutenberg
titles = c("Twenty Thousand Leagues under the Sea", "The War of the Worlds",
            "Pride and Prejudice", "Great Expectations")

##read in those books
books = gutenberg_works(title %in% titles) %>%
  gutenberg_download(meta_fields = "title") %>% 
  mutate(document = row_number())

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

create_chapters = books %>% 
  group_by(title) %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter", ignore_case = TRUE)))) %>% 
  ungroup() %>%
  filter(chapter > 0) %>%
  unite(document, title, chapter) 

by_chapter = create_chapters %>% 
  group_by(document) %>% 
  summarise(text=paste(text,collapse=' '))

The by_chapter data.frame can be used to create a corpus with VectorSource by using the text column.

Your Analysis

Pick either LSA or Topics Models (pick just one version of the topics math).
Create a corpus from the data above.

import_corpus = Corpus(VectorSource(by_chapter$text))

Build the semantic vector space.

import_mat =
  DocumentTermMatrix(import_corpus,
                     control = list(stemming = FALSE, #create root words
                                    stopwords = TRUE, #remove stop words
                                    minWordLength = 4, #cut out small words
                                    removeNumbers = TRUE, #take out the numbers
                                    removePunctuation = TRUE)) #take out the punctuation

#weight the space
import_weight = tapply(import_mat$v/row_sums(import_mat)[import_mat$i],
                       import_mat$j,
                       mean) *
  log2(nDocs(import_mat)/col_sums(import_mat > 0))

#ignore the very frequent and 0 terms
import_mat = import_mat[ , import_weight >= 0.03]
import_mat = import_mat[row_sums(import_mat) > 0, ]

#set the number of topics
k = 3

#set a random number for seed
SEED = 12345

LDA_fit = LDA(import_mat, k = k,
              control = list(seed = SEED))

LDA_fixed = LDA(import_mat, k = k,
                control = list(estimate.alpha = FALSE, seed = SEED))

LDA_gibbs = LDA(import_mat, k = k, method = "Gibbs",
                control = list(seed = SEED, burnin = 1000,
                               thin = 100, iter = 1000))

CTM_fit = CTM(import_mat, k = k,
              control = list(seed = SEED,
                             var = list(tol = 10^-4),
                             em = list(tol = 10^-3)))

Explore the vector space:

LDA_fit@alpha

## [1] 0.03544873

LDA_fixed@alpha

## [1] 16.66667

LDA_gibbs@alpha

## [1] 16.66667

sapply(list(LDA_fit, LDA_fixed, LDA_gibbs, CTM_fit),
       function (x)
         mean(apply(posterior(x)$topics, 1, function(z) - sum(z * log(z)))))

## [1] 0.1355341 1.0509520 1.0417784 0.2350925

#Deepdive of LDA  Fit Model
#Topic Matrix
topics(LDA_fit, k)

##      1 2 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 22 23 24 25 26 28 29
## [1,] 3 3 3 3 3 3 3 3  3  3  3  2  2  3  2  2  2  2  3  3  3  3  3  2  3  3
## [2,] 2 2 2 2 2 2 2 2  2  2  1  1  1  2  3  1  3  1  2  2  2  2  1  1  2  2
## [3,] 1 1 1 1 1 1 1 1  1  1  2  3  3  1  1  3  1  3  1  1  1  1  2  3  1  1
##      30 31 32 33 34 35 37 38 39 40 41 42 43 45 47 48 49 50 51 52 53 54 55
## [1,]  2  2  2  3  3  2  1  2  2  2  2  2  2  1  2  3  3  3  2  3  3  3  3
## [2,]  1  1  3  2  2  1  2  3  3  1  1  1  3  3  1  2  2  2  1  1  2  1  2
## [3,]  3  3  1  1  1  3  3  1  1  3  3  3  1  2  3  1  1  1  3  2  1  2  1
##      56 57 58 59 69 104 114 124 125 130 132 134 135 136 137 138 139 141
## [1,]  3  3  3  3  1   2   1   1   1   1   2   1   2   1   1   2   3   3
## [2,]  2  2  2  1  3   1   3   3   3   3   1   3   1   2   2   1   1   1
## [3,]  1  1  1  2  2   3   2   2   2   2   3   2   3   3   3   3   2   2
##      142 145 151 154 158 159 166 167 176 182 183 184 186 189 193
## [1,]   1   2   2   1   2   2   2   2   3   1   1   1   1   1   1
## [2,]   2   1   1   2   1   1   1   1   1   2   2   2   3   2   3
## [3,]   3   3   3   3   3   3   3   3   2   3   3   3   2   3   2

#Most Frequent Terms
terms(LDA_fit,10)

##       Topic 1        Topic 2    Topic 3       
##  [1,] "sergeant"     "wemmick"  "joe"         
##  [2,] "cuttlefish"   "drummle"  "ulla"        
##  [3,] "wandered"     "vanikoro" "galley"      
##  [4,] "adventures"   "perouse"  "waldengarver"
##  [5,] "arthur"       "pearl"    "rul"         
##  [6,] "experiences"  "cellar"   "twoandthirty"
##  [7,] "roehampton"   "jaggerth" "venus"       
##  [8,] "hopped"       "turnkey"  "sargasso"    
##  [9,] "hilton"       "coiler"   "danish"      
## [10,] "ecclesiastes" "sausage"  "experiences"

#Beta Values
#use tidyverse to clean up the fit
LDA_fit_topics = tidy(LDA_fit, matrix = "beta")

#create top terms
top_terms = LDA_fit_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

#clean up ggplot2 defaults
cleanup = theme(panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.background = element_blank(),
                axis.line.x = element_line(color = "black"),
                axis.line.y = element_line(color = "black"),
                legend.key = element_rect(fill = "white"),
                text = element_text(size = 10))

#make the plot
top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  cleanup +
  coord_flip()

LDA_gamma = tidy(LDA_fit, matrix = "gamma")
#Gamma Values
LDA_gamma %>%
  ggplot(aes(factor(topic), gamma)) +
  geom_point() +
  cleanup

Assignment 8: Semantic Vector Spaces

STUDENT NAME

2019-09-14

Load the libraries + functions

The Data

Your Analysis