rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 362872 19.4 592000 31.7 460000 24.6
## Vcells 555239 4.3 1023718 7.9 845439 6.5
library(tm)
## Loading required package: NLP
library(FactoMineR)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(data.table)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Matrix)
d <- data.table(raw_text = c("Can't say for sure what they used but there is an R library that includes the text of all Jane Austen's works if you ever wanted to try some yourself.",
"They probably got the idea from http://tidytextmining.com/index.html, which uses Jane Austen's work extensively as an ongoing example when showcasing the tidytext library.
The specific library of Jane Austen's work is library(janeaustenr).
I'm guessing they used tidytext and janeaustenr in R, then used a slightly different analysis (using the results from R) in Tableau or another data-viz heavy program to make it palatable for readers.",
"They took a page from the book of Julia Silge (pun intended -- sorry), who has already done a lot of really cool analysis like this.",
"They claimed in the text that they used PCA. But what they don't say is how they created the matrix that was fed into the PCA.
My guess would be just a document-term matrix. You create a list of the unique terms across all of the books, then you count the number of times that each word appears in each book. Take this matrix, feed it into the PCA, take the top two components. Those two components form the two dimensions in the plot shown in the article. Analyze the loadings of the terms on the components to describe them, as they've done.",
"I wonder if they used Word2Vec on the texts then used PCA on that.",
"What I don't understand is how they could attach labels to the features if they used PCA. Doesn't PCA turn features into something not understandable by humans?",
"Not only a fun read..but also a pretty infographic."))
### I don't want to use tm's corpus junk, so I'm wrapping some of the methods so I can use it with dplyr and data.table
stemCharacter <- Vectorize(function(z) {
z <- PlainTextDocument(z)
z <- stemDocument(z)
return(as.character(z))
})
# function to make a list of tokens
makeTokens <- function(z) {
z <- PlainTextDocument(z)
tk <- as.character(scan_tokenizer(z))
}
# clean the text
d <- d %>% mutate(clean_text = tolower(raw_text)) %>%
mutate(clean_text = gsub("[^a-z'-]", ' ', clean_text)) %>%
mutate(text_stemmed = stemCharacter(clean_text)) %>%
mutate(id = 1:nrow(.))
# create a key-value table of tokens
token_table <- rbindlist(lapply(1:nrow(d), function(i) {
data.table(id = d$id[i], terms = makeTokens(d$text_stemmed[i]) %>% as.character)
}))
# aggregate the frequences of the terms
term_freq_table <- token_table %>%
filter(!(terms %in% stopwords('en'))) %>% # remove stop words
group_by(id, terms) %>%
summarize(freq = n()) %>%
collect()
# put it all in a sparse matrix
u_terms <- sort(unique(term_freq_table$terms))
ids <- sort(unique(term_freq_table$id))
row.ix <- match(term_freq_table$id, ids)
col.ix <- match(term_freq_table$terms, u_terms)
m <- sparseMatrix(i = row.ix, j = col.ix, x = term_freq_table$freq)
rownames(m) <- ids
colnames(m) <- u_terms
# reduce the data, there are documents and terms without relevance
# row sum should be at least 1, if I document has one term it stays.
# term should occur twice
reduce.tdm <- function(m, rs = 1, cs = 2) {
repeat {
i <- sum(dim(m))
cat("Min row sum\t", min(rowSums(m)),"\tMin col sum\t",min(colSums(m)), "\n")
m <- m[rowSums(m) >= rs, colSums(m) >= cs]
if(sum(dim(m)) == i) break
}
return(m)
}
m <- reduce.tdm(m)
## Min row sum 6 Min col sum 1
## Min row sum 0 Min col sum 2
## Min row sum 3 Min col sum 2
# here are the top terms
colnames(m)
## [1] "analysi" "austen" "book" "compon" "creat"
## [6] "done" "featur" "guess" "jane" "janeaustenr"
## [11] "librari" "matrix" "pca" "r" "say"
## [16] "take" "term" "text" "tidytext" "two"
## [21] "understand" "use" "word" "work"
### PCA #############
m.pca <- PCA(as.matrix(m))


# you can see can see that one dimension has 48% of the variance. The second has 28%.
# In two dimensions we can explain 76% of the total variance.
summary(m.pca)
##
## Call:
## PCA(X = as.matrix(m))
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Variance 11.419 6.833 2.365 2.214 1.170
## % of var. 47.578 28.471 9.854 9.224 4.873
## Cumulative % of var. 47.578 76.049 85.903 95.127 100.000
##
## Individuals
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2
## 1 | 3.185 | -1.230 2.209 0.149 | -0.281 0.193 0.008 |
## 2 | 6.566 | -5.279 40.674 0.646 | 3.781 34.867 0.332 |
## 3 | 3.630 | 0.038 0.002 0.000 | -1.766 7.603 0.237 |
## 4 | 7.028 | 6.247 56.954 0.790 | 3.187 24.769 0.206 |
## 5 | 3.109 | 0.318 0.148 0.010 | -1.671 6.809 0.289 |
## 6 | 4.303 | -0.094 0.013 0.000 | -3.250 25.760 0.570 |
## Dim.3 ctr cos2
## 1 -1.978 27.579 0.386 |
## 2 0.863 5.254 0.017 |
## 3 -0.701 3.467 0.037 |
## 4 0.451 1.434 0.004 |
## 5 -1.305 12.010 0.176 |
## 6 2.670 50.256 0.385 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3 ctr
## analysi | -0.548 2.633 0.301 | 0.273 1.087 0.074 | 0.037 0.059
## austen | -0.761 5.075 0.579 | 0.608 5.406 0.369 | -0.036 0.054
## book | 0.809 5.735 0.655 | 0.385 2.165 0.148 | 0.029 0.034
## compon | 0.827 5.985 0.683 | 0.545 4.350 0.297 | 0.131 0.728
## creat | 0.827 5.985 0.683 | 0.545 4.350 0.297 | 0.131 0.728
## done | 0.658 3.787 0.432 | 0.192 0.541 0.037 | -0.058 0.140
## featur | -0.012 0.001 0.000 | -0.556 4.524 0.309 | 0.777 25.501
## guess | 0.101 0.090 0.010 | 0.942 12.997 0.888 | 0.302 3.862
## jane | -0.761 5.075 0.579 | 0.608 5.406 0.369 | -0.036 0.054
## janeaustenr | -0.699 4.274 0.488 | 0.647 6.123 0.418 | 0.251 2.666
## cos2
## analysi 0.001 |
## austen 0.001 |
## book 0.001 |
## compon 0.017 |
## creat 0.017 |
## done 0.003 |
## featur 0.603 |
## guess 0.091 |
## jane 0.001 |
## janeaustenr 0.063 |
# plot of the variables
# just from this I can see that
# Dim 1 is when people talk about text, pca, terms, matrix a lot
# and they say use jane austen, analysis very little
# Dim 2 is very positive for "guess", and negative for "understand" and "feature"
plot(m.pca, choix = 'var')

# we can use dimdesc to see with more accurady which terms weight heavily and in which
# direction for each dimension
dimdesc(m.pca, axes = 1:2, proba = 0.1)
## $Dim.1
## $Dim.1$quanti
## correlation p.value
## term 0.8267107 0.04244191
## take 0.8267107 0.04244191
## creat 0.8267107 0.04244191
## two 0.8267107 0.04244191
## matrix 0.8267107 0.04244191
## compon 0.8267107 0.04244191
## book 0.8092454 0.05111044
## pca 0.8060206 0.05279250
## work -0.7612349 0.07870732
## r -0.7612349 0.07870732
## jane -0.7612349 0.07870732
## austen -0.7612349 0.07870732
## librari -0.7614083 0.07859799
##
##
## $Dim.2
## $Dim.2$quanti
## correlation p.value
## guess 0.9423781 0.00488476