library(tidyverse)
library(tidytext)
library(irlba)
library(quanteda)
library(plotly)
data(stop_words)
papers <- read_csv("full_biorxiv_data.csv") %>%
select(titles, abstracts)
word_counts <- papers %>%
unnest_tokens(word, abstracts) %>%
count(titles, word, sort = TRUE) %>%
ungroup()
word_freqs <- word_counts %>%
anti_join(stop_words) %>%
bind_tf_idf(word, titles, n)
Joining, by = "word"
term_mat <- word_freqs %>%
cast_dfm(titles, word, tf) %>%
as.matrix()
# term_pca <- prcomp(term_mat,center = TRUE, scale. = TRUE)
term_pca <- term_mat %*% irlba(term_mat, nv=5, nu=0, center=colMeans(term_mat), right_only=TRUE)$v
term_pca_df <- as_data_frame(term_pca) %>%
rename_(.dots = setNames(names(.), paste0("PC", 1:5))) %>%
mutate(title = rownames(term_pca))
ggplot(term_pca_df, aes(x = PC1, y = PC2)) +
geom_point(alpha = 0.2)

packageVersion('plotly')
[1] ‘4.5.6’
plot_ly(term_pca_df, x = ~PC1, y = ~PC2, z = ~PC3, opacity = 0.2,
text = ~paste('Title:', title)) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'PC1'),
yaxis = list(title = 'PC2'),
zaxis = list(title = 'PC3')))
LS0tCnRpdGxlOiAicGFwciBQQ0EgTm90ZWJvb2siCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyLCBtZXNzYWdlPSBGQUxTRSwgd2FybmluZyA9IEZBTFNFfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeSh0aWR5dGV4dCkKbGlicmFyeShpcmxiYSkKbGlicmFyeShxdWFudGVkYSkKbGlicmFyeShwbG90bHkpCmBgYAoKYGBge3J9CmRhdGEoc3RvcF93b3JkcykKCnBhcGVycyA8LSByZWFkX2NzdigiZnVsbF9iaW9yeGl2X2RhdGEuY3N2IikgJT4lIAogIHNlbGVjdCh0aXRsZXMsIGFic3RyYWN0cykKCndvcmRfY291bnRzIDwtIHBhcGVycyAlPiUgCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCBhYnN0cmFjdHMpICU+JSAKICBjb3VudCh0aXRsZXMsIHdvcmQsIHNvcnQgPSBUUlVFKSAlPiUgCiAgdW5ncm91cCgpCgp3b3JkX2ZyZXFzIDwtIHdvcmRfY291bnRzICU+JSAKICBhbnRpX2pvaW4oc3RvcF93b3JkcykgJT4lIAogIGJpbmRfdGZfaWRmKHdvcmQsIHRpdGxlcywgbikgCgp0ZXJtX21hdCA8LSB3b3JkX2ZyZXFzICU+JSAKICBjYXN0X2RmbSh0aXRsZXMsIHdvcmQsIHRmKSAlPiUgCiAgYXMubWF0cml4KCkKCiMgdGVybV9wY2EgPC0gcHJjb21wKHRlcm1fbWF0LGNlbnRlciA9IFRSVUUsIHNjYWxlLiA9IFRSVUUpIAoKdGVybV9wY2EgPC0gdGVybV9tYXQgJSolIGlybGJhKHRlcm1fbWF0LCBudj01LCBudT0wLCBjZW50ZXI9Y29sTWVhbnModGVybV9tYXQpLCByaWdodF9vbmx5PVRSVUUpJHYKCnRlcm1fcGNhX2RmIDwtIGFzX2RhdGFfZnJhbWUodGVybV9wY2EpICU+JSAKICByZW5hbWVfKC5kb3RzID0gc2V0TmFtZXMobmFtZXMoLiksIHBhc3RlMCgiUEMiLCAxOjUpKSkgJT4lIAogIG11dGF0ZSh0aXRsZSA9IHJvd25hbWVzKHRlcm1fcGNhKSkKCmdncGxvdCh0ZXJtX3BjYV9kZiwgYWVzKHggPSBQQzEsIHkgPSBQQzIpKSArIAogIGdlb21fcG9pbnQoYWxwaGEgPSAwLjIpCgpgYGAKCmBgYHtyfQpwYWNrYWdlVmVyc2lvbigncGxvdGx5JykKCnBsb3RfbHkodGVybV9wY2FfZGYsIHggPSB+UEMxLCB5ID0gflBDMiwgeiA9IH5QQzMsIG9wYWNpdHkgPSAwLjIsCiAgICAgICAgdGV4dCA9IH5wYXN0ZSgnVGl0bGU6JywgdGl0bGUpKSAlPiUKICBhZGRfbWFya2VycygpICU+JQogIGxheW91dChzY2VuZSA9IGxpc3QoeGF4aXMgPSBsaXN0KHRpdGxlID0gJ1BDMScpLAogICAgICAgICAgICAgICAgICAgICAgeWF4aXMgPSBsaXN0KHRpdGxlID0gJ1BDMicpLAogICAgICAgICAgICAgICAgICAgICAgemF4aXMgPSBsaXN0KHRpdGxlID0gJ1BDMycpKSkKYGBgCgo=