Document Embeddings

Code
library(tidyverse)
library(reticulate)
library(plotly)

Sys.setenv("RETICULATE_PYTHON" = "/Users/peerchristensen/Library/r-miniconda/envs/r-reticulate/bin/python")

#conda_install("r-reticulate", "numpy")
#use_condaenv("contracts_pca")
np <- import("numpy")

data = np$load("embeddings.npy")

embeddings <- data %>% as_tibble() 
paragraphs <- read_csv("parsed_paragraphs.csv") %>% select(doc_name,paragraph)

df <- cbind(paragraphs, embeddings)
df <- df %>% distinct() %>% 
  filter(!str_detect(doc_name,"ENG"))

PCA

Code
df_pca <- df %>% 
  select_if(is.numeric) %>%
  prcomp(center = TRUE, scale. = TRUE)

p <- df %>%
  select(doc_name, paragraph) %>%
  bind_cols(df_pca$x) %>%
  sample_frac(.1) %>%
  mutate(txt = glue::glue("{doc_name}\n{paragraph}")) %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(aes(text=txt), alpha= .4) +
  scale_color_viridis_d(option="C") +
  theme_minimal()

ggplotly(p, tooltip = "text")

K-means + t-SNE

Code
set.seed(1387923)

d <- df %>%
  sample_frac(.1) %>%
  distinct() 

hf <- d %>% 
  select(starts_with("V")) %>%
  as.h2o()

km <- h2o.kmeans(hf,k = 5, estimate_k=F)
preds <- h2o.predict(km, hf) %>% as_tibble()

tSNE_fit <- d %>% 
  select(starts_with("V")) %>%
  Rtsne(check_duplicates=F,
        num_threads=0, momentum=.7, final_momentum=.9, perplexity=100)

tSNE_df <- tSNE_fit$Y %>% 
  as_tibble() %>%
  mutate(cluster = factor(preds$predict)) %>%
  rename(tSNE1="V1",
         tSNE2="V2")

p <-tSNE_df %>%
  mutate(doc_name = d$doc_name,
         paragraph = d$paragraph) %>%
  mutate(txt = glue::glue("{doc_name}\n{paragraph}")) %>%
  ggplot(aes(x = tSNE1, 
             y = tSNE2,color=cluster)) +
  geom_point(aes(text=txt), alpha= .4) +
  scale_color_viridis_d(option="C") +
  theme_minimal()

ggplotly(p, tooltip = "text")