set.seed(1387923)
d <- df %>%
sample_frac(.1) %>%
distinct()
hf <- d %>%
select(starts_with("V")) %>%
as.h2o()
km <- h2o.kmeans(hf,k = 5, estimate_k=F)
preds <- h2o.predict(km, hf) %>% as_tibble()
tSNE_fit <- d %>%
select(starts_with("V")) %>%
Rtsne(check_duplicates=F,
num_threads=0, momentum=.7, final_momentum=.9, perplexity=100)
tSNE_df <- tSNE_fit$Y %>%
as_tibble() %>%
mutate(cluster = factor(preds$predict)) %>%
rename(tSNE1="V1",
tSNE2="V2")
p <-tSNE_df %>%
mutate(doc_name = d$doc_name,
paragraph = d$paragraph) %>%
mutate(txt = glue::glue("{doc_name}\n{paragraph}")) %>%
ggplot(aes(x = tSNE1,
y = tSNE2,color=cluster)) +
geom_point(aes(text=txt), alpha= .4) +
scale_color_viridis_d(option="C") +
theme_minimal()
ggplotly(p, tooltip = "text")