In-workshop materials

Introduction

knitr::opts_chunk$set(eval=FALSE)

library(wordVectors)
library(magrittr)

news = read.vectors("~/word2vec_models/short_google_news.bin")


knitr::opts_chunk$set(eval=FALSE)

Closeness

news %>% nearest_to(news[["America"]],40)

news %>% nearest_to(news[["Canada"]],40)

europe_words = news %>% nearest_to(news[[c("France","Germany")]],100) %>% names



just_europe = news[[europe_words ,average=FALSE]]

Plotting Europe words by their similarity.

This takes some R code to do, but you can see how to change it.

library(ggplot2)

country_similarities = data.frame(
  # Note if you change this line, you'll have to change *both* the variable and the filter name. 
  # (i.e., 'france_similarity' and 'news[["France"]]')
  france_similarity = just_europe %>% cosineSimilarity(news[["France"]]),
  germany_similarity = just_europe %>% cosineSimilarity(news[["Germany"]]),
  word = rownames(just_europe)
) 

ggplot(country_similarities) + geom_text(
  aes(x=france_similarity,
      y=germany_similarity,
      label=word)
)

Some analogies.

install.packages("abind")
library(abind)

news %>% nearest_to(-{news[["woman"]] - news[["man"]]}, 50)

news %>% nearest_to(news[["father"]]-news[["man"]] + news[["woman"]])

news %>% nearest_to(news[["Berlin"]] + news[["Spain"]] - news[["Germany"]] )

plot_words(c("father","mother","uncle","aunt","grandfather","grandmother","sister","brother","son","daughter"),news)

Alignment

Alignment of multiple models presents fascinating, if poorly understood, opportunities.

library(shapes)
library(wordVectors)
library(magrittr)
library(abind)

source("http://benschmidt.org/word2vec_workshop/alignment.R")

# If you downloaded these files, they're in a different folder.

model_names = sort(list.files("~/word2vec_models/good_hansard/",full.names = T))
models = read_group(model_names)


models[[3]] %>% nearest_to(models[["america"]])

Align the models

aligned = align_models(models,shared_vocab_only = TRUE)

stanford_plot(aligned, word = c("communism"), 15, transform_type = "pca")

stanford_plot(aligned, word = c("english"), 15, transform_type = "pca")

stanford_plot(aligned, word = c("empire"), 15, transform_type = "pca")

stanford_plot(aligned, word = c("market"), 15, transform_type = "pca")

distance = function(mx,my) {sqrt(rowSums((mx - my)^2))}
distmat = sapply(1:length(aligned),function(i) {
  distance(aligned[[i]],aligned[[3]])
})

sort(-apply(distmat,1,mean))[1:100]

stanford_plot(aligned,"liberty")

stanford_plot(aligned, word = c("tyranny"), 15, transform_type = "mds")

#Communism becomes taken *seriously*.
stanford_plot(aligned, word = c("communism"), 15, transform_type = "pca")

stanford_plot(aligned, word = c("english"), 15, transform_type = "pca")

stanford_plot(aligned, word = c("empire"), 15, transform_type = "pca")

stanford_plot(aligned, word = c("market"), 15, transform_type = "pca")

stanford_plot(aligned, word = c("economy"), 15, transform_type = "pca")


stanford_plot(aligned, word = c("empire"), 15, transform_type = "pca")

stanford_plot(aligned, word = c("king","queen"), 15, transform_type = "pca")