# along the lines of:
# https://gist.github.com/nassimhaddad/58933f4a5d34b84f4099

library(readr)
library(stringr)
library(knitr)
library(gplots)
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
# path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.50d.txt.zip"; dims <- 50
path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.100d.txt.zip"; dims <- 100
# path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.200d.txt.zip"; dims <- 200
# path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.300d.txt.zip"; dims <- 300



col_names <- c("term", str_c("d", 1:dims))

dat <- read_delim(file = path,
                   delim = " ",
                   quote = "",
                   col_names = col_names)
## Multiple files in zip: reading 'glove.6B.100d.txt'
rownames(dat) <- dat$term
dat <- dat[,-1]
dat <- as.matrix(dat)



# head(dat)

terms <- row.names(dat)



library(lsa)
## Loading required package: SnowballC
dat[c("cat", "dog", "rabbit", "elvis", "beatles", "dylan"),] %>%
        t %>%
        head(10) %>%
        kable(digits = 2)
cat dog rabbit elvis beatles dylan
d1 0.23 0.31 0.03 0.38 -0.11 0.09
d2 0.28 0.31 0.04 -0.38 0.20 0.26
d3 0.63 0.53 0.59 0.53 0.33 0.10
d4 -0.59 -0.93 -0.38 -0.45 -0.03 -1.02
d5 -0.59 -0.74 -0.47 0.15 0.43 0.43
d6 0.63 0.63 0.21 0.83 0.51 0.21
d7 0.24 0.44 0.15 0.27 0.61 0.16
d8 -0.14 0.10 -0.07 -0.36 -0.20 -0.01
d9 0.06 -0.09 0.48 0.28 -0.11 -0.07
d10 -0.79 -0.57 -1.19 -1.23 -0.51 -0.83
get_cosine_matrix <- function(x) {
        dat[x,] %>%
                t %>%
                cosine
}

print_cosine_matrix <- function(x) {
        get_cosine_matrix(x) %>%
                kable(digits = 2)
}

cosine_matrix_heatmap <- function(x) {
        get_cosine_matrix(x) %>%
                heatmap.2(col = cm.colors)
}

c("cat", "elvis", "dog", "beatles", "rabbit", "dylan") %>% print_cosine_matrix
cat elvis dog beatles rabbit dylan
cat 1.00 0.36 0.88 0.21 0.74 0.19
elvis 0.36 1.00 0.33 0.67 0.27 0.59
dog 0.88 0.33 1.00 0.17 0.65 0.17
beatles 0.21 0.67 0.17 1.00 0.15 0.70
rabbit 0.74 0.27 0.65 0.15 1.00 0.14
dylan 0.19 0.59 0.17 0.70 0.14 1.00
c("cat", "elvis", "dog", "beatles", "rabbit", "dylan") %>% cosine_matrix_heatmap

c("paris", "berlin", "france", "germany") %>% cosine_matrix_heatmap

c("feminist", "activist", "bank", "teller", "money", "finance") %>% cosine_matrix_heatmap

# quick function
library(FNN)
## 
## Attaching package: 'FNN'
## 
## The following object is masked from 'package:lsa':
## 
##     entropy
get_closest <- function(x, k = 10){
        knns <- get.knnx(dat, t(x), k = k)
        data.frame(words = terms[knns$nn.index],
                   dist = as.vector(knns$nn.dist))
}


print_closest <- function(x, ...) {
        get_closest(dat[x,]) %>%
                kable(caption = str_c(x, ": Most similar words"))
}



# find closest words
"wine" %>% print_closest
wine: Most similar words
words dist
wine 0.000000
wines 3.587189
tasting 4.220061
beer 4.541482
grape 4.647468
champagne 4.674164
coffee 4.723772
drink 4.849089
dessert 4.989297
drinks 5.064714
"paris" %>% print_closest
paris: Most similar words
words dist
paris 0.000000
london 4.619154
france 4.627724
brussels 4.646891
rome 4.714683
amsterdam 4.772124
vienna 4.922808
berlin 4.958270
french 4.959027
prohertrib 5.083570
"elvis" %>% print_closest
elvis: Most similar words
words dist
elvis 0.000000
presley 2.726861
sinatra 3.712933
impersonators 4.395840
beatles 4.417561
hendrix 4.583122
dylan 4.645549
impersonator 4.696696
springsteen 4.704137
marilyn 4.704166
"feminist" %>% print_closest
feminist: Most similar words
words dist
feminist 0.000000
feminism 3.794703
feminists 3.941639
activism 4.282637
postmodern 4.384990
anti-pornography 4.525025
postmodernist 4.573411
modernist 4.582374
left-wing 4.631260
humanist 4.653668
"social" %>% print_closest
social: Most similar words
words dist
social 0.000000
education 4.452117
political 4.480227
welfare 4.552694
cultural 4.628394
educational 4.660680
reform 4.766635
environment 4.818840
public 4.833534
organizational 4.856950
# arithmetic based on words
(dat["king",] - dat["son",] + dat["daughter",]) %>% get_closest %>% kable
words dist
queen 3.054499
king 3.166580
elizabeth 4.110683
princess 4.344591
daughter 4.702879
monarch 4.717587
anne 4.756577
sister 4.786452
margaret 4.787183
lady 4.805601
(dat["king",] - dat["man",] + dat["woman",]) %>% get_closest %>% kable
words dist
king 3.364068
queen 4.081079
monarch 4.642907
throne 4.905501
elizabeth 4.921559
prince 4.981147
daughter 4.985715
mother 5.064087
cousin 5.077497
princess 5.078685
(dat["new",] + dat["york",]) %>% get_closest %>% kable
words dist
york 6.007761
new 6.068016
boston 7.904500
the 8.179040
chicago 8.272642
angeles 8.277406
on 8.289830
washington 8.322091
manhattan 8.328979
for 8.339557
(dat["new",] + dat["york",] + dat["city",]) %>% get_closest %>% kable
words dist
york 11.21928
new 11.22376
city 11.48779
the 12.50416
in 12.56927
at 12.57566
angeles 12.59023
an 12.69666
boston 12.72993
on 12.74887
(dat["new",] + dat["york",] + dat["finance",]) %>% get_closest %>% kable
words dist
new 10.06800
york 10.40987
finance 11.48779
business 11.57991
for 11.59704
office 11.61339
financial 11.62303
the 11.71041
on 11.73747
as 11.83019
(dat["rolling",] + dat["stones",]) %>% get_closest %>% kable
words dist
stones 4.944118
rolling 5.658773
stone 6.584476
rock 6.670684
rocks 6.685455
metal 6.947805
dirt 7.195856
tires 7.251429
sand 7.252148
roll 7.256610
(dat["rolling",] + dat["stones",] + dat["band",]) %>% get_closest %>% kable
words dist
band 9.192113
rock 9.435211
stones 9.829665
album 10.209143
bands 10.278002
rolling 10.444018
metal 10.508084
songs 10.661002
song 10.734908
albums 10.786799
(dat["rolling",] * dat["stones",]) %>% get_closest %>% kable
words dist
oly-2004-cycling 3.841481
indnsia 3.929485
oly-2004-gymnastics 3.943509
lucenttech 3.960635
www.slarmy.org 3.961460
gph04bb 3.971755
canyonres 3.981488
greg.wilcoxdailynews.com 3.985083
kd97 3.987408
em96 3.987853
(dat["rolling",] * dat["stones",] + dat["band",]) %>% get_closest %>% kable
words dist
band 4.002357
bands 4.303591
rock 5.191870
musicians 5.244460
r.e.m. 5.272875
ac/dc 5.308618
trio 5.346968
punk 5.375955
beatles 5.408529
rockers 5.500042
(dat["rolling",] * dat["stones",] + (dat["rolling",] + dat["stones",])) %>% get_closest %>% kable
words dist
stones 6.505402
rolling 7.204792
tires 7.928066
rocks 8.079176
dirt 8.315046
barricades 8.318810
boulders 8.339338
metal 8.339379
bricks 8.414550
stone 8.419676
(dat["rolling",] * dat["stones",] + (dat["rolling",] + dat["stones",]) + dat["band",]) %>% get_closest %>% kable
words dist
stones 10.12676
rock 10.29294
band 10.30725
bands 10.81319
rolling 10.81748
metal 10.94286
album 11.38065
songs 11.39306
rocks 11.45329
albums 11.58872
"beatles" %>% print_closest
beatles: Most similar words
words dist
beatles 0.000000
lennon 3.773517
mccartney 4.033355
dylan 4.147005
r.e.m. 4.388818
elvis 4.417561
hendrix 4.440084
presley 4.500262
motown 4.558921
sinatra 4.582189
(dat["the",] + dat["beatles",]) %>% get_closest %>% kable
words dist
the 5.605455
beatles 5.821154
original 6.050194
first 6.205197
band 6.225141
one 6.246675
part 6.262356
time 6.302615
all 6.306094
album 6.335962
(dat["the",] + dat["beatles",] + dat["band",]) %>% get_closest %>% kable
words dist
band 8.905652
album 9.588286
song 10.025520
rock 10.164441
songs 10.254784
music 10.347172
beatles 10.516223
albums 10.578234
recording 10.720413
bands 10.734764
"doors" %>% print_closest
doors: Most similar words
words dist
doors 0.000000
door 2.892251
window 3.827778
locked 4.234783
opened 4.339789
room 4.356725
garage 4.436788
inside 4.475200
entrance 4.506697
floor 4.512173
(dat["the",] + dat["doors",]) %>% get_closest %>% kable
words dist
the 5.372134
doors 5.821154
on 6.093308
door 6.154467
their 6.348921
into 6.356098
before 6.424828
all 6.426935
up 6.493618
two 6.503468
(dat["the",] + dat["doors",] + dat["band",]) %>% get_closest %>% kable
words dist
the 9.515325
band 9.575906
on 9.971374
their 9.978845
they 10.262334
rock 10.270579
into 10.287472
back 10.333579
its 10.341715
it 10.353337
"queen" %>% print_closest
queen: Most similar words
words dist
queen 0.000000
princess 3.853247
elizabeth 4.159615
king 4.281252
lady 4.467098
victoria 4.487890
monarch 4.615716
royal 4.673854
majesty 4.694913
crown 4.749913
(dat["queen",] + dat["royals",]) %>% get_closest %>% kable
words dist
queen 5.276516
royals 6.006717
princess 6.749851
king 6.773056
crown 6.917491
victoria 7.021070
royal 7.182192
prince 7.206046
kings 7.309651
lady 7.312353
(dat["queen",] + dat["band",]) %>% get_closest %>% kable
words dist
band 6.006717
queen 6.401372
song 7.215384
rock 7.266964
album 7.285269
bands 7.527365
music 7.532709
singer 7.716667
king 7.747794
concert 7.829989
(dat["queen",] + dat["band",] - dat["singer",]) %>% get_closest %>% kable
words dist
queen 5.250784
royal 5.975947
king 6.340909
band 6.405538
windsor 6.429560
knights 6.465327
majesty 6.484171
kingdom 6.492589
6.507686
upon 6.546279
"jimi" %>% print_closest
jimi: Most similar words
words dist
jimi 0.000000
hendrix 2.027038
clapton 3.981021
joplin 4.041720
wilco 4.145041
janis 4.394502
dylan 4.450860
zeppelin 4.469578
janie 4.624310
cline 4.629106
"janis" %>% print_closest
janis: Most similar words
words dist
janis 0.000000
joplin 3.980992
jimi 4.394502
hendrix 4.408821
britt 4.530569
cline 4.552597
reeder 4.684725
doucette 4.744579
carmichael 4.779216
loring 4.782758
"lennon" %>% print_closest
lennon: Most similar words
words dist
lennon 0.000000
mccartney 2.982369
dylan 3.690259
beatles 3.773517
ringo 4.124350
ono 4.135684
morrison 4.372467
sinatra 4.412618
harrison 4.443267
keane 4.531996
"paul" %>% print_closest
paul: Most similar words
words dist
paul 0.000000
john 3.620322
peter 3.705694
patrick 3.864196
michael 3.958338
andrew 4.046297
gregory 4.106373
martin 4.114295
james 4.144627
robinson 4.171885
(dat["paul",] + dat["beatles",]) %>% get_closest %>% kable
words dist
paul 5.605455
beatles 5.686023
mccartney 6.211818
lennon 6.303713
john 6.694298
dylan 6.779703
jackson 6.915449
peter 6.954399
neil 7.068818
james 7.081242
"george" %>% print_closest
george: Most similar words
words dist
george 0.000000
w. 3.196505
john 3.484793
howard 3.546275
charles 3.728872
wilson 3.866285
henry 3.878181
donald 3.984903
james 4.035362
william 4.036626
(dat["george",] + dat["beatles",]) %>% get_closest %>% kable
words dist
beatles 5.603918
george 5.605455
john 6.041804
harrison 6.435167
james 6.473246
jackson 6.512313
mccartney 6.514504
lennon 6.607794
jimmy 6.651387
paul 6.652483
"ringo" %>% print_closest
ringo: Most similar words
words dist
ringo 0.000000
shiina 3.968166
lennon 4.124350
mccartney 4.185193
bandmate 4.369752
starkey 4.445920
bandmates 4.460172
karn 4.540136
r.e.m. 4.545028
voormann 4.579271
(dat["ringo",] + dat["beatles",]) %>% get_closest %>% kable
words dist
beatles 5.012027
ringo 5.605455
lennon 6.384024
mccartney 6.461124
dylan 7.098850
beatle 7.105667
presley 7.294093
sinatra 7.354986
r.e.m. 7.388486
hendrix 7.390636
"faith" %>% print_closest
faith: Most similar words
words dist
faith 0.000000
belief 3.324202
spirit 4.068222
devotion 4.154548
beliefs 4.222546
wisdom 4.264750
passion 4.345821
true 4.359775
desire 4.379118
spirituality 4.404841
"no" %>% print_closest
no: Most similar words
words dist
no 0.000000
there 2.717778
any 2.829521
not 2.983645
only 3.093342
without 3.156777
even 3.169766
nothing 3.197312
because 3.335527
but 3.345817
"more" %>% print_closest
more: Most similar words
words dist
more 0.000000
than 2.307780
some 2.643331
less 2.647678
most 3.019516
much 3.022626
so 3.135619
even 3.265349
least 3.362266
few 3.402258
(dat["faith",] + dat["no",] + dat["more",]) %>% get_closest %>% kable
words dist
more 9.672123
no 9.811803
not 9.916727
all 9.979179
we 9.993107
it 10.010660
some 10.011909
that 10.029277
any 10.033411
there 10.064119