glove.R

# along the lines of:
# https://gist.github.com/nassimhaddad/58933f4a5d34b84f4099

library(readr)
library(stringr)
library(knitr)
library(gplots)

## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess

# path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.50d.txt.zip"; dims <- 50
path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.100d.txt.zip"; dims <- 100
# path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.200d.txt.zip"; dims <- 200
# path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.300d.txt.zip"; dims <- 300



col_names <- c("term", str_c("d", 1:dims))

dat <- read_delim(file = path,
                   delim = " ",
                   quote = "",
                   col_names = col_names)

## Multiple files in zip: reading 'glove.6B.100d.txt'

rownames(dat) <- dat$term
dat <- dat[,-1]
dat <- as.matrix(dat)



# head(dat)

terms <- row.names(dat)



library(lsa)

## Loading required package: SnowballC

dat[c("cat", "dog", "rabbit", "elvis", "beatles", "dylan"),] %>%
        t %>%
        head(10) %>%
        kable(digits = 2)

	cat	dog	rabbit	elvis	beatles	dylan
d1	0.23	0.31	0.03	0.38	-0.11	0.09
d2	0.28	0.31	0.04	-0.38	0.20	0.26
d3	0.63	0.53	0.59	0.53	0.33	0.10
d4	-0.59	-0.93	-0.38	-0.45	-0.03	-1.02
d5	-0.59	-0.74	-0.47	0.15	0.43	0.43
d6	0.63	0.63	0.21	0.83	0.51	0.21
d7	0.24	0.44	0.15	0.27	0.61	0.16
d8	-0.14	0.10	-0.07	-0.36	-0.20	-0.01
d9	0.06	-0.09	0.48	0.28	-0.11	-0.07
d10	-0.79	-0.57	-1.19	-1.23	-0.51	-0.83

get_cosine_matrix <- function(x) {
        dat[x,] %>%
                t %>%
                cosine
}

print_cosine_matrix <- function(x) {
        get_cosine_matrix(x) %>%
                kable(digits = 2)
}

cosine_matrix_heatmap <- function(x) {
        get_cosine_matrix(x) %>%
                heatmap.2(col = cm.colors)
}

c("cat", "elvis", "dog", "beatles", "rabbit", "dylan") %>% print_cosine_matrix

	cat	elvis	dog	beatles	rabbit	dylan
cat	1.00	0.36	0.88	0.21	0.74	0.19
elvis	0.36	1.00	0.33	0.67	0.27	0.59
dog	0.88	0.33	1.00	0.17	0.65	0.17
beatles	0.21	0.67	0.17	1.00	0.15	0.70
rabbit	0.74	0.27	0.65	0.15	1.00	0.14
dylan	0.19	0.59	0.17	0.70	0.14	1.00

c("cat", "elvis", "dog", "beatles", "rabbit", "dylan") %>% cosine_matrix_heatmap

c("paris", "berlin", "france", "germany") %>% cosine_matrix_heatmap

c("feminist", "activist", "bank", "teller", "money", "finance") %>% cosine_matrix_heatmap

# quick function
library(FNN)

## 
## Attaching package: 'FNN'
## 
## The following object is masked from 'package:lsa':
## 
##     entropy

get_closest <- function(x, k = 10){
        knns <- get.knnx(dat, t(x), k = k)
        data.frame(words = terms[knns$nn.index],
                   dist = as.vector(knns$nn.dist))
}


print_closest <- function(x, ...) {
        get_closest(dat[x,]) %>%
                kable(caption = str_c(x, ": Most similar words"))
}



# find closest words
"wine" %>% print_closest

wine: Most similar words
words	dist
wine	0.000000
wines	3.587189
tasting	4.220061
beer	4.541482
grape	4.647468
champagne	4.674164
coffee	4.723772
drink	4.849089
dessert	4.989297
drinks	5.064714

"paris" %>% print_closest

paris: Most similar words
words	dist
paris	0.000000
london	4.619154
france	4.627724
brussels	4.646891
rome	4.714683
amsterdam	4.772124
vienna	4.922808
berlin	4.958270
french	4.959027
prohertrib	5.083570

"elvis" %>% print_closest

elvis: Most similar words
words	dist
elvis	0.000000
presley	2.726861
sinatra	3.712933
impersonators	4.395840
beatles	4.417561
hendrix	4.583122
dylan	4.645549
impersonator	4.696696
springsteen	4.704137
marilyn	4.704166

"feminist" %>% print_closest

feminist: Most similar words
words	dist
feminist	0.000000
feminism	3.794703
feminists	3.941639
activism	4.282637
postmodern	4.384990
anti-pornography	4.525025
postmodernist	4.573411
modernist	4.582374
left-wing	4.631260
humanist	4.653668

"social" %>% print_closest

social: Most similar words
words	dist
social	0.000000
education	4.452117
political	4.480227
welfare	4.552694
cultural	4.628394
educational	4.660680
reform	4.766635
environment	4.818840
public	4.833534
organizational	4.856950

# arithmetic based on words
(dat["king",] - dat["son",] + dat["daughter",]) %>% get_closest %>% kable

words	dist
queen	3.054499
king	3.166580
elizabeth	4.110683
princess	4.344591
daughter	4.702879
monarch	4.717587
anne	4.756577
sister	4.786452
margaret	4.787183
lady	4.805601

(dat["king",] - dat["man",] + dat["woman",]) %>% get_closest %>% kable

words	dist
king	3.364068
queen	4.081079
monarch	4.642907
throne	4.905501
elizabeth	4.921559
prince	4.981147
daughter	4.985715
mother	5.064087
cousin	5.077497
princess	5.078685

(dat["new",] + dat["york",]) %>% get_closest %>% kable

words	dist
york	6.007761
new	6.068016
boston	7.904500
the	8.179040
chicago	8.272642
angeles	8.277406
on	8.289830
washington	8.322091
manhattan	8.328979
for	8.339557

(dat["new",] + dat["york",] + dat["city",]) %>% get_closest %>% kable

words	dist
york	11.21928
new	11.22376
city	11.48779
the	12.50416
in	12.56927
at	12.57566
angeles	12.59023
an	12.69666
boston	12.72993
on	12.74887

(dat["new",] + dat["york",] + dat["finance",]) %>% get_closest %>% kable

words	dist
new	10.06800
york	10.40987
finance	11.48779
business	11.57991
for	11.59704
office	11.61339
financial	11.62303
the	11.71041
on	11.73747
as	11.83019

(dat["rolling",] + dat["stones",]) %>% get_closest %>% kable

words	dist
stones	4.944118
rolling	5.658773
stone	6.584476
rock	6.670684
rocks	6.685455
metal	6.947805
dirt	7.195856
tires	7.251429
sand	7.252148
roll	7.256610

(dat["rolling",] + dat["stones",] + dat["band",]) %>% get_closest %>% kable

words	dist
band	9.192113
rock	9.435211
stones	9.829665
album	10.209143
bands	10.278002
rolling	10.444018
metal	10.508084
songs	10.661002
song	10.734908
albums	10.786799

(dat["rolling",] * dat["stones",]) %>% get_closest %>% kable

words	dist
oly-2004-cycling	3.841481
indnsia	3.929485
oly-2004-gymnastics	3.943509
lucenttech	3.960635
www.slarmy.org	3.961460
gph04bb	3.971755
canyonres	3.981488
greg.wilcoxdailynews.com	3.985083
kd97	3.987408
em96	3.987853

(dat["rolling",] * dat["stones",] + dat["band",]) %>% get_closest %>% kable

words	dist
band	4.002357
bands	4.303591
rock	5.191870
musicians	5.244460
r.e.m.	5.272875
ac/dc	5.308618
trio	5.346968
punk	5.375955
beatles	5.408529
rockers	5.500042

(dat["rolling",] * dat["stones",] + (dat["rolling",] + dat["stones",])) %>% get_closest %>% kable

words	dist
stones	6.505402
rolling	7.204792
tires	7.928066
rocks	8.079176
dirt	8.315046
barricades	8.318810
boulders	8.339338
metal	8.339379
bricks	8.414550
stone	8.419676

(dat["rolling",] * dat["stones",] + (dat["rolling",] + dat["stones",]) + dat["band",]) %>% get_closest %>% kable

words	dist
stones	10.12676
rock	10.29294
band	10.30725
bands	10.81319
rolling	10.81748
metal	10.94286
album	11.38065
songs	11.39306
rocks	11.45329
albums	11.58872

"beatles" %>% print_closest

beatles: Most similar words
words	dist
beatles	0.000000
lennon	3.773517
mccartney	4.033355
dylan	4.147005
r.e.m.	4.388818
elvis	4.417561
hendrix	4.440084
presley	4.500262
motown	4.558921
sinatra	4.582189

(dat["the",] + dat["beatles",]) %>% get_closest %>% kable

words	dist
the	5.605455
beatles	5.821154
original	6.050194
first	6.205197
band	6.225141
one	6.246675
part	6.262356
time	6.302615
all	6.306094
album	6.335962

(dat["the",] + dat["beatles",] + dat["band",]) %>% get_closest %>% kable

words	dist
band	8.905652
album	9.588286
song	10.025520
rock	10.164441
songs	10.254784
music	10.347172
beatles	10.516223
albums	10.578234
recording	10.720413
bands	10.734764

"doors" %>% print_closest

doors: Most similar words
words	dist
doors	0.000000
door	2.892251
window	3.827778
locked	4.234783
opened	4.339789
room	4.356725
garage	4.436788
inside	4.475200
entrance	4.506697
floor	4.512173

(dat["the",] + dat["doors",]) %>% get_closest %>% kable

words	dist
the	5.372134
doors	5.821154
on	6.093308
door	6.154467
their	6.348921
into	6.356098
before	6.424828
all	6.426935
up	6.493618
two	6.503468

(dat["the",] + dat["doors",] + dat["band",]) %>% get_closest %>% kable

words	dist
the	9.515325
band	9.575906
on	9.971374
their	9.978845
they	10.262334
rock	10.270579
into	10.287472
back	10.333579
its	10.341715
it	10.353337

"queen" %>% print_closest

queen: Most similar words
words	dist
queen	0.000000
princess	3.853247
elizabeth	4.159615
king	4.281252
lady	4.467098
victoria	4.487890
monarch	4.615716
royal	4.673854
majesty	4.694913
crown	4.749913

(dat["queen",] + dat["royals",]) %>% get_closest %>% kable

words	dist
queen	5.276516
royals	6.006717
princess	6.749851
king	6.773056
crown	6.917491
victoria	7.021070
royal	7.182192
prince	7.206046
kings	7.309651
lady	7.312353

(dat["queen",] + dat["band",]) %>% get_closest %>% kable

words	dist
band	6.006717
queen	6.401372
song	7.215384
rock	7.266964
album	7.285269
bands	7.527365
music	7.532709
singer	7.716667
king	7.747794
concert	7.829989

(dat["queen",] + dat["band",] - dat["singer",]) %>% get_closest %>% kable

words	dist
queen	5.250784
royal	5.975947
king	6.340909
band	6.405538
windsor	6.429560
knights	6.465327
majesty	6.484171
kingdom	6.492589
—	6.507686
upon	6.546279

"jimi" %>% print_closest

jimi: Most similar words
words	dist
jimi	0.000000
hendrix	2.027038
clapton	3.981021
joplin	4.041720
wilco	4.145041
janis	4.394502
dylan	4.450860
zeppelin	4.469578
janie	4.624310
cline	4.629106

"janis" %>% print_closest

janis: Most similar words
words	dist
janis	0.000000
joplin	3.980992
jimi	4.394502
hendrix	4.408821
britt	4.530569
cline	4.552597
reeder	4.684725
doucette	4.744579
carmichael	4.779216
loring	4.782758

"lennon" %>% print_closest

lennon: Most similar words
words	dist
lennon	0.000000
mccartney	2.982369
dylan	3.690259
beatles	3.773517
ringo	4.124350
ono	4.135684
morrison	4.372467
sinatra	4.412618
harrison	4.443267
keane	4.531996

"paul" %>% print_closest

paul: Most similar words
words	dist
paul	0.000000
john	3.620322
peter	3.705694
patrick	3.864196
michael	3.958338
andrew	4.046297
gregory	4.106373
martin	4.114295
james	4.144627
robinson	4.171885

(dat["paul",] + dat["beatles",]) %>% get_closest %>% kable

words	dist
paul	5.605455
beatles	5.686023
mccartney	6.211818
lennon	6.303713
john	6.694298
dylan	6.779703
jackson	6.915449
peter	6.954399
neil	7.068818
james	7.081242

"george" %>% print_closest

george: Most similar words
words	dist
george	0.000000
w.	3.196505
john	3.484793
howard	3.546275
charles	3.728872
wilson	3.866285
henry	3.878181
donald	3.984903
james	4.035362
william	4.036626

(dat["george",] + dat["beatles",]) %>% get_closest %>% kable

words	dist
beatles	5.603918
george	5.605455
john	6.041804
harrison	6.435167
james	6.473246
jackson	6.512313
mccartney	6.514504
lennon	6.607794
jimmy	6.651387
paul	6.652483

"ringo" %>% print_closest

ringo: Most similar words
words	dist
ringo	0.000000
shiina	3.968166
lennon	4.124350
mccartney	4.185193
bandmate	4.369752
starkey	4.445920
bandmates	4.460172
karn	4.540136
r.e.m.	4.545028
voormann	4.579271

(dat["ringo",] + dat["beatles",]) %>% get_closest %>% kable

words	dist
beatles	5.012027
ringo	5.605455
lennon	6.384024
mccartney	6.461124
dylan	7.098850
beatle	7.105667
presley	7.294093
sinatra	7.354986
r.e.m.	7.388486
hendrix	7.390636

"faith" %>% print_closest

faith: Most similar words
words	dist
faith	0.000000
belief	3.324202
spirit	4.068222
devotion	4.154548
beliefs	4.222546
wisdom	4.264750
passion	4.345821
true	4.359775
desire	4.379118
spirituality	4.404841

"no" %>% print_closest

no: Most similar words
words	dist
no	0.000000
there	2.717778
any	2.829521
not	2.983645
only	3.093342
without	3.156777
even	3.169766
nothing	3.197312
because	3.335527
but	3.345817

"more" %>% print_closest

more: Most similar words
words	dist
more	0.000000
than	2.307780
some	2.643331
less	2.647678
most	3.019516
much	3.022626
so	3.135619
even	3.265349
least	3.362266
few	3.402258

(dat["faith",] + dat["no",] + dat["more",]) %>% get_closest %>% kable

words	dist
more	9.672123
no	9.811803
not	9.916727
all	9.979179
we	9.993107
it	10.010660
some	10.011909
that	10.029277
any	10.033411
there	10.064119

glove.R

herzog

Wed Jan 13 17:57:31 2016