# along the lines of:
# https://gist.github.com/nassimhaddad/58933f4a5d34b84f4099
library(readr)
library(stringr)
library(knitr)
library(gplots)
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
# path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.50d.txt.zip"; dims <- 50
path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.100d.txt.zip"; dims <- 100
# path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.200d.txt.zip"; dims <- 200
# path <- "pre-trained-vectors/wikipedia2014_gigaword5/glove.6B.300d.txt.zip"; dims <- 300
col_names <- c("term", str_c("d", 1:dims))
dat <- read_delim(file = path,
delim = " ",
quote = "",
col_names = col_names)
## Multiple files in zip: reading 'glove.6B.100d.txt'
rownames(dat) <- dat$term
dat <- dat[,-1]
dat <- as.matrix(dat)
# head(dat)
terms <- row.names(dat)
library(lsa)
## Loading required package: SnowballC
dat[c("cat", "dog", "rabbit", "elvis", "beatles", "dylan"),] %>%
t %>%
head(10) %>%
kable(digits = 2)
| d1 |
0.23 |
0.31 |
0.03 |
0.38 |
-0.11 |
0.09 |
| d2 |
0.28 |
0.31 |
0.04 |
-0.38 |
0.20 |
0.26 |
| d3 |
0.63 |
0.53 |
0.59 |
0.53 |
0.33 |
0.10 |
| d4 |
-0.59 |
-0.93 |
-0.38 |
-0.45 |
-0.03 |
-1.02 |
| d5 |
-0.59 |
-0.74 |
-0.47 |
0.15 |
0.43 |
0.43 |
| d6 |
0.63 |
0.63 |
0.21 |
0.83 |
0.51 |
0.21 |
| d7 |
0.24 |
0.44 |
0.15 |
0.27 |
0.61 |
0.16 |
| d8 |
-0.14 |
0.10 |
-0.07 |
-0.36 |
-0.20 |
-0.01 |
| d9 |
0.06 |
-0.09 |
0.48 |
0.28 |
-0.11 |
-0.07 |
| d10 |
-0.79 |
-0.57 |
-1.19 |
-1.23 |
-0.51 |
-0.83 |
get_cosine_matrix <- function(x) {
dat[x,] %>%
t %>%
cosine
}
print_cosine_matrix <- function(x) {
get_cosine_matrix(x) %>%
kable(digits = 2)
}
cosine_matrix_heatmap <- function(x) {
get_cosine_matrix(x) %>%
heatmap.2(col = cm.colors)
}
c("cat", "elvis", "dog", "beatles", "rabbit", "dylan") %>% print_cosine_matrix
| cat |
1.00 |
0.36 |
0.88 |
0.21 |
0.74 |
0.19 |
| elvis |
0.36 |
1.00 |
0.33 |
0.67 |
0.27 |
0.59 |
| dog |
0.88 |
0.33 |
1.00 |
0.17 |
0.65 |
0.17 |
| beatles |
0.21 |
0.67 |
0.17 |
1.00 |
0.15 |
0.70 |
| rabbit |
0.74 |
0.27 |
0.65 |
0.15 |
1.00 |
0.14 |
| dylan |
0.19 |
0.59 |
0.17 |
0.70 |
0.14 |
1.00 |
c("cat", "elvis", "dog", "beatles", "rabbit", "dylan") %>% cosine_matrix_heatmap

c("paris", "berlin", "france", "germany") %>% cosine_matrix_heatmap

c("feminist", "activist", "bank", "teller", "money", "finance") %>% cosine_matrix_heatmap

# quick function
library(FNN)
##
## Attaching package: 'FNN'
##
## The following object is masked from 'package:lsa':
##
## entropy
get_closest <- function(x, k = 10){
knns <- get.knnx(dat, t(x), k = k)
data.frame(words = terms[knns$nn.index],
dist = as.vector(knns$nn.dist))
}
print_closest <- function(x, ...) {
get_closest(dat[x,]) %>%
kable(caption = str_c(x, ": Most similar words"))
}
# find closest words
"wine" %>% print_closest
wine: Most similar words
| wine |
0.000000 |
| wines |
3.587189 |
| tasting |
4.220061 |
| beer |
4.541482 |
| grape |
4.647468 |
| champagne |
4.674164 |
| coffee |
4.723772 |
| drink |
4.849089 |
| dessert |
4.989297 |
| drinks |
5.064714 |
"paris" %>% print_closest
paris: Most similar words
| paris |
0.000000 |
| london |
4.619154 |
| france |
4.627724 |
| brussels |
4.646891 |
| rome |
4.714683 |
| amsterdam |
4.772124 |
| vienna |
4.922808 |
| berlin |
4.958270 |
| french |
4.959027 |
| prohertrib |
5.083570 |
"elvis" %>% print_closest
elvis: Most similar words
| elvis |
0.000000 |
| presley |
2.726861 |
| sinatra |
3.712933 |
| impersonators |
4.395840 |
| beatles |
4.417561 |
| hendrix |
4.583122 |
| dylan |
4.645549 |
| impersonator |
4.696696 |
| springsteen |
4.704137 |
| marilyn |
4.704166 |
"feminist" %>% print_closest
feminist: Most similar words
| feminist |
0.000000 |
| feminism |
3.794703 |
| feminists |
3.941639 |
| activism |
4.282637 |
| postmodern |
4.384990 |
| anti-pornography |
4.525025 |
| postmodernist |
4.573411 |
| modernist |
4.582374 |
| left-wing |
4.631260 |
| humanist |
4.653668 |
"social" %>% print_closest
social: Most similar words
| social |
0.000000 |
| education |
4.452117 |
| political |
4.480227 |
| welfare |
4.552694 |
| cultural |
4.628394 |
| educational |
4.660680 |
| reform |
4.766635 |
| environment |
4.818840 |
| public |
4.833534 |
| organizational |
4.856950 |
# arithmetic based on words
(dat["king",] - dat["son",] + dat["daughter",]) %>% get_closest %>% kable
| queen |
3.054499 |
| king |
3.166580 |
| elizabeth |
4.110683 |
| princess |
4.344591 |
| daughter |
4.702879 |
| monarch |
4.717587 |
| anne |
4.756577 |
| sister |
4.786452 |
| margaret |
4.787183 |
| lady |
4.805601 |
(dat["king",] - dat["man",] + dat["woman",]) %>% get_closest %>% kable
| king |
3.364068 |
| queen |
4.081079 |
| monarch |
4.642907 |
| throne |
4.905501 |
| elizabeth |
4.921559 |
| prince |
4.981147 |
| daughter |
4.985715 |
| mother |
5.064087 |
| cousin |
5.077497 |
| princess |
5.078685 |
(dat["new",] + dat["york",]) %>% get_closest %>% kable
| york |
6.007761 |
| new |
6.068016 |
| boston |
7.904500 |
| the |
8.179040 |
| chicago |
8.272642 |
| angeles |
8.277406 |
| on |
8.289830 |
| washington |
8.322091 |
| manhattan |
8.328979 |
| for |
8.339557 |
(dat["new",] + dat["york",] + dat["city",]) %>% get_closest %>% kable
| york |
11.21928 |
| new |
11.22376 |
| city |
11.48779 |
| the |
12.50416 |
| in |
12.56927 |
| at |
12.57566 |
| angeles |
12.59023 |
| an |
12.69666 |
| boston |
12.72993 |
| on |
12.74887 |
(dat["new",] + dat["york",] + dat["finance",]) %>% get_closest %>% kable
| new |
10.06800 |
| york |
10.40987 |
| finance |
11.48779 |
| business |
11.57991 |
| for |
11.59704 |
| office |
11.61339 |
| financial |
11.62303 |
| the |
11.71041 |
| on |
11.73747 |
| as |
11.83019 |
(dat["rolling",] + dat["stones",]) %>% get_closest %>% kable
| stones |
4.944118 |
| rolling |
5.658773 |
| stone |
6.584476 |
| rock |
6.670684 |
| rocks |
6.685455 |
| metal |
6.947805 |
| dirt |
7.195856 |
| tires |
7.251429 |
| sand |
7.252148 |
| roll |
7.256610 |
(dat["rolling",] + dat["stones",] + dat["band",]) %>% get_closest %>% kable
| band |
9.192113 |
| rock |
9.435211 |
| stones |
9.829665 |
| album |
10.209143 |
| bands |
10.278002 |
| rolling |
10.444018 |
| metal |
10.508084 |
| songs |
10.661002 |
| song |
10.734908 |
| albums |
10.786799 |
(dat["rolling",] * dat["stones",]) %>% get_closest %>% kable
| oly-2004-cycling |
3.841481 |
| indnsia |
3.929485 |
| oly-2004-gymnastics |
3.943509 |
| lucenttech |
3.960635 |
| www.slarmy.org |
3.961460 |
| gph04bb |
3.971755 |
| canyonres |
3.981488 |
| greg.wilcoxdailynews.com |
3.985083 |
| kd97 |
3.987408 |
| em96 |
3.987853 |
(dat["rolling",] * dat["stones",] + dat["band",]) %>% get_closest %>% kable
| band |
4.002357 |
| bands |
4.303591 |
| rock |
5.191870 |
| musicians |
5.244460 |
| r.e.m. |
5.272875 |
| ac/dc |
5.308618 |
| trio |
5.346968 |
| punk |
5.375955 |
| beatles |
5.408529 |
| rockers |
5.500042 |
(dat["rolling",] * dat["stones",] + (dat["rolling",] + dat["stones",])) %>% get_closest %>% kable
| stones |
6.505402 |
| rolling |
7.204792 |
| tires |
7.928066 |
| rocks |
8.079176 |
| dirt |
8.315046 |
| barricades |
8.318810 |
| boulders |
8.339338 |
| metal |
8.339379 |
| bricks |
8.414550 |
| stone |
8.419676 |
(dat["rolling",] * dat["stones",] + (dat["rolling",] + dat["stones",]) + dat["band",]) %>% get_closest %>% kable
| stones |
10.12676 |
| rock |
10.29294 |
| band |
10.30725 |
| bands |
10.81319 |
| rolling |
10.81748 |
| metal |
10.94286 |
| album |
11.38065 |
| songs |
11.39306 |
| rocks |
11.45329 |
| albums |
11.58872 |
"beatles" %>% print_closest
beatles: Most similar words
| beatles |
0.000000 |
| lennon |
3.773517 |
| mccartney |
4.033355 |
| dylan |
4.147005 |
| r.e.m. |
4.388818 |
| elvis |
4.417561 |
| hendrix |
4.440084 |
| presley |
4.500262 |
| motown |
4.558921 |
| sinatra |
4.582189 |
(dat["the",] + dat["beatles",]) %>% get_closest %>% kable
| the |
5.605455 |
| beatles |
5.821154 |
| original |
6.050194 |
| first |
6.205197 |
| band |
6.225141 |
| one |
6.246675 |
| part |
6.262356 |
| time |
6.302615 |
| all |
6.306094 |
| album |
6.335962 |
(dat["the",] + dat["beatles",] + dat["band",]) %>% get_closest %>% kable
| band |
8.905652 |
| album |
9.588286 |
| song |
10.025520 |
| rock |
10.164441 |
| songs |
10.254784 |
| music |
10.347172 |
| beatles |
10.516223 |
| albums |
10.578234 |
| recording |
10.720413 |
| bands |
10.734764 |
"doors" %>% print_closest
doors: Most similar words
| doors |
0.000000 |
| door |
2.892251 |
| window |
3.827778 |
| locked |
4.234783 |
| opened |
4.339789 |
| room |
4.356725 |
| garage |
4.436788 |
| inside |
4.475200 |
| entrance |
4.506697 |
| floor |
4.512173 |
(dat["the",] + dat["doors",]) %>% get_closest %>% kable
| the |
5.372134 |
| doors |
5.821154 |
| on |
6.093308 |
| door |
6.154467 |
| their |
6.348921 |
| into |
6.356098 |
| before |
6.424828 |
| all |
6.426935 |
| up |
6.493618 |
| two |
6.503468 |
(dat["the",] + dat["doors",] + dat["band",]) %>% get_closest %>% kable
| the |
9.515325 |
| band |
9.575906 |
| on |
9.971374 |
| their |
9.978845 |
| they |
10.262334 |
| rock |
10.270579 |
| into |
10.287472 |
| back |
10.333579 |
| its |
10.341715 |
| it |
10.353337 |
"queen" %>% print_closest
queen: Most similar words
| queen |
0.000000 |
| princess |
3.853247 |
| elizabeth |
4.159615 |
| king |
4.281252 |
| lady |
4.467098 |
| victoria |
4.487890 |
| monarch |
4.615716 |
| royal |
4.673854 |
| majesty |
4.694913 |
| crown |
4.749913 |
(dat["queen",] + dat["royals",]) %>% get_closest %>% kable
| queen |
5.276516 |
| royals |
6.006717 |
| princess |
6.749851 |
| king |
6.773056 |
| crown |
6.917491 |
| victoria |
7.021070 |
| royal |
7.182192 |
| prince |
7.206046 |
| kings |
7.309651 |
| lady |
7.312353 |
(dat["queen",] + dat["band",]) %>% get_closest %>% kable
| band |
6.006717 |
| queen |
6.401372 |
| song |
7.215384 |
| rock |
7.266964 |
| album |
7.285269 |
| bands |
7.527365 |
| music |
7.532709 |
| singer |
7.716667 |
| king |
7.747794 |
| concert |
7.829989 |
(dat["queen",] + dat["band",] - dat["singer",]) %>% get_closest %>% kable
| queen |
5.250784 |
| royal |
5.975947 |
| king |
6.340909 |
| band |
6.405538 |
| windsor |
6.429560 |
| knights |
6.465327 |
| majesty |
6.484171 |
| kingdom |
6.492589 |
| — |
6.507686 |
| upon |
6.546279 |
"jimi" %>% print_closest
jimi: Most similar words
| jimi |
0.000000 |
| hendrix |
2.027038 |
| clapton |
3.981021 |
| joplin |
4.041720 |
| wilco |
4.145041 |
| janis |
4.394502 |
| dylan |
4.450860 |
| zeppelin |
4.469578 |
| janie |
4.624310 |
| cline |
4.629106 |
"janis" %>% print_closest
janis: Most similar words
| janis |
0.000000 |
| joplin |
3.980992 |
| jimi |
4.394502 |
| hendrix |
4.408821 |
| britt |
4.530569 |
| cline |
4.552597 |
| reeder |
4.684725 |
| doucette |
4.744579 |
| carmichael |
4.779216 |
| loring |
4.782758 |
"lennon" %>% print_closest
lennon: Most similar words
| lennon |
0.000000 |
| mccartney |
2.982369 |
| dylan |
3.690259 |
| beatles |
3.773517 |
| ringo |
4.124350 |
| ono |
4.135684 |
| morrison |
4.372467 |
| sinatra |
4.412618 |
| harrison |
4.443267 |
| keane |
4.531996 |
"paul" %>% print_closest
paul: Most similar words
| paul |
0.000000 |
| john |
3.620322 |
| peter |
3.705694 |
| patrick |
3.864196 |
| michael |
3.958338 |
| andrew |
4.046297 |
| gregory |
4.106373 |
| martin |
4.114295 |
| james |
4.144627 |
| robinson |
4.171885 |
(dat["paul",] + dat["beatles",]) %>% get_closest %>% kable
| paul |
5.605455 |
| beatles |
5.686023 |
| mccartney |
6.211818 |
| lennon |
6.303713 |
| john |
6.694298 |
| dylan |
6.779703 |
| jackson |
6.915449 |
| peter |
6.954399 |
| neil |
7.068818 |
| james |
7.081242 |
"george" %>% print_closest
george: Most similar words
| george |
0.000000 |
| w. |
3.196505 |
| john |
3.484793 |
| howard |
3.546275 |
| charles |
3.728872 |
| wilson |
3.866285 |
| henry |
3.878181 |
| donald |
3.984903 |
| james |
4.035362 |
| william |
4.036626 |
(dat["george",] + dat["beatles",]) %>% get_closest %>% kable
| beatles |
5.603918 |
| george |
5.605455 |
| john |
6.041804 |
| harrison |
6.435167 |
| james |
6.473246 |
| jackson |
6.512313 |
| mccartney |
6.514504 |
| lennon |
6.607794 |
| jimmy |
6.651387 |
| paul |
6.652483 |
"ringo" %>% print_closest
ringo: Most similar words
| ringo |
0.000000 |
| shiina |
3.968166 |
| lennon |
4.124350 |
| mccartney |
4.185193 |
| bandmate |
4.369752 |
| starkey |
4.445920 |
| bandmates |
4.460172 |
| karn |
4.540136 |
| r.e.m. |
4.545028 |
| voormann |
4.579271 |
(dat["ringo",] + dat["beatles",]) %>% get_closest %>% kable
| beatles |
5.012027 |
| ringo |
5.605455 |
| lennon |
6.384024 |
| mccartney |
6.461124 |
| dylan |
7.098850 |
| beatle |
7.105667 |
| presley |
7.294093 |
| sinatra |
7.354986 |
| r.e.m. |
7.388486 |
| hendrix |
7.390636 |
"faith" %>% print_closest
faith: Most similar words
| faith |
0.000000 |
| belief |
3.324202 |
| spirit |
4.068222 |
| devotion |
4.154548 |
| beliefs |
4.222546 |
| wisdom |
4.264750 |
| passion |
4.345821 |
| true |
4.359775 |
| desire |
4.379118 |
| spirituality |
4.404841 |
"no" %>% print_closest
no: Most similar words
| no |
0.000000 |
| there |
2.717778 |
| any |
2.829521 |
| not |
2.983645 |
| only |
3.093342 |
| without |
3.156777 |
| even |
3.169766 |
| nothing |
3.197312 |
| because |
3.335527 |
| but |
3.345817 |
"more" %>% print_closest
more: Most similar words
| more |
0.000000 |
| than |
2.307780 |
| some |
2.643331 |
| less |
2.647678 |
| most |
3.019516 |
| much |
3.022626 |
| so |
3.135619 |
| even |
3.265349 |
| least |
3.362266 |
| few |
3.402258 |
(dat["faith",] + dat["no",] + dat["more",]) %>% get_closest %>% kable
| more |
9.672123 |
| no |
9.811803 |
| not |
9.916727 |
| all |
9.979179 |
| we |
9.993107 |
| it |
10.010660 |
| some |
10.011909 |
| that |
10.029277 |
| any |
10.033411 |
| there |
10.064119 |