Option #3. Train your own Word2Vec model on the 100MB Wikipedia model from 2006: http://mattmahoney.net/dc/text8.zip
(a) Data imported from Source
Explore analogies. Come up with a few of your own. Come up with 3 examples of analogies and show the results from your model.
Using predict function we will find nearby terms with given word and display similarity index and the rank of these words (which is closet to the given word). Below are displayed 13 words and their top similarities. - literature - technology - epoch - galaxy - ice - gender - mathematics etc.
model %>% predict("literature")
$literature
model %>% predict("technology")
$technology
model %>% predict("epoch")
$epoch
model %>% predict("ice")
$ice
model %>% predict("galaxy")
$galaxy
model %>% predict("gender")
$gender
model %>% predict("mathematics")
$mathematics
model %>% predict("forecast")
$forecast
model %>% predict("project")
$project
model %>% predict("europe")
$europe
model %>% predict("world")
$world
model %>% predict("women")
$women
model %>% predict("science")
$science
NA
To get analogy or to observe strong regularities in the word vector space I am first understanding some relations on key words: “technology” and “internet”
technology=word2vec_similarity( as.matrix(model)["technology",], as.matrix(model), top_n = 10, type = "cosine")
internet=word2vec_similarity( as.matrix(model)["internet",], as.matrix(model), top_n = 10, type = "cosine")
# similarity to the average of these terms:
#word2vec_similarity(x, y, top_n = +Inf, type = c("dot", "cosine"))
technology_and_internet=word2vec_similarity( tech_int, as.matrix(model), top_n = 20, type = "cosine")
# next: take a closer look at these words
#Plot based on distance from "computer" and "internet"
internet=word2vec_similarity( as.matrix(model)["internet",], as.matrix(model)[technology_and_internet$term2,], type = "cosine")
technology=word2vec_similarity( as.matrix(model)["technology",], as.matrix(model)[technology_and_internet$term2,], type = "cosine")
plot(internet, technology,type='n', xlim = c(0,1),slim = c(0,1))
Warning in plot.window(...) : "slim" is not a graphical parameter
Warning in plot.xy(xy, type, ...) : "slim" is not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...) :
"slim" is not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...) :
"slim" is not a graphical parameter
Warning in box(...) : "slim" is not a graphical parameter
Warning in title(...) : "slim" is not a graphical parameter
text(internet, technology,labels=technology_and_internet$term2)
NA
NA
tech_mat = apply(model %>% predict(c("technology","mathematics"), type = "embedding"),2,mean)
# store it as a 1 row matrix:
tech_mat = matrix(tech_mat,nrow=1,dimnames=list("comp_int",NULL))
tech_mat
[,1] [,2] [,3] [,4] [,5] [,6]
comp_int -1.577999 0.4652721 -0.4802899 -0.001003236 0.2246308 -0.8039914
[,7] [,8] [,9] [,10] [,11] [,12]
comp_int 0.5373268 -0.09327218 0.5956751 -0.1579152 0.1703879 0.5451163
[,13] [,14] [,15] [,16] [,17] [,18]
comp_int 0.3574364 1.381131 0.3180511 1.606939 0.1599314 -0.8027079
[,19] [,20] [,21] [,22] [,23] [,24]
comp_int 0.9163714 0.7771271 -0.1200165 0.7429772 -0.5792335 -1.736349
[,25] [,26] [,27] [,28] [,29] [,30]
comp_int -0.1221061 -0.1316846 0.9024746 -0.6036304 1.110614 -1.681817
[,31] [,32] [,33] [,34] [,35] [,36]
comp_int 0.06922852 -1.708554 0.5546289 -0.4678206 -0.6861236 0.004110426
[,37] [,38] [,39] [,40] [,41] [,42]
comp_int 0.1014062 -0.719866 0.2430631 1.056332 0.9367728 0.3685903
[,43] [,44] [,45] [,46] [,47] [,48]
comp_int -0.6269874 0.2128994 0.6926395 -0.4727499 1.704953 0.2690826
[,49] [,50] [,51] [,52] [,53] [,54]
comp_int -0.66215 -0.1452444 -1.487703 -0.6268972 -0.2893036 -0.1107578
[,55] [,56] [,57] [,58] [,59] [,60]
comp_int -0.3430799 -0.539978 0.4321179 -1.152737 0.3379841 -1.035241
[,61] [,62] [,63] [,64] [,65] [,66]
comp_int 0.6738486 2.077905 0.6609491 0.7280979 -0.957276 0.3758941
[,67] [,68] [,69] [,70] [,71] [,72]
comp_int -1.149814 0.8479758 0.8459372 -0.2602505 0.3232254 0.257856
[,73] [,74] [,75] [,76] [,77] [,78]
comp_int -1.40498 -0.2226957 -0.08624631 -0.00423108 0.2955763 -0.2018932
[,79] [,80] [,81] [,82] [,83] [,84]
comp_int -0.7909692 0.1384689 -0.4070444 -0.8678267 1.382823 -0.4148014
[,85] [,86] [,87] [,88] [,89] [,90]
comp_int -1.581395 0.1115427 -1.156026 -0.274945 -0.2105632 0.3300758
[,91] [,92] [,93] [,94] [,95] [,96]
comp_int 0.2277229 -0.3420553 -0.2764403 -1.375628 0.4510075 -0.3083977
[,97] [,98] [,99] [,100] [,101] [,102]
comp_int -0.4077189 0.5126394 -0.8025979 0.01352955 -0.05894127 0.7107821
[,103] [,104] [,105] [,106] [,107] [,108]
comp_int 0.6253503 -2.743082 0.4800262 -0.7194703 -0.8260761 0.149396
[,109] [,110] [,111] [,112] [,113] [,114]
comp_int -1.225206 0.6124699 -0.6767747 0.9728033 0.3278663 -0.2534892
[,115] [,116] [,117] [,118] [,119] [,120]
comp_int 1.15912 -0.6827027 -0.5424848 0.2896895 0.04994676 0.05959772
[,121] [,122] [,123] [,124] [,125] [,126]
comp_int 0.2823828 0.09048156 -0.5923854 -0.6919205 -1.010307 -0.207219
[,127] [,128] [,129] [,130] [,131] [,132]
comp_int 0.2196364 -1.472182 -0.06912667 -1.163124 0.392437 1.086041
[,133] [,134] [,135] [,136] [,137] [,138]
comp_int 0.01675487 0.08137242 -1.133886 1.140362 -1.396165 0.9410394
[,139] [,140] [,141] [,142] [,143] [,144]
comp_int -0.8452542 -1.109584 0.16045 1.217093 -0.5354903 0.3108073
[,145] [,146] [,147] [,148] [,149] [,150]
comp_int 0.0930224 0.7476939 -0.1292826 0.1477279 0.2701976 0.6358531
[,151] [,152] [,153] [,154] [,155] [,156]
comp_int 0.309333 0.593015 -0.3767641 0.5648782 -0.503154 0.5869612
[,157] [,158] [,159] [,160] [,161] [,162]
comp_int 0.7450373 0.3137634 -1.614193 1.833351 0.6620106 0.2911922
[,163] [,164] [,165] [,166] [,167] [,168]
comp_int -0.4784964 -1.416243 0.1591077 0.326633 1.057995 -0.3903803
[,169] [,170] [,171] [,172] [,173] [,174]
comp_int 1.877052 0.6298048 -1.038637 -1.00447 0.2493354 0.02525073
[,175] [,176] [,177] [,178] [,179] [,180]
comp_int -0.7703333 0.5899781 0.3702602 1.677245 -0.4846489 -0.7685625
[,181] [,182] [,183] [,184] [,185] [,186]
comp_int -0.750675 0.8336292 -1.312058 -1.144764 -0.2052372 0.1659092
[,187] [,188] [,189] [,190] [,191] [,192]
comp_int -1.103556 -0.3640978 0.2299731 -0.02661008 -0.05015617 -0.8628529
[,193] [,194] [,195] [,196] [,197] [,198]
comp_int 0.1321233 0.8878228 0.03910103 -0.3950765 -0.7959826 -0.4605443
[,199] [,200]
comp_int -0.6544964 0.3128084
technology=word2vec_similarity( as.matrix(model)["technology",], as.matrix(model), top_n = 10, type = "cosine")
mathematics=word2vec_similarity( as.matrix(model)["mathematics",], as.matrix(model), top_n = 10, type = "cosine")
# similarity to the average of these terms:
#word2vec_similarity(x, y, top_n = +Inf, type = c("dot", "cosine"))
technology_and_mathematics=word2vec_similarity( tech_mat, as.matrix(model), top_n = 20, type = "cosine")
# next: take a closer look at these words
#Plot based on distance from "computer" and "internet"
mathematics=word2vec_similarity( as.matrix(model)["mathematics",], as.matrix(model)[technology_and_mathematics$term2,], type = "cosine")
technology=word2vec_similarity( as.matrix(model)["technology",], as.matrix(model)[technology_and_mathematics$term2,], type = "cosine")
plot(mathematics, technology,type='n', xlim = c(0,1),slim = c(0,1))
Warning in plot.window(...) : "slim" is not a graphical parameter
Warning in plot.xy(xy, type, ...) : "slim" is not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...) :
"slim" is not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...) :
"slim" is not a graphical parameter
Warning in box(...) : "slim" is not a graphical parameter
Warning in title(...) : "slim" is not a graphical parameter
text(mathematics, technology,labels=technology_and_mathematics$term2)
# Zoom plot
plot(mathematics, technology,type='n', xlim = c(0,0.6),slim = c(0,1),ylim=c(0,0.6))
Warning in plot.window(...) : "slim" is not a graphical parameter
Warning in plot.xy(xy, type, ...) : "slim" is not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...) :
"slim" is not a graphical parameter
Warning in axis(side = side, at = at, labels = labels, ...) :
"slim" is not a graphical parameter
Warning in box(...) : "slim" is not a graphical parameter
Warning in title(...) : "slim" is not a graphical parameter
text(mathematics, technology,labels=technology_and_mathematics$term2)
sci_women = apply(model %>% predict(c("science","women"), type = "embedding"),2,mean)
# store it as a 1 row matrix:
sci_women = matrix(sci_women,nrow=1,dimnames=list("comp_int",NULL))
sci_women
[,1] [,2] [,3] [,4]
comp_int -0.3179859 0.03393589 -0.5800104 1.029352
[,5] [,6] [,7] [,8]
comp_int 0.08555508 -1.346731 0.5870489 -1.03492
[,9] [,10] [,11] [,12]
comp_int -0.5641688 -0.373129 0.1487406 0.1797497
[,13] [,14] [,15] [,16]
comp_int -0.1193674 -1.558854 -0.433222 1.208273
[,17] [,18] [,19] [,20]
comp_int 0.3219631 0.2218968 0.7953986 0.07620867
[,21] [,22] [,23] [,24]
comp_int 1.159843 -0.03522152 -0.0378727 -1.537864
[,25] [,26] [,27] [,28]
comp_int -0.9905524 -0.5510377 -0.9693588 -0.3353918
[,29] [,30] [,31] [,32]
comp_int 0.03310356 0.9530064 0.4259249 -1.410239
[,33] [,34] [,35] [,36]
comp_int -0.3326396 0.06034333 -0.885241 0.6489097
[,37] [,38] [,39] [,40]
comp_int -1.034415 -0.645978 0.4147109 0.7871358
[,41] [,42] [,43] [,44]
comp_int 0.3367168 0.1811265 -1.287015 -0.2689216
[,45] [,46] [,47] [,48]
comp_int 0.0894129 -1.070177 -0.4858917 1.490561
[,49] [,50] [,51] [,52]
comp_int -1.58763 0.03482664 -0.2398939 -0.08193439
[,53] [,54] [,55] [,56]
comp_int -0.865099 0.6690017 -0.2903164 -0.06020392
[,57] [,58] [,59] [,60]
comp_int 0.1947181 -0.02682557 0.84957 -1.256834
[,61] [,62] [,63] [,64]
comp_int 0.2431845 0.6978914 0.2646685 0.3557622
[,65] [,66] [,67] [,68]
comp_int -1.111874 -0.4985764 1.177464 0.2564348
[,69] [,70] [,71] [,72]
comp_int -0.6128036 -0.4763288 -0.9491103 0.9272924
[,73] [,74] [,75] [,76]
comp_int -0.05577317 -1.911886 -0.06506792 -1.479085
[,77] [,78] [,79] [,80]
comp_int -0.4075893 0.4756082 1.056066 -0.29543
[,81] [,82] [,83] [,84]
comp_int -0.8096104 -0.0229938 -0.1048992 -1.016388
[,85] [,86] [,87] [,88]
comp_int -0.3477395 -0.3730027 -0.4019246 -0.22865
[,89] [,90] [,91] [,92]
comp_int 0.4734698 1.202315 -0.2094783 -0.3044235
[,93] [,94] [,95] [,96]
comp_int 0.5231666 0.6669512 -1.209607 -1.208172
[,97] [,98] [,99] [,100]
comp_int -0.1795776 0.4610457 -1.027622 0.5910243
[,101] [,102] [,103] [,104]
comp_int -0.1159773 1.973473 -0.2582878 -0.0206399
[,105] [,106] [,107] [,108]
comp_int -0.6829103 0.2452708 -0.3685747 -0.513652
[,109] [,110] [,111] [,112]
comp_int 0.01340029 -1.253075 -0.7394774 0.2464113
[,113] [,114] [,115] [,116]
comp_int 0.803 -0.6002255 -0.8958221 0.1943679
[,117] [,118] [,119] [,120]
comp_int -0.2796964 0.001303151 -0.5105558 0.4999035
[,121] [,122] [,123] [,124]
comp_int 0.6514376 -0.9419909 0.1957576 -0.3746592
[,125] [,126] [,127] [,128]
comp_int 0.8274029 -0.4771918 0.76837 -1.308422
[,129] [,130] [,131] [,132]
comp_int 0.4862073 -0.0694249 0.4754299 -1.150041
[,133] [,134] [,135] [,136]
comp_int 0.6548706 -0.605057 -0.3654231 0.5684195
[,137] [,138] [,139] [,140]
comp_int -1.040241 0.3476941 0.2884196 -0.3159418
[,141] [,142] [,143] [,144]
comp_int 1.477249 -0.4466746 -0.8053416 -0.4851422
[,145] [,146] [,147] [,148]
comp_int -0.07960003 -0.9707485 0.4045715 -0.7951454
[,149] [,150] [,151] [,152]
comp_int 0.9706692 0.8312142 1.001146 0.2836071
[,153] [,154] [,155] [,156]
comp_int -0.5639763 -0.5686312 1.201627 -0.1894581
[,157] [,158] [,159] [,160]
comp_int 0.1892743 0.234033 -0.4547168 0.5994949
[,161] [,162] [,163] [,164]
comp_int 0.2792074 -0.5279929 0.4331483 -0.6543341
[,165] [,166] [,167] [,168]
comp_int -0.08941841 0.04252233 0.1551799 -0.4586514
[,169] [,170] [,171] [,172] [,173]
comp_int 1.40107 -1.02219 -0.2673804 0.9230855 0.7338408
[,174] [,175] [,176] [,177]
comp_int 0.5645198 0.5779977 -0.2121298 0.478854
[,178] [,179] [,180] [,181]
comp_int 0.5217635 -0.2788157 -0.9629443 -0.2696674
[,182] [,183] [,184] [,185]
comp_int -0.4101695 -0.2128299 -0.8826456 -0.4936118
[,186] [,187] [,188] [,189]
comp_int 0.1020988 0.3212687 -0.4862584 0.6166553
[,190] [,191] [,192] [,193]
comp_int -1.282793 0.29202 0.7989016 -0.04706359
[,194] [,195] [,196] [,197]
comp_int -0.3207891 -0.8643369 0.3958289 0.3058293
[,198] [,199] [,200]
comp_int -0.1346173 -0.005377982 0.6993007
science=word2vec_similarity( as.matrix(model)["science",], as.matrix(model), top_n = 10, type = "cosine")
women=word2vec_similarity( as.matrix(model)["women",], as.matrix(model), top_n = 10, type = "cosine")
# similarity to the average of these terms:
#word2vec_similarity(x, y, top_n = +Inf, type = c("dot", "cosine"))
science_and_women=word2vec_similarity( sci_women, as.matrix(model), top_n = 20, type = "cosine")
# next: take a closer look at these words
#Plot based on distance from "computer" and "internet"
women=word2vec_similarity( as.matrix(model)["women",], as.matrix(model)[science_and_women$term2,], type = "cosine")
science=word2vec_similarity( as.matrix(model)["science",], as.matrix(model)[science_and_women$term2,], type = "cosine")
#Plot
plot(women, science,type='n', xlim = c(0,1),ylim = c(0,1))
text(women, science,labels=science_and_women$term2)
Surprisingly (or not) we see that women and science “correlations” are close to 0.5 Come up with 3 examples of analogies and show the results from your model.
Query = as.matrix( model)[c("science","man","woman"),]
word2vec_similarity( Query[1,]-Query[2,]+ Query[3,],
as.matrix(model), top_n = 15, type = "cosine")
#runtime is ~ 20 seconds
Query = as.matrix( model)[c("science","technology","woman"),]
word2vec_similarity( Query[1,]-Query[2,]+ Query[3,],
as.matrix(model), top_n = 15, type = "cosine")
#runtime is ~ 20 seconds
Analogy 3: “didactic”,“literacy”,“computer”
Query = as.matrix( model)[c("didactic","literacy","computer"),]
word2vec_similarity( Query[1,]-Query[2,]+ Query[3,],
as.matrix(model), top_n = 15, type = "cosine")
#runtime is ~ 20 seconds
Part (b) You can use the cookbooks, Google, or Wikipedia embeddings for this question. Regardless of the CBOW or skip-gram models that you used, find the 50 word vectors closest to any of {beef, carrot, bread, egg, milk, beer, yeast}. You should have 50 x 7 words (but you might have a few less unique words if there is some overlap. Obtain the cosine distances between all these words (arguably no other distance metric makes sense for word vectors). Perform clustering on the word vectors and plot the dendogram output.
First running this functions.
prep_word2vec <- function(origin,destination,lowercase=F,
bundle_ngrams=1, ...)
{
# strsplit chokes on large lines. I would not have gone down this path if I knew this
# to begin with.
message("Beginning tokenization to text file at ", destination)
if (!exists("dir.exists")) {
# Use the version from devtools if in R < 3.2.0
dir.exists <- function (x)
{
res <- file.exists(x) & file.info(x)$isdir
stats::setNames(res, x)
}
}
if (dir.exists(origin)) {
origin = list.files(origin,recursive=T,full.names = T)
}
if (file.exists(destination)) file.remove(destination)
tokenize_words = function (x, lowercase = TRUE) {
# This is an abbreviated version of the "tokenizers" package version to remove the dependency.
# Sorry, Lincoln, it was failing some tests.
if (lowercase) x <- stringi::stri_trans_tolower(x)
out <- stringi::stri_split_boundaries(x, type = "word", skip_word_none = TRUE)
unlist(out)
}
prep_single_file <- function(file_in, file_out, lowercase) {
message("Prepping ", file_in)
text <- file_in %>%
readr::read_file() %>%
tokenize_words(lowercase) %>%
stringr::str_c(collapse = " ")
stopifnot(length(text) == 1)
readr::write_lines(text, file_out, append = TRUE)
return(TRUE)
}
Map(prep_single_file, origin, lowercase=lowercase, file_out=destination)
# Save the ultimate output
real_destination_name = destination
# Repeatedly build bigrams, trigrams, etc.
if (bundle_ngrams > 1) {
while(bundle_ngrams > 1) {
old_destination = destination
destination = paste0(destination,"_")
word2phrase(old_destination,destination,...)
file.remove(old_destination)
bundle_ngrams = bundle_ngrams - 1
}
file.rename(destination,real_destination_name)
}
silent = real_destination_name
}
word2phrase=function(train_file,output_file,debug_mode=0,min_count=5,threshold=100,force=FALSE)
{
if (!file.exists(train_file)) stop("Can't find the training file!")
if (file.exists(output_file) && !force) stop("The output file '",
output_file ,
"' already exists: give a new destination or run with 'force=TRUE'.")
OUT=.C("word2phrase",rtrain_file=as.character(train_file),
rdebug_mode=as.integer(debug_mode),
routput_file=as.character(output_file),
rmin_count=as.integer(min_count),
rthreshold=as.double(threshold))
return(output_file)
}
#CBOW
T1 = Sys.time()
modelcbow <- word2vec(x="text8.txt", dim = 200, window = 12, type = "cbow", threads = 40)
(Elapsed4 = T1-Sys.time())
Time difference of -5.477513 mins
Find the 50 word vectors closest to any of {beef, carrot, bread, egg, milk, beer, yeast}. You should have 50 x 7 words (but you might have a few less unique words if there is some overlap.
Obtain the cosine distances between all these words (arguably no other distance metric makes sense for word vectors). Value interpretation: -1 = exact opposite, 1 = exactly the same, 0 = orthogonal
Expand out to search for more food words:
#Expand out to search for more food words:
foodStuff = apply(modelcbow %>%
predict(c("beef", "carrot", "bread", "egg", "milk", "beer", "yeast"), type = "embedding"),2,mean)
foodStuff = matrix(foodStuff,nrow=1,dimnames=list("foodStuff",NULL))
#Obtain the cosine distances between all these words
Near_Food=word2vec_similarity( foodStuff, as.matrix(modelcbow), top_n = 50, type = "cosine")
head(Near_Food)
tail(Near_Food)
centers = 150
clustering = kmeans(as.matrix(modelcbow),centers=centers,iter.max = 40)
temp = clustering$cluster %>% table
Ind = which(temp>400) #these are clusters with a lot of members
sapply(c(Ind ,sample(1:centers,7)),function(n) {
names(clustering$cluster[clustering$cluster==n][1:10])
})
1 2 3
[1,] "podgorica" "airlifted" "knanaya"
[2,] "godunov" "gymkhana" "tillotson"
[3,] "vremya" "hohenzollerns" "waldenses"
[4,] "aunus" "countersigned" "thecla"
[5,] "livland" "amalasuntha" "adoptionist"
[6,] "carpentier" "lincei" "restates"
[7,] "suzdal" "ecclesias" "polygenesis"
[8,] "bundesrepublik" "nco" "indescribable"
4 5 6
[1,] "manoeuvred" "banqueting" "titanomachy"
[2,] "profitably" "nobile" "handmaidens"
[3,] "overconfident" "megalith" "zephyrus"
[4,] "bragged" "temenos" "felled"
[5,] "nazgul" "reburial" "morag"
[6,] "manged" "europos" "lod"
[7,] "drifts" "dollis" "abderus"
[8,] "mameluk" "soundstage" "polias"
7 9 10
[1,] "concertgebouw" "phrygians" "lusus"
[2,] "handfastings" "siraj" "mikveh"
[3,] "hanukiah" "nabopolassar" "icarius"
[4,] "epoque" "isam" "cometh"
[5,] "microscopical" "hasdingi" "knoweth"
[6,] "stigmatized" "takauji" "contemptible"
[7,] "frock" "christianisation" "communicant"
[8,] "stockinette" "germanus" "sarov"
11 12 14
[1,] "variolation" "eckart" "stearate"
[2,] "erythropoietin" "steinhoff" "weatherproof"
[3,] "salivary" "oxenstierna" "reverb"
[4,] "shigella" "mischen" "impedances"
[5,] "multifocal" "bist" "fireproof"
[6,] "biphobia" "obergruppenf" "lase"
[7,] "myeloma" "einf" "arcing"
[8,] "neurochemical" "wundt" "shp"
16 17 18
[1,] "heartlands" "muharraq" "pandion"
[2,] "diodati" "muhafazat" "renaldo"
[3,] "aviemore" "umma" "seiwa"
[4,] "formartine" "matruh" "prioress"
[5,] "snowdonia" "tahir" "eremetic"
[6,] "allerton" "koon" "belson"
[7,] "penrhyn" "jyllands" "piaf"
[8,] "newquay" "suleyman" "babs"
19 20 21
[1,] "dockyards" "whitten" "vmware"
[2,] "skirting" "chamberlin" "thmix"
[3,] "veld" "neilson" "xeon"
[4,] "lvsborg" "nolte" "macpaint"
[5,] "taguatinga" "ebbie" "mizar"
[6,] "dagenham" "derringer" "usenix"
[7,] "zuoying" "durocher" "qbasic"
[8,] "fishbourne" "smoltz" "simulink"
22 23 24
[1,] "maxton" "apportion" "autem"
[2,] "astrosociobiology" "rorschach" "areopagite"
[3,] "kushi" "aptitudes" "crescens"
[4,] "tainter" "aggravate" "divi"
[5,] "smollett" "entrust" "ntliche"
[6,] "cardiothoracic" "perturb" "eutychian"
[7,] "langen" "worshiping" "temporum"
[8,] "clebsch" "jeopardize" "thaddaeus"
25 27 28
[1,] "straighter" "elided" "gipsy"
[2,] "humerus" "labialized" "escaflowne"
[3,] "keikogi" "ltr" "omoo"
[4,] "parrying" "sinitic" "smelly"
[5,] "cushioned" "aspirates" "chronopolis"
[6,] "flooring" "iotated" "tevye"
[7,] "fairlight" "fancier" "aira"
[8,] "herringbone" "binyanim" "devours"
30 31 32
[1,] "energizing" "gloire" "overpass"
[2,] "cynically" "tellier" "comair"
[3,] "uncompromisingly" "conseillers" "rocketdyne"
[4,] "dehumanized" "plaines" "aecl"
[5,] "dispassionately" "pacifique" "dehomag"
[6,] "disorganization" "armes" "goizueta"
[7,] "retooled" "brique" "fiefdoms"
[8,] "circadian" "libero" "nynex"
34 36 37
[1,] "dockers" "youssoufi" "mistinguett"
[2,] "bazaars" "bwf" "aurore"
[3,] "consumables" "reactionaries" "bourg"
[4,] "gaiety" "cominform" "anouilh"
[5,] "rafting" "antifascist" "tamiris"
[6,] "chum" "yevtushenko" "phillipe"
[7,] "clubbers" "goce" "sisyphe"
[8,] "folksy" "giurgiu" "montand"
38 40 41
[1,] "qahtan" "matsuda" "pommern"
[2,] "ashdod" "metropole" "commandeered"
[3,] "mytilene" "magnes" "tali"
[4,] "posidonius" "mixup" "ladysmith"
[5,] "reckoner" "augmentative" "christov"
[6,] "kavala" "meriadoc" "schutzstaffel"
[7,] "amida" "coiner" "fended"
[8,] "assurbanipal" "ator" "oster"
42 43 44
[1,] "mccready" "intercede" "kaled"
[2,] "prospero" "wrinkle" "xenogears"
[3,] "mynors" "adjoins" "mechanist"
[4,] "marvell" "lach" "thundarr"
[5,] "anstey" "yearned" "superheroine"
[6,] "wyllie" "lionized" "ultimates"
[7,] "apocryphon" "enthralled" "madmen"
[8,] "whiteside" "nue" "bgc"
45 46 47
[1,] "pictus" "polenta" "brunelleschi"
[2,] "melanosuchus" "cornmeal" "bandinelli"
[3,] "przewalskii" "entrees" "regie"
[4,] "thresher" "chewy" "tornatore"
[5,] "roborovski" "carrom" "torelli"
[6,] "veined" "raspberries" "giambologna"
[7,] "wombats" "leavened" "modigliani"
[8,] "ptarmigan" "zaza" "mastroianni"
48 51 52
[1,] "perceiver" "retronym" "corozine"
[2,] "atelic" "xaa" "kurtis"
[3,] "ahra" "pchar" "bennet"
[4,] "indistinguishability" "ypbpr" "melodica"
[5,] "gradations" "crlf" "hannett"
[6,] "tachyons" "invoice" "supertramp"
[7,] "humaneness" "localhost" "metroplex"
[8,] "casuistic" "arial" "marigold"
53 54 55
[1,] "nadar" "nanotechnological" "godwinson"
[2,] "sturtevant" "qmp" "nra"
[3,] "venial" "decelerator" "kean"
[4,] "sadiq" "polarizations" "mauldin"
[5,] "ajn" "foregrounds" "desapio"
[6,] "supremacism" "levitating" "jla"
[7,] "spagna" "conjugates" "pattison"
[8,] "chino" "whiten" "bouchat"
56 57 59
[1,] "dumyat" "voyeurism" "greenway"
[2,] "archipelagos" "lability" "microserfs"
[3,] "yichang" "apnea" "omg"
[4,] "sirocco" "hypersomnia" "gaffer"
[5,] "cadastral" "sads" "grahn"
[6,] "bokhara" "synaesthesia" "mcginty"
[7,] "arakan" "paralytic" "bolonka"
[8,] "leine" "obsessively" "matinee"
60 61 62
[1,] "kenyanthropus" "kias" "wos"
[2,] "adventitious" "squandering" "krc"
[3,] "centipedes" "dawned" "downes"
[4,] "haplodiploidy" "rotund" "followup"
[5,] "fenestra" "popularizers" "bresenham"
[6,] "alveolates" "junctures" "vdm"
[7,] "ramidus" "exertions" "stonebraker"
[8,] "grebes" "belie" "simula"
64 66 67
[1,] "atoned" "medibank" "leamington"
[2,] "railing" "orthopedics" "rheinland"
[3,] "forepaws" "intercom" "hebei"
[4,] "uppercut" "endangerment" "subcamps"
[5,] "raking" "rediscover" "lusatian"
[6,] "equalised" "glasser" "sittard"
[7,] "buehrle" "familiarization" "lkerwanderung"
[8,] "takedowns" "airtel" "prilep"
68 69 70
[1,] "emperorship" "czerny" "sumba"
[2,] "terentius" "nirmalananda" "milion"
[3,] "pontian" "auberon" "buildering"
[4,] "herennius" "remonstrants" "pemba"
[5,] "consultum" "vergil" "doan"
[6,] "caecilia" "weimer" "bonaire"
[7,] "diadumenianus" "distinctness" "arapawa"
[8,] "claudii" "ironical" "crozet"
71 72 73
[1,] "meinhard" "embraer" "nationalize"
[2,] "tassilo" "rioja" "betancur"
[3,] "chatillon" "ribagorza" "pastora"
[4,] "eisleben" "hodh" "corazon"
[5,] "aiko" "iglesia" "ansip"
[6,] "amangkurat" "valenciana" "fouad"
[7,] "sadami" "todos" "naidu"
[8,] "meinong" "amaz" "uhuru"
74 75 78
[1,] "meriam" "arrabal" "korey"
[2,] "jizyah" "splashing" "northrup"
[3,] "investigatory" "scrubbing" "gennaro"
[4,] "lprs" "strode" "niekro"
[5,] "rightsholder" "slush" "defenseman"
[6,] "surcharge" "capsize" "philco"
[7,] "annuls" "altercations" "trifecta"
[8,] "bna" "bellowing" "tripucka"
80 81 83
[1,] "thf" "socialites" "intravascular"
[2,] "horoscopic" "blanchett" "annealed"
[3,] "hymenopterans" "chantal" "abraded"
[4,] "syntheses" "nikolais" "faeces"
[5,] "doms" "prisca" "smeared"
[6,] "polymorphisms" "micki" "incineration"
[7,] "vertebral" "lynette" "plantings"
[8,] "clefs" "ozma" "thins"
84 85 86
[1,] "ett" "edler" "akseli"
[2,] "facere" "ntv" "faustroll"
[3,] "majo" "apod" "litany"
[4,] "bieup" "cucullatus" "skald"
[5,] "jubilation" "pdge" "scansion"
[6,] "yw" "storrs" "dhe"
[7,] "pur" "basc" "unrhymed"
[8,] "ohta" "krauss" "vitriol"
88 91 92
[1,] "simpleton" "restate" "microscopist"
[2,] "brainless" "riveting" "moviemakers"
[3,] "wittily" "incubating" "merce"
[4,] "damsel" "dizzying" "buffyverse"
[5,] "expressionistic" "piggyback" "darko"
[6,] "undiminished" "morphing" "farscape"
[7,] "deflated" "inconsiderable" "lattuada"
[8,] "pervades" "trafficked" "vinterberg"
95 96 97
[1,] "linklater" "corporativism" "semitransparent"
[2,] "dehaas" "distributists" "pitchblende"
[3,] "tuchman" "ukusa" "denatonium"
[4,] "adelbert" "segregating" "welsbach"
[5,] "macbride" "ritualized" "smelters"
[6,] "mcinnerny" "actualized" "fractionated"
[7,] "tabarrok" "baathist" "burgas"
[8,] "theoharis" "jacobinism" "tten"
99 100 101
[1,] "liebowitz" "xenofungus" "swamping"
[2,] "shim" "bushfires" "chloroquine"
[3,] "lti" "catchments" "buffered"
[4,] "kohler" "tambora" "sakoku"
[5,] "gaa" "mudslide" "urbanism"
[6,] "iwamura" "frequenting" "overexploitation"
[7,] "vari" "batholiths" "dollarization"
[8,] "autokrat" "pastureland" "concessional"
102 103 104
[1,] "ardently" "acrostic" "priories"
[2,] "uprightness" "deaconess" "willibrord"
[3,] "condolences" "jephthah" "ushakov"
[4,] "weininger" "aharon" "avenches"
[5,] "authorise" "leib" "civili"
[6,] "humiliations" "bris" "dominici"
[7,] "foment" "tablecloth" "johannis"
[8,] "slanderous" "kadesh" "anaphora"
106 108 110
[1,] "fea" "mesha" "unearthing"
[2,] "mantissa" "fionn" "chp"
[3,] "arcminute" "karna" "chiming"
[4,] "subgraphs" "guofeng" "acd"
[5,] "regularities" "trepidation" "reachability"
[6,] "idft" "mithridates" "deque"
[7,] "parameterized" "sewashi" "pict"
[8,] "disperser" "terrify" "interlocked"
112 113 116
[1,] "instigating" "muerte" "hydroxyapatite"
[2,] "violencia" "tlaxcala" "chlorophylls"
[3,] "outrages" "crespo" "testicular"
[4,] "caning" "zaro" "globulins"
[5,] "ruc" "hernan" "bacteriostatic"
[6,] "amarillo" "liedholm" "bacteriophages"
[7,] "gehlen" "borb" "intronic"
[8,] "airstrike" "sociedad" "archaean"
117 120
[1,] "gingivitis" "bienville"
[2,] "diamorphine" "refurbishing"
[3,] "therapeutically" "nub"
[4,] "steroidal" "chaddr"
[5,] "methylenedioxymethamphetamine" "kamloops"
[6,] "fluoroquinolones" "cmucl"
[7,] "phenethylamine" "mesquite"
[8,] "methicillin" "enrage"
121 123 124
[1,] "disbelievers" "weblink" "minitel"
[2,] "manichees" "schoolnet" "ringtones"
[3,] "signifiers" "askmen" "pdh"
[4,] "kratos" "paralympic" "sdh"
[5,] "exultation" "cft" "minidv"
[6,] "eliphas" "earlychurch" "synch"
[7,] "insolent" "cais" "msd"
[8,] "yogis" "hyperphysics" "webcasts"
125 126 127
[1,] "dimpac" "stopgap" "bartholin"
[2,] "junge" "ravioles" "kapitsa"
[3,] "memoriae" "linesman" "henriksen"
[4,] "washoe" "pataliputra" "wegner"
[5,] "ouimet" "subkingdoms" "tilov"
[6,] "uva" "squealing" "dragutin"
[7,] "detterman" "phonon" "peeters"
[8,] "kitties" "sarcee" "narendra"
128 129 131
[1,] "pcw" "levene" "cranking"
[2,] "tatung" "saponification" "helicase"
[3,] "powermacs" "alkylating" "scc"
[4,] "grayscale" "aryl" "pry"
[5,] "tweaks" "phosphatidylserine" "herded"
[6,] "mainboard" "butadiene" "racking"
[7,] "unformatted" "cys" "lulled"
[8,] "lightworks" "fermentations" "yoma"
133 134 135
[1,] "ceaseless" "kiernan" "booji"
[2,] "vying" "radford" "blush"
[3,] "faeries" "ndi" "scrubs"
[4,] "patroness" "wildman" "goulet"
[5,] "obsoleted" "ivins" "mornin"
[6,] "astounded" "speechwriter" "nigga"
[7,] "shallowest" "goldenberg" "copyist"
[8,] "domiciled" "phillipson" "mender"
136 137 138
[1,] "skoda" "abp" "solitudes"
[2,] "sunningdale" "cronies" "utm"
[3,] "aggressions" "wikiwikiweb" "trekboers"
[4,] "addendum" "lita" "tigrigna"
[5,] "evian" "ando" "jats"
[6,] "concessionary" "chairpersons" "amerind"
[7,] "negotiates" "giamatti" "comorians"
[8,] "scid" "humffray" "dogon"
139 140 141
[1,] "curvatures" "guaranty" "phocid"
[2,] "ough" "shopkeepers" "mineralized"
[3,] "escutcheon" "shekels" "mattresses"
[4,] "kistvaens" "cashed" "phocids"
[5,] "dodecagonal" "handsomely" "brained"
[6,] "skein" "homeowner" "fossilize"
[7,] "paraboloids" "telemarketing" "pufferfish"
[8,] "grimshaw" "sustainably" "thoat"
143 144 145
[1,] "bett" "hovers" "phagocytes"
[2,] "betjeman" "waives" "gulbuddin"
[3,] "unua" "neogrammarians" "cajamarca"
[4,] "libris" "segregate" "aeginetans"
[5,] "americus" "socketed" "beausejour"
[6,] "nala" "formulates" "besiegers"
[7,] "sculpt" "disallows" "bourges"
[8,] "scifi" "cleaveland" "unrecognisable"
147 148
[1,] "bowes" "tsurugi"
[2,] "birt" "vasilievich"
[3,] "khalsa" "grigson"
[4,] "klug" "paraplegic"
[5,] "crotch" "bergh"
[6,] "roberson" "tempers"
[7,] "vick" "devos"
[8,] "malloy" "crimefighting"
149 150
[1,] "caucasoids" "eaf"
[2,] "parapsychological" "manoeuvrability"
[3,] "pangenesis" "airforces"
[4,] "brams" "lantirn"
[5,] "homological" "winches"
[6,] "habituated" "uav"
[7,] "irreligion" "stratotanker"
[8,] "neurochemistry" "wingtips"
[1,] "lusus" "disbelievers" "muirhead"
[2,] "mikveh" "manichees" "kemmyn"
[3,] "icarius" "signifiers" "alpin"
[4,] "cometh" "kratos" "kilwinning"
[5,] "knoweth" "exultation" "sutcliff"
[6,] "contemptible" "eliphas" "brassey"
[7,] "communicant" "insolent" "meres"
[8,] "sarov" "yogis" "abercromby"
[1,] "godwinson" "phrygians" "titanomachy"
[2,] "nra" "siraj" "handmaidens"
[3,] "kean" "nabopolassar" "zephyrus"
[4,] "mauldin" "isam" "felled"
[5,] "desapio" "hasdingi" "morag"
[6,] "jla" "takauji" "lod"
[7,] "pattison" "christianisation" "abderus"
[8,] "bouchat" "germanus" "polias"
[1,] "dockyards"
[2,] "skirting"
[3,] "veld"
[4,] "lvsborg"
[5,] "taguatinga"
[6,] "dagenham"
[7,] "zuoying"
[8,] "fishbourne"
[ reached getOption("max.print") -- omitted 2 rows ]
ingredients = c("beef", "carrot", "bread", "egg", "milk", "beer", "yeast")
foods2use = NULL
for(food in ingredients){
foods2use = rbind(foods2use,
apply(model %>%
predict(food, type = "embedding"),2,mean))
}
subset = as.matrix(model)[unique(c(ingredients,Near_Food$term2)),]
word2vec_similarity(subset, subset, type = "cosine") %>%
as.dist %>% # consider it a distance matrix
hclust %>% # hierarchical clustering
plot
It seems like “desserts” are in a separate cluster. The other clusters seem like food menu based on keywords like: (carrot and milk), (egg and beer), (meet, beef and flavor), (bread, yeast and peast)
Main Reference:
END!