rm(list = ls())
#install.packages("udpipe")
library(word2vec)
help(package = "word2vec")
#Take some data and standardise it a bit
library(udpipe)
data(brussels_reviews, package = "udpipe")
x <- subset(brussels_reviews, language == "nl")
x <- tolower(x$feedback)
class(x);x[1:2]
## [1] "character"
## [1] "zeer leuke plek om te vertoeven , rustig en toch erg centraal gelegen in het centrum van brussel , leuk adres om te kennen , als we terug naar brussel komen zullen we zeker teruggaan ! \n"
## [2] "het appartement ligt op een goede locatie: op loopafstand van de europese wijk en vlakbij verschilende metrostations en toch rustig. het is zeer ruim en modern ingericht en voorzien van alle gemakken. ik vond het de perfecte plek om te verblijven als je in brussel moet zijn. het contact met de verhuurders was prettig en er was een duidelijke instructie voor het verkrijgen van de sleutels. "
#Build a model
set.seed(123456789)
model <- word2vec(x = x, type = "cbow", dim = 15, iter = 20)
model
## $model
## <pointer: 0x0000000011ef1190>
##
## $data
## $data$file
## [1] "C:\\Users\\liyix\\AppData\\Local\\Temp\\RtmpYZc54p\\textspace_352c65af763a.txt"
##
## $data$stopwords
## character(0)
##
## $data$n
## [1] 25714
##
## $data$n_vocabulary
## [1] 21978
##
##
## $vocabulary
## [1] 640
##
## $success
## [1] TRUE
##
## $error_log
## [1] ""
##
## $control
## $control$min_count
## [1] 5
##
## $control$dim
## [1] 15
##
## $control$window
## [1] 05
##
## $control$iter
## [1] 14
##
## $control$lr
## [1] 0.05
##
## $control$skipgram
## [1] FALSE
##
## $control$hs
## [1] FALSE
##
## $control$negative
## [1] 05
##
## $control$sample
## [1] 0.001
##
## $control$expTableSize
## [1] 1000
##
## $control$expValueMax
## [1] 06
##
## $control$split_words
## [1] " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r"
##
## $control$split_sents
## [1] ".\n?!"
##
##
## attr(,"class")
## [1] "word2vec_trained"
embedding <- as.matrix(model)
dim(embedding); head(embedding)
## [1] 640 15
## [,1] [,2] [,3] [,4] [,5] [,6]
## bernard -1.3422247 -0.4590147 -0.77534479 1.28593886 -0.3947695 -1.5983517
## sleutels -0.6713787 -0.4043640 -1.05720556 1.81686842 -1.1079620 -0.5394753
## geval -0.4089678 -1.0037924 -0.58945584 1.51523340 -0.8092667 -0.4402452
## hartelijke -1.9429145 -0.1595369 0.05116571 1.66322911 -0.1285198 -0.8196269
## drukke -0.5313207 -0.8749534 -1.23775148 -0.09081767 -0.9341027 -1.1453652
## frans -0.9083108 -0.5593087 -0.76590472 1.83271646 -0.6499583 -0.6898251
## [,7] [,8] [,9] [,10] [,11] [,12]
## bernard -0.4050666 0.24117255 -1.0199908 0.96478570 1.0033354 1.6371416
## sleutels -0.1102052 -0.49158666 -0.1660005 0.06766788 0.9009049 1.3253812
## geval 0.6697449 -0.54153371 -0.9570823 -0.30420557 0.4341437 1.1725357
## hartelijke -0.1706638 0.74545777 -1.6138520 0.87543684 0.7293046 1.5659348
## drukke 0.3265014 -1.84846675 -1.0232714 -1.44320679 1.4195256 -0.1862754
## frans -0.6690902 -0.06349311 -0.5469971 0.80700910 0.3239857 1.7490976
## [,13] [,14] [,15]
## bernard 1.3956844 0.4296976 -0.0971726
## sleutels 1.2846290 1.0238616 1.7016655
## geval 1.6605110 1.7158335 1.1480248
## hartelijke 0.7684046 0.2255859 -0.4015766
## drukke 0.9391979 0.0214287 0.8011264
## frans 1.7212024 0.8498750 0.8758233
embedding <- predict(model, c("bus", "toilet"), type = "embedding")
dim(embedding); embedding
## [1] 2 15
## [,1] [,2] [,3] [,4] [,5] [,6]
## bus -0.3668469 -0.6366928 -0.8689163 0.4159282 -1.0081322 0.4687428
## toilet -0.6691340 0.3986863 -2.1744916 0.8056974 -0.8030706 -1.9879684
## [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## bus 0.6375756 -1.1711998 -0.1687210 -1.7958313 1.1843972 0.2223057 1.035369
## toilet -1.0297498 0.1493207 -0.1461529 0.4930494 0.7287921 0.1968287 1.153534
## [,14] [,15]
## bus 0.8172594 2.0179636
## toilet 0.7124245 0.8146146
lookslike <- predict(model, c("bus", "toilet"), type = "nearest", top_n = 5)
lookslike
## $bus
## term1 term2 similarity rank
## 1 bus voet 0.9891888 1
## 2 bus gratis 0.9886125 2
## 3 bus tram 0.9885917 3
## 4 bus ben 0.9844405 4
## 5 bus auto 0.9780166 5
##
## $toilet
## term1 term2 similarity rank
## 1 toilet koelkast 0.9856260 1
## 2 toilet douche 0.9830786 2
## 3 toilet wifi 0.9779336 3
## 4 toilet voldoende 0.9723337 4
## 5 toilet werkte 0.9677669 5
#Save the model and read it back in and do something with it
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\"
write.word2vec(model, paste0(dir_path,Sys.Date(),"-","mymodel.bin"))
## [1] TRUE
model <- read.word2vec(paste0(dir_path,Sys.Date(),"-","mymodel.bin"))
model
## $model
## <pointer: 0x0000000011ef0c90>
##
## $model_path
## [1] "C:\\Users\\liyix\\OneDrive\\Desktop\\2022-05-19-mymodel.bin"
##
## $dim
## [1] 15
##
## $vocabulary
## [1] 640
##
## attr(,"class")
## [1] "word2vec"
terms <- summary(model, "vocabulary")
embedding <- as.matrix(model)
dim(embedding);head(embedding)
## [1] 640 15
## [,1] [,2] [,3] [,4] [,5] [,6]
## gevoel -0.9238259 -0.026704872 -0.6292740 1.8863711 -0.3662841 -0.757030666
## min -0.2133710 -0.404283434 -1.2222506 0.2552329 -1.1098080 0.005463298
## thee -0.4583002 -0.174394324 -1.4553903 1.4375530 -1.1182940 -0.909959972
## tram -0.4896810 -0.336572140 -1.0490241 0.3570949 -1.1357403 0.105500288
## klaar -0.6893575 -0.605752587 -0.5863386 2.0427637 -1.0395325 -0.191039354
## host -1.6348834 -0.005303845 -0.8567226 0.7927718 -0.4832102 -2.143013954
## [,7] [,8] [,9] [,10] [,11] [,12]
## gevoel -0.6334901 0.659689784 -0.61428064 1.0188545 0.42617735 1.96174943
## min 0.8705564 -1.060020208 0.08100636 -2.1422913 1.48482287 -0.03199992
## thee -0.1591056 0.004384169 0.03588378 0.4067790 0.75696486 1.22212207
## tram 0.9032922 -1.061702609 -0.33808836 -1.8427789 1.48538208 0.09147169
## klaar -0.3656188 0.007973482 -0.69237167 0.9956049 0.03312894 1.37026989
## host -0.4804118 -0.366714567 -1.99319756 0.4760848 0.85478091 0.25786027
## [,13] [,14] [,15]
## gevoel 1.6212552 0.690529346 0.3112835
## min 1.1381605 -0.005548047 1.4218421
## thee 1.6013472 1.215780973 1.4815913
## tram 1.0668954 0.507353604 1.7476388
## klaar 1.3183149 1.353249669 1.2131468
## host 0.8325186 -0.051653422 -0.2846081
####################Using another example, we get the embeddings of words together with parts of speech tag
library(udpipe)
data(brussels_reviews_anno, package = "udpipe")
x <- subset(brussels_reviews_anno, language == "fr" & !is.na(lemma) & nchar(lemma) > 1)
dim(x);head(x)
## [1] 24565 8
## doc_id language sentence_id token_id token lemma upos xpos
## 38014 47860059 fr 2345 1 Quelle quelle DET DT
## 38015 47860059 fr 2345 2 excellent excellent ADJ JJ
## 38016 47860059 fr 2345 3 week week NOUN NN
## 38017 47860059 fr 2345 4 end endre VERB VB
## 38019 47860059 fr 2345 6 Merci merci NOUN NNP
## 38020 47860059 fr 2345 7 a avoir VERB VB
x <- subset(x, xpos %in% c("NN", "IN", "RB", "VB", "DT", "JJ", "PRP", "CC",
"VBN", "NNP", "NNS", "PRP$", "CD", "WP", "VBG", "UH", "SYM"))
dim(x)
## [1] 24525 8
x$text <- sprintf("%s//%s", x$lemma, x$xpos)
x <- paste.data.frame(x, term = "text", group = "doc_id", collapse = " ")
dim(x);head(x,2)
## [1] 500 2
## doc_id
## 1 47860059
## 2 34292252
## text
## 1 quelle//DT excellent//JJ week//NN endre//VB merci//NNP avoir//VB david//NNP pour//IN sa//PRP$ confiance//NN merci//NN avoir//VB son//PRP$ appart//NN etre//VB aussi//RB chouette//NN merci//NN au//IN quartier//NN etre//VB aussi//RB calme//JJ et//CC accueillir//VBG et//CC merci//NN avoir//VB cett//NN jolie//NNP ville//NN de//IN bruxelles//NNP et//CC avoir//VB sa//PRP$ journee//NN sans//IN voitures//NN qui//WP nou//PRP avoir//VB permettre//VBN de//IN sillonner//VB avoir//VB pied//NN avec//IN delices//NN de//IN bien//RB joli//JJ quartier//NN
## 2 moi//PRP et//PRP mum//PRP$ mere//NN avoir//VB passer//VBN un//DT super//NN sejour//NN sous//IN un//DT beau//JJ soleil//NN en//IN prime//NN le//DT appartement//NN etre//VB coquet//JJ calme//JJ et//CC fonctionnel//NN il//PRP etre//VB situer//VBN en//IN centre//NN ville//NN pres//RB de//IN tout//RB nou//PRP ne//RB avoir//VB jamais//RB avoir//VBN prendre//VB de//IN transport//NNS en//IN commun//NN et//CC avoir//VB pouvoir//VBN parcourir//VB bruxelles//NNP pied//NN nicky//NNP etre//VB un//DT hote//NN sympathique//JJ efficace//JJ et//CC disponible//JJ il//PRP repondre//VB tres//RB vite//RB aux//IN texto//NNS de//IN renseignement//NNS que//IN je//PRP avoir//VB pouvoir//VBN lui//PRP envoyer//VBN
model <- word2vec(x = x$text, dim = 15, iter = 20, split = c(" ", ".\n?!"))
embedding <- as.matrix(model)
head(embedding);dim(embedding)
## [,1] [,2] [,3] [,4] [,5]
## aucune//DT -0.01696466 -0.4810905 -0.3410409 1.2784883 -0.3260561
## atypique//JJ -1.52763069 0.9976431 0.1355143 -0.7577388 -0.4820832
## retrouver//VB 1.71266007 -2.2584770 -0.3064924 -0.2388107 -0.6920210
## guide//NNS 1.01204801 -0.5672924 -0.7514498 -0.5869820 -0.9762023
## fort//RB -2.08437681 1.6740702 -0.6035988 -0.5179451 -0.6938872
## beau//RB 0.21748888 0.6019061 0.1119600 0.2356944 0.4000441
## [,6] [,7] [,8] [,9] [,10]
## aucune//DT 1.8111421 0.9114513 -0.2430620 0.71685368 -1.4650924
## atypique//JJ 0.7465367 -0.2249501 -0.1332033 -0.32726943 -2.0483675
## retrouver//VB 0.3543530 0.3391838 -0.2297677 0.02733639 0.1694310
## guide//NNS -0.3536319 0.2273995 -0.6589138 -1.35743272 -0.2971301
## fort//RB 0.1881352 0.2734222 0.4105585 -0.79866064 -1.6534911
## beau//RB -0.2364233 -1.1650056 -0.9312444 0.06085423 -2.0491593
## [,11] [,12] [,13] [,14] [,15]
## aucune//DT -0.83271521 0.34253895 -1.3964294 0.8477117 1.6128542
## atypique//JJ 1.05714977 -0.68914056 -2.0011356 0.5579198 -0.0961210
## retrouver//VB -1.29996502 -0.04413626 1.3456048 1.5837785 -0.0499371
## guide//NNS -1.29595363 -0.58322805 1.9735909 1.5328861 -0.9926966
## fort//RB 0.03740248 0.81885743 -0.9918312 -0.3148631 -1.1547241
## beau//RB 1.54104030 -0.18854232 -1.5642091 1.5915272 0.6993676
## [1] 523 15
##Perform dimension reduction using UMAP + make interactive plot of only the adjectives for example
library(uwot)
## 载入需要的程辑包:Matrix
#install.packages("uwot")
viz <- umap(embedding, n_neighbors = 15, n_threads = 2)
dim(viz);head(viz)
## [1] 523 2
## [,1] [,2]
## aucune//DT -0.8454238 -0.2838199
## atypique//JJ -2.8617881 0.2812078
## retrouver//VB 1.8427495 -1.6437385
## guide//NNS -1.9739782 -2.5672386
## fort//RB -3.4381882 0.1756518
## beau//RB -0.5686551 1.7442520
## Static plot
library(ggplot2)
library(ggrepel)
df <- data.frame(word = gsub("//.+", "", rownames(embedding)),
xpos = gsub(".+//", "", rownames(embedding)),
x = viz[, 1], y = viz[, 2],
stringsAsFactors = FALSE)
df <- subset(df, xpos %in% c("JJ"))
head(df)
## word xpos x y
## atypique//JJ atypique JJ -2.861788 0.2812078
## exceptionnel//JJ exceptionnel JJ -2.740785 0.2926815
## historique//JJ historique JJ 4.045830 0.5378420
## discrete//JJ discrete JJ -3.433984 -0.5384281
## possible//JJ possible JJ 1.253913 -0.8751023
## residentiel//JJ residentiel JJ 1.230259 2.4629652
ggplot(df, aes(x = x, y = y, label = word)) +
geom_text_repel() + theme_void() +
labs(title = "word2vec - adjectives in 2D using UMAP")
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Interactive plot
library(plotly)
##
## 载入程辑包:'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(df, x = ~x, y = ~y, type = "scatter", mode = 'text', text = ~word)
#https://cran.r-project.org/web/packages/word2vec/readme/README.html