#r上的word2vec都是開源的
if (!require(wordVectors)) {
if (!(require(devtools))) {
install.packages("devtools")
}
devtools::install_github("bmschmidt/wordVectors")
}
## Loading required package: wordVectors
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
#run & save model
if (!file.exists("tokenall.bin")) {model = train_word2vec("tokenall.txt","tokenall.bin",vectors=200,threads=8,window=12,iter=5,negative_samples=0)} else model = read.vectors("tokenall.bin")
## Filename ends with .bin, so reading in binary format
## Reading a word2vec binary file of 5862 rows and 200 columns
#negative_samples:Number of negative samples to take in skip-gram training. 0 means full sampling, while lower numbers give faster training
#上下字像的
model %>% closest_to("蔡總統")
#上下字像的
model %>% closest_to("柯文哲")
model %>%
closest_to(model[[c("蔡英文","蔡總統")]],50)
some_fish = closest_to(model,model[[c("核四","核能","核廢料","核電","反核")]],200)
fishy = model[[some_fish$word,average=F]]
par(family="黑體-繁 中黑")
plot(fishy,method="pca")

set.seed(10)
centers = 150
clustering = kmeans(model,centers=centers,iter.max = 40)
#前10群的字
sapply(sample(1:centers,10),function(n) {
names(clustering$cluster[clustering$cluster==n][1:10])
})
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "南部" "至少" "燃氣" "超過" "電廠" "護航" "北部" "程度"
## [2,] "回答" "遊行" "抗議" "一些" "除役" "文章" "發電量" "爐渣"
## [3,] "準備" "硬幹" "重要" "不可" "多久" "不算" "幾十年" "結論"
## [4,] "太多" "總統府" "風力" "示範" "給我" "放新北" "解釋" "兩次"
## [5,] "知識" "全球" "以上" "很爽" "無關" "某人" "家裡" "水力"
## [6,] "四個" "豆漿" "兩兆" "先去" "沒救" "大眾" "發言" "房子"
## [7,] "損失" "詐騙" "不同意" "綠共黨" "找個" "問他" "比爾蓋茲" "影片"
## [8,] "鍵盤" "放著" "無能" "賤畜" "反應" "電能" "品質" "蔡總統"
## [9,] "信任" "型態" "堅持" "打手" "真香" "不醒" "第一次" "大小"
## [10,] "打死" "貢寮" "顏色" "原始" "想過" "不看" "外面" "先問"
## [,9] [,10]
## [1,] "鄉民" "穩定"
## [2,] "選項" "絕食"
## [3,] "努力" "記得"
## [4,] "去問" "內文"
## [5,] "你媽" "看過"
## [6,] "傷害" "緊急"
## [7,] "結構" "內部"
## [8,] "疫苗" "民進"
## [9,] "提到" "回到"
## [10,] "偷偷" "信心"
#觀察wordvec分群
ingredients = c("發電","核電","擁核","非核")
term_set = lapply(ingredients,
function(ingredient) {
nearest_words = model %>% closest_to(model[[ingredient]],20)
nearest_words$word
}) %>% unlist
subset = model[[term_set,average=F]]
subset %>%
cosineDist(subset) %>%
as.dist %>%
hclust %>%
plot(family = "黑體-繁 中黑",cex= 0.5)

#看字的關係
tastes = model[[c("台北市","新北市"),average=F]]
# model[1:3000,] here restricts to the 3000 most common words in the set.
sweet_and_saltiness = model[1:500,] %>% cosineSimilarity(tastes)
# Filter to the top 20 sweet or salty.
sweet_and_saltiness = sweet_and_saltiness[
rank(-sweet_and_saltiness[,1])<20 |
rank(-sweet_and_saltiness[,2])<20,
]
par(family="黑體-繁 中黑",cex=0.8)
plot(sweet_and_saltiness,type='n')+
text(sweet_and_saltiness,labels=rownames(sweet_and_saltiness))

## integer(0)
tastes = model[[c("蔡英文","馬英九","柯文哲"),average=F]]
# model[1:3000,] here restricts to the 3000 most common words in the set.
common_similarities_tastes = model[1:1000,] %>% cosineSimilarity(tastes)
common_similarities_tastes[20:30,]
## 蔡英文 馬英九 柯文哲
## 缺電 -0.024739014 0.054346407 -0.105711072
## 重啟 -0.083077633 0.071190555 0.006426440
## 直接 0.039747629 -0.005546726 -0.023102725
## 安全 -0.060101805 0.067323949 0.031937705
## 根本 0.070379495 0.024172588 0.065232463
## 已經 -0.032898461 -0.116650001 -0.139302416
## 這種 -0.033348149 0.013551710 0.004914821
## 覺得 -0.048203346 -0.074427113 0.063418097
## 處理 0.146106161 -0.102990015 -0.014422602
## 能源 -0.002106965 0.058067136 -0.042794962
## 你家 0.029239901 0.011962353 0.076125722
high_similarities_to_tastes = common_similarities_tastes[rank(-apply(common_similarities_tastes,1,max)) < 30,]
high_similarities_to_tastes %>%
PCA(graph=F) %>%
fviz_pca_biplot(pointsize=2,labelsize = 3,font="黑體-繁 中黑")
