Word2Vec try

#r上的word2vec都是開源的
if (!require(wordVectors)) {
  if (!(require(devtools))) {
    install.packages("devtools")
  }
  devtools::install_github("bmschmidt/wordVectors")
}

## Loading required package: wordVectors

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(FactoMineR)

#run & save model
if (!file.exists("tokenall.bin")) {model = train_word2vec("tokenall.txt","tokenall.bin",vectors=200,threads=8,window=12,iter=5,negative_samples=0)} else model = read.vectors("tokenall.bin")

## Filename ends with .bin, so reading in binary format

## Reading a word2vec binary file of 5862 rows and 200 columns

#negative_samples:Number of negative samples to take in skip-gram training. 0 means full sampling, while lower numbers give faster training

#上下字像的
model %>% closest_to("蔡總統")

#上下字像的
model %>% closest_to("柯文哲")

model %>% 
  closest_to(model[[c("蔡英文","蔡總統")]],50)

some_fish = closest_to(model,model[[c("核四","核能","核廢料","核電","反核")]],200)
fishy = model[[some_fish$word,average=F]] 
par(family="黑體-繁 中黑")
plot(fishy,method="pca")

set.seed(10)
centers = 150
clustering = kmeans(model,centers=centers,iter.max = 40)

#前10群的字
sapply(sample(1:centers,10),function(n) {
  names(clustering$cluster[clustering$cluster==n][1:10])
})

##       [,1]   [,2]     [,3]     [,4]     [,5]   [,6]     [,7]       [,8]    
##  [1,] "南部" "至少"   "燃氣"   "超過"   "電廠" "護航"   "北部"     "程度"  
##  [2,] "回答" "遊行"   "抗議"   "一些"   "除役" "文章"   "發電量"   "爐渣"  
##  [3,] "準備" "硬幹"   "重要"   "不可"   "多久" "不算"   "幾十年"   "結論"  
##  [4,] "太多" "總統府" "風力"   "示範"   "給我" "放新北" "解釋"     "兩次"  
##  [5,] "知識" "全球"   "以上"   "很爽"   "無關" "某人"   "家裡"     "水力"  
##  [6,] "四個" "豆漿"   "兩兆"   "先去"   "沒救" "大眾"   "發言"     "房子"  
##  [7,] "損失" "詐騙"   "不同意" "綠共黨" "找個" "問他"   "比爾蓋茲" "影片"  
##  [8,] "鍵盤" "放著"   "無能"   "賤畜"   "反應" "電能"   "品質"     "蔡總統"
##  [9,] "信任" "型態"   "堅持"   "打手"   "真香" "不醒"   "第一次"   "大小"  
## [10,] "打死" "貢寮"   "顏色"   "原始"   "想過" "不看"   "外面"     "先問"  
##       [,9]   [,10] 
##  [1,] "鄉民" "穩定"
##  [2,] "選項" "絕食"
##  [3,] "努力" "記得"
##  [4,] "去問" "內文"
##  [5,] "你媽" "看過"
##  [6,] "傷害" "緊急"
##  [7,] "結構" "內部"
##  [8,] "疫苗" "民進"
##  [9,] "提到" "回到"
## [10,] "偷偷" "信心"

#觀察wordvec分群
ingredients = c("發電","核電","擁核","非核")
term_set = lapply(ingredients, 
       function(ingredient) {
          nearest_words = model %>% closest_to(model[[ingredient]],20)
          nearest_words$word
        }) %>% unlist

subset = model[[term_set,average=F]]

subset %>%
  cosineDist(subset) %>% 
  as.dist %>%
  hclust %>%
  plot(family = "黑體-繁 中黑",cex= 0.5)

#看字的關係
tastes = model[[c("台北市","新北市"),average=F]]

# model[1:3000,] here restricts to the 3000 most common words in the set.
sweet_and_saltiness = model[1:500,] %>% cosineSimilarity(tastes)

# Filter to the top 20 sweet or salty.
sweet_and_saltiness = sweet_and_saltiness[
  rank(-sweet_and_saltiness[,1])<20 |
  rank(-sweet_and_saltiness[,2])<20,
  ]
par(family="黑體-繁 中黑",cex=0.8)
plot(sweet_and_saltiness,type='n')+
text(sweet_and_saltiness,labels=rownames(sweet_and_saltiness))

## integer(0)

tastes = model[[c("蔡英文","馬英九","柯文哲"),average=F]]

# model[1:3000,] here restricts to the 3000 most common words in the set.
common_similarities_tastes = model[1:1000,] %>% cosineSimilarity(tastes)

common_similarities_tastes[20:30,]

##            蔡英文       馬英九       柯文哲
## 缺電 -0.024739014  0.054346407 -0.105711072
## 重啟 -0.083077633  0.071190555  0.006426440
## 直接  0.039747629 -0.005546726 -0.023102725
## 安全 -0.060101805  0.067323949  0.031937705
## 根本  0.070379495  0.024172588  0.065232463
## 已經 -0.032898461 -0.116650001 -0.139302416
## 這種 -0.033348149  0.013551710  0.004914821
## 覺得 -0.048203346 -0.074427113  0.063418097
## 處理  0.146106161 -0.102990015 -0.014422602
## 能源 -0.002106965  0.058067136 -0.042794962
## 你家  0.029239901  0.011962353  0.076125722

high_similarities_to_tastes = common_similarities_tastes[rank(-apply(common_similarities_tastes,1,max)) < 30,]


high_similarities_to_tastes %>% 
  PCA(graph=F) %>% 
  fviz_pca_biplot(pointsize=2,labelsize = 3,font="黑體-繁 中黑")

Word2Vec try

xuanyou

2021-06-14