Lecture10: LLMの利用

パッケージのインストール

install.packages("text")
install.packages("reticulate")

Install a conda environment with text required python packages

library(text)
library(reticulate)
textrpp_install()
textrpp_initialize(save_profile = TRUE)
conda_list()
use_condaenv("r-reticulate")
use_condaenv(conda_list()$python[1])

textEmbed関数の実行

Subword Tokenization

embeddings <- textEmbed("word embeddings")
Completed layers output for texts (variable: 1/1, duration: 6.213706 secs).

Completed layers aggregation for word_type_embeddings. 

Completed layers aggregation (variable 1/1, duration: 0.073271 secs).

Completed layers aggregation (variable 1/1, duration: 0.087532 secs).
embeddings$tokens
$texts
$texts[[1]]
NANA

分析テキストURL

Category A: Retro Japan
  article1, article7, article8
Category B: Research/Survey
article2, article9, article12
Category C: Politics
  article3, article10, article11
Category D: Sports (Football)
  article4, article5, article6

記事URLの情報取得

article_urls <- readLines("Lec09_ArticleURL_info")

Term-Document Matrixの作成

contents <- lapply(article_urls, getArticleContent)
tmMtx <- getDocumentTermMTX(contents, term="lemma")
Processed document 10 of 12
dim(tmMtx)
[1] 1119   12
head(tmMtx)

article1抜粋データの単語埋め込み情報

単語埋め込み情報計算と結果保存

n <- length(contents)  # Use all elements in contents
article_tibble <- tibble(!!!set_names(contents, paste0("article", seq_along(contents))))
article_tibble

article.embeddings <- textEmbed(
  article_tibble,
  model = "bert-base-uncased", aggregation_from_layers_to_tokens = "concatenate", aggregation_from_tokens_to_texts = "mean", keep_token_embeddings = FALSE
)

saveRDS(article.embeddings, "article.embeddings.rds")

単語埋め込みデータの読み込み

article.embeddings <- readRDS("article.embeddings.rds")

データ整形(単語埋め込み情報の行結合)

article.embeddingsLst <- c()
for (current_article in article.embeddings$texts){
  colnames(current_article) <- paste0("Dim", 1:ncol(current_article))
  article.embeddingsLst <- rbind(article.embeddingsLst, current_article)
}

print(dim(article.embeddingsLst))
[1]  12 768
article.embeddingsLst[1:5, 1:5]

文書間の類似度(euclidean)

textSimilarityMTX.euclid<-textSimilarityMatrix(article.embeddingsLst, method = "euclidean")

View表示

View(round(textSimilarityMTX.euclid,2))

文書間の類似度(cosine)

textSimilarityMTX.cosine<-textSimilarityMatrix(article.embeddingsLst, method = "cosine")

結果表示

as.dist(round(textSimilarityMTX.cosine,2))
      1    2    3    4    5    6    7    8    9   10   11
2  0.84                                                  
3  0.83 0.81                                             
4  0.76 0.70 0.78                                        
5  0.71 0.73 0.72 0.76                                   
6  0.76 0.72 0.78 0.89 0.90                              
7  0.96 0.87 0.83 0.75 0.72 0.75                         
8  0.97 0.84 0.85 0.79 0.72 0.78 0.96                    
9  0.82 0.90 0.80 0.68 0.71 0.70 0.83 0.81               
10 0.82 0.83 0.86 0.74 0.72 0.75 0.82 0.82 0.85          
11 0.82 0.85 0.88 0.74 0.73 0.75 0.82 0.82 0.85 0.96     
12 0.78 0.92 0.76 0.67 0.69 0.68 0.80 0.78 0.85 0.78 0.79

View表示

View(round(textSimilarityMTX.cosine,2))

比較:出現頻度による文書間のcosine類似度

library(proxy)
library(cleanNLP)

cnlp_init_udpipe()
tmMtx <- getDocumentTermMTX(contents, term="lemma", punct=TRUE)
Processed document 10 of 12

結果表示

round(simil(t(tmMtx), method="cosine"),2)
      1    2    3    4    5    6    7    8    9   10   11
2  0.61                                                  
3  0.61 0.55                                             
4  0.60 0.55 0.51                                        
5  0.66 0.65 0.55 0.60                                   
6  0.72 0.58 0.55 0.66 0.73                              
7  0.84 0.67 0.62 0.56 0.69 0.70                         
8  0.84 0.65 0.69 0.56 0.66 0.69 0.84                    
9  0.65 0.71 0.62 0.54 0.60 0.62 0.67 0.70               
10 0.72 0.67 0.62 0.60 0.63 0.66 0.70 0.72 0.73          
11 0.72 0.67 0.63 0.59 0.69 0.68 0.72 0.71 0.69 0.81     
12 0.73 0.63 0.59 0.58 0.64 0.71 0.69 0.72 0.69 0.74 0.71

比較:出現頻度による文書間のcosine距離

dist.cos<-proxy::dist(t(tmMtx), method="cosine", diag=T)
round(dist.cos,2)
      1    2    3    4    5    6    7    8    9   10   11   12
1  0.00                                                       
2  0.39 0.00                                                  
3  0.39 0.45 0.00                                             
4  0.40 0.45 0.49 0.00                                        
5  0.34 0.35 0.45 0.40 0.00                                   
6  0.28 0.42 0.45 0.34 0.27 0.00                              
7  0.16 0.33 0.38 0.44 0.31 0.30 0.00                         
8  0.16 0.35 0.31 0.44 0.34 0.31 0.16 0.00                    
9  0.35 0.29 0.38 0.46 0.40 0.38 0.33 0.30 0.00               
10 0.28 0.33 0.38 0.40 0.37 0.34 0.30 0.28 0.27 0.00          
11 0.28 0.33 0.37 0.41 0.31 0.32 0.28 0.29 0.31 0.19 0.00     
12 0.27 0.37 0.41 0.42 0.36 0.29 0.31 0.28 0.31 0.26 0.29 0.00

階層的クラスター分析

階層的クラスター分析

  • Distance measure: cosine
  • Agglomeration method: ward

出現単語頻度データ

hc.freq.cosine <- hclust(proxy::dist(t(tmMtx), method="cosine"), method = "ward.D2")
plot(hc.freq.cosine)

単語埋め込みデータ

distMtx <- as.dist(textDistanceMatrix(article.embeddingsLst, center = TRUE, method="cosine"))
hc.embeddings.cosine <- hclust(distMtx, method = "ward.D2")
plot(hc.embeddings.cosine)

LS0tCnRpdGxlOiAiTGVjMTA6IExMTeOBruWIqeeUqCAoRmFsbCAyMDI0KSIKb3V0cHV0OiBodG1sX25vdGVib29rCmVkaXRvcl9vcHRpb25zOiAKICBjaHVua19vdXRwdXRfdHlwZTogaW5saW5lCi0tLQoKIyBMZWN0dXJlMTA6IExMTeOBruWIqeeUqAoqIDxhIGhyZWY9Imh0dHBzOi8vd3d3LnItdGV4dC5vcmcvaW5kZXguaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPiJ0ZXh0InBhY2thZ2U8L2E+CiogPGEgaHJlZj0iaHR0cHM6Ly9naXRodWIuY29tL09zY2FyS2plbGwvdGV4dCIgdGFyZ2V0PSJfYmxhbmsiPkdpdGh1YjogInRleHQiIHBhY2thZ2U8L2E+CiogPGEgaHJlZj0iaHR0cHM6Ly9vc2YuaW8vcHJlcHJpbnRzL3BzeWFyeGl2LzI5M2t0IiB0YXJnZXQ9Il9ibGFuayI+VGhlIHRleHQtcGFja2FnZTogQW4gUi1wYWNrYWdlIGZvciBBbmFseXppbmcgYW5kIFZpc3VhbGl6aW5nIEh1bWFuIExhbmd1YWdlIFVzaW5nIE5hdHVyYWwgTGFuZ3VhZ2UgUHJvY2Vzc2luZyBhbmQgRGVlcCBMZWFybmluZzwvYT4KCiMjIOODkeODg+OCseODvOOCuOOBruOCpOODs+OCueODiOODvOODqwpgYGB7ciwgZXZhbD1GQUxTRX0KaW5zdGFsbC5wYWNrYWdlcygidGV4dCIpCmluc3RhbGwucGFja2FnZXMoInJldGljdWxhdGUiKQpgYGAKCiMjIDxhIGhyZWY9Imh0dHBzOi8vd3d3LnItdGV4dC5vcmcvYXJ0aWNsZXMvaHVnZ2luZ2ZhY2VfaW5fcl9leHRlbmRlZF9pbnN0YWxsYXRpb25fZ3VpZGUuaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPkluc3RhbGwgYSBjb25kYSBlbnZpcm9ubWVudCB3aXRoIHRleHQgcmVxdWlyZWQgcHl0aG9uIHBhY2thZ2VzPC9hPgpgYGB7cn0KbGlicmFyeSh0ZXh0KQpsaWJyYXJ5KHJldGljdWxhdGUpCmBgYAoKYGBge3IsIGV2YWw9RkFMU0V9CnRleHRycHBfaW5zdGFsbCgpCnRleHRycHBfaW5pdGlhbGl6ZShzYXZlX3Byb2ZpbGUgPSBUUlVFKQpgYGAKCmBgYHtyLCBldmFsPUZBTFNFfQpjb25kYV9saXN0KCkKdXNlX2NvbmRhZW52KCJyLXJldGljdWxhdGUiKQp1c2VfY29uZGFlbnYoY29uZGFfbGlzdCgpJHB5dGhvblsxXSkKYGBgCgojIyB0ZXh0RW1iZWTplqLmlbDjga7lrp/ooYwKIyMjIFN1YndvcmQgVG9rZW5pemF0aW9uCmBgYHtyfQplbWJlZGRpbmdzIDwtIHRleHRFbWJlZCgid29yZCBlbWJlZGRpbmdzIikKZW1iZWRkaW5ncyR0b2tlbnMKYGBgCiMjIOWIhuaekOODhuOCreOCueODiFVSTApgYGAKQ2F0ZWdvcnkgQTogUmV0cm8gSmFwYW4KICBhcnRpY2xlMSwgYXJ0aWNsZTcsIGFydGljbGU4CkNhdGVnb3J5IEI6IFJlc2VhcmNoL1N1cnZleQphcnRpY2xlMiwgYXJ0aWNsZTksIGFydGljbGUxMgpDYXRlZ29yeSBDOiBQb2xpdGljcwogIGFydGljbGUzLCBhcnRpY2xlMTAsIGFydGljbGUxMQpDYXRlZ29yeSBEOiBTcG9ydHMgKEZvb3RiYWxsKQogIGFydGljbGU0LCBhcnRpY2xlNSwgYXJ0aWNsZTYKYGBgCiMjIOiomOS6i1VSTOOBruaDheWgseWPluW+lyAKYGBge3J9CmFydGljbGVfdXJscyA8LSByZWFkTGluZXMoIkxlYzA5X0FydGljbGVVUkxfaW5mbyIpCmBgYAoKIyMgVGVybS1Eb2N1bWVudCBNYXRyaXjjga7kvZzmiJAKYGBge3J9CnNvdXJjZSgiZnVuYzRsZWMwOS5SIikKCmNvbnRlbnRzIDwtIGxhcHBseShhcnRpY2xlX3VybHMsIGdldEFydGljbGVDb250ZW50KQp0bU10eCA8LSBnZXREb2N1bWVudFRlcm1NVFgoY29udGVudHMsIHRlcm09ImxlbW1hIikKZGltKHRtTXR4KQpoZWFkKHRtTXR4KQpgYGAKCiMjIGFydGljbGUx5oqc57KL44OH44O844K/44Gu5Y2Y6Kqe5Z+L44KB6L6844G/5oOF5aCxCmBgYHtyfQphcnRpY2xlMSA8LSBzdHJzcGxpdChjb250ZW50c1tbMV1dLCAiXFwuIikKI3VubGlzdChhcnRpY2xlMSlbMToyXQphcnRpY2xlMS5lbWJlZGRpbmdzIDwtIHRleHRFbWJlZCh1bmxpc3QoYXJ0aWNsZTEpWzE6Ml0pCmhlYWQoYXJ0aWNsZTEuZW1iZWRkaW5ncyR0b2tlbnMkdGV4dHNbWzJdXSkKYGBgCiMjIOWNmOiqnuWfi+OCgei+vOOBv+aDheWgseioiOeul+OBqOe1kOaenOS/neWtmAoqIDxhIGhyZWY9Imh0dHBzOi8vY3Jhbi5yLXByb2plY3Qub3JnL3dlYi9wYWNrYWdlcy90ZXh0L3ZpZ25ldHRlcy90ZXh0Lmh0bWwiIHRhcmdldD0iX2JsYW5rIj7lj4LogIPjgrPjg7zjg4k6IHRoZSBUZXh0IFR1dG9yaWFsPC9hPgpgYGB7ciwgZXZhbD1GQUxTRX0KbiA8LSBsZW5ndGgoY29udGVudHMpICAjIFVzZSBhbGwgZWxlbWVudHMgaW4gY29udGVudHMKYXJ0aWNsZV90aWJibGUgPC0gdGliYmxlKCEhIXNldF9uYW1lcyhjb250ZW50cywgcGFzdGUwKCJhcnRpY2xlIiwgc2VxX2Fsb25nKGNvbnRlbnRzKSkpKQphcnRpY2xlX3RpYmJsZQoKYXJ0aWNsZS5lbWJlZGRpbmdzIDwtIHRleHRFbWJlZCgKICBhcnRpY2xlX3RpYmJsZSwKICBtb2RlbCA9ICJiZXJ0LWJhc2UtdW5jYXNlZCIsIGFnZ3JlZ2F0aW9uX2Zyb21fbGF5ZXJzX3RvX3Rva2VucyA9ICJjb25jYXRlbmF0ZSIsIGFnZ3JlZ2F0aW9uX2Zyb21fdG9rZW5zX3RvX3RleHRzID0gIm1lYW4iLCBrZWVwX3Rva2VuX2VtYmVkZGluZ3MgPSBGQUxTRQopCgpzYXZlUkRTKGFydGljbGUuZW1iZWRkaW5ncywgImFydGljbGUuZW1iZWRkaW5ncy5yZHMiKQpgYGAKCgojIyDljZjoqp7ln4vjgoHovrzjgb/jg4fjg7zjgr/jga7oqq3jgb/ovrzjgb8gCmBgYHtyfQphcnRpY2xlLmVtYmVkZGluZ3MgPC0gcmVhZFJEUygiYXJ0aWNsZS5lbWJlZGRpbmdzLnJkcyIpCmBgYAoKIyMg44OH44O844K/5pW05b2i77yI5Y2Y6Kqe5Z+L44KB6L6844G/5oOF5aCx44Gu6KGM57WQ5ZCI77yJCmBgYHtyfQphcnRpY2xlLmVtYmVkZGluZ3NMc3QgPC0gYygpCmZvciAoY3VycmVudF9hcnRpY2xlIGluIGFydGljbGUuZW1iZWRkaW5ncyR0ZXh0cyl7CiAgY29sbmFtZXMoY3VycmVudF9hcnRpY2xlKSA8LSBwYXN0ZTAoIkRpbSIsIDE6bmNvbChjdXJyZW50X2FydGljbGUpKQogIGFydGljbGUuZW1iZWRkaW5nc0xzdCA8LSByYmluZChhcnRpY2xlLmVtYmVkZGluZ3NMc3QsIGN1cnJlbnRfYXJ0aWNsZSkKfQoKcHJpbnQoZGltKGFydGljbGUuZW1iZWRkaW5nc0xzdCkpCmFydGljbGUuZW1iZWRkaW5nc0xzdFsxOjUsIDE6NV0KYGBgCgojIyMg5paH5pu46ZaT44Gu6aGe5Ly85bqm77yIZXVjbGlkZWFu77yJCmBgYHtyfQp0ZXh0U2ltaWxhcml0eU1UWC5ldWNsaWQ8LXRleHRTaW1pbGFyaXR5TWF0cml4KGFydGljbGUuZW1iZWRkaW5nc0xzdCwgbWV0aG9kID0gImV1Y2xpZGVhbiIpCmBgYAoKIyMjIFZpZXfooajnpLoKYGBge3IsIGV2YWw9RkFMU0V9ClZpZXcocm91bmQodGV4dFNpbWlsYXJpdHlNVFguZXVjbGlkLDIpKQpgYGAKCiMjIyDmlofmm7jplpPjga7poZ7kvLzluqbvvIhjb3NpbmXvvIkKYGBge3J9CnRleHRTaW1pbGFyaXR5TVRYLmNvc2luZTwtdGV4dFNpbWlsYXJpdHlNYXRyaXgoYXJ0aWNsZS5lbWJlZGRpbmdzTHN0LCBtZXRob2QgPSAiY29zaW5lIikKYGBgCgojIyMg57WQ5p6c6KGo56S6CmBgYHtyfQphcy5kaXN0KHJvdW5kKHRleHRTaW1pbGFyaXR5TVRYLmNvc2luZSwyKSkKYGBgCgojIyMgVmlld+ihqOekugpgYGB7ciwgZXZhbD1GQUxTRX0KVmlldyhyb3VuZCh0ZXh0U2ltaWxhcml0eU1UWC5jb3NpbmUsMikpCmBgYAoKIyMjIOavlOi8g++8muWHuuePvumgu+W6puOBq+OCiOOCi+aWh+abuOmWk+OBrmNvc2luZemhnuS8vOW6pgpgYGB7cn0KbGlicmFyeShwcm94eSkKbGlicmFyeShjbGVhbk5MUCkKCmNubHBfaW5pdF91ZHBpcGUoKQp0bU10eCA8LSBnZXREb2N1bWVudFRlcm1NVFgoY29udGVudHMsIHRlcm09ImxlbW1hIiwgcHVuY3Q9VFJVRSkKYGBgCiMjIyDntZDmnpzooajnpLoKYGBge3J9CnJvdW5kKHNpbWlsKHQodG1NdHgpLCBtZXRob2Q9ImNvc2luZSIpLDIpCmBgYAoKIyMjIOavlOi8g++8muWHuuePvumgu+W6puOBq+OCiOOCi+aWh+abuOmWk+OBrmNvc2luZei3nembogpgYGB7cn0KZGlzdC5jb3M8LXByb3h5OjpkaXN0KHQodG1NdHgpLCBtZXRob2Q9ImNvc2luZSIsIGRpYWc9VCkKcm91bmQoZGlzdC5jb3MsMikKYGBgCgojIyDpmo7lsaTnmoTjgq/jg6njgrnjgr/jg7zliIbmnpAKKiA8YSBocmVmPSJodHRwczovL2JlbGxjdXJ2ZS5qcC9zdGF0aXN0aWNzL2NvdXJzZS8yNzE1NS5odG1sIiB0YXJnZXQ9Il9ibGFuayI+5Y+C6ICD44K144Kk44OIMTog6ZqO5bGk55qE44Kv44Op44K544K/44O85YiG5p6QPC9hPgoqIDxhIGhyZWY9Imh0dHBzOi8vbnVtZXJpY3MubWF0aGRvdG5ldC5jb20vRGlzdGFuY2UiIHRhcmdldD0iX2JsYW5rIj7lj4LogIPjgrXjgqTjg4gyOiDot53pm6LooYzliJc8L2E+CgojIyMg6ZqO5bGk55qE44Kv44Op44K544K/44O85YiG5p6QCiogRGlzdGFuY2UgbWVhc3VyZTogY29zaW5lCiogQWdnbG9tZXJhdGlvbiBtZXRob2Q6IHdhcmQKCiMjIyMg5Ye654++5Y2Y6Kqe6aC75bqm44OH44O844K/CmBgYHtyfQpoYy5mcmVxLmNvc2luZSA8LSBoY2x1c3QocHJveHk6OmRpc3QodCh0bU10eCksIG1ldGhvZD0iY29zaW5lIiksIG1ldGhvZCA9ICJ3YXJkLkQyIikKcGxvdChoYy5mcmVxLmNvc2luZSkKYGBgCiMjIyMg5Y2Y6Kqe5Z+L44KB6L6844G/44OH44O844K/CmBgYHtyfQpkaXN0TXR4IDwtIGFzLmRpc3QodGV4dERpc3RhbmNlTWF0cml4KGFydGljbGUuZW1iZWRkaW5nc0xzdCwgY2VudGVyID0gVFJVRSwgbWV0aG9kPSJjb3NpbmUiKSkKaGMuZW1iZWRkaW5ncy5jb3NpbmUgPC0gaGNsdXN0KGRpc3RNdHgsIG1ldGhvZCA9ICJ3YXJkLkQyIikKcGxvdChoYy5lbWJlZGRpbmdzLmNvc2luZSkKYGBgCgoKCgoKCg==