library(text)
library(reticulate)
textrpp_install()
textrpp_initialize(save_profile = TRUE)
conda_list()
use_condaenv("r-reticulate")
use_condaenv(conda_list()$python[1])
textEmbed関数の実行
Subword Tokenization
embeddings <- textEmbed("word embeddings")
Completed layers output for texts (variable: 1/1, duration: 6.213706 secs).
Completed layers aggregation for word_type_embeddings.
Completed layers aggregation (variable 1/1, duration: 0.073271 secs).
Completed layers aggregation (variable 1/1, duration: 0.087532 secs).
embeddings$tokens
$texts
$texts[[1]]
NANA
分析テキストURL
Category A: Retro Japan
article1, article7, article8
Category B: Research/Survey
article2, article9, article12
Category C: Politics
article3, article10, article11
Category D: Sports (Football)
article4, article5, article6
Term-Document Matrixの作成
contents <- lapply(article_urls, getArticleContent)
tmMtx <- getDocumentTermMTX(contents, term="lemma")
Processed document 10 of 12
dim(tmMtx)
[1] 1119 12
head(tmMtx)
単語埋め込み情報計算と結果保存
n <- length(contents) # Use all elements in contents
article_tibble <- tibble(!!!set_names(contents, paste0("article", seq_along(contents))))
article_tibble
article.embeddings <- textEmbed(
article_tibble,
model = "bert-base-uncased", aggregation_from_layers_to_tokens = "concatenate", aggregation_from_tokens_to_texts = "mean", keep_token_embeddings = FALSE
)
saveRDS(article.embeddings, "article.embeddings.rds")
データ整形(単語埋め込み情報の行結合)
article.embeddingsLst <- c()
for (current_article in article.embeddings$texts){
colnames(current_article) <- paste0("Dim", 1:ncol(current_article))
article.embeddingsLst <- rbind(article.embeddingsLst, current_article)
}
print(dim(article.embeddingsLst))
[1] 12 768
article.embeddingsLst[1:5, 1:5]
文書間の類似度(euclidean)
textSimilarityMTX.euclid<-textSimilarityMatrix(article.embeddingsLst, method = "euclidean")
View表示
View(round(textSimilarityMTX.euclid,2))
文書間の類似度(cosine)
textSimilarityMTX.cosine<-textSimilarityMatrix(article.embeddingsLst, method = "cosine")
結果表示
as.dist(round(textSimilarityMTX.cosine,2))
1 2 3 4 5 6 7 8 9 10 11
2 0.84
3 0.83 0.81
4 0.76 0.70 0.78
5 0.71 0.73 0.72 0.76
6 0.76 0.72 0.78 0.89 0.90
7 0.96 0.87 0.83 0.75 0.72 0.75
8 0.97 0.84 0.85 0.79 0.72 0.78 0.96
9 0.82 0.90 0.80 0.68 0.71 0.70 0.83 0.81
10 0.82 0.83 0.86 0.74 0.72 0.75 0.82 0.82 0.85
11 0.82 0.85 0.88 0.74 0.73 0.75 0.82 0.82 0.85 0.96
12 0.78 0.92 0.76 0.67 0.69 0.68 0.80 0.78 0.85 0.78 0.79
View表示
View(round(textSimilarityMTX.cosine,2))
比較:出現頻度による文書間のcosine類似度
library(proxy)
library(cleanNLP)
cnlp_init_udpipe()
tmMtx <- getDocumentTermMTX(contents, term="lemma", punct=TRUE)
Processed document 10 of 12
結果表示
round(simil(t(tmMtx), method="cosine"),2)
1 2 3 4 5 6 7 8 9 10 11
2 0.61
3 0.61 0.55
4 0.60 0.55 0.51
5 0.66 0.65 0.55 0.60
6 0.72 0.58 0.55 0.66 0.73
7 0.84 0.67 0.62 0.56 0.69 0.70
8 0.84 0.65 0.69 0.56 0.66 0.69 0.84
9 0.65 0.71 0.62 0.54 0.60 0.62 0.67 0.70
10 0.72 0.67 0.62 0.60 0.63 0.66 0.70 0.72 0.73
11 0.72 0.67 0.63 0.59 0.69 0.68 0.72 0.71 0.69 0.81
12 0.73 0.63 0.59 0.58 0.64 0.71 0.69 0.72 0.69 0.74 0.71
比較:出現頻度による文書間のcosine距離
dist.cos<-proxy::dist(t(tmMtx), method="cosine", diag=T)
round(dist.cos,2)
1 2 3 4 5 6 7 8 9 10 11 12
1 0.00
2 0.39 0.00
3 0.39 0.45 0.00
4 0.40 0.45 0.49 0.00
5 0.34 0.35 0.45 0.40 0.00
6 0.28 0.42 0.45 0.34 0.27 0.00
7 0.16 0.33 0.38 0.44 0.31 0.30 0.00
8 0.16 0.35 0.31 0.44 0.34 0.31 0.16 0.00
9 0.35 0.29 0.38 0.46 0.40 0.38 0.33 0.30 0.00
10 0.28 0.33 0.38 0.40 0.37 0.34 0.30 0.28 0.27 0.00
11 0.28 0.33 0.37 0.41 0.31 0.32 0.28 0.29 0.31 0.19 0.00
12 0.27 0.37 0.41 0.42 0.36 0.29 0.31 0.28 0.31 0.26 0.29 0.00