RでLDAの一例

ver1.0 2014-05-07 m.ooki

0. 概要

LDAが割とすぐにRでできるらしいので、やってみた。

参考:

参考1:Rでトピック分析

参考2:トピックモデル

参考3:ldaパッケージ

1. パッケージ読み込み

opts_chunk$set(warning = F, comment = "", fig.width = 11, fig.height = 6)
library(lda)
library(reshape2)
library(ggplot2)

2. ldaパッケージで解析できるデータの例

# ベクトル型で以下のように格納できるデータならOK。csvファイルなら結合していけばOK。
sentence <- c("I am the very model of a modern major general", "I have a major headache")
# lexicalize関数で、LDAで分析するためのデータを生成。LIST型として展開される。
# LISTの要素は2つの成分。LIST型の$documentsと文字列ベクトルの$vovabに。
test <- lexicalize(sentence, lower = TRUE)
# testの中身を見ると、$documentはセンテンスの出現位置と回数、$vocabがユニークな単語
test
$documents
$documents[[1]]
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,]    0    1    2    3    4    5    6    7    8     9
[2,]    1    1    1    1    1    1    1    1    1     1

$documents[[2]]
     [,1] [,2] [,3] [,4] [,5]
[1,]    0   10    6    8   11
[2,]    1    1    1    1    1


$vocab
 [1] "i"        "am"       "the"      "very"     "model"    "of"      
 [7] "a"        "modern"   "major"    "general"  "have"     "headache"
# リストはこんな感じでアクセスできます。
test$documents[[1]][2, 4]
[1] 1

3. 分析データ読み込み

# coraデータの読み込み(2410の科学記事のデータセット。LISTで2410成分ある)
data(cora.documents)
head(cora.documents, n = 2)
[[1]]
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
[1,]    0    1    2    3    4    5    6    7    8     9    10    11    12
[2,]    1    4    2    2    1    2    1    2    1     1     3     4     1
     [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
[1,]    13    14    15    16    17    18    19    20    21    22    23
[2,]     1     1     1     1     1     3     1     1     1     1     1
     [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35]
[1,]    24    25    26    27    28    29    30    31    32    33    34
[2,]     1     1     1     4     2     1     1     1     1     1     3
     [,36] [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46]
[1,]    35    36    37    38    39    40    41    42    43    44    45
[2,]     2     1     4     1     2     1     1     1     1     1     1
     [,47] [,48] [,49] [,50] [,51] [,52] [,53] [,54] [,55] [,56] [,57]
[1,]    46    47    48    49    50    51    52    53    54    55    56
[2,]     1     1     2     2     1     1     1     1     2     1     1
     [,58] [,59] [,60] [,61] [,62] [,63] [,64]
[1,]    57    58    59    60    61    62    63
[2,]     1     1     1     1     1     1     1

[[2]]
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
[1,]   31   34   64   65   66   67   68   69   70    71    72    73    74
[2,]    1    1    1    1    1    1    1    3    1     2     1     1     2
     [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
[1,]    75    76    77    78    79    80    81    82    83    84    85
[2,]     1     1     2     2     2     1     1     1     1     1     1
     [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35]
[1,]    86    87    88    89    90    91    92    93    94    95    96
[2,]     1     1     1     1     1     1     1     1     1     1     1
     [,36] [,37] [,38] [,39]
[1,]    97    98    99   100
[2,]     1     1     1     1
# 科学記事で使われているユニーク単語(2910個)のベクトル
data(cora.vocab)
head(cora.vocab)
[1] "computer"    "algorithms"  "discovering" "patterns"    "groups"     
[6] "protein"    
# 科学記事で使われているタイトル(2410個)のベクトル
data(cora.titles)
head(cora.titles)
[1] "The megaprior heuristic for discovering protein sequence patterns."                                                                                                                                                                                
[2] "Skyt The GANNsystem. Genetic Algorithms generating feedforward neural networks. http://www.daimi.aau.dk/~damgaard/NNSG/ 25 Du Junping,K.L.Rasmussen,J.Aagaard,B.Mayoh, T.Srensen,etc. Applications of Machine Learning: a Medical Follow Up Study."
[3] "http:##www-anw.cs.umass.edu#~mharmon#rltutorial#docs#singh.ps"                                                                                                                                                                                     
[4] "Planning and acting in partially observable stochastic domains."                                                                                                                                                                                   
[5] "(1997) Variational methods for inference and estimation in graphical models. Unpublished doctoral dissertation,"                                                                                                                                   
[6] "ftp:##ftp.engr.orst.edu#pub#dambrosi#uai-96.ps.Z"                                                                                                                                                                                                  
# 分析データの作成(トリッキーな参照をしているので注意)
# 1列目がcora.documentsの第一成分で使われる単語のリスト、2列目がその出現回数
data_cora <- as.data.frame(cbind(cora.vocab[cora.documents[[1]][1, ] + 1], cora.documents[[1]][2, 
    ]))
# coreの1番目の記事はこれらの単語とその出現回数で構成されていることが分かる。
head(data_cora)
           V1 V2
1    computer  1
2  algorithms  4
3 discovering  2
4    patterns  2
5      groups  1
6     protein  2

4. LDA

# 推定するトピック数の設定
k <- 10

# ldaパッケージはギブスサンプラーでやるようです。
# ギブスサンプラーの中でも3つくらいmethodがあるようです。
result <- lda.collapsed.gibbs.sampler(cora.documents, 
                                       k,
                                       cora.vocab,
                                       25,  # 繰り返し数
                                       0.1, # ディリクレ過程のハイパーパラメータα
                                       0.1, # ディリクレ過程のハイパーパラメータη
                                       compute.log.likelihood=TRUE)

# サマリを見ると、10成分のリストで構成されている。
# assignments:文書Dと同じ長さのリスト。値は単語が割り当てられたトピックNoを示す。
# topic:k × vの行列。値はそのトピックに出現する単語数を表す。
# topic_sums:それぞれのトピックに割り当てられた単語の合計数
# document_sums:k × Dの行列。割り振られたトピックにおける一文章内の単語数を示す。
summary(result)
                Length Class  Mode   
assignments      2410  -none- list   
topics          29610  -none- numeric
topic_sums         10  -none- numeric
document_sums   24100  -none- numeric
<NA>                0  -none- NULL   
<NA>                0  -none- NULL   
<NA>                0  -none- NULL   
<NA>                0  -none- NULL   
<NA>                0  -none- NULL   
log.likelihoods    50  -none- numeric
# 各クラスターでの上位キーワードを抽出する
# 例は各トピックにおける上位3位の単語の行列。
top.words <- top.topic.words(result$topics, 3, by.score=TRUE)
top.words
     [,1]        [,2]       [,3]      [,4]            [,5]       
[1,] "design"    "bayesian" "visual"  "genetic"       "learning" 
[2,] "knowledge" "belief"   "network" "reinforcement" "concept"  
[3,] "reasoning" "markov"   "neural"  "search"        "algorithm"
     [,6]       [,7]       [,8]        [,9]       [,10]        
[1,] "tree"     "learning" "learning"  "learning" "genetic"    
[2,] "trees"    "feature"  "models"    "networks" "performance"
[3,] "decision" "network"  "algorithm" "network"  "parallel"   
# 最初の3記事だけトピック割合を抽出してみる
N <- 3
topic.proportions <- t(result$document_sums) / colSums(result$document_sums)
topic.proportions <- topic.proportions[1:N, ]
topic.proportions[is.na(topic.proportions)] <-  1 / k

# 上位3番までのトップワードを用いて列名をつけて、意味付けを行う。
colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=" ")
par(mar=c(5, 14, 2, 2))
barplot(topic.proportions, beside=TRUE, horiz=TRUE, las=1, xlab="proportion")

plot of chunk unnamed-chunk-4

# ggplotで可視化するために、meltを駆使してデータを作成(トリッキーなので注意)
topic.proportions.df <- melt(cbind(data.frame(topic.proportions), document=factor(1:N)), variable.name="topic", id.vars = "document")

# ggplotで可視化
ggplot(topic.proportions.df, aes(x=topic, y=value, fill=document)) + geom_bar() + facet_wrap(~ document, ncol=N) + coord_flip()
Mapping a variable to y and also using stat="bin".
  With stat="bin", it will attempt to set the y value to the count of cases in each group.
  This can result in unexpected behavior and will not be allowed in a future version of ggplot2.
  If you want y to represent counts of cases, use stat="bin" and don't map a variable to y.
  If you want y to represent values in the data, use stat="identity".
  See ?geom_bar for examples. (Deprecated; last used in version 0.9.2)
Mapping a variable to y and also using stat="bin".
  With stat="bin", it will attempt to set the y value to the count of cases in each group.
  This can result in unexpected behavior and will not be allowed in a future version of ggplot2.
  If you want y to represent counts of cases, use stat="bin" and don't map a variable to y.
  If you want y to represent values in the data, use stat="identity".
  See ?geom_bar for examples. (Deprecated; last used in version 0.9.2)
Mapping a variable to y and also using stat="bin".
  With stat="bin", it will attempt to set the y value to the count of cases in each group.
  This can result in unexpected behavior and will not be allowed in a future version of ggplot2.
  If you want y to represent counts of cases, use stat="bin" and don't map a variable to y.
  If you want y to represent values in the data, use stat="identity".
  See ?geom_bar for examples. (Deprecated; last used in version 0.9.2)

plot of chunk unnamed-chunk-4

# 予測はこんな感じ
predictions <- predictive.distribution(result$document_sums[,1:2], result$topics, 0.1, 0.1)
top.topic.words(t(predictions), 5)
     [,1]        [,2]      
[1,] "learning"  "learning"
[2,] "model"     "networks"
[3,] "algorithm" "network" 
[4,] "paper"     "neural"  
[5,] "problem"   "paper"