トピックの発見

## 必要な2つのパッケージを読み込む
library(tm, SnowballC)
## Loading required package: NLP
## 未加工のコーパスを読み込む
corpus.raw <- VCorpus(DirSource(directory = "federalist", pattern = "fp"))

## 小文字にする
corpus.prep <- tm_map(corpus.raw, content_transformer(tolower)) 

## スペースを取り除く
corpus.prep <- tm_map(corpus.prep, stripWhitespace) 

## 句読点を取り除く
corpus.prep <- tm_map(corpus.prep, removePunctuation)

## 数字を取り除く
corpus.prep <- tm_map(corpus.prep, removeNumbers) 

## ストップワードを取り除く
corpus <- tm_map(corpus.prep, removeWords, stopwords("english")) 

## 残った単語を語幹化する
corpus <- tm_map(corpus, stemDocument) 

## 文書-用語行列
dtm <- DocumentTermMatrix(corpus)

## matrixオブジェクトに変換
dtm.mat <- as.matrix(dtm)


## 必要なパッケージをインストールする(1度だけでよい)
## install.package("wordcould") 

## パッケージを読み込む
library(wordcloud)
## Loading required package: RColorBrewer
## ワードクラウドの作成
wordcloud(colnames(dtm.mat), dtm.mat[12, ], scale = c(3,0.5), max.words = 20)  # 第12篇

wordcloud(colnames(dtm.mat), dtm.mat[24, ], scale = c(3,0.5), max.words = 20)  # 第24篇

stemCompletion(c("revenu", "commerc", "peac", "army"), corpus.prep)
##     revenu    commerc       peac       army 
##  "revenue" "commerce"    "peace"     "army"


\[ \mbox{tf-idf}(w,d) = \mbox{tf}(w,d) \times \mbox{idf}(w) \]

\[ \mbox{idf}(w) = \log \frac{N}{\mbox{df}(w)} \]


dtm.tfidf <- weightTfIdf(dtm) # tf-idf の計算
inspect(dtm.tfidf[1:5, 1:8])
## <<DocumentTermMatrix (documents: 5, terms: 8)>>
## Non-/sparse entries: 4/36
## Sparsity           : 90%
## Maximal term length: 7
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample             :
##           Terms
## Docs       abandon abat abb abet abhorr        abil abject         abl
##   fp01.txt       0    0   0    0      0 0.000000000      0 0.001367034
##   fp02.txt       0    0   0    0      0 0.003588854      0 0.000000000
##   fp03.txt       0    0   0    0      0 0.000000000      0 0.002833074
##   fp04.txt       0    0   0    0      0 0.000000000      0 0.001336496
##   fp05.txt       0    0   0    0      0 0.000000000      0 0.000000000
dtm.tfidf.raw <- weightTfIdf(dtm, normalize = FALSE) # 標準化しない
inspect(dtm.tfidf.raw[1:5, 1:8])
## <<DocumentTermMatrix (documents: 5, terms: 8)>>
## Non-/sparse entries: 4/36
## Sparsity           : 90%
## Maximal term length: 7
## Weighting          : term frequency - inverse document frequency (tf-idf)
## Sample             :
##           Terms
## Docs       abandon abat abb abet abhorr     abil abject      abl
##   fp01.txt       0    0   0    0      0 0.000000      0 1.017074
##   fp02.txt       0    0   0    0      0 2.824428      0 0.000000
##   fp03.txt       0    0   0    0      0 0.000000      0 2.034147
##   fp04.txt       0    0   0    0      0 0.000000      0 1.017074
##   fp05.txt       0    0   0    0      0 0.000000      0 0.000000
dtm.tfidf.mat <- as.matrix(dtm.tfidf)  # 行列へ変換

## 論文第12篇で最も重要な10語
head(sort(dtm.tfidf.mat[12, ], decreasing = TRUE), n = 10)
##     revenu contraband     patrol      excis      coast      trade        per 
## 0.01905877 0.01886965 0.01886965 0.01876560 0.01592559 0.01473504 0.01420342 
##        tax       cent     gallon 
## 0.01295466 0.01257977 0.01257977
## 論文第24篇で最も重要な10語
head(sort(dtm.tfidf.mat[24, ], decreasing = TRUE), n = 10)
##   garrison   dockyard settlement      spain       armi   frontier    arsenal 
## 0.02965511 0.01962294 0.01962294 0.01649040 0.01544256 0.01482756 0.01308196 
##    western       post     nearer 
## 0.01306664 0.01236780 0.01166730


k <- 4  # クラスターの数

## ハミルトンによって書かれた『フェデラリスト・ペーパー』を部分集合化する
hamilton <- c(1, 6:9, 11:13, 15:17, 21:36, 59:61, 65:85)
dtm.tfidf.hamilton <- dtm.tfidf.mat[hamilton, ]

## k平均法を実行
km.out <- kmeans(dtm.tfidf.hamilton, centers = k)
km.out$iter # 収束したかをチェック(繰り返しの回数は異なる場合がある)
## [1] 3
## 各中心点に対応する用語でラベルをつける
colnames(km.out$centers) <- colnames(dtm.tfidf.hamilton)

for (i in 1:k) { # 各クラスターでループ
    cat("CLUSTER", i, "\n")
    cat("Top 10 words:\n") # 各中心点で最も重要な10語
    print(head(sort(km.out$centers[i, ], decreasing = TRUE), n = 10))
    cat("\n")
    cat("Federalist Papers classified: \n") # 分類された論文を抽出
    print(rownames(dtm.tfidf.hamilton)[km.out$cluster == i])
    cat("\n")
}
## CLUSTER 1 
## Top 10 words:
##       court        juri       appel   jurisdict       trial      tribun 
## 0.045553185 0.031679369 0.014610450 0.014398568 0.013826016 0.012963605 
##      suprem     impeach      cogniz    inferior 
## 0.012739302 0.010427215 0.010077710 0.008663789 
## 
## Federalist Papers classified: 
## [1] "fp65.txt" "fp81.txt" "fp82.txt" "fp83.txt"
## 
## CLUSTER 2 
## Top 10 words:
##     vacanc     recess      claus      senat    session       fill    appoint 
## 0.06953047 0.04437713 0.04082617 0.03408008 0.03313305 0.03101140 0.02211662 
##     presid      expir    unfound 
## 0.01852025 0.01738262 0.01684465 
## 
## Federalist Papers classified: 
## [1] "fp67.txt"
## 
## CLUSTER 3 
## Top 10 words:
##     militia        armi    militari      pardon    governor     treason 
## 0.014639288 0.014493183 0.010024167 0.007643664 0.007114050 0.005833538 
##   disciplin        peac      presid    garrison 
## 0.005567509 0.005053636 0.004940856 0.004717510 
## 
## Federalist Papers classified: 
## [1] "fp08.txt" "fp24.txt" "fp25.txt" "fp26.txt" "fp28.txt" "fp29.txt" "fp69.txt"
## [8] "fp74.txt"
## 
## CLUSTER 4 
## Top 10 words:
##       senat        upon         tax      revenu       claus        land 
## 0.004103736 0.003886908 0.003028980 0.002766801 0.002750835 0.002726462 
##       taxat      presid       offic       court 
## 0.002711544 0.002695284 0.002404103 0.002184164 
## 
## Federalist Papers classified: 
##  [1] "fp01.txt" "fp06.txt" "fp07.txt" "fp09.txt" "fp11.txt" "fp12.txt"
##  [7] "fp13.txt" "fp15.txt" "fp16.txt" "fp17.txt" "fp21.txt" "fp22.txt"
## [13] "fp23.txt" "fp27.txt" "fp30.txt" "fp31.txt" "fp32.txt" "fp33.txt"
## [19] "fp34.txt" "fp35.txt" "fp36.txt" "fp59.txt" "fp60.txt" "fp61.txt"
## [25] "fp66.txt" "fp68.txt" "fp70.txt" "fp71.txt" "fp72.txt" "fp73.txt"
## [31] "fp75.txt" "fp76.txt" "fp77.txt" "fp78.txt" "fp79.txt" "fp80.txt"
## [37] "fp84.txt" "fp85.txt"
CLUSTER 1 
Top 10 words:
      trade    merchant    northern       navig  manufactur      revenu     commerc 
0.013962253 0.012890018 0.010487956 0.010283154 0.009795735 0.009754859 0.009501991 
   southern      market confederaci 
0.009303222 0.009179588 0.006984902 

Federalist Papers classified: 
[1] "fp11.txt" "fp12.txt" "fp13.txt" "fp35.txt"

CLUSTER 2 
Top 10 words:
    vacanc     recess      claus      senat    session       fill    appoint 
0.06953047 0.04437713 0.04082617 0.03408008 0.03313305 0.03101140 0.02211662 
    presid      expir    unfound 
0.01852025 0.01738262 0.01684465 

Federalist Papers classified: 
[1] "fp67.txt"

CLUSTER 3 
Top 10 words:
      court       senat        upon        juri      presid       claus   jurisdict 
0.006979236 0.004317092 0.003983231 0.003385189 0.002751280 0.002750835 0.002649711 
      offic         tax     impeach 
0.002566185 0.002526463 0.002294396 

Federalist Papers classified: 
 [1] "fp01.txt" "fp06.txt" "fp07.txt" "fp09.txt" "fp15.txt" "fp16.txt" "fp17.txt"
 [8] "fp21.txt" "fp22.txt" "fp23.txt" "fp27.txt" "fp30.txt" "fp31.txt" "fp32.txt"
[15] "fp33.txt" "fp34.txt" "fp36.txt" "fp59.txt" "fp60.txt" "fp61.txt" "fp65.txt"
[22] "fp66.txt" "fp68.txt" "fp70.txt" "fp71.txt" "fp72.txt" "fp73.txt" "fp75.txt"
[29] "fp76.txt" "fp77.txt" "fp78.txt" "fp79.txt" "fp80.txt" "fp81.txt" "fp82.txt"
[36] "fp83.txt" "fp84.txt" "fp85.txt"

CLUSTER 4 
Top 10 words:
    militia        armi    militari      pardon    governor     treason   disciplin 
0.014639288 0.014493183 0.010024167 0.007643664 0.007114050 0.005833538 0.005567509 
       peac      presid    garrison 
0.005053636 0.004940856 0.004717510 

Federalist Papers classified: 
[1] "fp08.txt" "fp24.txt" "fp25.txt" "fp26.txt" "fp28.txt" "fp29.txt" "fp69.txt"
[8] "fp74.txt"



著者を予測する

## 操作のため行列へ文書-用語行列行列を行列へ変換
dtm1 <- as.matrix(DocumentTermMatrix(corpus.prep)) 
tfm <- dtm1 / rowSums(dtm1) * 1000 # 1000語あたりの用語頻度

## 関心のある単語
words <- c("although", "always", "commonly", "consequently",
           "considerable", "enough", "there", "upon", "while", "whilst")

## これらの単語のみを選択
tfm <- tfm[, words]
tfm[1:5, ]
##           Terms
## Docs        although    always  commonly consequently considerable    enough
##   fp01.txt 0.0000000 0.8410429 0.8410429     0.000000            0 0.8410429
##   fp02.txt 0.0000000 0.7593014 0.0000000     0.000000            0 0.0000000
##   fp03.txt 0.8952551 3.5810206 0.8952551     2.685765            0 0.0000000
##   fp04.txt 0.7763975 0.0000000 0.0000000     0.000000            0 0.0000000
##   fp05.txt 0.9191176 0.9191176 0.0000000     0.000000            0 0.0000000
##           Terms
## Docs           there      upon     while whilst
##   fp01.txt 1.6820858 5.0462574 0.0000000      0
##   fp02.txt 0.0000000 0.7593014 0.7593014      0
##   fp03.txt 0.8952551 0.0000000 0.0000000      0
##   fp04.txt 2.3291925 0.0000000 0.0000000      0
##   fp05.txt 0.0000000 0.0000000 0.0000000      0
## ハミルトンによって書かれた論文(再掲)
hamilton <- c(1, 6:9, 11:13, 15:17, 21:36, 59:61, 65:85)

## マディソンによって書かれた論文
madison <- c(10, 14, 37:48, 58)

## ハミルトンとマディソンの論文の各平均(1行目ハミルトン、2行目マディソン)
tfm.ave <- rbind(colSums(tfm[hamilton, ]) / length(hamilton), 
                 colSums(tfm[madison, ]) / length(madison))
tfm.ave
##        although    always  commonly consequently considerable    enough
## [1,] 0.01756975 0.7527744 0.2630876   0.02600857    0.5435127 0.3955031
## [2,] 0.27058809 0.2006710 0.0000000   0.44878468    0.1601669 0.0000000
##         there      upon     while      whilst
## [1,] 4.417750 4.3986828 0.3700484 0.007055719
## [2,] 1.113252 0.2000269 0.0000000 0.380113114
author <- rep(NA, nrow(dtm1)) # 欠損値をもつベクトル
author[hamilton] <- 1  # ハミルトンであれば1
author[madison] <- -1  # マディソンであれば-1

## 回帰分析のためのデータフレーム
author.data <- data.frame(author = author[c(hamilton, madison)], 
                          tfm[c(hamilton, madison), ])
hm.fit <- lm(author ~ upon + there + consequently + whilst, 
             data = author.data)
hm.fit
## 
## Call:
## lm(formula = author ~ upon + there + consequently + whilst, data = author.data)
## 
## Coefficients:
##  (Intercept)          upon         there  consequently        whilst  
##     -0.26288       0.16678       0.09494      -0.44012      -0.65875
hm.fitted <- fitted(hm.fit) # 当てはめ値
sd(hm.fitted)  # 標準偏差
## [1] 0.7180769

交差検証

## 正しく分類されたハミルトンの著した論文の割合
mean(hm.fitted[author.data$author == 1] > 0)
## [1] 1
## 正しく分類されたマディソンの著した論文の割合
mean(hm.fitted[author.data$author == -1] < 0)
## [1] 1


n <- nrow(author.data)
hm.classify <- rep(NA, n) # 欠損値をもつベクトル(ループで値を入れる容器)

for (i in 1:n) {
    ## i番目の観察を除いたデータをモデルに当てははめる
    sub.fit <- lm(author ~ upon + there + consequently + whilst, 
                  data = author.data[-i, ]) # i番目の行を除く
    ## i番目の観察について著者を予測する
    hm.classify[i] <- predict(sub.fit, newdata = author.data[i, ])
}
## 正しく分類されたハミルトンの著した論文の割合
mean(hm.classify[author.data$author == 1] > 0)
## [1] 1
## 正しく分類されたマディソンの著した論文の割合
mean(hm.classify[author.data$author == -1] < 0)
## [1] 1


disputed <- c(49, 50:57, 62, 63) # 著者をめぐる論争のある11篇の論文
tf.disputed <- as.data.frame(tfm[disputed, ])

## 論争中の著者の予測
pred <- predict(hm.fit, newdata = tf.disputed)
pred # 予測値
##    fp49.txt    fp50.txt    fp51.txt    fp52.txt    fp53.txt    fp54.txt 
## -0.99831799 -0.06759254 -1.53243206 -0.26288400 -0.54584900 -0.56566555 
##    fp55.txt    fp56.txt    fp57.txt    fp62.txt    fp63.txt 
##  0.04376632 -0.57115610 -1.22289415 -1.00675456 -0.21939646
## ハミルトンによって著された論文の当てはめ値:赤い四角形
plot(hamilton, hm.fitted[author.data$author == 1], pch = 15, 
     xlim = c(1, 85), ylim  = c(-2, 2), col = "red", 
     xlab = "Federalist Papers", ylab = "Predicted values")
abline(h = 0, lty = "dashed")

## マディソンによって著された論文の当てはめ値:青い丸
points(madison, hm.fitted[author.data$author == -1], 
       pch = 16, col = "blue")

## 著者不明の論文の予測値:黒い三角形
points(disputed, pred, pch = 17) 


参考文献

  1. 社会科学のためのデータ分析入門(上)岩波書店

  2. 社会科学のためのデータ分析入門(下)岩波書店

  1. F. Mosteller and D. L. Wallace (1963) “Inference in an suthoership problem.” Journa of the American Statistical Association, vol. 58, no. 302, pp. 275-309.