Load data

dailykos <- read.csv("dailykos.csv")

Compute hiearchical clustering and plot the dendrogram

distances <- dist(dailykos)
hierClust <- hclust(distances, method="ward.D")
plot(hierClust)

Create 7 clusters

docClust <- cutree(hierClust, k=7)
(table(docClust))
## docClust
##    1    2    3    4    5    6    7 
## 1266  321  374  139  407  714  209
docClusters <- split(dailykos, docClust)

Get the top 6 words of cluster 1:

tail(sort(colMeans(docClusters[[1]])), 6)
##      state republican       poll   democrat      kerry       bush 
##  0.7575039  0.7590837  0.9036335  0.9194313  1.0624013  1.7053712

Get top 6 words of all clusters

(topWords <- lapply(docClusters, function(c) tail(sort(colMeans(c)))))
## $`1`
##      state republican       poll   democrat      kerry       bush 
##  0.7575039  0.7590837  0.9036335  0.9194313  1.0624013  1.7053712 
## 
## $`2`
##      bush  democrat challenge      vote      poll  november 
##  2.847352  2.850467  4.096573  4.398754  4.847352 10.339564 
## 
## $`3`
##      elect    parties      state republican   democrat       bush 
##   1.647059   1.665775   2.320856   2.524064   3.823529   4.406417 
## 
## $`4`
## campaign    voter presided     poll     bush    kerry 
## 1.431655 1.539568 1.625899 3.589928 7.834532 8.438849 
## 
## $`5`
##       american       presided administration            war           iraq 
##       1.090909       1.120393       1.230958       1.776413       2.427518 
##           bush 
##       3.941032 
## 
## $`6`
##      race      bush     kerry     elect  democrat      poll 
## 0.4579832 0.4887955 0.5168067 0.5350140 0.5644258 0.5812325 
## 
## $`7`
## democrat    clark   edward     poll    kerry     dean 
## 2.148325 2.497608 2.607656 2.765550 3.952153 5.803828

K-means clustering

set.seed(1000)
kmClust <- kmeans(dailykos, centers=7, iter.max=1000)
kmClusters <- split(dailykos, kmClust$cluster)

Properties of K-means clusters

(nrow(kmClusters[[3]]))
## [1] 277
(sapply(kmClusters, nrow))
##    1    2    3    4    5    6    7 
##  146  144  277 2063  163  329  308

6 most frequent words from each k-means cluster:

(kmTopWords <- lapply(kmClusters, function(c) tail(sort(colMeans(c)))))
## $`1`
##          state           iraq          kerry administration       presided 
##       1.609589       1.616438       1.636986       2.664384       2.767123 
##           bush 
##      11.431507 
## 
## $`2`
## primaries  democrat    edward     clark     kerry      dean 
##  2.319444  2.694444  2.798611  3.090278  4.979167  8.277778 
## 
## $`3`
## administration          iraqi       american           bush            war 
##       1.389892       1.610108       1.685921       2.610108       3.025271 
##           iraq 
##       4.093863 
## 
## $`4`
##      elect republican      kerry       poll   democrat       bush 
##  0.6010664  0.6175473  0.6495395  0.7474552  0.7891420  1.1473582 
## 
## $`5`
##       race     senate      state    parties republican   democrat 
##   2.484663   2.650307   3.521472   3.619632   4.638037   6.993865 
## 
## $`6`
##  democrat      bush challenge      vote      poll  november 
##  2.899696  2.960486  4.121581  4.446809  4.872340 10.370821 
## 
## $`7`
## presided    voter campaign     poll     bush    kerry 
## 1.324675 1.334416 1.383117 2.788961 5.970779 6.480519