Load data
dailykos <- read.csv("dailykos.csv")
Compute hiearchical clustering and plot the dendrogram
distances <- dist(dailykos)
hierClust <- hclust(distances, method="ward.D")
plot(hierClust)
Create 7 clusters
docClust <- cutree(hierClust, k=7)
(table(docClust))
## docClust
## 1 2 3 4 5 6 7
## 1266 321 374 139 407 714 209
docClusters <- split(dailykos, docClust)
Get the top 6 words of cluster 1:
tail(sort(colMeans(docClusters[[1]])), 6)
## state republican poll democrat kerry bush
## 0.7575039 0.7590837 0.9036335 0.9194313 1.0624013 1.7053712
Get top 6 words of all clusters
(topWords <- lapply(docClusters, function(c) tail(sort(colMeans(c)))))
## $`1`
## state republican poll democrat kerry bush
## 0.7575039 0.7590837 0.9036335 0.9194313 1.0624013 1.7053712
##
## $`2`
## bush democrat challenge vote poll november
## 2.847352 2.850467 4.096573 4.398754 4.847352 10.339564
##
## $`3`
## elect parties state republican democrat bush
## 1.647059 1.665775 2.320856 2.524064 3.823529 4.406417
##
## $`4`
## campaign voter presided poll bush kerry
## 1.431655 1.539568 1.625899 3.589928 7.834532 8.438849
##
## $`5`
## american presided administration war iraq
## 1.090909 1.120393 1.230958 1.776413 2.427518
## bush
## 3.941032
##
## $`6`
## race bush kerry elect democrat poll
## 0.4579832 0.4887955 0.5168067 0.5350140 0.5644258 0.5812325
##
## $`7`
## democrat clark edward poll kerry dean
## 2.148325 2.497608 2.607656 2.765550 3.952153 5.803828
K-means clustering
set.seed(1000)
kmClust <- kmeans(dailykos, centers=7, iter.max=1000)
kmClusters <- split(dailykos, kmClust$cluster)
Properties of K-means clusters
(nrow(kmClusters[[3]]))
## [1] 277
(sapply(kmClusters, nrow))
## 1 2 3 4 5 6 7
## 146 144 277 2063 163 329 308
6 most frequent words from each k-means cluster:
(kmTopWords <- lapply(kmClusters, function(c) tail(sort(colMeans(c)))))
## $`1`
## state iraq kerry administration presided
## 1.609589 1.616438 1.636986 2.664384 2.767123
## bush
## 11.431507
##
## $`2`
## primaries democrat edward clark kerry dean
## 2.319444 2.694444 2.798611 3.090278 4.979167 8.277778
##
## $`3`
## administration iraqi american bush war
## 1.389892 1.610108 1.685921 2.610108 3.025271
## iraq
## 4.093863
##
## $`4`
## elect republican kerry poll democrat bush
## 0.6010664 0.6175473 0.6495395 0.7474552 0.7891420 1.1473582
##
## $`5`
## race senate state parties republican democrat
## 2.484663 2.650307 3.521472 3.619632 4.638037 6.993865
##
## $`6`
## democrat bush challenge vote poll november
## 2.899696 2.960486 4.121581 4.446809 4.872340 10.370821
##
## $`7`
## presided voter campaign poll bush kerry
## 1.324675 1.334416 1.383117 2.788961 5.970779 6.480519