<< Data Analysis >> Note part 2

Created on 23 June 2013
Revised on Sun Jun 23 14:50:29 2013

12 Clustering example

library(RCurl)  ## complicated, download data from https
## Loading required package: bitops
library(XML)
fileUrl <- "https://dl.dropboxusercontent.com/u/8272421/samsungData.csv"
myCsv <- getURL(fileUrl, ssl.verifypeer = FALSE)
temporaryFile <- tempfile()
con <- file(temporaryFile, open = "w")
cat(myCsv, file = con)
close(con)
samsungData <- read.csv(temporaryFile)
table(samsungData$activity)
## 
##   laying  sitting standing     walk walkdown   walkup 
##     1407     1286     1374     1226      986     1073
par(mfrow = c(1, 2))  ## Plotting average acceleration for first subject
numericActivity <- as.numeric(as.factor(samsungData$activity))[samsungData$subject == 
    1]
plot(samsungData[samsungData$subject == 1, 1], pch = 19, col = numericActivity, 
    ylab = names(samsungData)[1])
plot(samsungData[samsungData$subject == 1, 2], pch = 19, col = numericActivity, 
    ylab = names(samsungData)[2])
legend(150, -0.1, legend = unique(samsungData$activity), col = unique(numericActivity), 
    pch = 19)

plot of chunk unnamed-chunk-3

par(mfrow = c(1, 1))
source("https://dl.dropboxusercontent.com/u/8272421/myplclust.R")
## Warning: 不支持这种URL方案
## Error: 无法打开链结
distanceMatrix <- dist(samsungData[samsungData$subject == 1, 1:3])
hclustering <- hclust(distanceMatrix)  ## Clustering based just on average acceleration
myplclust(hclustering, lab.col = numericActivity)
## Error: 没有"myplclust"这个函数
par(mfrow = c(1, 2))  ## Plotting max acceleration for the first subject
plot(samsungData[samsungData$subject == 1, 10], pch = 19, col = numericActivity, 
    ylab = names(samsungData)[10])
plot(samsungData[samsungData$subject == 1, 11], pch = 19, col = numericActivity, 
    ylab = names(samsungData)[11])

plot of chunk unnamed-chunk-5

par(mfrow = c(1, 1))
source("https://dl.dropboxusercontent.com/u/8272421/myplclust.R")
## Warning: 不支持这种URL方案
## Error: 无法打开链结
distanceMatrix <- dist(samsungData[samsungData$subject == 1, 10:12])
hclustering <- hclust(distanceMatrix)  ## Clustering based on maximum acceleration
myplclust(hclustering, lab.col = numericActivity)
## Error: 没有"myplclust"这个函数
svd1 = svd(scale(samsungData[samsungData$subject == 1, -c(562, 563, 564)]))
par(mfrow = c(1, 2))  ## Singular value decomposition
plot(svd1$u[, 1], col = numericActivity, pch = 19)
plot(svd1$u[, 2], col = numericActivity, pch = 19)

plot of chunk unnamed-chunk-7

par(mfrow = c(1, 1))
plot(svd1$v[, 2], pch = 19)  ## Find maximum contributor

plot of chunk unnamed-chunk-8

maxContrib <- which.max(svd1$v[, 2])
distanceMatrix <- dist(samsungData[samsungData$subject == 1, c(10:12, maxContrib)])
hclustering <- hclust(distanceMatrix)  ## New clustering with maximum contributer
myplclust(hclustering, lab.col = numericActivity)
## Error: 没有"myplclust"这个函数
names(samsungData)[maxContrib]  ## New clustering with maximum contributer
## [1] "tBodyGyroMag.arCoeff..2"
kClust <- kmeans(samsungData[samsungData$subject == 1, -c(562, 563, 564)], centers = 6)  ## K-means clustering (nstart=1, first try)
table(kClust$cluster, samsungData$activity[samsungData$subject == 1])
##    
##     laying sitting standing walk walkdown walkup
##   1     12      22       26    0        0      0
##   2      0       0        0    6       25     26
##   3     10      24       27    0        0      0
##   4      0       0        0    5       24     27
##   5     11       1        0   43        0      0
##   6     17       0        0   41        0      0
kClust <- kmeans(samsungData[samsungData$subject == 1, -c(562, 563, 564)], centers = 6, 
    nstart = 1)  ## K-means clustering (nstart=1, second try)
table(kClust$cluster, samsungData$activity[samsungData$subject == 1])
##    
##     laying sitting standing walk walkdown walkup
##   1     17       0        0   41        0      0
##   2      0       0        0    6       25     26
##   3      0       0        0    5       24     27
##   4     11       1        0   43        0      0
##   5     12      22       26    0        0      0
##   6     10      24       27    0        0      0
kClust <- kmeans(samsungData[samsungData$subject == 1, -c(562, 563, 564)], centers = 6, 
    nstart = 100)  ## K-means clustering (nstart=100, first try)
table(kClust$cluster, samsungData$activity[samsungData$subject == 1])
##    
##     laying sitting standing walk walkdown walkup
##   1     11       1        0   43        0      0
##   2     10      24       27    0        0      0
##   3     12      22       26    0        0      0
##   4      0       0        0    6       25     26
##   5     17       0        0   41        0      0
##   6      0       0        0    5       24     27
kClust <- kmeans(samsungData[samsungData$subject == 1, -c(562, 563, 564)], centers = 6, 
    nstart = 100)  ## K-means clustering (nstart=100, second try)
table(kClust$cluster, samsungData$activity[samsungData$subject == 1])
##    
##     laying sitting standing walk walkdown walkup
##   1     17       0        0   41        0      0
##   2     11       1        0   43        0      0
##   3      0       0        0    6       25     26
##   4     10      24       27    0        0      0
##   5      0       0        0    5       24     27
##   6     12      22       26    0        0      0
plot(kClust$center[1, 1:10], pch = 19, ylab = "Cluster Center", xlab = "")  ## Cluster 1 Variable Centers (Laying)

plot of chunk unnamed-chunk-15

plot(kClust$center[6, 1:10], pch = 19, ylab = "Cluster Center", xlab = "")  ## Cluster 2 Variable Centers (Walking)

plot of chunk unnamed-chunk-16