Data Analysis Project 2

Step 4 Use the Models on Test Data

George Fisher george@georgefisher.com

Observations


Singular value decomposition

with ALL quantitative variables in our test data, scaled

# columns 562, 563 are subject and activity; not accelerometer data
svd1 = svd(scale(test[, -c(562, 563)]))

Plot the sigmas to see how important they are

max.sig.num = 12
plot(svd1$d^2/sum(svd1$d^2) * 100, ylab = "% variance explained by each sigma", 
    xlab = expression(paste(sigma, "s in ", Sigma, ", in descending order of priority/explanatory power")), 
    main = paste("Variance explained by sigmas\n1st 2 =", round(sum(svd1$d[1:2]^2)/sum(svd1$d^2) * 
        100, digits = 2), "%\n1st", max.sig.num, " =", round(sum(svd1$d[1:max.sig.num]^2)/sum(svd1$d^2) * 
        100, digits = 2), "%"))

plot of chunk s4.sigma.plot


U vectors are orthogonal to each other: uncorrelated

X-Axis is each observation for subject 1

Y-Axis is the value recorded for the activity (coded by observation number in the numericActivity vector) for that observation

par(mfrow = c(2, 2))

for (i in 1:4) {

    plot(svd1$u[, i], col = numericActivity.test, pch = 19, xlab = "Observations for test data", 
        ylab = "value for a specific activity", main = paste("U vector #", i))
    abline(h = 0)
    if (i == 4) 
        legend(650, -0.025, legend = unique(test$activity), col = unique(numericActivity.test), 
            pch = 19)
}

plot of chunk s4.plotUs


Find maximum contributing variable

in the V vectors

X-Axis is each variable

Y-Axis is the weight each variable contributes to the value in the corresponding U vector (one U vector per observation)

par(mfrow = c(2, 2))
for (i in 1:12) {

    x = which.max(svd1$v[, i])  # max contributor variable
    plot(svd1$v[, i], pch = 19, xlab = paste(x, names(test[x]), "=", round(svd1$v[x, 
        i], digits = 3)), ylab = paste("weight of each variable in U_", i, sep = ""), 
        cex = 0.75, main = paste("V vector #", i))

    points(x, svd1$v[x, i], cex = 2, col = "red")

    abline(h = 0)
    print(paste("V_", i, ": ", x, " ", names(test[x]), " = ", round(svd1$v[x, 
        i], digits = 3), sep = ""))
}
## [1] "V_1: 13 tBodyAcc-min()-X = 0.056"
## [1] "V_2: 75 tGravityAcc-arCoeff()-Z,2 = 0.12"
## [1] "V_3: 51 tGravityAcc-max()-Y = 0.183"

plot of chunk s4.maxcontributor

## [1] "V_4: 515 fBodyAccMag-kurtosis() = 0.135"
## [1] "V_5: 56 tGravityAcc-sma() = 0.144"
## [1] "V_6: 461 fBodyGyro-bandsEnergy()-1,8 = 0.118"
## [1] "V_7: 153 tBodyGyro-arCoeff()-Y,4 = 0.153"

plot of chunk s4.maxcontributor

## [1] "V_8: 158 tBodyGyro-correlation()-X,Y = 0.116"
## [1] "V_9: 452 fBodyGyro-meanFreq()-X = 0.179"
## [1] "V_10: 32 tBodyAcc-arCoeff()-Y,3 = 0.18"
## [1] "V_11: 115 tBodyAccJerk-arCoeff()-Z,2 = 0.163"

plot of chunk s4.maxcontributor

## [1] "V_12: 122 tBodyGyro-mean()-Y = 0.23"

Try the cluster again

par(mfrow = c(2, 2))

# establish a tree height for cluster grouping
max.tree.height.orig = 1.25
max.tree.height.plus1 = 1.25
max.tree.height.plus12 = 2.75
max.tree.height.all = 12

distanceMatOrig <- dist(test[, 10:12], method = "euclidean")
hclustOrig <- hclust(distanceMatOrig)
myplclust(hclustOrig, lab.col = numericActivity.test, main = "Original")
abline(h = max.tree.height.orig, col = "red")

distanceMatrix.plus1 <- dist(test[, c(10:12, 14)], method = "euclidean")
hclustering.plus1 <- hclust(distanceMatrix.plus1)
myplclust(hclustering.plus1, lab.col = numericActivity.test, main = "Add #14")
abline(h = max.tree.height.plus1, col = "red")

distanceMatrix.plus12 <- dist(test[, c(10:12, 14, 296, 67, 451, 559, 41, 33, 
    188, 377, 152, 459, 111)], method = "euclidean")
hclustering.plus12 <- hclust(distanceMatrix.plus12)
myplclust(hclustering.plus12, lab.col = numericActivity.test, main = "Add #s 14,296,67,451,559,41,33,188,\n377,152,459,111")
abline(h = max.tree.height.plus12, col = "red")

distanceMatrix.all <- dist(test[, 1:561], method = "euclidean")
hclustering.all <- hclust(distanceMatrix.all)
myplclust(hclustering.all, lab.col = numericActivity.test, main = "Use them all 1:561")
abline(h = max.tree.height.all, col = "red")

plot of chunk s4.newcluster


Analyze the dendograms

library(dynamicTreeCut)
par(mfrow = c(2, 2))

cut.orig = cutreeDynamicTree(hclustOrig, maxTreeHeight = max.tree.height.orig, 
    deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.orig ~ numericActivity.test, main = "original", xaxt = "n", xlab = "activity", 
    ylab = "clusters")
axis(1, at = unique(numericActivity.test), labels = unique(test$activity))

cut.plus1 = cutreeDynamicTree(hclustering.plus1, maxTreeHeight = max.tree.height.plus1, 
    deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.plus1 ~ numericActivity.test, main = "plus1", xaxt = "n", xlab = "activity", 
    ylab = "clusters")
axis(1, at = unique(numericActivity.test), labels = unique(test$activity))

cut.plus12 = cutreeDynamicTree(hclustering.plus12, maxTreeHeight = max.tree.height.plus12, 
    deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.plus12 ~ numericActivity.test, main = "plus12", xaxt = "n", xlab = "activity", 
    ylab = "clusters")
axis(1, at = unique(numericActivity.test), labels = unique(test$activity))

cut.all = cutreeDynamicTree(hclustering.all, maxTreeHeight = max.tree.height.all, 
    deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.all ~ numericActivity.test, main = "all", xaxt = "n", xlab = "activity", 
    ylab = "clusters")
axis(1, at = unique(numericActivity.test), labels = unique(test$activity))

plot of chunk s4.analyze.dendograms

# table.orig = table(cut.orig, numericActivity.test)
table.plus1 = table(cut.plus1, numericActivity.test)

table.plus12 = table(cut.plus12, numericActivity.test)
colnames(table.plus12) = unique(test$activity)
print(table.plus12)
##           numericActivity.test
## cut.plus12 standing sitting laying walk walkdown walkup
##         1         0       0      0  180      200     50
##         2         0       0      0   49        0    166
##         3         0      51     81    0        0      0
##         4       114       0      0    0        0      0
##         5       111       0      0    0        0      0
##         6         0      68     31    0        0      0
##         7         0      38     46    0        0      0
##         8         0      29     55    0        0      0
##         9         0      53     16    0        0      0
##         10       68       0      0    0        0      0
##         11        0      14     28    0        0      0
##         12        0      11     26    0        0      0

table.all = table(cut.all, numericActivity.test)

activity.names = unique(test$activity)
print("activity all plus12 plus1")
## [1] "activity all plus12 plus1"

for (i in 1:6) {
    activity.name = activity.names[i]
    # activity.orig =
    # round(max(table.orig[,i])/sum(table.orig[,i])*100,digits=0)
    activity.plus1 = round(max(table.plus1[, i])/sum(table.plus1[, i]) * 100, 
        digits = 0)
    activity.plus12 = round(max(table.plus12[, i])/sum(table.plus12[, i]) * 
        100, digits = 0)
    activity.all = round(max(table.all[, i])/sum(table.all[, i]) * 100, digits = 0)

    cat("\n", activity.name, ":\t", activity.all, "%", "\t", activity.plus12, 
        "%", "\t", activity.plus1, "%", sep = "")
}
## 
## standing:    27% 39% 51%
## sitting: 40% 26% 49%
## laying:  80% 29% 36%
## walk:    67% 79% 44%
## walkdown:    52% 100%    30%
## walkup:  100%    77% 50%

Let's try K-Means Clustering

kClust <- kmeans(test[, -c(562, 563)], centers = 6, nstart = 100, iter.max = 100)
str(kClust)
## List of 9
##  $ cluster     : Named int [1:1485] 3 3 3 3 3 3 3 3 3 3 ...
##   ..- attr(*, "names")= chr [1:1485] "5868" "5869" "5870" "5871" ...
##  $ centers     : num [1:6, 1:561] 0.275 0.269 0.278 0.299 0.278 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:6] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:561] "tBodyAcc-mean()-X" "tBodyAcc-mean()-Y" "tBodyAcc-mean()-Z" "tBodyAcc-std()-X" ...
##  $ totss       : num 72935
##  $ withinss    : num [1:6] 3662 1586 3491 2886 4013 ...
##  $ tot.withinss: num 20853
##  $ betweenss   : num 52082
##  $ size        : int [1:6] 219 115 345 169 276 361
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
table(kClust$cluster, test$activity)
##    
##     laying sitting standing walk walkdown walkup
##   1     18      69      132    0        0      0
##   2      0       0        0    0        1    114
##   3      0     194      151    0        0      0
##   4      0       0        0    7      162      0
##   5    275       1        0    0        0      0
##   6      0       0        0  222       37    102
par(mfrow = c(2, 2))
for (i in 1:6) {
    titlelab = paste("Cluster #", i)
    if (i == 1) 
        titlelab = paste(titlelab, "50/50 sitting/standing")
    if (i == 2) 
        titlelab = paste(titlelab, "60% walk, 23/17 walkdown/walkup")
    if (i == 3) 
        titlelab = paste(titlelab, "100% laying")
    if (i == 4) 
        titlelab = paste(titlelab, "standing, sitting, laying")
    if (i == 5) 
        titlelab = paste(titlelab, "75% walkdown,\n12.5/12.5 walk/walkup")
    if (i == 6) 
        titlelab = paste(titlelab, "Mostly walkup with other bits")
    y = which.max(kClust$center[i, ])  # max contributor variable
    plot(kClust$center[i, ], pch = 19, cex = 0.5, ylab = "Cluster Center", xlab = paste(y, 
        names(test[y]), "=", round(kClust$center[i, y], digits = 3)), main = titlelab)
    abline(h = 0)

    points(y, kClust$center[i, y], cex = 2, col = "red")
}

plot of chunk s4.kmeans

dim(kClust$center)
## [1]   6 561

plot of chunk s4.kmeans

par(opar)
par(mfrow = c(1, 3))
for (i in c(174, 41, 53)) {
    if (i == 174) 
        xlabtitle = "Crosses are KMeans centers"
    if (i == 41) 
        xlabtitle = paste("X-axis = each of", nrow(test), "observations")
    if (i == 53) 
        xlabtitle = "If I did this right, it ain't much help"
    plot(1:nrow(test), test[1:nrow(test), i], pch = 19, cex = 0.5, main = paste(i, 
        "=", names(test)[i]), xlab = xlabtitle, ylab = "Value of each observation", 
        col = numericActivity.test)
    abline(h = 0)
    points(kClust$centers[, i], col = 1:6, pch = 3, cex = 3, lwd = 3)

    if (i == 41) 
        legend(500, 0.6, legend = unique(test$activity), col = unique(numericActivity.test), 
            pch = 19)
}

plot of chunk s4.kmeansplot

KMeans Confidence Estimates

# kClust is all 561 variables kClust.plus1 adds one SVD variable
# kClust.plus12 adds 12 SVD variables

kClust.plus1 <- kmeans(test[, c(10:12, 14)], centers = 6, nstart = 100, iter.max = 100)
kClust.plus12 <- kmeans(test[, c(10:12, 14, 296, 67, 451, 559, 41, 33, 188, 
    377, 152, 459, 111)], centers = 6, nstart = 100, iter.max = 100)


(table.data = table(kClust$cluster, test$activity))
##    
##     laying sitting standing walk walkdown walkup
##   1     18      69      132    0        0      0
##   2      0       0        0    0        1    114
##   3      0     194      151    0        0      0
##   4      0       0        0    7      162      0
##   5    275       1        0    0        0      0
##   6      0       0        0  222       37    102
activity.names = dimnames(table.data)[[2]]

table.data.plus1 = table(kClust.plus1$cluster, test$activity)
table.data.plus12 = table(kClust.plus12$cluster, test$activity)
cat("\n Confidence Predictions from KMeans \n")
## 
##  Confidence Predictions from KMeans
cat("Activity 561 plus12  plus1 \n")
## Activity 561 plus12  plus1
for (i in 1:6) {
    activity.name = activity.names[i]
    activity.best = round(max(table.data[, i])/sum(table.data[, i]) * 100, digits = 0)
    activity.best1 = round(max(table.data.plus1[, i])/sum(table.data.plus1[, 
        i]) * 100, digits = 0)
    activity.best12 = round(max(table.data.plus12[, i])/sum(table.data.plus12[, 
        i]) * 100, digits = 0)

    cat("\n", activity.name, ":\t", activity.best, "%", "\t", activity.best12, 
        "%", "\t", activity.best1, "%", sep = "")
}
## 
## laying:  94% 100%    100%
## sitting: 73% 52% 100%
## standing:    53% 64% 100%
## walk:    97% 71% 58%
## walkdown:    81% 98% 48%
## walkup:  53% 52% 39%

Plot all the centers

boxplot(as.data.frame(t(kClust$center)), main = "Distributions of the KMeans centers for each cluster", 
    xlab = "Clusters")

plot of chunk s4.allkmeancenters



for (i in 1:6) {
    d <- density(t(kClust$center)[, i])
    if (i == 1) 
        plot(d, type = "n", main = "Densities of the centers for each cluster")
    lines(d, col = i)
}

plot of chunk s4.allkmeancenters


Confounding and Collinearity

Confounding is either omitted variables or variables that are highly correlated with the response variable. Collinearity is predictor variables that are highly correlated with each other.

test.cor = cor(test[, 1:561])
dim(test.cor)
## [1] 561 561
for (i in 1:dim(test.cor)[1]) {
    test.cor[i, i] = 0  # get rid of correaltion with self
}
dim(which(test.cor == 1, arr.ind = TRUE, useNames = TRUE))
## [1] 42  2
which(test.cor == 1, arr.ind = TRUE, useNames = TRUE)
##                             row col
## tBodyAccMag-sma()           206 201
## tGravityAccMag-mean()       214 201
## tGravityAccMag-sma()        219 201
## tGravityAccMag-std()        215 202
## tGravityAccMag-mad()        216 203
## tGravityAccMag-max()        217 204
## tGravityAccMag-min()        218 205
## tBodyAccMag-mean()          201 206
## tGravityAccMag-mean()       214 206
## tGravityAccMag-sma()        219 206
## tGravityAccMag-energy()     220 207
## tGravityAccMag-iqr()        221 208
## tGravityAccMag-entropy()    222 209
## tGravityAccMag-arCoeff()2   224 211
## tGravityAccMag-arCoeff()3   225 212
## tGravityAccMag-arCoeff()4   226 213
## tBodyAccMag-mean()          201 214
## tBodyAccMag-sma()           206 214
## tGravityAccMag-sma()        219 214
## tBodyAccMag-std()           202 215
## tBodyAccMag-mad()           203 216
## tBodyAccMag-max()           204 217
## tBodyAccMag-min()           205 218
## tBodyAccMag-mean()          201 219
## tBodyAccMag-sma()           206 219
## tGravityAccMag-mean()       214 219
## tBodyAccMag-energy()        207 220
## tBodyAccMag-iqr()           208 221
## tBodyAccMag-entropy()       209 222
## tBodyAccMag-arCoeff()2      211 224
## tBodyAccMag-arCoeff()3      212 225
## tBodyAccMag-arCoeff()4      213 226
## tBodyGyroMag-sma()          245 240
## tBodyGyroMag-mean()         240 245
## fBodyAccMag-sma()           508 503
## fBodyAccMag-mean()          503 508
## fBodyBodyAccJerkMag-sma()   521 516
## fBodyBodyAccJerkMag-mean()  516 521
## fBodyBodyGyroMag-sma()      534 529
## fBodyBodyGyroMag-mean()     529 534
## fBodyBodyGyroJerkMag-sma()  547 542
## fBodyBodyGyroJerkMag-mean() 542 547

Info about the system running this code

print(str(.Platform))
## List of 8
##  $ OS.type   : chr "windows"
##  $ file.sep  : chr "/"
##  $ dynlib.ext: chr ".dll"
##  $ GUI       : chr "RTerm"
##  $ endian    : chr "little"
##  $ pkgType   : chr "win.binary"
##  $ path.sep  : chr ";"
##  $ r_arch    : chr "x64"
## NULL
print(version)
##                _                           
## platform       x86_64-w64-mingw32          
## arch           x86_64                      
## os             mingw32                     
## system         x86_64, mingw32             
## status                                     
## major          3                           
## minor          0.2                         
## year           2013                        
## month          09                          
## day            25                          
## svn rev        63987                       
## language       R                           
## version.string R version 3.0.2 (2013-09-25)
## nickname       Frisbee Sailing
print(sessionInfo(), locale = FALSE)
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dynamicTreeCut_1.60-1 knitr_1.5            
## 
## loaded via a namespace (and not attached):
## [1] evaluate_0.5.1 formatR_0.10   stringr_0.6.2  tools_3.0.2
print(Sys.time())
## [1] "2013-12-07 14:59:11 EST"