# columns 562, 563 are subject and activity; not accelerometer data
svd1 = svd(scale(test[, -c(562, 563)]))
max.sig.num = 12
plot(svd1$d^2/sum(svd1$d^2) * 100, ylab = "% variance explained by each sigma",
xlab = expression(paste(sigma, "s in ", Sigma, ", in descending order of priority/explanatory power")),
main = paste("Variance explained by sigmas\n1st 2 =", round(sum(svd1$d[1:2]^2)/sum(svd1$d^2) *
100, digits = 2), "%\n1st", max.sig.num, " =", round(sum(svd1$d[1:max.sig.num]^2)/sum(svd1$d^2) *
100, digits = 2), "%"))
par(mfrow = c(2, 2))
for (i in 1:4) {
plot(svd1$u[, i], col = numericActivity.test, pch = 19, xlab = "Observations for test data",
ylab = "value for a specific activity", main = paste("U vector #", i))
abline(h = 0)
if (i == 4)
legend(650, -0.025, legend = unique(test$activity), col = unique(numericActivity.test),
pch = 19)
}
par(mfrow = c(2, 2))
for (i in 1:12) {
x = which.max(svd1$v[, i]) # max contributor variable
plot(svd1$v[, i], pch = 19, xlab = paste(x, names(test[x]), "=", round(svd1$v[x,
i], digits = 3)), ylab = paste("weight of each variable in U_", i, sep = ""),
cex = 0.75, main = paste("V vector #", i))
points(x, svd1$v[x, i], cex = 2, col = "red")
abline(h = 0)
print(paste("V_", i, ": ", x, " ", names(test[x]), " = ", round(svd1$v[x,
i], digits = 3), sep = ""))
}
## [1] "V_1: 13 tBodyAcc-min()-X = 0.056"
## [1] "V_2: 75 tGravityAcc-arCoeff()-Z,2 = 0.12"
## [1] "V_3: 51 tGravityAcc-max()-Y = 0.183"
## [1] "V_4: 515 fBodyAccMag-kurtosis() = 0.135"
## [1] "V_5: 56 tGravityAcc-sma() = 0.144"
## [1] "V_6: 461 fBodyGyro-bandsEnergy()-1,8 = 0.118"
## [1] "V_7: 153 tBodyGyro-arCoeff()-Y,4 = 0.153"
## [1] "V_8: 158 tBodyGyro-correlation()-X,Y = 0.116"
## [1] "V_9: 452 fBodyGyro-meanFreq()-X = 0.179"
## [1] "V_10: 32 tBodyAcc-arCoeff()-Y,3 = 0.18"
## [1] "V_11: 115 tBodyAccJerk-arCoeff()-Z,2 = 0.163"
## [1] "V_12: 122 tBodyGyro-mean()-Y = 0.23"
par(mfrow = c(2, 2))
# establish a tree height for cluster grouping
max.tree.height.orig = 1.25
max.tree.height.plus1 = 1.25
max.tree.height.plus12 = 2.75
max.tree.height.all = 12
distanceMatOrig <- dist(test[, 10:12], method = "euclidean")
hclustOrig <- hclust(distanceMatOrig)
myplclust(hclustOrig, lab.col = numericActivity.test, main = "Original")
abline(h = max.tree.height.orig, col = "red")
distanceMatrix.plus1 <- dist(test[, c(10:12, 14)], method = "euclidean")
hclustering.plus1 <- hclust(distanceMatrix.plus1)
myplclust(hclustering.plus1, lab.col = numericActivity.test, main = "Add #14")
abline(h = max.tree.height.plus1, col = "red")
distanceMatrix.plus12 <- dist(test[, c(10:12, 14, 296, 67, 451, 559, 41, 33,
188, 377, 152, 459, 111)], method = "euclidean")
hclustering.plus12 <- hclust(distanceMatrix.plus12)
myplclust(hclustering.plus12, lab.col = numericActivity.test, main = "Add #s 14,296,67,451,559,41,33,188,\n377,152,459,111")
abline(h = max.tree.height.plus12, col = "red")
distanceMatrix.all <- dist(test[, 1:561], method = "euclidean")
hclustering.all <- hclust(distanceMatrix.all)
myplclust(hclustering.all, lab.col = numericActivity.test, main = "Use them all 1:561")
abline(h = max.tree.height.all, col = "red")
library(dynamicTreeCut)
par(mfrow = c(2, 2))
cut.orig = cutreeDynamicTree(hclustOrig, maxTreeHeight = max.tree.height.orig,
deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.orig ~ numericActivity.test, main = "original", xaxt = "n", xlab = "activity",
ylab = "clusters")
axis(1, at = unique(numericActivity.test), labels = unique(test$activity))
cut.plus1 = cutreeDynamicTree(hclustering.plus1, maxTreeHeight = max.tree.height.plus1,
deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.plus1 ~ numericActivity.test, main = "plus1", xaxt = "n", xlab = "activity",
ylab = "clusters")
axis(1, at = unique(numericActivity.test), labels = unique(test$activity))
cut.plus12 = cutreeDynamicTree(hclustering.plus12, maxTreeHeight = max.tree.height.plus12,
deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.plus12 ~ numericActivity.test, main = "plus12", xaxt = "n", xlab = "activity",
ylab = "clusters")
axis(1, at = unique(numericActivity.test), labels = unique(test$activity))
cut.all = cutreeDynamicTree(hclustering.all, maxTreeHeight = max.tree.height.all,
deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.all ~ numericActivity.test, main = "all", xaxt = "n", xlab = "activity",
ylab = "clusters")
axis(1, at = unique(numericActivity.test), labels = unique(test$activity))
# table.orig = table(cut.orig, numericActivity.test)
table.plus1 = table(cut.plus1, numericActivity.test)
table.plus12 = table(cut.plus12, numericActivity.test)
colnames(table.plus12) = unique(test$activity)
print(table.plus12)
## numericActivity.test
## cut.plus12 standing sitting laying walk walkdown walkup
## 1 0 0 0 180 200 50
## 2 0 0 0 49 0 166
## 3 0 51 81 0 0 0
## 4 114 0 0 0 0 0
## 5 111 0 0 0 0 0
## 6 0 68 31 0 0 0
## 7 0 38 46 0 0 0
## 8 0 29 55 0 0 0
## 9 0 53 16 0 0 0
## 10 68 0 0 0 0 0
## 11 0 14 28 0 0 0
## 12 0 11 26 0 0 0
table.all = table(cut.all, numericActivity.test)
activity.names = unique(test$activity)
print("activity all plus12 plus1")
## [1] "activity all plus12 plus1"
for (i in 1:6) {
activity.name = activity.names[i]
# activity.orig =
# round(max(table.orig[,i])/sum(table.orig[,i])*100,digits=0)
activity.plus1 = round(max(table.plus1[, i])/sum(table.plus1[, i]) * 100,
digits = 0)
activity.plus12 = round(max(table.plus12[, i])/sum(table.plus12[, i]) *
100, digits = 0)
activity.all = round(max(table.all[, i])/sum(table.all[, i]) * 100, digits = 0)
cat("\n", activity.name, ":\t", activity.all, "%", "\t", activity.plus12,
"%", "\t", activity.plus1, "%", sep = "")
}
##
## standing: 27% 39% 51%
## sitting: 40% 26% 49%
## laying: 80% 29% 36%
## walk: 67% 79% 44%
## walkdown: 52% 100% 30%
## walkup: 100% 77% 50%
kClust <- kmeans(test[, -c(562, 563)], centers = 6, nstart = 100, iter.max = 100)
str(kClust)
## List of 9
## $ cluster : Named int [1:1485] 3 3 3 3 3 3 3 3 3 3 ...
## ..- attr(*, "names")= chr [1:1485] "5868" "5869" "5870" "5871" ...
## $ centers : num [1:6, 1:561] 0.275 0.269 0.278 0.299 0.278 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:6] "1" "2" "3" "4" ...
## .. ..$ : chr [1:561] "tBodyAcc-mean()-X" "tBodyAcc-mean()-Y" "tBodyAcc-mean()-Z" "tBodyAcc-std()-X" ...
## $ totss : num 72935
## $ withinss : num [1:6] 3662 1586 3491 2886 4013 ...
## $ tot.withinss: num 20853
## $ betweenss : num 52082
## $ size : int [1:6] 219 115 345 169 276 361
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
table(kClust$cluster, test$activity)
##
## laying sitting standing walk walkdown walkup
## 1 18 69 132 0 0 0
## 2 0 0 0 0 1 114
## 3 0 194 151 0 0 0
## 4 0 0 0 7 162 0
## 5 275 1 0 0 0 0
## 6 0 0 0 222 37 102
par(mfrow = c(2, 2))
for (i in 1:6) {
titlelab = paste("Cluster #", i)
if (i == 1)
titlelab = paste(titlelab, "50/50 sitting/standing")
if (i == 2)
titlelab = paste(titlelab, "60% walk, 23/17 walkdown/walkup")
if (i == 3)
titlelab = paste(titlelab, "100% laying")
if (i == 4)
titlelab = paste(titlelab, "standing, sitting, laying")
if (i == 5)
titlelab = paste(titlelab, "75% walkdown,\n12.5/12.5 walk/walkup")
if (i == 6)
titlelab = paste(titlelab, "Mostly walkup with other bits")
y = which.max(kClust$center[i, ]) # max contributor variable
plot(kClust$center[i, ], pch = 19, cex = 0.5, ylab = "Cluster Center", xlab = paste(y,
names(test[y]), "=", round(kClust$center[i, y], digits = 3)), main = titlelab)
abline(h = 0)
points(y, kClust$center[i, y], cex = 2, col = "red")
}
dim(kClust$center)
## [1] 6 561
par(opar)
par(mfrow = c(1, 3))
for (i in c(174, 41, 53)) {
if (i == 174)
xlabtitle = "Crosses are KMeans centers"
if (i == 41)
xlabtitle = paste("X-axis = each of", nrow(test), "observations")
if (i == 53)
xlabtitle = "If I did this right, it ain't much help"
plot(1:nrow(test), test[1:nrow(test), i], pch = 19, cex = 0.5, main = paste(i,
"=", names(test)[i]), xlab = xlabtitle, ylab = "Value of each observation",
col = numericActivity.test)
abline(h = 0)
points(kClust$centers[, i], col = 1:6, pch = 3, cex = 3, lwd = 3)
if (i == 41)
legend(500, 0.6, legend = unique(test$activity), col = unique(numericActivity.test),
pch = 19)
}
# kClust is all 561 variables kClust.plus1 adds one SVD variable
# kClust.plus12 adds 12 SVD variables
kClust.plus1 <- kmeans(test[, c(10:12, 14)], centers = 6, nstart = 100, iter.max = 100)
kClust.plus12 <- kmeans(test[, c(10:12, 14, 296, 67, 451, 559, 41, 33, 188,
377, 152, 459, 111)], centers = 6, nstart = 100, iter.max = 100)
(table.data = table(kClust$cluster, test$activity))
##
## laying sitting standing walk walkdown walkup
## 1 18 69 132 0 0 0
## 2 0 0 0 0 1 114
## 3 0 194 151 0 0 0
## 4 0 0 0 7 162 0
## 5 275 1 0 0 0 0
## 6 0 0 0 222 37 102
activity.names = dimnames(table.data)[[2]]
table.data.plus1 = table(kClust.plus1$cluster, test$activity)
table.data.plus12 = table(kClust.plus12$cluster, test$activity)
cat("\n Confidence Predictions from KMeans \n")
##
## Confidence Predictions from KMeans
cat("Activity 561 plus12 plus1 \n")
## Activity 561 plus12 plus1
for (i in 1:6) {
activity.name = activity.names[i]
activity.best = round(max(table.data[, i])/sum(table.data[, i]) * 100, digits = 0)
activity.best1 = round(max(table.data.plus1[, i])/sum(table.data.plus1[,
i]) * 100, digits = 0)
activity.best12 = round(max(table.data.plus12[, i])/sum(table.data.plus12[,
i]) * 100, digits = 0)
cat("\n", activity.name, ":\t", activity.best, "%", "\t", activity.best12,
"%", "\t", activity.best1, "%", sep = "")
}
##
## laying: 94% 100% 100%
## sitting: 73% 52% 100%
## standing: 53% 64% 100%
## walk: 97% 71% 58%
## walkdown: 81% 98% 48%
## walkup: 53% 52% 39%
boxplot(as.data.frame(t(kClust$center)), main = "Distributions of the KMeans centers for each cluster",
xlab = "Clusters")
for (i in 1:6) {
d <- density(t(kClust$center)[, i])
if (i == 1)
plot(d, type = "n", main = "Densities of the centers for each cluster")
lines(d, col = i)
}
Confounding is either omitted variables or variables that are highly correlated with the response variable. Collinearity is predictor variables that are highly correlated with each other.
test.cor = cor(test[, 1:561])
dim(test.cor)
## [1] 561 561
for (i in 1:dim(test.cor)[1]) {
test.cor[i, i] = 0 # get rid of correaltion with self
}
dim(which(test.cor == 1, arr.ind = TRUE, useNames = TRUE))
## [1] 42 2
which(test.cor == 1, arr.ind = TRUE, useNames = TRUE)
## row col
## tBodyAccMag-sma() 206 201
## tGravityAccMag-mean() 214 201
## tGravityAccMag-sma() 219 201
## tGravityAccMag-std() 215 202
## tGravityAccMag-mad() 216 203
## tGravityAccMag-max() 217 204
## tGravityAccMag-min() 218 205
## tBodyAccMag-mean() 201 206
## tGravityAccMag-mean() 214 206
## tGravityAccMag-sma() 219 206
## tGravityAccMag-energy() 220 207
## tGravityAccMag-iqr() 221 208
## tGravityAccMag-entropy() 222 209
## tGravityAccMag-arCoeff()2 224 211
## tGravityAccMag-arCoeff()3 225 212
## tGravityAccMag-arCoeff()4 226 213
## tBodyAccMag-mean() 201 214
## tBodyAccMag-sma() 206 214
## tGravityAccMag-sma() 219 214
## tBodyAccMag-std() 202 215
## tBodyAccMag-mad() 203 216
## tBodyAccMag-max() 204 217
## tBodyAccMag-min() 205 218
## tBodyAccMag-mean() 201 219
## tBodyAccMag-sma() 206 219
## tGravityAccMag-mean() 214 219
## tBodyAccMag-energy() 207 220
## tBodyAccMag-iqr() 208 221
## tBodyAccMag-entropy() 209 222
## tBodyAccMag-arCoeff()2 211 224
## tBodyAccMag-arCoeff()3 212 225
## tBodyAccMag-arCoeff()4 213 226
## tBodyGyroMag-sma() 245 240
## tBodyGyroMag-mean() 240 245
## fBodyAccMag-sma() 508 503
## fBodyAccMag-mean() 503 508
## fBodyBodyAccJerkMag-sma() 521 516
## fBodyBodyAccJerkMag-mean() 516 521
## fBodyBodyGyroMag-sma() 534 529
## fBodyBodyGyroMag-mean() 529 534
## fBodyBodyGyroJerkMag-sma() 547 542
## fBodyBodyGyroJerkMag-mean() 542 547
print(str(.Platform))
## List of 8
## $ OS.type : chr "windows"
## $ file.sep : chr "/"
## $ dynlib.ext: chr ".dll"
## $ GUI : chr "RTerm"
## $ endian : chr "little"
## $ pkgType : chr "win.binary"
## $ path.sep : chr ";"
## $ r_arch : chr "x64"
## NULL
print(version)
## _
## platform x86_64-w64-mingw32
## arch x86_64
## os mingw32
## system x86_64, mingw32
## status
## major 3
## minor 0.2
## year 2013
## month 09
## day 25
## svn rev 63987
## language R
## version.string R version 3.0.2 (2013-09-25)
## nickname Frisbee Sailing
print(sessionInfo(), locale = FALSE)
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dynamicTreeCut_1.60-1 knitr_1.5
##
## loaded via a namespace (and not attached):
## [1] evaluate_0.5.1 formatR_0.10 stringr_0.6.2 tools_3.0.2
print(Sys.time())
## [1] "2013-12-07 14:59:11 EST"