# columns 562, 563 are subject and activity; not accelerometer data
svd1 = svd(scale(train[, -c(562, 563)]))
# dimensions of our vectors
dim(svd1$u)
## [1] 1315 561
dim(diag(svd1$d))
## [1] 561 561
dim(svd1$v)
## [1] 561 561
plot(svd1$u[, 1], col = numericActivity.train, pch = 19, xlab = "Observations for training data",
ylab = "value for a specific activity", main = paste("U vector #", 1))
abline(h = 0)
x = which.max(svd1$v[, 1]) # max contributor variable
plot(svd1$v[, 1], pch = 19, xlab = paste(x, names(train[x]), "=", round(svd1$v[x,
1], digits = 3)), ylab = paste("weight of each variable in U_", 1, sep = ""),
cex = 0.75, main = paste("V vector #", 1))
points(x, svd1$v[x, 1], cex = 2, col = "red")
abline(h = 0)
plot(colSums(svd1$v), main = "sums of every V column vector ... these are weights?",
xlab = "V column vector index", ylab = "sum of each V column vector", pch = 19,
cex = 0.5)
abline(h = 0, col = "red")
for (i in 1:5) {
points(i, sum(svd1$v[, i]), cex = 2, col = "red")
text(i + 15, sum(svd1$v[, i]) + 0.05, labels = as.character(i), col = "blue")
}
par(mfrow = c(2, 1))
plot(colSums(abs(svd1$v)), main = "colSums(abs(svd1$v))", xlab = "V column vector index",
ylab = "sum of the abs()", pch = 19, cex = 0.5)
plot(colSums(svd1$v^2), main = "colSums(svd1$v^2)", xlab = "V column vector index",
ylab = "sum of the squares", pch = 19, cex = 0.5)
## Warning: relative range of values = 26 * EPS, is small (axis 2)
par(opar)
par(mfrow = c(2, 1))
boxplot(svd1$v[, 1:5], main = "1st 5 V column vector values", xlab = "V column vector index")
abline(h = 0, col = "red")
plot(svd1$v[, 1], main = "1st V column vector values", xlab = "1st V column vector entry index",
type = "n")
lines(svd1$v[, 1])
abline(h = 0, col = "red")
par(opar)
plot(rowSums(svd1$v), main = "sums of every V row vector ... these are certainly not weights",
xlab = "V row vector index", ylab = "sum of each V row vector", pch = 19,
cex = 0.5)
abline(h = 0, col = "red")
for (i in 1:5) {
points(i, sum(svd1$v[i, ]), cex = 2, col = "red")
text(i + 15, sum(svd1$v[i, ]) + 0.05, labels = as.character(i), col = "blue")
}
# sum of the sums
print(sum(colSums(svd1$v)))
## [1] -4.27
# 1st 25 sums
print(colSums(svd1$v)[1:25])
## [1] -16.77647 3.88188 3.58181 -1.49766 2.33871 1.73334 -2.51696
## [8] 1.34879 -1.23793 -0.87557 0.55897 1.65384 0.69773 -0.59660
## [15] 0.03812 -0.93638 1.30374 -1.28119 0.18812 0.25587 0.26074
## [22] -0.19345 0.94895 2.07743 -0.21892
scatter.smooth(apply(svd1$v, 2, max), pch = 19, cex = 0.75, col = "blue", main = "max of every V column vector",
xlab = "V column vector index", ylab = "max of each V column vector")
index.v = seq(1:dim(svd1$v)[2])
abline(lm(apply(svd1$v, 2, max) ~ index.v), col = "red")
scatter.smooth(svd1$d, apply(svd1$v, 2, max), main = "loess regression: sigma predicts max?",
xlab = "V column vector index", ylab = "max of V column vector")
max.sig.num = 12
plot(svd1$d^2/sum(svd1$d^2) * 100, ylab = "% variance explained by each sigma",
xlab = expression(paste(sigma, "s in ", Sigma, ", in descending order of priority/explanatory power")),
main = paste("Variance explained by sigmas\n1st 2 =", round(sum(svd1$d[1:2]^2)/sum(svd1$d^2) *
100, digits = 2), "%\n1st", max.sig.num, " =", round(sum(svd1$d[1:max.sig.num]^2)/sum(svd1$d^2) *
100, digits = 2), "%"))
par(mfrow = c(2, 2))
for (i in 1:4) {
plot(svd1$u[, i], col = numericActivity.train, pch = 19, xlab = "Observations for training data",
ylab = "value for a specific activity", main = paste("U vector #", i))
abline(h = 0)
if (i == 4)
legend(650, -0.025, legend = unique(train$activity), col = unique(numericActivity.train),
pch = 19)
}
par(mfrow = c(2, 2))
max.Vs = ""
max.indexes = integer()
for (i in 1:max.sig.num) {
x = which.max(svd1$v[, i]) # max contributor variable
plot(svd1$v[, i], pch = 19, xlab = paste(x, names(train[x]), "=", round(svd1$v[x,
i], digits = 3)), ylab = paste("weight of each variable in U_", i, sep = ""),
cex = 0.75, main = paste("V vector #", i))
points(x, svd1$v[x, i], cex = 2, col = "red")
abline(h = 0)
# print(paste('V_',i,': ',x,' ',names(train[x]),' =
# ',round(svd1$v[x,i],digits=3),sep=''))
max.Vs = paste(max.Vs, "\nV_", i, ": ", x, " ", names(train[x]), " = ",
round(svd1$v[x, i], digits = 3), sep = "")
max.indexes = c(max.indexes, x)
}
cat(max.Vs) # the max variables in priority order
##
## V_1: 14 tBodyAcc-min()-Y = 0.056
## V_2: 296 fBodyAcc-meanFreq()-Z = 0.132
## V_3: 67 tGravityAcc-arCoeff()-X,2 = 0.153
## V_4: 451 fBodyGyro-maxInds-Z = 0.086
## V_5: 559 angle(X,gravityMean) = 0.154
## V_6: 41 tGravityAcc-mean()-X = 0.15
## V_7: 33 tBodyAcc-arCoeff()-Y,4 = 0.137
## V_8: 188 tBodyGyroJerk-arCoeff()-X,3 = 0.114
## V_9: 377 fBodyAccJerk-kurtosis()-X = 0.124
## V_10: 152 tBodyGyro-arCoeff()-Y,3 = 0.179
## V_11: 459 fBodyGyro-skewness()-Z = 0.173
## V_12: 111 tBodyAccJerk-arCoeff()-Y,2 = 0.155
print(max.indexes)
## [1] 14 296 67 451 559 41 33 188 377 152 459 111
# save the max contributing variables
save(numericActivity.test, numericActivity.train, test, train, max.indexes,
file = "train_and_test.Rda")
par(mfrow = c(2, 2))
# establish a tree height for cluster grouping
max.tree.height.orig = 1.5
max.tree.height.plus1 = 1.25
max.tree.height.plus12 = 2.5
max.tree.height.all = 12
distanceMatOrig <- dist(train[, 10:12], method = "euclidean")
hclustOrig <- hclust(distanceMatOrig)
myplclust(hclustOrig, lab.col = numericActivity.train, main = "Original")
abline(h = max.tree.height.orig, col = "red")
distanceMatrix.plus1 <- dist(train[, c(10:12, max.indexes[1])], method = "euclidean")
hclustering.plus1 <- hclust(distanceMatrix.plus1)
myplclust(hclustering.plus1, lab.col = numericActivity.train, main = paste("Add #",
max.indexes[1], sep = ""))
abline(h = max.tree.height.plus1, col = "red")
# distanceMatrix.plus12 <- dist(train[,
# c(10:12,14,296,67,451,559,41,33,188,377,152,459,111)], method =
# 'euclidean')
distanceMatrix.plus12 <- dist(train[, c(10:12, max.indexes)], method = "euclidean")
hclustering.plus12 <- hclust(distanceMatrix.plus12)
myplclust(hclustering.plus12, lab.col = numericActivity.train, main = paste("1st",
length(max.indexes)))
abline(h = max.tree.height.plus12, col = "red")
distanceMatrix.all <- dist(train[, 1:561], method = "euclidean")
hclustering.all <- hclust(distanceMatrix.all)
myplclust(hclustering.all, lab.col = numericActivity.train, main = "Use them all 1:561")
abline(h = max.tree.height.all, col = "red")
library(dynamicTreeCut)
par(mfrow = c(2, 2))
cut.orig = cutreeDynamicTree(hclustOrig, maxTreeHeight = max.tree.height.orig,
deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.orig ~ numericActivity.train, main = "original", xaxt = "n", xlab = "activity",
ylab = "clusters")
axis(1, at = unique(numericActivity.train), labels = unique(train$activity))
cut.plus1 = cutreeDynamicTree(hclustering.plus1, maxTreeHeight = max.tree.height.plus1,
deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.plus1 ~ numericActivity.train, main = "plus1", xaxt = "n", xlab = "activity",
ylab = "clusters")
axis(1, at = unique(numericActivity.train), labels = unique(train$activity))
cut.plus12 = cutreeDynamicTree(hclustering.plus12, maxTreeHeight = max.tree.height.plus12,
deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.plus12 ~ numericActivity.train, main = "plus12", xaxt = "n", xlab = "activity",
ylab = "clusters")
axis(1, at = unique(numericActivity.train), labels = unique(train$activity))
cut.all = cutreeDynamicTree(hclustering.all, maxTreeHeight = max.tree.height.all,
deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.all ~ numericActivity.train, main = "all", xaxt = "n", xlab = "activity",
ylab = "clusters")
axis(1, at = unique(numericActivity.train), labels = unique(train$activity))
# table.orig = table(cut.orig, numericActivity.train)
table.plus1 = table(cut.plus1, numericActivity.train)
table.plus12 = table(cut.plus12, numericActivity.train)
colnames(table.plus12) = unique(train$activity)
print(table.plus12)
## numericActivity.train
## cut.plus12 standing sitting laying walk walkdown walkup
## 0 2 4 4 0 0 0
## 1 0 0 0 163 146 22
## 2 0 128 129 0 0 0
## 3 219 0 0 0 0 0
## 4 0 0 0 33 32 109
## 5 0 42 75 0 0 0
## 6 0 0 0 24 14 55
## 7 0 0 0 46 1 24
## 8 0 24 19 0 0 0
table.all = table(cut.all, numericActivity.train)
activity.names = unique(train$activity)
print("activity all plus12 plus1")
## [1] "activity all plus12 plus1"
for (i in 1:6) {
activity.name = activity.names[i]
# activity.orig =
# round(max(table.orig[,i])/sum(table.orig[,i])*100,digits=0)
activity.plus1 = round(max(table.plus1[, i])/sum(table.plus1[, i]) * 100,
digits = 0)
activity.plus12 = round(max(table.plus12[, i])/sum(table.plus12[, i]) *
100, digits = 0)
activity.all = round(max(table.all[, i])/sum(table.all[, i]) * 100, digits = 0)
cat("\n", activity.name, ":\t", activity.all, "%", "\t", activity.plus12,
"%", "\t", activity.plus1, "%", sep = "")
}
##
## standing: 48% 99% 66%
## sitting: 42% 65% 55%
## laying: 33% 57% 53%
## walk: 58% 61% 74%
## walkdown: 47% 76% 54%
## walkup: 40% 52% 44%
kClust <- kmeans(train[, -c(562, 563)], centers = 6, nstart = 100, iter.max = 100)
str(kClust)
## List of 9
## $ cluster : Named int [1:1315] 6 6 6 6 6 6 6 6 6 6 ...
## ..- attr(*, "names")= chr [1:1315] "1" "2" "3" "4" ...
## $ centers : num [1:6, 1:561] 0.275 0.265 0.292 0.277 0.239 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:6] "1" "2" "3" "4" ...
## .. ..$ : chr [1:561] "tBodyAcc-mean()-X" "tBodyAcc-mean()-Y" "tBodyAcc-mean()-Z" "tBodyAcc-std()-X" ...
## $ totss : num 71373
## $ withinss : num [1:6] 2537 3563 3366 6042 2319 ...
## $ tot.withinss: num 21464
## $ betweenss : num 49909
## $ size : int [1:6] 186 163 138 391 146 291
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
table(kClust$cluster, train$activity)
##
## laying sitting standing walk walkdown walkup
## 1 186 0 0 0 0 0
## 2 30 57 76 0 0 0
## 3 0 0 0 24 101 13
## 4 0 0 0 232 92 67
## 5 5 1 0 10 0 130
## 6 0 140 151 0 0 0
par(mfrow = c(2, 2))
for (i in 1:6) {
titlelab = paste("Cluster #", i)
if (i == 1)
titlelab = paste(titlelab, "50/50 sitting/standing")
if (i == 2)
titlelab = paste(titlelab, "60% walk, 23/17 walkdown/walkup")
if (i == 3)
titlelab = paste(titlelab, "100% laying")
if (i == 4)
titlelab = paste(titlelab, "standing, sitting, laying")
if (i == 5)
titlelab = paste(titlelab, "75% walkdown,\n12.5/12.5 walk/walkup")
if (i == 6)
titlelab = paste(titlelab, "Mostly walkup with other bits")
y = which.max(kClust$center[i, ]) # max contributor variable
plot(kClust$center[i, ], pch = 19, cex = 0.5, ylab = "Cluster Center", xlab = paste(y,
names(train[y]), "=", round(kClust$center[i, y], digits = 3)), main = titlelab)
abline(h = 0)
points(y, kClust$center[i, y], cex = 2, col = "red")
}
dim(kClust$center)
## [1] 6 561
par(opar)
par(mfrow = c(1, 3))
for (i in c(174, 41, 53)) {
if (i == 174)
xlabtitle = "Crosses are KMeans centers"
if (i == 41)
xlabtitle = paste("X-axis = each of", nrow(train), "observations")
if (i == 53)
xlabtitle = "If I did this right, it ain't much help"
plot(1:nrow(train), train[1:nrow(train), i], pch = 19, cex = 0.5, main = paste(i,
"=", names(train)[i]), xlab = xlabtitle, ylab = "Value of each observation",
col = numericActivity.train)
abline(h = 0)
points(kClust$centers[, i], col = 1:6, pch = 3, cex = 3, lwd = 3)
if (i == 41)
legend(500, 0.6, legend = unique(train$activity), col = unique(numericActivity.train),
pch = 19)
}
# kClust is all 561 variables kClust.plus1 adds one SVD variable
# kClust.plus12 adds 12 SVD variables
kClust.plus1 <- kmeans(train[, c(10:12, max.indexes[1])], centers = 6, nstart = 100,
iter.max = 100)
kClust.plus12 <- kmeans(train[, c(10:12, max.indexes)], centers = 6, nstart = 100,
iter.max = 100)
(table.data = table(kClust$cluster, train$activity))
##
## laying sitting standing walk walkdown walkup
## 1 186 0 0 0 0 0
## 2 30 57 76 0 0 0
## 3 0 0 0 24 101 13
## 4 0 0 0 232 92 67
## 5 5 1 0 10 0 130
## 6 0 140 151 0 0 0
activity.names = dimnames(table.data)[[2]]
table.data.plus1 = table(kClust.plus1$cluster, train$activity)
table.data.plus12 = table(kClust.plus12$cluster, train$activity)
cat("\n Confidence Predictions from KMeans \n")
##
## Confidence Predictions from KMeans
cat("Activity 561 plus12 plus1 \n")
## Activity 561 plus12 plus1
for (i in 1:6) {
activity.name = activity.names[i]
activity.best = round(max(table.data[, i])/sum(table.data[, i]) * 100, digits = 0)
activity.best1 = round(max(table.data.plus1[, i])/sum(table.data.plus1[,
i]) * 100, digits = 0)
activity.best12 = round(max(table.data.plus12[, i])/sum(table.data.plus12[,
i]) * 100, digits = 0)
cat("\n", activity.name, ":\t", activity.best, "%", "\t", activity.best12,
"%", "\t", activity.best1, "%", sep = "")
}
##
## laying: 84% 100% 94%
## sitting: 71% 52% 91%
## standing: 67% 56% 93%
## walk: 87% 62% 65%
## walkdown: 52% 83% 59%
## walkup: 62% 85% 42%
boxplot(as.data.frame(t(kClust$center)), main = "Distributions of the KMeans centers for each cluster",
xlab = "Clusters")
for (i in 1:6) {
d <- density(t(kClust$center)[, i])
if (i == 1)
plot(d, type = "n", main = "Densities of the centers for each cluster")
lines(d, col = i)
}
Confounding is either omitted variables or variables that are highly correlated with the response variable. Collinearity is predictor variables that are highly correlated with each other.
training.cor = cor(train[, 1:561])
dim(training.cor)
## [1] 561 561
for (i in 1:dim(training.cor)[1]) {
training.cor[i, i] = 0 # get rid of correaltion with self
}
dim(which(training.cor == 1, arr.ind = TRUE, useNames = TRUE))
## [1] 32 2
which(training.cor == 1, arr.ind = TRUE, useNames = TRUE)
## row col
## tGravityAccMag-std() 215 202
## tGravityAccMag-mad() 216 203
## tGravityAccMag-max() 217 204
## tGravityAccMag-min() 218 205
## tGravityAccMag-energy() 220 207
## tGravityAccMag-iqr() 221 208
## tGravityAccMag-arCoeff()1 223 210
## tGravityAccMag-arCoeff()2 224 211
## tGravityAccMag-arCoeff()3 225 212
## tGravityAccMag-arCoeff()4 226 213
## tBodyAccMag-std() 202 215
## tBodyAccMag-mad() 203 216
## tBodyAccMag-max() 204 217
## tBodyAccMag-min() 205 218
## tBodyAccMag-energy() 207 220
## tBodyAccMag-iqr() 208 221
## tBodyAccMag-arCoeff()1 210 223
## tBodyAccMag-arCoeff()2 211 224
## tBodyAccMag-arCoeff()3 212 225
## tBodyAccMag-arCoeff()4 213 226
## tBodyAccJerkMag-sma() 232 227
## tBodyAccJerkMag-mean() 227 232
## tBodyGyroJerkMag-sma() 258 253
## tBodyGyroJerkMag-mean() 253 258
## fBodyAccMag-sma() 508 503
## fBodyAccMag-mean() 503 508
## fBodyBodyAccJerkMag-sma() 521 516
## fBodyBodyAccJerkMag-mean() 516 521
## fBodyBodyGyroMag-sma() 534 529
## fBodyBodyGyroMag-mean() 529 534
## fBodyBodyGyroJerkMag-sma() 547 542
## fBodyBodyGyroJerkMag-mean() 542 547
print(str(.Platform))
## List of 8
## $ OS.type : chr "windows"
## $ file.sep : chr "/"
## $ dynlib.ext: chr ".dll"
## $ GUI : chr "RTerm"
## $ endian : chr "little"
## $ pkgType : chr "win.binary"
## $ path.sep : chr ";"
## $ r_arch : chr "x64"
## NULL
print(version)
## _
## platform x86_64-w64-mingw32
## arch x86_64
## os mingw32
## system x86_64, mingw32
## status
## major 3
## minor 0.2
## year 2013
## month 09
## day 25
## svn rev 63987
## language R
## version.string R version 3.0.2 (2013-09-25)
## nickname Frisbee Sailing
print(sessionInfo(), locale = FALSE)
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dynamicTreeCut_1.60-1 knitr_1.5
##
## loaded via a namespace (and not attached):
## [1] evaluate_0.5.1 formatR_0.10 stringr_0.6.2 tools_3.0.2
print(Sys.time())
## [1] "2013-12-07 14:57:21 EST"