Data Analysis Project 2

Step 3 Singular Value Decomposition (SVD)

George Fisher george@georgefisher.com

Observations

Singular value decomposition

with ALL quantitative variables in our training data, scaled

# columns 562, 563 are subject and activity; not accelerometer data
svd1 = svd(scale(train[, -c(562, 563)]))

# dimensions of our vectors
dim(svd1$u)

## [1] 1315  561

dim(diag(svd1$d))

## [1] 561 561

dim(svd1$v)

## [1] 561 561


plot(svd1$u[, 1], col = numericActivity.train, pch = 19, xlab = "Observations for training data", 
    ylab = "value for a specific activity", main = paste("U vector #", 1))
abline(h = 0)

plot of chunk s3.svd


x = which.max(svd1$v[, 1])  # max contributor variable
plot(svd1$v[, 1], pch = 19, xlab = paste(x, names(train[x]), "=", round(svd1$v[x, 
    1], digits = 3)), ylab = paste("weight of each variable in U_", 1, sep = ""), 
    cex = 0.75, main = paste("V vector #", 1))
points(x, svd1$v[x, 1], cex = 2, col = "red")
abline(h = 0)

plot of chunk s3.svd


plot(colSums(svd1$v), main = "sums of every V column vector ... these are weights?", 
    xlab = "V column vector index", ylab = "sum of each V column vector", pch = 19, 
    cex = 0.5)
abline(h = 0, col = "red")
for (i in 1:5) {
    points(i, sum(svd1$v[, i]), cex = 2, col = "red")
    text(i + 15, sum(svd1$v[, i]) + 0.05, labels = as.character(i), col = "blue")
}

plot of chunk s3.svd


par(mfrow = c(2, 1))
plot(colSums(abs(svd1$v)), main = "colSums(abs(svd1$v))", xlab = "V column vector index", 
    ylab = "sum of the abs()", pch = 19, cex = 0.5)

plot(colSums(svd1$v^2), main = "colSums(svd1$v^2)", xlab = "V column vector index", 
    ylab = "sum of the squares", pch = 19, cex = 0.5)

## Warning: relative range of values = 26 * EPS, is small (axis 2)

plot of chunk s3.svd


par(opar)
par(mfrow = c(2, 1))
boxplot(svd1$v[, 1:5], main = "1st 5 V column vector values", xlab = "V column vector index")
abline(h = 0, col = "red")

plot(svd1$v[, 1], main = "1st V column vector values", xlab = "1st V column vector entry index", 
    type = "n")
lines(svd1$v[, 1])
abline(h = 0, col = "red")

plot of chunk s3.svd

par(opar)



plot(rowSums(svd1$v), main = "sums of every V row vector ... these are certainly not weights", 
    xlab = "V row vector index", ylab = "sum of each V row vector", pch = 19, 
    cex = 0.5)
abline(h = 0, col = "red")
for (i in 1:5) {
    points(i, sum(svd1$v[i, ]), cex = 2, col = "red")
    text(i + 15, sum(svd1$v[i, ]) + 0.05, labels = as.character(i), col = "blue")
}

plot of chunk s3.svd


# sum of the sums
print(sum(colSums(svd1$v)))

## [1] -4.27


# 1st 25 sums
print(colSums(svd1$v)[1:25])

##  [1] -16.77647   3.88188   3.58181  -1.49766   2.33871   1.73334  -2.51696
##  [8]   1.34879  -1.23793  -0.87557   0.55897   1.65384   0.69773  -0.59660
## [15]   0.03812  -0.93638   1.30374  -1.28119   0.18812   0.25587   0.26074
## [22]  -0.19345   0.94895   2.07743  -0.21892


scatter.smooth(apply(svd1$v, 2, max), pch = 19, cex = 0.75, col = "blue", main = "max of every V column vector", 
    xlab = "V column vector index", ylab = "max of each V column vector")
index.v = seq(1:dim(svd1$v)[2])
abline(lm(apply(svd1$v, 2, max) ~ index.v), col = "red")

plot of chunk s3.svd



scatter.smooth(svd1$d, apply(svd1$v, 2, max), main = "loess regression: sigma predicts max?", 
    xlab = "V column vector index", ylab = "max of V column vector")

plot of chunk s3.svd

Plot the sigmas to see how important they are

max.sig.num = 12

plot(svd1$d^2/sum(svd1$d^2) * 100, ylab = "% variance explained by each sigma", 
    xlab = expression(paste(sigma, "s in ", Sigma, ", in descending order of priority/explanatory power")), 
    main = paste("Variance explained by sigmas\n1st 2 =", round(sum(svd1$d[1:2]^2)/sum(svd1$d^2) * 
        100, digits = 2), "%\n1st", max.sig.num, " =", round(sum(svd1$d[1:max.sig.num]^2)/sum(svd1$d^2) * 
        100, digits = 2), "%"))

plot of chunk s3.sigma.plot

U vectors are orthogonal to each other: uncorrelated

X-Axis is each observation for subject 1

Y-Axis is the value recorded for the activity (coded by observation number in the numericActivity vector) for that observation

par(mfrow = c(2, 2))

for (i in 1:4) {

    plot(svd1$u[, i], col = numericActivity.train, pch = 19, xlab = "Observations for training data", 
        ylab = "value for a specific activity", main = paste("U vector #", i))
    abline(h = 0)
    if (i == 4) 
        legend(650, -0.025, legend = unique(train$activity), col = unique(numericActivity.train), 
            pch = 19)
}

plot of chunk s3.plotUs

Find maximum contributing variable

in the V vectors

X-Axis is each variable

Y-Axis is the weight each variable contributes to the value in the corresponding U vector (one U vector per observation)

par(mfrow = c(2, 2))
max.Vs = ""
max.indexes = integer()
for (i in 1:max.sig.num) {

    x = which.max(svd1$v[, i])  # max contributor variable
    plot(svd1$v[, i], pch = 19, xlab = paste(x, names(train[x]), "=", round(svd1$v[x, 
        i], digits = 3)), ylab = paste("weight of each variable in U_", i, sep = ""), 
        cex = 0.75, main = paste("V vector #", i))

    points(x, svd1$v[x, i], cex = 2, col = "red")

    abline(h = 0)
    # print(paste('V_',i,': ',x,' ',names(train[x]),' =
    # ',round(svd1$v[x,i],digits=3),sep=''))
    max.Vs = paste(max.Vs, "\nV_", i, ": ", x, " ", names(train[x]), " = ", 
        round(svd1$v[x, i], digits = 3), sep = "")
    max.indexes = c(max.indexes, x)
}

plot of chunk s3.maxcontributor

cat(max.Vs)  # the max variables in priority order

## 
## V_1: 14 tBodyAcc-min()-Y = 0.056
## V_2: 296 fBodyAcc-meanFreq()-Z = 0.132
## V_3: 67 tGravityAcc-arCoeff()-X,2 = 0.153
## V_4: 451 fBodyGyro-maxInds-Z = 0.086
## V_5: 559 angle(X,gravityMean) = 0.154
## V_6: 41 tGravityAcc-mean()-X = 0.15
## V_7: 33 tBodyAcc-arCoeff()-Y,4 = 0.137
## V_8: 188 tBodyGyroJerk-arCoeff()-X,3 = 0.114
## V_9: 377 fBodyAccJerk-kurtosis()-X = 0.124
## V_10: 152 tBodyGyro-arCoeff()-Y,3 = 0.179
## V_11: 459 fBodyGyro-skewness()-Z = 0.173
## V_12: 111 tBodyAccJerk-arCoeff()-Y,2 = 0.155

print(max.indexes)

##  [1]  14 296  67 451 559  41  33 188 377 152 459 111

# save the max contributing variables
save(numericActivity.test, numericActivity.train, test, train, max.indexes, 
    file = "train_and_test.Rda")

Try the cluster again

par(mfrow = c(2, 2))

# establish a tree height for cluster grouping
max.tree.height.orig = 1.5
max.tree.height.plus1 = 1.25
max.tree.height.plus12 = 2.5
max.tree.height.all = 12

distanceMatOrig <- dist(train[, 10:12], method = "euclidean")
hclustOrig <- hclust(distanceMatOrig)
myplclust(hclustOrig, lab.col = numericActivity.train, main = "Original")
abline(h = max.tree.height.orig, col = "red")

distanceMatrix.plus1 <- dist(train[, c(10:12, max.indexes[1])], method = "euclidean")
hclustering.plus1 <- hclust(distanceMatrix.plus1)
myplclust(hclustering.plus1, lab.col = numericActivity.train, main = paste("Add #", 
    max.indexes[1], sep = ""))
abline(h = max.tree.height.plus1, col = "red")

# distanceMatrix.plus12 <- dist(train[,
# c(10:12,14,296,67,451,559,41,33,188,377,152,459,111)], method =
# 'euclidean')
distanceMatrix.plus12 <- dist(train[, c(10:12, max.indexes)], method = "euclidean")
hclustering.plus12 <- hclust(distanceMatrix.plus12)
myplclust(hclustering.plus12, lab.col = numericActivity.train, main = paste("1st", 
    length(max.indexes)))
abline(h = max.tree.height.plus12, col = "red")

distanceMatrix.all <- dist(train[, 1:561], method = "euclidean")
hclustering.all <- hclust(distanceMatrix.all)
myplclust(hclustering.all, lab.col = numericActivity.train, main = "Use them all 1:561")
abline(h = max.tree.height.all, col = "red")

plot of chunk s3.newcluster

Analyze the dendograms

library(dynamicTreeCut)
par(mfrow = c(2, 2))

cut.orig = cutreeDynamicTree(hclustOrig, maxTreeHeight = max.tree.height.orig, 
    deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.orig ~ numericActivity.train, main = "original", xaxt = "n", xlab = "activity", 
    ylab = "clusters")
axis(1, at = unique(numericActivity.train), labels = unique(train$activity))

cut.plus1 = cutreeDynamicTree(hclustering.plus1, maxTreeHeight = max.tree.height.plus1, 
    deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.plus1 ~ numericActivity.train, main = "plus1", xaxt = "n", xlab = "activity", 
    ylab = "clusters")
axis(1, at = unique(numericActivity.train), labels = unique(train$activity))

cut.plus12 = cutreeDynamicTree(hclustering.plus12, maxTreeHeight = max.tree.height.plus12, 
    deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.plus12 ~ numericActivity.train, main = "plus12", xaxt = "n", xlab = "activity", 
    ylab = "clusters")
axis(1, at = unique(numericActivity.train), labels = unique(train$activity))

cut.all = cutreeDynamicTree(hclustering.all, maxTreeHeight = max.tree.height.all, 
    deepSplit = FALSE, minModuleSize = 30)
boxplot(cut.all ~ numericActivity.train, main = "all", xaxt = "n", xlab = "activity", 
    ylab = "clusters")
axis(1, at = unique(numericActivity.train), labels = unique(train$activity))

plot of chunk s3.analyze.dendograms

# table.orig = table(cut.orig, numericActivity.train)
table.plus1 = table(cut.plus1, numericActivity.train)

table.plus12 = table(cut.plus12, numericActivity.train)
colnames(table.plus12) = unique(train$activity)
print(table.plus12)

##           numericActivity.train
## cut.plus12 standing sitting laying walk walkdown walkup
##          0        2       4      4    0        0      0
##          1        0       0      0  163      146     22
##          2        0     128    129    0        0      0
##          3      219       0      0    0        0      0
##          4        0       0      0   33       32    109
##          5        0      42     75    0        0      0
##          6        0       0      0   24       14     55
##          7        0       0      0   46        1     24
##          8        0      24     19    0        0      0


table.all = table(cut.all, numericActivity.train)

activity.names = unique(train$activity)
print("activity all plus12 plus1")

## [1] "activity all plus12 plus1"


for (i in 1:6) {
    activity.name = activity.names[i]
    # activity.orig =
    # round(max(table.orig[,i])/sum(table.orig[,i])*100,digits=0)
    activity.plus1 = round(max(table.plus1[, i])/sum(table.plus1[, i]) * 100, 
        digits = 0)
    activity.plus12 = round(max(table.plus12[, i])/sum(table.plus12[, i]) * 
        100, digits = 0)
    activity.all = round(max(table.all[, i])/sum(table.all[, i]) * 100, digits = 0)

    cat("\n", activity.name, ":\t", activity.all, "%", "\t", activity.plus12, 
        "%", "\t", activity.plus1, "%", sep = "")
}

## 
## standing:    48% 99% 66%
## sitting: 42% 65% 55%
## laying:  33% 57% 53%
## walk:    58% 61% 74%
## walkdown:    47% 76% 54%
## walkup:  40% 52% 44%

Let's try K-Means Clustering

kClust <- kmeans(train[, -c(562, 563)], centers = 6, nstart = 100, iter.max = 100)
str(kClust)

## List of 9
##  $ cluster     : Named int [1:1315] 6 6 6 6 6 6 6 6 6 6 ...
##   ..- attr(*, "names")= chr [1:1315] "1" "2" "3" "4" ...
##  $ centers     : num [1:6, 1:561] 0.275 0.265 0.292 0.277 0.239 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:6] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:561] "tBodyAcc-mean()-X" "tBodyAcc-mean()-Y" "tBodyAcc-mean()-Z" "tBodyAcc-std()-X" ...
##  $ totss       : num 71373
##  $ withinss    : num [1:6] 2537 3563 3366 6042 2319 ...
##  $ tot.withinss: num 21464
##  $ betweenss   : num 49909
##  $ size        : int [1:6] 186 163 138 391 146 291
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

table(kClust$cluster, train$activity)

##    
##     laying sitting standing walk walkdown walkup
##   1    186       0        0    0        0      0
##   2     30      57       76    0        0      0
##   3      0       0        0   24      101     13
##   4      0       0        0  232       92     67
##   5      5       1        0   10        0    130
##   6      0     140      151    0        0      0

par(mfrow = c(2, 2))
for (i in 1:6) {
    titlelab = paste("Cluster #", i)
    if (i == 1) 
        titlelab = paste(titlelab, "50/50 sitting/standing")
    if (i == 2) 
        titlelab = paste(titlelab, "60% walk, 23/17 walkdown/walkup")
    if (i == 3) 
        titlelab = paste(titlelab, "100% laying")
    if (i == 4) 
        titlelab = paste(titlelab, "standing, sitting, laying")
    if (i == 5) 
        titlelab = paste(titlelab, "75% walkdown,\n12.5/12.5 walk/walkup")
    if (i == 6) 
        titlelab = paste(titlelab, "Mostly walkup with other bits")
    y = which.max(kClust$center[i, ])  # max contributor variable
    plot(kClust$center[i, ], pch = 19, cex = 0.5, ylab = "Cluster Center", xlab = paste(y, 
        names(train[y]), "=", round(kClust$center[i, y], digits = 3)), main = titlelab)
    abline(h = 0)

    points(y, kClust$center[i, y], cex = 2, col = "red")
}

plot of chunk s3.kmeans

dim(kClust$center)

## [1]   6 561

plot of chunk s3.kmeans

par(opar)
par(mfrow = c(1, 3))
for (i in c(174, 41, 53)) {
    if (i == 174) 
        xlabtitle = "Crosses are KMeans centers"
    if (i == 41) 
        xlabtitle = paste("X-axis = each of", nrow(train), "observations")
    if (i == 53) 
        xlabtitle = "If I did this right, it ain't much help"
    plot(1:nrow(train), train[1:nrow(train), i], pch = 19, cex = 0.5, main = paste(i, 
        "=", names(train)[i]), xlab = xlabtitle, ylab = "Value of each observation", 
        col = numericActivity.train)
    abline(h = 0)
    points(kClust$centers[, i], col = 1:6, pch = 3, cex = 3, lwd = 3)

    if (i == 41) 
        legend(500, 0.6, legend = unique(train$activity), col = unique(numericActivity.train), 
            pch = 19)
}

plot of chunk s3.kmeansplot

KMeans Confidence Estimates

# kClust is all 561 variables kClust.plus1 adds one SVD variable
# kClust.plus12 adds 12 SVD variables

kClust.plus1 <- kmeans(train[, c(10:12, max.indexes[1])], centers = 6, nstart = 100, 
    iter.max = 100)
kClust.plus12 <- kmeans(train[, c(10:12, max.indexes)], centers = 6, nstart = 100, 
    iter.max = 100)


(table.data = table(kClust$cluster, train$activity))

##    
##     laying sitting standing walk walkdown walkup
##   1    186       0        0    0        0      0
##   2     30      57       76    0        0      0
##   3      0       0        0   24      101     13
##   4      0       0        0  232       92     67
##   5      5       1        0   10        0    130
##   6      0     140      151    0        0      0

activity.names = dimnames(table.data)[[2]]

table.data.plus1 = table(kClust.plus1$cluster, train$activity)
table.data.plus12 = table(kClust.plus12$cluster, train$activity)
cat("\n Confidence Predictions from KMeans \n")

## 
##  Confidence Predictions from KMeans

cat("Activity 561 plus12  plus1 \n")

## Activity 561 plus12  plus1

for (i in 1:6) {
    activity.name = activity.names[i]
    activity.best = round(max(table.data[, i])/sum(table.data[, i]) * 100, digits = 0)
    activity.best1 = round(max(table.data.plus1[, i])/sum(table.data.plus1[, 
        i]) * 100, digits = 0)
    activity.best12 = round(max(table.data.plus12[, i])/sum(table.data.plus12[, 
        i]) * 100, digits = 0)

    cat("\n", activity.name, ":\t", activity.best, "%", "\t", activity.best12, 
        "%", "\t", activity.best1, "%", sep = "")
}

## 
## laying:  84% 100%    94%
## sitting: 71% 52% 91%
## standing:    67% 56% 93%
## walk:    87% 62% 65%
## walkdown:    52% 83% 59%
## walkup:  62% 85% 42%

Plot all the centers

boxplot(as.data.frame(t(kClust$center)), main = "Distributions of the KMeans centers for each cluster", 
    xlab = "Clusters")

plot of chunk s3.allkmeancenters



for (i in 1:6) {
    d <- density(t(kClust$center)[, i])
    if (i == 1) 
        plot(d, type = "n", main = "Densities of the centers for each cluster")
    lines(d, col = i)
}

plot of chunk s3.allkmeancenters

Confounding and Collinearity

Confounding is either omitted variables or variables that are highly correlated with the response variable. Collinearity is predictor variables that are highly correlated with each other.

training.cor = cor(train[, 1:561])
dim(training.cor)

## [1] 561 561

for (i in 1:dim(training.cor)[1]) {
    training.cor[i, i] = 0  # get rid of correaltion with self
}
dim(which(training.cor == 1, arr.ind = TRUE, useNames = TRUE))

## [1] 32  2

which(training.cor == 1, arr.ind = TRUE, useNames = TRUE)

##                             row col
## tGravityAccMag-std()        215 202
## tGravityAccMag-mad()        216 203
## tGravityAccMag-max()        217 204
## tGravityAccMag-min()        218 205
## tGravityAccMag-energy()     220 207
## tGravityAccMag-iqr()        221 208
## tGravityAccMag-arCoeff()1   223 210
## tGravityAccMag-arCoeff()2   224 211
## tGravityAccMag-arCoeff()3   225 212
## tGravityAccMag-arCoeff()4   226 213
## tBodyAccMag-std()           202 215
## tBodyAccMag-mad()           203 216
## tBodyAccMag-max()           204 217
## tBodyAccMag-min()           205 218
## tBodyAccMag-energy()        207 220
## tBodyAccMag-iqr()           208 221
## tBodyAccMag-arCoeff()1      210 223
## tBodyAccMag-arCoeff()2      211 224
## tBodyAccMag-arCoeff()3      212 225
## tBodyAccMag-arCoeff()4      213 226
## tBodyAccJerkMag-sma()       232 227
## tBodyAccJerkMag-mean()      227 232
## tBodyGyroJerkMag-sma()      258 253
## tBodyGyroJerkMag-mean()     253 258
## fBodyAccMag-sma()           508 503
## fBodyAccMag-mean()          503 508
## fBodyBodyAccJerkMag-sma()   521 516
## fBodyBodyAccJerkMag-mean()  516 521
## fBodyBodyGyroMag-sma()      534 529
## fBodyBodyGyroMag-mean()     529 534
## fBodyBodyGyroJerkMag-sma()  547 542
## fBodyBodyGyroJerkMag-mean() 542 547

Info about the system running this code

print(str(.Platform))

## List of 8
##  $ OS.type   : chr "windows"
##  $ file.sep  : chr "/"
##  $ dynlib.ext: chr ".dll"
##  $ GUI       : chr "RTerm"
##  $ endian    : chr "little"
##  $ pkgType   : chr "win.binary"
##  $ path.sep  : chr ";"
##  $ r_arch    : chr "x64"
## NULL

print(version)

##                _                           
## platform       x86_64-w64-mingw32          
## arch           x86_64                      
## os             mingw32                     
## system         x86_64, mingw32             
## status                                     
## major          3                           
## minor          0.2                         
## year           2013                        
## month          09                          
## day            25                          
## svn rev        63987                       
## language       R                           
## version.string R version 3.0.2 (2013-09-25)
## nickname       Frisbee Sailing

print(sessionInfo(), locale = FALSE)

## R version 3.0.2 (2013-09-25)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dynamicTreeCut_1.60-1 knitr_1.5            
## 
## loaded via a namespace (and not attached):
## [1] evaluate_0.5.1 formatR_0.10   stringr_0.6.2  tools_3.0.2

print(Sys.time())

## [1] "2013-12-07 14:57:21 EST"