UCI HAR dataset on “Human Activity Recognition Using Smartphones”“

One of the most exciting areas in all of data science right now is wearable computing - see for example this article.

Data set information was download from http://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones#. "The experiments have been carried out with a group of 30 volunteers within an age bracket of 19-48 years. Each person performed six activities (WALKING, WALKING UPSTAIRS, WALKING DOWNSTAIRS, SITTING, STANDING, LAYING) wearing a smartphone (Samsung Galaxy S II) on the waist. Using its embedded accelerometer and gyroscope, we captured 3-axial linear acceleration and 3-axial angular velocity at a constant rate of 50Hz. The experiments have been video-recorded to label the data manually. The obtained dataset has been randomly partitioned into two sets, where 70% of the volunteers was selected for generating the training data and 30% the test data. The sensor signals (accelerometer and gyroscope) were pre-processed by applying noise filters and then sampled in fixed-width sliding windows of 2.56 sec and 50% overlap (128 readings/window). The sensor acceleration signal, which has gravitational and body motion components, was separated using a Butterworth low-pass filter into body acceleration and gravity. The gravitational force is assumed to have only low frequency components, therefore a filter with 0.3 Hz cutoff frequency was used. From each window, a vector of features was obtained by calculating variables from the time and frequency domain.”

For each record in the raw dataset it is provided:

* Triaxial acceleration from the accelerometer (total acceleration) and the estimated body acceleration.
* Triaxial Angular velocity from the gyroscope.
* A 561-feature vector with time and frequency domain variables.
* Its activity label.
* An identifier of the subject who carried out the experiment.

download and unzip files

nsampleSize = -1L
train <- readData("UCI_HAR_Dataset/train", "train", nsampleSize)
test <- readData("UCI_HAR_Dataset/test", "test", nsampleSize)
# train <- activityLabel(train) test <- activityLabel(test)
train <- transform(train, subjectID = factor(subjectID), activityID = factor(activityID))
test <- transform(test, subjectID = factor(subjectID), activityID = factor(activityID))

train$partition = "train"
test$partition = "test"
samsungData <- rbind(train, test)
summary(samsungData$subjectID)
##   1   3   5   6   7   8  11  14  15  16  17  19  21  22  23  25  26  27 
## 347 341 302 325 308 281 316 323 328 366 368 360 408 321 372 409 392 376 
##  28  29  30   2   4   9  10  12  13  18  20  24 
## 382 344 383 302 317 288 294 320 327 364 354 381

plotting average acceleration for first subject

library(ggplot2)

qplot(data = samsungData, x = subjectID, fill = activityID)

plot of chunk processData

qplot(data = samsungData, x = subjectID, fill = partition)

plot of chunk processData

# grid.arrange(q1,q2,ncol=1) samsungData$part <-
# as.factor(samsungData$partition) qplot(data=samsungData, x=subjectID,
# fill=activityID,binwidth=0.5)
sub1 <- subset(samsungData, subjectID == 1)

par(mfrow = c(1, 2), mar = c(5, 4, 1, 1))
plot(sub1[, 1], col = sub1$activityID, ylab = names(sub1)[1])
plot(sub1[, 2], col = sub1$activityID, ylab = names(sub1)[2])

legend("bottomright", legend = unique(sub1$activity), col = unique(sub1$activity), 
    pch = 1)

plot of chunk unnamed-chunk-2

par(mfrow = c(1, 1))

Clustering based just on average acceleration

# source('myplclust.R')
distanceMatrix <- dist(sub1[, 1:3])
hclustering <- hclust(distanceMatrix)
myplclust(hclustering, lab.col = unclass(sub1$activityID))

plot of chunk unnamed-chunk-3

plotting max acceleration for the first subject

par(mfrow = c(1, 2))
plot(sub1[, 10], pch = 19, col = sub1$activityID, ylab = names(sub1)[10])
plot(sub1[, 11], pch = 19, col = sub1$activityID, ylab = names(sub1)[11])

plot of chunk unnamed-chunk-4

par(mfrow = c(1, 1))

Clustering based on maximum acceleration

# source('myplclust.R')
distanceMatrix <- dist(sub1[, 10:12])
hclustering <- hclust(distanceMatrix)
myplclust(hclustering, lab.col = unclass(sub1$activityID))

plot of chunk unnamed-chunk-5

Singular Value decomposition

# svd1 = svd(scale(sub1[, -c(562, 563)]))
svd1 = svd(scale(sub1[, -c(562:564)]))
par(mfrow = c(1, 2))
plot(svd1$u[, 1], col = sub1$activityID, pch = 19)
plot(svd1$u[, 2], col = sub1$activityID, pch = 19)

plot of chunk svdChunk

par(mfrow = c(1, 1))

Find maximum contributor

plot(svd1$v[, 2], pch = 19)

plot of chunk unnamed-chunk-6


plot(sub1[, 296], col = sub1$activityID, ylab = names(sub1[296]))

plot of chunk unnamed-chunk-6

New clustering with maximum contributer

maxContrib <- which.max(svd1$v[, 2])
distanceMatrix <- dist(sub1[, c(10:12, maxContrib)])
hclustering <- hclust(distanceMatrix)
myplclust(hclustering, lab.col = unclass(sub1$activityID))

plot of chunk unnamed-chunk-7


New clustering with maximum contributer

names(samsungData)[maxContrib]
## [1] "fBodyAcc.meanFreq...Z"
(topN <- order(svd1$v[, 2], decreasing = T)[1:5])
## [1] 296 249 210 223  74
n <- names(samsungData)[topN]
r <- svd1$v[, 2][topN]
rbind(n, r)
##   [,1]                    [,2]                     
## n "fBodyAcc.meanFreq...Z" "tBodyGyroMag.arCoeff..1"
## r "0.12232138827709"      "0.121065159901176"      
##   [,3]                     [,4]                       
## n "tBodyAccMag.arCoeff..1" "tGravityAccMag.arCoeff..1"
## r "0.11408410152257"       "0.11408410152257"         
##   [,5]                       
## n "tGravityAcc.arCoeff...Z.1"
## r "0.112031658115089"

qplot(sub1[, 296], col = sub1$activityID, xlab = names(sub1[296]), binwidth = 2/30, 
    fill = sub1$activityID)

plot of chunk unnamed-chunk-8

kmean clustering

# kClust <- kmeans(sub1[, -c(562, 563)], centers = 6)
kClust <- kmeans(sub1[, -c(562:564)], centers = 6)
table(kClust$cluster, sub1$activityID)
##    
##      1  2  3  4  5  6
##   1  0  0  0 11  3 14
##   2  0  0  0  0  0 27
##   3  0 53  0  0  0  0
##   4  0  0  0 34 50  0
##   5  0  0  0  2  0  9
##   6 95  0 49  0  0  0

# kClust <- kmeans(sub1[, -c(562, 563)], centers = 6, nstart = 1)
kClust <- kmeans(sub1[, -c(562:564)], centers = 6, nstart = 1)
table(kClust$cluster, sub1$activityID)
##    
##      1  2  3  4  5  6
##   1 95  0 49  0  0  0
##   2  0  0  0  0  0 27
##   3  0  0  0 34 50  0
##   4  0  0  0  2  0  9
##   5  0  0  0 11  3 14
##   6  0 53  0  0  0  0

# kClust <- kmeans(sub1[, -c(562, 563)], centers = 6, nstart = 100)
kClust <- kmeans(sub1[, -c(562:564)], centers = 6, nstart = 100)
table(kClust$cluster, sub1$activityID)
##    
##      1  2  3  4  5  6
##   1  0  0 49  0  0  0
##   2  0 53  0  0  0  3
##   3  0  0  0 10  2 18
##   4  0  0  0  0  0 29
##   5 95  0  0  0  0  0
##   6  0  0  0 37 51  0

Analysis all subjects

##

Check the predictor variables are normalized and scaled to the range[-1,1].

library(plyr)
numPredictors = ncol(samsungData) - 3
dataSd = colwise(sd)(samsungData[, 1:numPredictors])
dataSd$stat = "Predictor Variable Standard Deviation"
dataMean = colwise(mean)(samsungData[, 1:numPredictors])
dataMean$stat = "Predictor Variable Mean"
library(reshape2)
temp = melt(rbind(dataMean, dataSd), c("stat"))
qplot(data = temp, x = value, binwidth = 0.025) + facet_wrap(~stat, ncol = 1)

plot of chunk rangeCheck

remove(dataSd, dataMean, temp)

If each variable was z-scaled, the mean would be approximately zero and the standard deviation would be 1. These variables may be normalized, but they are not z-scaled. If we intend to use modeling methods that are sensitive to feature scaling, we might want to do some preprocessing.

Looking one subject

Preprocessing

caret package offers several options for preprocessing continuous variables such as the predictors in the UCI HAR dataset. We will prepare several different versions of the predictor matrix to compare how these perform when we build a predictive model.

Z-scaling

library(lattice)
library(caret)
zScaleTrain = preProcess(train[, 1:numPredictors])
scaledX = predict(zScaleTrain, train[, 1:numPredictors])
head(names(scaledX))
## [1] "tBodyAcc.mean...X" "tBodyAcc.mean...Y" "tBodyAcc.mean...Z"
## [4] "tBodyAcc.std...X"  "tBodyAcc.std...Y"  "tBodyAcc.std...Z"

Near Zero Variance Predictor Detection

nzv = nearZeroVar(scaledX, saveMetrics = TRUE)
summary(nzv)
##    freqRatio      percentUnique     zeroVar           nzv         
##  Min.   :  1.00   Min.   :  0.34   Mode :logical   Mode :logical  
##  1st Qu.:  1.00   1st Qu.: 96.03   FALSE:561       FALSE:561      
##  Median :  1.00   Median : 99.95   NA's :0         NA's :0        
##  Mean   :  2.05   Mean   : 92.42                                  
##  3rd Qu.:  1.25   3rd Qu.: 99.99                                  
##  Max.   :207.73   Max.   :100.00
head(nzv[order(nzv$percentUnique, decreasing = FALSE), ], n = 20)
##                               freqRatio percentUnique zeroVar   nzv
## fBodyGyro.maxInds.Z               1.574        0.3400   FALSE FALSE
## fBodyAcc.maxInds.Y                1.999        0.3536   FALSE FALSE
## fBodyAcc.maxInds.Z                1.645        0.3536   FALSE FALSE
## fBodyGyro.maxInds.X               1.505        0.3672   FALSE FALSE
## fBodyBodyGyroMag.maxInds          2.544        0.3672   FALSE FALSE
## fBodyAcc.maxInds.X                1.155        0.3945   FALSE FALSE
## fBodyGyro.maxInds.Y               1.227        0.3945   FALSE FALSE
## fBodyAccMag.maxInds               1.794        0.3945   FALSE FALSE
## fBodyAccJerk.maxInds.X            1.238        0.6529   FALSE FALSE
## fBodyAccJerk.maxInds.Y            1.084        0.6529   FALSE FALSE
## fBodyAccJerk.maxInds.Z            1.018        0.6665   FALSE FALSE
## fBodyBodyGyroJerkMag.maxInds      1.073        0.7073   FALSE FALSE
## fBodyBodyAccJerkMag.maxInds       1.422        0.7753   FALSE FALSE
## tGravityAcc.entropy...Y         207.731       16.0365   FALSE FALSE
## tGravityAcc.entropy...Z         114.581       36.8607   FALSE FALSE
## tGravityAcc.entropy...X         111.357       43.0903   FALSE FALSE
## fBodyAccJerk.entropy...X         15.613       45.0626   FALSE FALSE
## fBodyAccJerk.entropy...Z          6.303       45.2258   FALSE FALSE
## fBodyAccJerk.entropy...Y         12.491       45.6882   FALSE FALSE
## fBodyBodyAccJerkMag.entropy..     9.534       46.1915   FALSE FALSE
head(nzv[order(nzv$freqRatio, decreasing = TRUE), ], n = 20)
##                                freqRatio percentUnique zeroVar   nzv
## tGravityAcc.entropy...Y          207.731       16.0365   FALSE FALSE
## tGravityAcc.entropy...Z          114.581       36.8607   FALSE FALSE
## tGravityAcc.entropy...X          111.357       43.0903   FALSE FALSE
## fBodyAccJerk.entropy...X          15.613       45.0626   FALSE FALSE
## fBodyAccJerk.entropy...Y          12.491       45.6882   FALSE FALSE
## fBodyBodyAccJerkMag.entropy..      9.534       46.1915   FALSE FALSE
## tBodyGyro.entropy...X              6.440       81.0800   FALSE FALSE
## tBodyGyro.entropy...Z              6.345       74.1567   FALSE FALSE
## tBodyGyro.entropy...Y              6.316       81.7057   FALSE FALSE
## fBodyAccJerk.entropy...Z           6.303       45.2258   FALSE FALSE
## tBodyAcc.entropy...Z               3.833       87.8945   FALSE FALSE
## fBodyBodyGyroJerkMag.entropy..     3.655       50.4081   FALSE FALSE
## fBodyBodyGyroMag.maxInds           2.544        0.3672   FALSE FALSE
## tBodyAcc.std...Y                   2.000       99.9864   FALSE FALSE
## tBodyAcc.mad...Z                   2.000       99.9864   FALSE FALSE
## tBodyAcc.max...X                   2.000       70.9875   FALSE FALSE
## tBodyAcc.max...Y                   2.000       70.7835   FALSE FALSE
## tBodyAcc.sma..                     2.000       99.9864   FALSE FALSE
## tBodyAcc.arCoeff...X.3             2.000       99.9864   FALSE FALSE
## tBodyAcc.arCoeff...Z.3             2.000       99.9864   FALSE FALSE

Find and Remove Highly Correlated Predictors

# correlatedPredictors = findCorrelation(cor(scaledX), cutoff = 0.95)
correlatedPredictors = findCorrelation(cor(scaledX), cutoff = 0.8)

There are 389 correlated predictors to remove

reducedCorrelationX = scaledX[, -correlatedPredictors]
head(names(reducedCorrelationX))
## [1] "tBodyAcc.mean...X"      "tBodyAcc.mean...Y"     
## [3] "tBodyAcc.mean...Z"      "tBodyAcc.entropy...X"  
## [5] "tBodyAcc.entropy...Z"   "tBodyAcc.arCoeff...X.3"

The reduced correlation predictor set retains 7352, 172 variables.

PCA Transformed Predictors

pcaTrain = preProcess(scaledX, method = "pca", thresh = 0.99)

The PCA transformed data retains components to capture 99% of the variance.

pcaX = predict(pcaTrain, scaledX)
head(names(pcaX))
## [1] "PC1" "PC2" "PC3" "PC4" "PC5" "PC6"

After PCA, the original predictor names are no longer available in a straightforward manner.

Data Splitting

An important aspect of predictive modeling is ensuring that we can accurately predict model performance on unseen data; when we deploy our model, our reputation and that of our employer are often impacted by how our model performs. For the UCI HAR data, our test set is a strict hold-out sample. We will not use this set for model development or model selection.

Our training data set is not tiny, but neither is it large. We have data examples from a limited number of experimental subjects. My technical approach will be to use cross-validation by experimental subject for model selection. There are 21 training subjects. If we wish to cross-validate over every experimental subject, we can generate sets of training sample indices like this:

leaveOneSubjectOutIndices = lapply(levels(train$subject), function(X) {
    which(!X == train$subject)
})

If instead we want to control computation time, we can create a different partition.

cvBreaks = 7
temp = sample(levels(train$subjectID), length(levels(train$subjectID)))  # randomize subjects
temp = split(temp, cut(1:length(temp), breaks = cvBreaks, labels = FALSE))  # split into CV groups
cvGroupIndices = lapply(temp, function(X) {
    which(!train$subject %in% X)
})

Model Training

Set up parallel processing

library(parallel)
cl = parallel::makeForkCluster(nnodes = detectCores()/2)
setDefaultCluster(cl)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators

registerDoParallel(cl)
getDoParWorkers()
## [1] 4

Train a Random Forest model using method='rf'

saveFile = paste(DataDirectory, "modelRF.RData", sep = "")
if (!file.exists(saveFile)) {
    rfCtrl = trainControl(method = "cv", number = length(cvGroupIndices), index = cvGroupIndices, 
        classProbs = TRUE)
    modelRF = train(reducedCorrelationX, train$activityID, method = "rf", trControl = rfCtrl, 
        tuneGrid = data.frame(.mtry = c(2, 5, 10, 15, 20)), importance = TRUE)
    save(rfCtrl, modelRF, correlatedPredictors, zScaleTrain, file = saveFile)
}
if (!exists("modelRF")) {
    load(saveFile)
}
print(modelRF)
## Random Forest 
## 
## 7352 samples
##  172 predictors
##    6 classes: '1', '2', '3', '4', '5', '6' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## 
## Summary of sample sizes: 6243, 6243, 6348, 6365, 6321, 6295, ... 
## 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy  Kappa  Accuracy SD  Kappa SD
##   2     0.9       0.9    0.04         0.05    
##   5     0.9       0.9    0.04         0.05    
##   10    0.9       0.9    0.03         0.04    
##   20    0.9       0.9    0.03         0.04    
##   20    0.9       0.9    0.03         0.04    
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 10.
plot(modelRF)

plot of chunk unnamed-chunk-17

print(confusionMatrix(modelRF))
## Cross-Validated (7 fold) Confusion Matrix 
## 
## (entries are percentages of table totals)
##  
##           Reference
## Prediction    1    2    3    4    5    6
##          1 15.2  0.2  0.2  0.0  0.0  0.0
##          2  1.1 13.9  0.6  0.0  0.0  0.0
##          3  0.4  0.5 12.6  0.0  0.0  0.0
##          4  0.0  0.0  0.0 14.8  2.1  0.0
##          5  0.0  0.0  0.0  2.0 16.6  0.3
##          6  0.0  0.0  0.0  0.6  0.0 18.7
m = as.data.frame(modelRF$finalModel$importance)
m = m[order(m$MeanDecreaseAccuracy, decreasing = TRUE), ]
head(m, n = 20)
##                                           1        2         3        4
## tGravityAcc.energy...Y             0.034264 0.063421 0.0555405 0.045001
## tGravityAcc.energy...Z             0.009105 0.004639 0.0067839 0.024489
## angle.Z.gravityMean.               0.005881 0.023636 0.0049358 0.031425
## fBodyAccJerk.bandsEnergy...17.24   0.042882 0.039346 0.0454118 0.025874
## fBodyGyro.bandsEnergy...9.16.2     0.032239 0.034152 0.0772552 0.017559
## fBodyGyro.bandsEnergy...17.24.2    0.035457 0.025070 0.0431805 0.018291
## fBodyGyro.bandsEnergy...9.16       0.036729 0.027841 0.0228709 0.051921
## fBodyAccJerk.bandsEnergy...25.32   0.029549 0.032032 0.0522116 0.021050
## tBodyAcc.correlation...X.Y         0.052318 0.028511 0.0598720 0.008411
## fBodyGyro.bandsEnergy...1.8.1      0.029698 0.036682 0.0281993 0.014587
## fBodyAccJerk.bandsEnergy...33.40.1 0.037173 0.024114 0.0303588 0.019923
## fBodyAcc.bandsEnergy...17.24.1     0.044781 0.019821 0.0303136 0.014032
## fBodyGyro.bandsEnergy...1.8        0.024628 0.010685 0.0176389 0.056234
## fBodyAccJerk.bandsEnergy...49.64   0.021262 0.023390 0.0402193 0.017677
## tGravityAcc.arCoeff...X.1          0.049258 0.036772 0.0274978 0.004721
## tGravityAcc.entropy...Y            0.001623 0.002296 0.0005602 0.042153
## tBodyGyroJerkMag.min..             0.022149 0.023322 0.0179692 0.020453
## tBodyAcc.entropy...X               0.063651 0.026901 0.0063441 0.008339
## tGravityAcc.sma..                  0.010058 0.018154 0.0170301 0.022645
## fBodyGyro.bandsEnergy...17.24      0.032877 0.016656 0.0165683 0.020406
##                                           5        6 MeanDecreaseAccuracy
## tGravityAcc.energy...Y             0.061561 0.191651              0.07842
## tGravityAcc.energy...Z             0.045944 0.119272              0.03883
## angle.Z.gravityMean.               0.054067 0.093275              0.03853
## fBodyAccJerk.bandsEnergy...17.24   0.029863 0.029942              0.03476
## fBodyGyro.bandsEnergy...9.16.2     0.019566 0.020594              0.03133
## fBodyGyro.bandsEnergy...17.24.2    0.029018 0.026455              0.02901
## fBodyGyro.bandsEnergy...9.16       0.019537 0.014403              0.02875
## fBodyAccJerk.bandsEnergy...25.32   0.018897 0.019869              0.02762
## tBodyAcc.correlation...X.Y         0.013157 0.008402              0.02641
## fBodyGyro.bandsEnergy...1.8.1      0.023787 0.021601              0.02525
## fBodyAccJerk.bandsEnergy...33.40.1 0.021345 0.017637              0.02464
## fBodyAcc.bandsEnergy...17.24.1     0.020564 0.015508              0.02365
## fBodyGyro.bandsEnergy...1.8        0.014156 0.010916              0.02256
## fBodyAccJerk.bandsEnergy...49.64   0.016705 0.017302              0.02190
## tGravityAcc.arCoeff...X.1          0.007912 0.007201              0.02093
## tGravityAcc.entropy...Y            0.047036 0.019427              0.02057
## tBodyGyroJerkMag.min..             0.015721 0.019468              0.01975
## tBodyAcc.entropy...X               0.011795 0.001072              0.01925
## tGravityAcc.sma..                  0.017383 0.025098              0.01862
## fBodyGyro.bandsEnergy...17.24      0.012342 0.011745              0.01826
##                                    MeanDecreaseGini
## tGravityAcc.energy...Y                       397.44
## tGravityAcc.energy...Z                       206.51
## angle.Z.gravityMean.                         214.41
## fBodyAccJerk.bandsEnergy...17.24             108.84
## fBodyGyro.bandsEnergy...9.16.2               101.53
## fBodyGyro.bandsEnergy...17.24.2               93.29
## fBodyGyro.bandsEnergy...9.16                 109.13
## fBodyAccJerk.bandsEnergy...25.32              98.29
## tBodyAcc.correlation...X.Y                   140.04
## fBodyGyro.bandsEnergy...1.8.1                 87.53
## fBodyAccJerk.bandsEnergy...33.40.1            74.94
## fBodyAcc.bandsEnergy...17.24.1                76.29
## fBodyGyro.bandsEnergy...1.8                   93.70
## fBodyAccJerk.bandsEnergy...49.64              71.89
## tGravityAcc.arCoeff...X.1                    120.16
## tGravityAcc.entropy...Y                      113.38
## tBodyGyroJerkMag.min..                        71.13
## tBodyAcc.entropy...X                          66.22
## tGravityAcc.sma..                            109.37
## fBodyGyro.bandsEnergy...17.24                 69.80

Train a Random Forest model using method='parRF'

saveFile = paste(DataDirectory, "modelParRF.RData", sep = "")
if (!file.exists(saveFile)) {
    parRfCtrl = trainControl(method = "cv", number = length(cvGroupIndices), 
        index = cvGroupIndices, classProbs = TRUE)
    modelParRF = train(reducedCorrelationX, train$activityID, method = "parRF", 
        trControl = parRfCtrl, tuneGrid = data.frame(.mtry = c(2, 5, 10, 15, 
            20)), importance = TRUE)
    save(parRfCtrl, modelParRF, correlatedPredictors, zScaleTrain, file = saveFile)
}
## Loading required package: randomForest
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
## Warning: At least one of the class levels are not valid R variables names; This may cause errors if class probabilities are generated because the variables names will be converted to: X1, X2, X3, X4, X5, X6
## Warning: There were missing values in resampled performance measures.
## Warning: missing values found in aggregated results
## Error: final tuning parameters could not be determined
if (!exists("modelParRF")) {
    load(saveFile)
}
## Warning: cannot open compressed file 'UCI_HAR_Dataset/modelParRF.RData',
## probable reason 'No such file or directory'
## Error: cannot open the connection
print(modelParRF)
## Error: object 'modelParRF' not found
plot(modelParRF)
## Error: object 'modelParRF' not found
print(confusionMatrix(modelParRF))
## Error: object 'modelParRF' not found

Train using a simpler model

saveFile = paste(DataDirectory, "modelKnn.RData", sep = "")
if (!file.exists(saveFile)) {
    knnCtrl = trainControl(method = "cv", number = length(cvGroupIndices), index = cvGroupIndices, 
        classProbs = TRUE)
    modelKnn = train(reducedCorrelationX, train$activityID, method = "knn", 
        trControl = knnCtrl, tuneGrid = data.frame(.k = c(5, 10, 15, 20)))
    save(knnCtrl, modelKnn, correlatedPredictors, zScaleTrain, file = saveFile)
}
if (!exists("modelKnn")) {
    load(saveFile)
}
print(modelKnn)
## k-Nearest Neighbors 
## 
## 7352 samples
##  172 predictors
##    6 classes: '1', '2', '3', '4', '5', '6' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## 
## Summary of sample sizes: 6243, 6243, 6348, 6365, 6321, 6295, ... 
## 
## Resampling results across tuning parameters:
## 
##   k   Accuracy  Kappa  Accuracy SD  Kappa SD
##   5   0.8       0.8    0.04         0.05    
##   10  0.8       0.8    0.04         0.04    
##   20  0.8       0.8    0.04         0.05    
##   20  0.8       0.8    0.04         0.05    
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was k = 20.
confusionMatrix(modelKnn)
## Cross-Validated (7 fold) Confusion Matrix 
## 
## (entries are percentages of table totals)
##  
##           Reference
## Prediction    1    2    3    4    5    6
##          1 15.7  1.1  2.2  0.0  0.0  0.0
##          2  0.6 13.3  1.8  0.0  0.0  0.0
##          3  0.4  0.2  9.4  0.0  0.0  0.0
##          4  0.0  0.0  0.0 12.4  1.4  1.3
##          5  0.0  0.0  0.0  4.9 17.2  1.0
##          6  0.0  0.0  0.0  0.2  0.0 16.8

Selecting the “best” model

For simplicity, we will choose the best model based on overall cross-validation accuracy which leaves us with one of the random forest models.

bestModel = modelRF

Predicting generalization performance

holdoutX = predict(zScaleTrain, test[, 1:numPredictors])[, -correlatedPredictors]
holdoutLabels = test$ActivityID
holdoutPrediction = predict(bestModel, holdoutX)
head(holdoutPrediction)
## [1] 5 5 5 5 5 5
## Levels: 1 2 3 4 5 6
classProbPrediction = predict(bestModel, holdoutX, type = "prob")
## Error: undefined columns selected
head(classProbPrediction)
## Error: object 'classProbPrediction' not found

holdoutConfusionMatrix = confusionMatrix(holdoutPrediction, holdoutLabels)
## Error: the data and reference factors must have the same number of levels
print(holdoutConfusionMatrix)
## Error: object 'holdoutConfusionMatrix' not found

Comparison of holdout predictions and cross-validation predictions

print(confusionMatrix(bestModel), digits = 2)
## Cross-Validated (7 fold) Confusion Matrix 
## 
## (entries are percentages of table totals)
##  
##           Reference
## Prediction     1     2     3     4     5     6
##          1 15.23  0.17  0.19  0.00  0.00  0.00
##          2  1.06 13.90  0.65  0.01  0.00  0.03
##          3  0.39  0.54 12.57  0.00  0.00  0.00
##          4  0.00  0.00  0.00 14.77  2.06  0.04
##          5  0.00  0.00  0.00  2.04 16.62  0.33
##          6  0.00  0.00  0.00  0.65  0.01 18.74
print(100 * (holdoutConfusionMatrix$table/sum(holdoutConfusionMatrix$table)), 
    digits = 1)
## Error: object 'holdoutConfusionMatrix' not found
save.image(file = "workspaceImage.RData")