One of the most exciting areas in all of data science right now is wearable computing - see for example this article.
Data set information was download from http://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones#. "The experiments have been carried out with a group of 30 volunteers within an age bracket of 19-48 years. Each person performed six activities (WALKING, WALKING UPSTAIRS, WALKING DOWNSTAIRS, SITTING, STANDING, LAYING) wearing a smartphone (Samsung Galaxy S II) on the waist. Using its embedded accelerometer and gyroscope, we captured 3-axial linear acceleration and 3-axial angular velocity at a constant rate of 50Hz. The experiments have been video-recorded to label the data manually. The obtained dataset has been randomly partitioned into two sets, where 70% of the volunteers was selected for generating the training data and 30% the test data. The sensor signals (accelerometer and gyroscope) were pre-processed by applying noise filters and then sampled in fixed-width sliding windows of 2.56 sec and 50% overlap (128 readings/window). The sensor acceleration signal, which has gravitational and body motion components, was separated using a Butterworth low-pass filter into body acceleration and gravity. The gravitational force is assumed to have only low frequency components, therefore a filter with 0.3 Hz cutoff frequency was used. From each window, a vector of features was obtained by calculating variables from the time and frequency domain.”
For each record in the raw dataset it is provided:
* Triaxial acceleration from the accelerometer (total acceleration) and the estimated body acceleration.
* Triaxial Angular velocity from the gyroscope.
* A 561-feature vector with time and frequency domain variables.
* Its activity label.
* An identifier of the subject who carried out the experiment.
nsampleSize = -1L
train <- readData("UCI_HAR_Dataset/train", "train", nsampleSize)
test <- readData("UCI_HAR_Dataset/test", "test", nsampleSize)
# train <- activityLabel(train) test <- activityLabel(test)
train <- transform(train, subjectID = factor(subjectID), activityID = factor(activityID))
test <- transform(test, subjectID = factor(subjectID), activityID = factor(activityID))
train$partition = "train"
test$partition = "test"
samsungData <- rbind(train, test)
summary(samsungData$subjectID)
## 1 3 5 6 7 8 11 14 15 16 17 19 21 22 23 25 26 27
## 347 341 302 325 308 281 316 323 328 366 368 360 408 321 372 409 392 376
## 28 29 30 2 4 9 10 12 13 18 20 24
## 382 344 383 302 317 288 294 320 327 364 354 381
library(ggplot2)
qplot(data = samsungData, x = subjectID, fill = activityID)
qplot(data = samsungData, x = subjectID, fill = partition)
# grid.arrange(q1,q2,ncol=1) samsungData$part <-
# as.factor(samsungData$partition) qplot(data=samsungData, x=subjectID,
# fill=activityID,binwidth=0.5)
sub1 <- subset(samsungData, subjectID == 1)
par(mfrow = c(1, 2), mar = c(5, 4, 1, 1))
plot(sub1[, 1], col = sub1$activityID, ylab = names(sub1)[1])
plot(sub1[, 2], col = sub1$activityID, ylab = names(sub1)[2])
legend("bottomright", legend = unique(sub1$activity), col = unique(sub1$activity),
pch = 1)
par(mfrow = c(1, 1))
# source('myplclust.R')
distanceMatrix <- dist(sub1[, 1:3])
hclustering <- hclust(distanceMatrix)
myplclust(hclustering, lab.col = unclass(sub1$activityID))
par(mfrow = c(1, 2))
plot(sub1[, 10], pch = 19, col = sub1$activityID, ylab = names(sub1)[10])
plot(sub1[, 11], pch = 19, col = sub1$activityID, ylab = names(sub1)[11])
par(mfrow = c(1, 1))
# source('myplclust.R')
distanceMatrix <- dist(sub1[, 10:12])
hclustering <- hclust(distanceMatrix)
myplclust(hclustering, lab.col = unclass(sub1$activityID))
# svd1 = svd(scale(sub1[, -c(562, 563)]))
svd1 = svd(scale(sub1[, -c(562:564)]))
par(mfrow = c(1, 2))
plot(svd1$u[, 1], col = sub1$activityID, pch = 19)
plot(svd1$u[, 2], col = sub1$activityID, pch = 19)
par(mfrow = c(1, 1))
plot(svd1$v[, 2], pch = 19)
plot(sub1[, 296], col = sub1$activityID, ylab = names(sub1[296]))
maxContrib <- which.max(svd1$v[, 2])
distanceMatrix <- dist(sub1[, c(10:12, maxContrib)])
hclustering <- hclust(distanceMatrix)
myplclust(hclustering, lab.col = unclass(sub1$activityID))
names(samsungData)[maxContrib]
## [1] "fBodyAcc.meanFreq...Z"
(topN <- order(svd1$v[, 2], decreasing = T)[1:5])
## [1] 296 249 210 223 74
n <- names(samsungData)[topN]
r <- svd1$v[, 2][topN]
rbind(n, r)
## [,1] [,2]
## n "fBodyAcc.meanFreq...Z" "tBodyGyroMag.arCoeff..1"
## r "0.12232138827709" "0.121065159901176"
## [,3] [,4]
## n "tBodyAccMag.arCoeff..1" "tGravityAccMag.arCoeff..1"
## r "0.11408410152257" "0.11408410152257"
## [,5]
## n "tGravityAcc.arCoeff...Z.1"
## r "0.112031658115089"
qplot(sub1[, 296], col = sub1$activityID, xlab = names(sub1[296]), binwidth = 2/30,
fill = sub1$activityID)
# kClust <- kmeans(sub1[, -c(562, 563)], centers = 6)
kClust <- kmeans(sub1[, -c(562:564)], centers = 6)
table(kClust$cluster, sub1$activityID)
##
## 1 2 3 4 5 6
## 1 0 0 0 11 3 14
## 2 0 0 0 0 0 27
## 3 0 53 0 0 0 0
## 4 0 0 0 34 50 0
## 5 0 0 0 2 0 9
## 6 95 0 49 0 0 0
# kClust <- kmeans(sub1[, -c(562, 563)], centers = 6, nstart = 1)
kClust <- kmeans(sub1[, -c(562:564)], centers = 6, nstart = 1)
table(kClust$cluster, sub1$activityID)
##
## 1 2 3 4 5 6
## 1 95 0 49 0 0 0
## 2 0 0 0 0 0 27
## 3 0 0 0 34 50 0
## 4 0 0 0 2 0 9
## 5 0 0 0 11 3 14
## 6 0 53 0 0 0 0
# kClust <- kmeans(sub1[, -c(562, 563)], centers = 6, nstart = 100)
kClust <- kmeans(sub1[, -c(562:564)], centers = 6, nstart = 100)
table(kClust$cluster, sub1$activityID)
##
## 1 2 3 4 5 6
## 1 0 0 49 0 0 0
## 2 0 53 0 0 0 3
## 3 0 0 0 10 2 18
## 4 0 0 0 0 0 29
## 5 95 0 0 0 0 0
## 6 0 0 0 37 51 0
##
library(plyr)
numPredictors = ncol(samsungData) - 3
dataSd = colwise(sd)(samsungData[, 1:numPredictors])
dataSd$stat = "Predictor Variable Standard Deviation"
dataMean = colwise(mean)(samsungData[, 1:numPredictors])
dataMean$stat = "Predictor Variable Mean"
library(reshape2)
temp = melt(rbind(dataMean, dataSd), c("stat"))
qplot(data = temp, x = value, binwidth = 0.025) + facet_wrap(~stat, ncol = 1)
remove(dataSd, dataMean, temp)
If each variable was z-scaled, the mean would be approximately zero and the standard deviation would be 1. These variables may be normalized, but they are not z-scaled. If we intend to use modeling methods that are sensitive to feature scaling, we might want to do some preprocessing.
Looking one subject
caret package offers several options for preprocessing continuous variables such as the predictors in the UCI HAR dataset. We will prepare several different versions of the predictor matrix to compare how these perform when we build a predictive model.
library(lattice)
library(caret)
zScaleTrain = preProcess(train[, 1:numPredictors])
scaledX = predict(zScaleTrain, train[, 1:numPredictors])
head(names(scaledX))
## [1] "tBodyAcc.mean...X" "tBodyAcc.mean...Y" "tBodyAcc.mean...Z"
## [4] "tBodyAcc.std...X" "tBodyAcc.std...Y" "tBodyAcc.std...Z"
nzv = nearZeroVar(scaledX, saveMetrics = TRUE)
summary(nzv)
## freqRatio percentUnique zeroVar nzv
## Min. : 1.00 Min. : 0.34 Mode :logical Mode :logical
## 1st Qu.: 1.00 1st Qu.: 96.03 FALSE:561 FALSE:561
## Median : 1.00 Median : 99.95 NA's :0 NA's :0
## Mean : 2.05 Mean : 92.42
## 3rd Qu.: 1.25 3rd Qu.: 99.99
## Max. :207.73 Max. :100.00
head(nzv[order(nzv$percentUnique, decreasing = FALSE), ], n = 20)
## freqRatio percentUnique zeroVar nzv
## fBodyGyro.maxInds.Z 1.574 0.3400 FALSE FALSE
## fBodyAcc.maxInds.Y 1.999 0.3536 FALSE FALSE
## fBodyAcc.maxInds.Z 1.645 0.3536 FALSE FALSE
## fBodyGyro.maxInds.X 1.505 0.3672 FALSE FALSE
## fBodyBodyGyroMag.maxInds 2.544 0.3672 FALSE FALSE
## fBodyAcc.maxInds.X 1.155 0.3945 FALSE FALSE
## fBodyGyro.maxInds.Y 1.227 0.3945 FALSE FALSE
## fBodyAccMag.maxInds 1.794 0.3945 FALSE FALSE
## fBodyAccJerk.maxInds.X 1.238 0.6529 FALSE FALSE
## fBodyAccJerk.maxInds.Y 1.084 0.6529 FALSE FALSE
## fBodyAccJerk.maxInds.Z 1.018 0.6665 FALSE FALSE
## fBodyBodyGyroJerkMag.maxInds 1.073 0.7073 FALSE FALSE
## fBodyBodyAccJerkMag.maxInds 1.422 0.7753 FALSE FALSE
## tGravityAcc.entropy...Y 207.731 16.0365 FALSE FALSE
## tGravityAcc.entropy...Z 114.581 36.8607 FALSE FALSE
## tGravityAcc.entropy...X 111.357 43.0903 FALSE FALSE
## fBodyAccJerk.entropy...X 15.613 45.0626 FALSE FALSE
## fBodyAccJerk.entropy...Z 6.303 45.2258 FALSE FALSE
## fBodyAccJerk.entropy...Y 12.491 45.6882 FALSE FALSE
## fBodyBodyAccJerkMag.entropy.. 9.534 46.1915 FALSE FALSE
head(nzv[order(nzv$freqRatio, decreasing = TRUE), ], n = 20)
## freqRatio percentUnique zeroVar nzv
## tGravityAcc.entropy...Y 207.731 16.0365 FALSE FALSE
## tGravityAcc.entropy...Z 114.581 36.8607 FALSE FALSE
## tGravityAcc.entropy...X 111.357 43.0903 FALSE FALSE
## fBodyAccJerk.entropy...X 15.613 45.0626 FALSE FALSE
## fBodyAccJerk.entropy...Y 12.491 45.6882 FALSE FALSE
## fBodyBodyAccJerkMag.entropy.. 9.534 46.1915 FALSE FALSE
## tBodyGyro.entropy...X 6.440 81.0800 FALSE FALSE
## tBodyGyro.entropy...Z 6.345 74.1567 FALSE FALSE
## tBodyGyro.entropy...Y 6.316 81.7057 FALSE FALSE
## fBodyAccJerk.entropy...Z 6.303 45.2258 FALSE FALSE
## tBodyAcc.entropy...Z 3.833 87.8945 FALSE FALSE
## fBodyBodyGyroJerkMag.entropy.. 3.655 50.4081 FALSE FALSE
## fBodyBodyGyroMag.maxInds 2.544 0.3672 FALSE FALSE
## tBodyAcc.std...Y 2.000 99.9864 FALSE FALSE
## tBodyAcc.mad...Z 2.000 99.9864 FALSE FALSE
## tBodyAcc.max...X 2.000 70.9875 FALSE FALSE
## tBodyAcc.max...Y 2.000 70.7835 FALSE FALSE
## tBodyAcc.sma.. 2.000 99.9864 FALSE FALSE
## tBodyAcc.arCoeff...X.3 2.000 99.9864 FALSE FALSE
## tBodyAcc.arCoeff...Z.3 2.000 99.9864 FALSE FALSE
# correlatedPredictors = findCorrelation(cor(scaledX), cutoff = 0.95)
correlatedPredictors = findCorrelation(cor(scaledX), cutoff = 0.8)
There are 389 correlated predictors to remove
reducedCorrelationX = scaledX[, -correlatedPredictors]
head(names(reducedCorrelationX))
## [1] "tBodyAcc.mean...X" "tBodyAcc.mean...Y"
## [3] "tBodyAcc.mean...Z" "tBodyAcc.entropy...X"
## [5] "tBodyAcc.entropy...Z" "tBodyAcc.arCoeff...X.3"
The reduced correlation predictor set retains 7352, 172 variables.
pcaTrain = preProcess(scaledX, method = "pca", thresh = 0.99)
The PCA transformed data retains components to capture 99% of the variance.
pcaX = predict(pcaTrain, scaledX)
head(names(pcaX))
## [1] "PC1" "PC2" "PC3" "PC4" "PC5" "PC6"
After PCA, the original predictor names are no longer available in a straightforward manner.
An important aspect of predictive modeling is ensuring that we can accurately predict model performance on unseen data; when we deploy our model, our reputation and that of our employer are often impacted by how our model performs. For the UCI HAR data, our test set is a strict hold-out sample. We will not use this set for model development or model selection.
Our training data set is not tiny, but neither is it large. We have data examples from a limited number of experimental subjects. My technical approach will be to use cross-validation by experimental subject for model selection. There are 21 training subjects. If we wish to cross-validate over every experimental subject, we can generate sets of training sample indices like this:
leaveOneSubjectOutIndices = lapply(levels(train$subject), function(X) {
which(!X == train$subject)
})
If instead we want to control computation time, we can create a different partition.
cvBreaks = 7
temp = sample(levels(train$subjectID), length(levels(train$subjectID))) # randomize subjects
temp = split(temp, cut(1:length(temp), breaks = cvBreaks, labels = FALSE)) # split into CV groups
cvGroupIndices = lapply(temp, function(X) {
which(!train$subject %in% X)
})
library(parallel)
cl = parallel::makeForkCluster(nnodes = detectCores()/2)
setDefaultCluster(cl)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
registerDoParallel(cl)
getDoParWorkers()
## [1] 4
saveFile = paste(DataDirectory, "modelRF.RData", sep = "")
if (!file.exists(saveFile)) {
rfCtrl = trainControl(method = "cv", number = length(cvGroupIndices), index = cvGroupIndices,
classProbs = TRUE)
modelRF = train(reducedCorrelationX, train$activityID, method = "rf", trControl = rfCtrl,
tuneGrid = data.frame(.mtry = c(2, 5, 10, 15, 20)), importance = TRUE)
save(rfCtrl, modelRF, correlatedPredictors, zScaleTrain, file = saveFile)
}
if (!exists("modelRF")) {
load(saveFile)
}
print(modelRF)
## Random Forest
##
## 7352 samples
## 172 predictors
## 6 classes: '1', '2', '3', '4', '5', '6'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
##
## Summary of sample sizes: 6243, 6243, 6348, 6365, 6321, 6295, ...
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.9 0.9 0.04 0.05
## 5 0.9 0.9 0.04 0.05
## 10 0.9 0.9 0.03 0.04
## 20 0.9 0.9 0.03 0.04
## 20 0.9 0.9 0.03 0.04
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.
plot(modelRF)
print(confusionMatrix(modelRF))
## Cross-Validated (7 fold) Confusion Matrix
##
## (entries are percentages of table totals)
##
## Reference
## Prediction 1 2 3 4 5 6
## 1 15.2 0.2 0.2 0.0 0.0 0.0
## 2 1.1 13.9 0.6 0.0 0.0 0.0
## 3 0.4 0.5 12.6 0.0 0.0 0.0
## 4 0.0 0.0 0.0 14.8 2.1 0.0
## 5 0.0 0.0 0.0 2.0 16.6 0.3
## 6 0.0 0.0 0.0 0.6 0.0 18.7
m = as.data.frame(modelRF$finalModel$importance)
m = m[order(m$MeanDecreaseAccuracy, decreasing = TRUE), ]
head(m, n = 20)
## 1 2 3 4
## tGravityAcc.energy...Y 0.034264 0.063421 0.0555405 0.045001
## tGravityAcc.energy...Z 0.009105 0.004639 0.0067839 0.024489
## angle.Z.gravityMean. 0.005881 0.023636 0.0049358 0.031425
## fBodyAccJerk.bandsEnergy...17.24 0.042882 0.039346 0.0454118 0.025874
## fBodyGyro.bandsEnergy...9.16.2 0.032239 0.034152 0.0772552 0.017559
## fBodyGyro.bandsEnergy...17.24.2 0.035457 0.025070 0.0431805 0.018291
## fBodyGyro.bandsEnergy...9.16 0.036729 0.027841 0.0228709 0.051921
## fBodyAccJerk.bandsEnergy...25.32 0.029549 0.032032 0.0522116 0.021050
## tBodyAcc.correlation...X.Y 0.052318 0.028511 0.0598720 0.008411
## fBodyGyro.bandsEnergy...1.8.1 0.029698 0.036682 0.0281993 0.014587
## fBodyAccJerk.bandsEnergy...33.40.1 0.037173 0.024114 0.0303588 0.019923
## fBodyAcc.bandsEnergy...17.24.1 0.044781 0.019821 0.0303136 0.014032
## fBodyGyro.bandsEnergy...1.8 0.024628 0.010685 0.0176389 0.056234
## fBodyAccJerk.bandsEnergy...49.64 0.021262 0.023390 0.0402193 0.017677
## tGravityAcc.arCoeff...X.1 0.049258 0.036772 0.0274978 0.004721
## tGravityAcc.entropy...Y 0.001623 0.002296 0.0005602 0.042153
## tBodyGyroJerkMag.min.. 0.022149 0.023322 0.0179692 0.020453
## tBodyAcc.entropy...X 0.063651 0.026901 0.0063441 0.008339
## tGravityAcc.sma.. 0.010058 0.018154 0.0170301 0.022645
## fBodyGyro.bandsEnergy...17.24 0.032877 0.016656 0.0165683 0.020406
## 5 6 MeanDecreaseAccuracy
## tGravityAcc.energy...Y 0.061561 0.191651 0.07842
## tGravityAcc.energy...Z 0.045944 0.119272 0.03883
## angle.Z.gravityMean. 0.054067 0.093275 0.03853
## fBodyAccJerk.bandsEnergy...17.24 0.029863 0.029942 0.03476
## fBodyGyro.bandsEnergy...9.16.2 0.019566 0.020594 0.03133
## fBodyGyro.bandsEnergy...17.24.2 0.029018 0.026455 0.02901
## fBodyGyro.bandsEnergy...9.16 0.019537 0.014403 0.02875
## fBodyAccJerk.bandsEnergy...25.32 0.018897 0.019869 0.02762
## tBodyAcc.correlation...X.Y 0.013157 0.008402 0.02641
## fBodyGyro.bandsEnergy...1.8.1 0.023787 0.021601 0.02525
## fBodyAccJerk.bandsEnergy...33.40.1 0.021345 0.017637 0.02464
## fBodyAcc.bandsEnergy...17.24.1 0.020564 0.015508 0.02365
## fBodyGyro.bandsEnergy...1.8 0.014156 0.010916 0.02256
## fBodyAccJerk.bandsEnergy...49.64 0.016705 0.017302 0.02190
## tGravityAcc.arCoeff...X.1 0.007912 0.007201 0.02093
## tGravityAcc.entropy...Y 0.047036 0.019427 0.02057
## tBodyGyroJerkMag.min.. 0.015721 0.019468 0.01975
## tBodyAcc.entropy...X 0.011795 0.001072 0.01925
## tGravityAcc.sma.. 0.017383 0.025098 0.01862
## fBodyGyro.bandsEnergy...17.24 0.012342 0.011745 0.01826
## MeanDecreaseGini
## tGravityAcc.energy...Y 397.44
## tGravityAcc.energy...Z 206.51
## angle.Z.gravityMean. 214.41
## fBodyAccJerk.bandsEnergy...17.24 108.84
## fBodyGyro.bandsEnergy...9.16.2 101.53
## fBodyGyro.bandsEnergy...17.24.2 93.29
## fBodyGyro.bandsEnergy...9.16 109.13
## fBodyAccJerk.bandsEnergy...25.32 98.29
## tBodyAcc.correlation...X.Y 140.04
## fBodyGyro.bandsEnergy...1.8.1 87.53
## fBodyAccJerk.bandsEnergy...33.40.1 74.94
## fBodyAcc.bandsEnergy...17.24.1 76.29
## fBodyGyro.bandsEnergy...1.8 93.70
## fBodyAccJerk.bandsEnergy...49.64 71.89
## tGravityAcc.arCoeff...X.1 120.16
## tGravityAcc.entropy...Y 113.38
## tBodyGyroJerkMag.min.. 71.13
## tBodyAcc.entropy...X 66.22
## tGravityAcc.sma.. 109.37
## fBodyGyro.bandsEnergy...17.24 69.80
saveFile = paste(DataDirectory, "modelParRF.RData", sep = "")
if (!file.exists(saveFile)) {
parRfCtrl = trainControl(method = "cv", number = length(cvGroupIndices),
index = cvGroupIndices, classProbs = TRUE)
modelParRF = train(reducedCorrelationX, train$activityID, method = "parRF",
trControl = parRfCtrl, tuneGrid = data.frame(.mtry = c(2, 5, 10, 15,
20)), importance = TRUE)
save(parRfCtrl, modelParRF, correlatedPredictors, zScaleTrain, file = saveFile)
}
## Loading required package: randomForest
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
## Warning: At least one of the class levels are not valid R variables names; This may cause errors if class probabilities are generated because the variables names will be converted to: X1, X2, X3, X4, X5, X6
## Warning: There were missing values in resampled performance measures.
## Warning: missing values found in aggregated results
## Error: final tuning parameters could not be determined
if (!exists("modelParRF")) {
load(saveFile)
}
## Warning: cannot open compressed file 'UCI_HAR_Dataset/modelParRF.RData',
## probable reason 'No such file or directory'
## Error: cannot open the connection
print(modelParRF)
## Error: object 'modelParRF' not found
plot(modelParRF)
## Error: object 'modelParRF' not found
print(confusionMatrix(modelParRF))
## Error: object 'modelParRF' not found
saveFile = paste(DataDirectory, "modelKnn.RData", sep = "")
if (!file.exists(saveFile)) {
knnCtrl = trainControl(method = "cv", number = length(cvGroupIndices), index = cvGroupIndices,
classProbs = TRUE)
modelKnn = train(reducedCorrelationX, train$activityID, method = "knn",
trControl = knnCtrl, tuneGrid = data.frame(.k = c(5, 10, 15, 20)))
save(knnCtrl, modelKnn, correlatedPredictors, zScaleTrain, file = saveFile)
}
if (!exists("modelKnn")) {
load(saveFile)
}
print(modelKnn)
## k-Nearest Neighbors
##
## 7352 samples
## 172 predictors
## 6 classes: '1', '2', '3', '4', '5', '6'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
##
## Summary of sample sizes: 6243, 6243, 6348, 6365, 6321, 6295, ...
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa Accuracy SD Kappa SD
## 5 0.8 0.8 0.04 0.05
## 10 0.8 0.8 0.04 0.04
## 20 0.8 0.8 0.04 0.05
## 20 0.8 0.8 0.04 0.05
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 20.
confusionMatrix(modelKnn)
## Cross-Validated (7 fold) Confusion Matrix
##
## (entries are percentages of table totals)
##
## Reference
## Prediction 1 2 3 4 5 6
## 1 15.7 1.1 2.2 0.0 0.0 0.0
## 2 0.6 13.3 1.8 0.0 0.0 0.0
## 3 0.4 0.2 9.4 0.0 0.0 0.0
## 4 0.0 0.0 0.0 12.4 1.4 1.3
## 5 0.0 0.0 0.0 4.9 17.2 1.0
## 6 0.0 0.0 0.0 0.2 0.0 16.8
For simplicity, we will choose the best model based on overall cross-validation accuracy which leaves us with one of the random forest models.
bestModel = modelRF
holdoutX = predict(zScaleTrain, test[, 1:numPredictors])[, -correlatedPredictors]
holdoutLabels = test$ActivityID
holdoutPrediction = predict(bestModel, holdoutX)
head(holdoutPrediction)
## [1] 5 5 5 5 5 5
## Levels: 1 2 3 4 5 6
classProbPrediction = predict(bestModel, holdoutX, type = "prob")
## Error: undefined columns selected
head(classProbPrediction)
## Error: object 'classProbPrediction' not found
holdoutConfusionMatrix = confusionMatrix(holdoutPrediction, holdoutLabels)
## Error: the data and reference factors must have the same number of levels
print(holdoutConfusionMatrix)
## Error: object 'holdoutConfusionMatrix' not found
print(confusionMatrix(bestModel), digits = 2)
## Cross-Validated (7 fold) Confusion Matrix
##
## (entries are percentages of table totals)
##
## Reference
## Prediction 1 2 3 4 5 6
## 1 15.23 0.17 0.19 0.00 0.00 0.00
## 2 1.06 13.90 0.65 0.01 0.00 0.03
## 3 0.39 0.54 12.57 0.00 0.00 0.00
## 4 0.00 0.00 0.00 14.77 2.06 0.04
## 5 0.00 0.00 0.00 2.04 16.62 0.33
## 6 0.00 0.00 0.00 0.65 0.01 18.74
print(100 * (holdoutConfusionMatrix$table/sum(holdoutConfusionMatrix$table)),
digits = 1)
## Error: object 'holdoutConfusionMatrix' not found
save.image(file = "workspaceImage.RData")