1. Load the data
library(caret)
library(RWeka)
library(Amelia)
library(ggplot2)
set.seed(12345)
url1 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
url2 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
#download.file(url1,method="curl",destfile="/Users/alex/Documents/R directory/Machine learning/training.csv")
#download.file(url2,method="curl",destfile="/Users/alex/Documents/R directory/Machine learning/testing.csv")
fileDownload <- date()
2. Read the files into separate tables.
training <- read.csv("/Users/alex/Documents/R directory/Machine learning/training.csv")
test <- read.csv("/Users/alex/Documents/R directory/Machine learning/testing.csv")
2. Clean the dataset
##Assign factor levels to predicted variable
training$classe <- as.factor(training$classe)
#Visualize missing values
missmap(training, main="Missings Map in Human Activity Recognition training dataset", col=c("salmon", "purple"), legend=FALSE)

##Remove missing values columns
table(colSums(is.na(training)))
##
## 0 19216
## 93 67
miss <- colnames(training[colSums(is.na(training))>1])
training <- training[,!colnames(training) %in% miss]
training <- na.omit(training)
##Remove the near zero variance predictors
nzv <- nearZeroVar(training)
training <- training[,-nzv]
training <- training[,-c(1:6)]
str(training)
## 'data.frame': 19622 obs. of 53 variables:
## $ roll_belt : num 1.41 1.41 1.42 1.48 1.48 1.45 1.42 1.42 1.43 1.45 ...
## $ pitch_belt : num 8.07 8.07 8.07 8.05 8.07 8.06 8.09 8.13 8.16 8.17 ...
## $ yaw_belt : num -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
## $ total_accel_belt : int 3 3 3 3 3 3 3 3 3 3 ...
## $ gyros_belt_x : num 0 0.02 0 0.02 0.02 0.02 0.02 0.02 0.02 0.03 ...
## $ gyros_belt_y : num 0 0 0 0 0.02 0 0 0 0 0 ...
## $ gyros_belt_z : num -0.02 -0.02 -0.02 -0.03 -0.02 -0.02 -0.02 -0.02 -0.02 0 ...
## $ accel_belt_x : int -21 -22 -20 -22 -21 -21 -22 -22 -20 -21 ...
## $ accel_belt_y : int 4 4 5 3 2 4 3 4 2 4 ...
## $ accel_belt_z : int 22 22 23 21 24 21 21 21 24 22 ...
## $ magnet_belt_x : int -3 -7 -2 -6 -6 0 -4 -2 1 -3 ...
## $ magnet_belt_y : int 599 608 600 604 600 603 599 603 602 609 ...
## $ magnet_belt_z : int -313 -311 -305 -310 -302 -312 -311 -313 -312 -308 ...
## $ roll_arm : num -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 ...
## $ pitch_arm : num 22.5 22.5 22.5 22.1 22.1 22 21.9 21.8 21.7 21.6 ...
## $ yaw_arm : num -161 -161 -161 -161 -161 -161 -161 -161 -161 -161 ...
## $ total_accel_arm : int 34 34 34 34 34 34 34 34 34 34 ...
## $ gyros_arm_x : num 0 0.02 0.02 0.02 0 0.02 0 0.02 0.02 0.02 ...
## $ gyros_arm_y : num 0 -0.02 -0.02 -0.03 -0.03 -0.03 -0.03 -0.02 -0.03 -0.03 ...
## $ gyros_arm_z : num -0.02 -0.02 -0.02 0.02 0 0 0 0 -0.02 -0.02 ...
## $ accel_arm_x : int -288 -290 -289 -289 -289 -289 -289 -289 -288 -288 ...
## $ accel_arm_y : int 109 110 110 111 111 111 111 111 109 110 ...
## $ accel_arm_z : int -123 -125 -126 -123 -123 -122 -125 -124 -122 -124 ...
## $ magnet_arm_x : int -368 -369 -368 -372 -374 -369 -373 -372 -369 -376 ...
## $ magnet_arm_y : int 337 337 344 344 337 342 336 338 341 334 ...
## $ magnet_arm_z : int 516 513 513 512 506 513 509 510 518 516 ...
## $ roll_dumbbell : num 13.1 13.1 12.9 13.4 13.4 ...
## $ pitch_dumbbell : num -70.5 -70.6 -70.3 -70.4 -70.4 ...
## $ yaw_dumbbell : num -84.9 -84.7 -85.1 -84.9 -84.9 ...
## $ total_accel_dumbbell: int 37 37 37 37 37 37 37 37 37 37 ...
## $ gyros_dumbbell_x : num 0 0 0 0 0 0 0 0 0 0 ...
## $ gyros_dumbbell_y : num -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 ...
## $ gyros_dumbbell_z : num 0 0 0 -0.02 0 0 0 0 0 0 ...
## $ accel_dumbbell_x : int -234 -233 -232 -232 -233 -234 -232 -234 -232 -235 ...
## $ accel_dumbbell_y : int 47 47 46 48 48 48 47 46 47 48 ...
## $ accel_dumbbell_z : int -271 -269 -270 -269 -270 -269 -270 -272 -269 -270 ...
## $ magnet_dumbbell_x : int -559 -555 -561 -552 -554 -558 -551 -555 -549 -558 ...
## $ magnet_dumbbell_y : int 293 296 298 303 292 294 295 300 292 291 ...
## $ magnet_dumbbell_z : num -65 -64 -63 -60 -68 -66 -70 -74 -65 -69 ...
## $ roll_forearm : num 28.4 28.3 28.3 28.1 28 27.9 27.9 27.8 27.7 27.7 ...
## $ pitch_forearm : num -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.8 -63.8 -63.8 ...
## $ yaw_forearm : num -153 -153 -152 -152 -152 -152 -152 -152 -152 -152 ...
## $ total_accel_forearm : int 36 36 36 36 36 36 36 36 36 36 ...
## $ gyros_forearm_x : num 0.03 0.02 0.03 0.02 0.02 0.02 0.02 0.02 0.03 0.02 ...
## $ gyros_forearm_y : num 0 0 -0.02 -0.02 0 -0.02 0 -0.02 0 0 ...
## $ gyros_forearm_z : num -0.02 -0.02 0 0 -0.02 -0.03 -0.02 0 -0.02 -0.02 ...
## $ accel_forearm_x : int 192 192 196 189 189 193 195 193 193 190 ...
## $ accel_forearm_y : int 203 203 204 206 206 203 205 205 204 205 ...
## $ accel_forearm_z : int -215 -216 -213 -214 -214 -215 -215 -213 -214 -215 ...
## $ magnet_forearm_x : int -17 -18 -18 -16 -17 -9 -18 -9 -16 -22 ...
## $ magnet_forearm_y : num 654 661 658 658 655 660 659 660 653 656 ...
## $ magnet_forearm_z : num 476 473 469 469 473 478 470 474 476 473 ...
## $ classe : Factor w/ 5 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
Model training
inTrain <- createDataPartition(y=training$classe,p=.7,list=F)
testing1 <- training[-inTrain,]
training1 <- training[inTrain,]
nrow(training1)
## [1] 13737
nrow(testing1)
## [1] 5885
#Cross validation is used to train the model with 3 folds:
#fitControl <- trainControl(method = "cv", number = 3)
#fit <- train(classe~.,method="gbm",data=training1,verbose=F,tuneLength = 5)
#saveRDS(fit, "gbm.RDS")
modelFit <- readRDS("gbm.RDS")
varImp(modelFit$finalModel, scale=F)
## Overall
## roll_belt 3004.093477
## pitch_belt 679.993968
## yaw_belt 1648.889106
## total_accel_belt 20.856205
## gyros_belt_x 5.491205
## gyros_belt_y 86.565344
## gyros_belt_z 434.596881
## accel_belt_x 6.604805
## accel_belt_y 42.358991
## accel_belt_z 191.374454
## magnet_belt_x 197.878859
## magnet_belt_y 210.818436
## magnet_belt_z 623.077487
## roll_arm 117.009951
## pitch_arm 20.422464
## yaw_arm 333.345001
## total_accel_arm 33.315643
## gyros_arm_x 39.760450
## gyros_arm_y 122.725285
## gyros_arm_z 0.000000
## accel_arm_x 76.385023
## accel_arm_y 52.890282
## accel_arm_z 22.002730
## magnet_arm_x 72.423170
## magnet_arm_y 159.564827
## magnet_arm_z 185.908975
## roll_dumbbell 456.561032
## pitch_dumbbell 30.514983
## yaw_dumbbell 99.766805
## total_accel_dumbbell 112.713723
## gyros_dumbbell_x 80.708692
## gyros_dumbbell_y 278.494692
## gyros_dumbbell_z 20.389388
## accel_dumbbell_x 217.137012
## accel_dumbbell_y 294.569673
## accel_dumbbell_z 341.248980
## magnet_dumbbell_x 252.266311
## magnet_dumbbell_y 1013.621658
## magnet_dumbbell_z 1106.217746
## roll_forearm 807.672343
## pitch_forearm 1577.666871
## yaw_forearm 17.568953
## total_accel_forearm 41.468195
## gyros_forearm_x 9.576312
## gyros_forearm_y 50.887809
## gyros_forearm_z 29.283814
## accel_forearm_x 311.183222
## accel_forearm_y 74.968003
## accel_forearm_z 369.010177
## magnet_forearm_x 145.863822
## magnet_forearm_y 60.842073
## magnet_forearm_z 358.652175
print(modelFit)
## Stochastic Gradient Boosting
##
## 13737 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 13737, 13737, 13737, 13737, 13737, 13737, ...
##
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa Accuracy SD
## 1 50 0.7516434 0.6849270 0.009168248
## 1 100 0.8196163 0.7716189 0.005586482
## 1 150 0.8506829 0.8109837 0.005845992
## 1 200 0.8704462 0.8360045 0.005620265
## 1 250 0.8839624 0.8530955 0.004864760
## 2 50 0.8537421 0.8146310 0.005198881
## 2 100 0.9034158 0.8777253 0.004886976
## 2 150 0.9282319 0.9091575 0.004043820
## 2 200 0.9443170 0.9295246 0.003364400
## 2 250 0.9556296 0.9438435 0.002573835
## 3 50 0.8931105 0.8646357 0.006337308
## 3 100 0.9385834 0.9222610 0.004063111
## 3 150 0.9585646 0.9475615 0.002034929
## 3 200 0.9682432 0.9598137 0.002076093
## 3 250 0.9744680 0.9676934 0.002215972
## 4 50 0.9193958 0.8979421 0.004746302
## 4 100 0.9566473 0.9451295 0.003316027
## 4 150 0.9704781 0.9626419 0.002475504
## 4 200 0.9775544 0.9715989 0.002262608
## 4 250 0.9817048 0.9768510 0.002065833
## 5 50 0.9364768 0.9195725 0.003606252
## 5 100 0.9665538 0.9576745 0.002197911
## 5 150 0.9767470 0.9705757 0.001904366
## 5 200 0.9820477 0.9772850 0.001886927
## 5 250 0.9846874 0.9806250 0.001495849
## Kappa SD
## 0.011676189
## 0.007064147
## 0.007375821
## 0.007091741
## 0.006135850
## 0.006560892
## 0.006174827
## 0.005120780
## 0.004246717
## 0.003249857
## 0.008019292
## 0.005130304
## 0.002559469
## 0.002613941
## 0.002791496
## 0.005992431
## 0.004182738
## 0.003126037
## 0.002854093
## 0.002609490
## 0.004569988
## 0.002754739
## 0.002403089
## 0.002377884
## 0.001890803
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 250,
## interaction.depth = 5, shrinkage = 0.1 and n.minobsinnode = 10.
#Model accuracy plotting:
plot(modelFit, metric="Accuracy")

Decision tree trial using Regressive Partitioning and Regression trees
library(rpart) # Regressive Partitioning and Regression trees
library(rpart.plot) # Decision Tree plot
treeModel <- rpart(classe ~ ., data=training1, method="class")
# Predicting:
treePrediction <- predict(treeModel, testing1, type = "class")
# Plot of the Decision Tree
rpart.plot(treeModel, main="Classification Tree", extra=102, under=TRUE, faclen=0)

confusionMatrix(treePrediction, testing1$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1498 196 69 106 25
## B 42 669 85 86 92
## C 43 136 739 129 131
## D 33 85 98 553 44
## E 58 53 35 90 790
##
## Overall Statistics
##
## Accuracy : 0.722
## 95% CI : (0.7104, 0.7334)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6467
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.8949 0.5874 0.7203 0.57365 0.7301
## Specificity 0.9060 0.9357 0.9097 0.94717 0.9509
## Pos Pred Value 0.7909 0.6869 0.6273 0.68020 0.7700
## Neg Pred Value 0.9559 0.9043 0.9390 0.91897 0.9399
## Prevalence 0.2845 0.1935 0.1743 0.16381 0.1839
## Detection Rate 0.2545 0.1137 0.1256 0.09397 0.1342
## Detection Prevalence 0.3218 0.1655 0.2002 0.13815 0.1743
## Balanced Accuracy 0.9004 0.7615 0.8150 0.76041 0.8405
Outcome predicting using Stochastic Gradient Boosting
predictions <- predict(modelFit,testing1)
confusionMatrix(predictions,testing1$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1670 10 0 0 0
## B 4 1119 17 0 1
## C 0 10 1005 18 1
## D 0 0 4 944 5
## E 0 0 0 2 1075
##
## Overall Statistics
##
## Accuracy : 0.9878
## 95% CI : (0.9846, 0.9904)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9845
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9976 0.9824 0.9795 0.9793 0.9935
## Specificity 0.9976 0.9954 0.9940 0.9982 0.9996
## Pos Pred Value 0.9940 0.9807 0.9720 0.9906 0.9981
## Neg Pred Value 0.9990 0.9958 0.9957 0.9959 0.9985
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2838 0.1901 0.1708 0.1604 0.1827
## Detection Prevalence 0.2855 0.1939 0.1757 0.1619 0.1830
## Balanced Accuracy 0.9976 0.9889 0.9868 0.9887 0.9966
predictfinal <- predict(modelFit, test)
predictfinal
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E