1. Load the data

library(caret)
library(RWeka)
library(Amelia)
library(ggplot2)

set.seed(12345)
url1 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
url2 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"

#download.file(url1,method="curl",destfile="/Users/alex/Documents/R directory/Machine learning/training.csv")
#download.file(url2,method="curl",destfile="/Users/alex/Documents/R directory/Machine learning/testing.csv")
fileDownload <- date()

2. Read the files into separate tables.

training <- read.csv("/Users/alex/Documents/R directory/Machine learning/training.csv")
test <- read.csv("/Users/alex/Documents/R directory/Machine learning/testing.csv")

2. Clean the dataset

##Assign factor levels to predicted variable
training$classe <- as.factor(training$classe)
#Visualize missing values
missmap(training, main="Missings Map in Human Activity Recognition training dataset", col=c("salmon", "purple"), legend=FALSE)

##Remove missing values columns
table(colSums(is.na(training)))
## 
##     0 19216 
##    93    67
miss <- colnames(training[colSums(is.na(training))>1])
training <- training[,!colnames(training) %in% miss]
training <- na.omit(training)
##Remove the near zero variance predictors
nzv <- nearZeroVar(training)
training <- training[,-nzv]
training <- training[,-c(1:6)]
str(training)
## 'data.frame':    19622 obs. of  53 variables:
##  $ roll_belt           : num  1.41 1.41 1.42 1.48 1.48 1.45 1.42 1.42 1.43 1.45 ...
##  $ pitch_belt          : num  8.07 8.07 8.07 8.05 8.07 8.06 8.09 8.13 8.16 8.17 ...
##  $ yaw_belt            : num  -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
##  $ total_accel_belt    : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ gyros_belt_x        : num  0 0.02 0 0.02 0.02 0.02 0.02 0.02 0.02 0.03 ...
##  $ gyros_belt_y        : num  0 0 0 0 0.02 0 0 0 0 0 ...
##  $ gyros_belt_z        : num  -0.02 -0.02 -0.02 -0.03 -0.02 -0.02 -0.02 -0.02 -0.02 0 ...
##  $ accel_belt_x        : int  -21 -22 -20 -22 -21 -21 -22 -22 -20 -21 ...
##  $ accel_belt_y        : int  4 4 5 3 2 4 3 4 2 4 ...
##  $ accel_belt_z        : int  22 22 23 21 24 21 21 21 24 22 ...
##  $ magnet_belt_x       : int  -3 -7 -2 -6 -6 0 -4 -2 1 -3 ...
##  $ magnet_belt_y       : int  599 608 600 604 600 603 599 603 602 609 ...
##  $ magnet_belt_z       : int  -313 -311 -305 -310 -302 -312 -311 -313 -312 -308 ...
##  $ roll_arm            : num  -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 ...
##  $ pitch_arm           : num  22.5 22.5 22.5 22.1 22.1 22 21.9 21.8 21.7 21.6 ...
##  $ yaw_arm             : num  -161 -161 -161 -161 -161 -161 -161 -161 -161 -161 ...
##  $ total_accel_arm     : int  34 34 34 34 34 34 34 34 34 34 ...
##  $ gyros_arm_x         : num  0 0.02 0.02 0.02 0 0.02 0 0.02 0.02 0.02 ...
##  $ gyros_arm_y         : num  0 -0.02 -0.02 -0.03 -0.03 -0.03 -0.03 -0.02 -0.03 -0.03 ...
##  $ gyros_arm_z         : num  -0.02 -0.02 -0.02 0.02 0 0 0 0 -0.02 -0.02 ...
##  $ accel_arm_x         : int  -288 -290 -289 -289 -289 -289 -289 -289 -288 -288 ...
##  $ accel_arm_y         : int  109 110 110 111 111 111 111 111 109 110 ...
##  $ accel_arm_z         : int  -123 -125 -126 -123 -123 -122 -125 -124 -122 -124 ...
##  $ magnet_arm_x        : int  -368 -369 -368 -372 -374 -369 -373 -372 -369 -376 ...
##  $ magnet_arm_y        : int  337 337 344 344 337 342 336 338 341 334 ...
##  $ magnet_arm_z        : int  516 513 513 512 506 513 509 510 518 516 ...
##  $ roll_dumbbell       : num  13.1 13.1 12.9 13.4 13.4 ...
##  $ pitch_dumbbell      : num  -70.5 -70.6 -70.3 -70.4 -70.4 ...
##  $ yaw_dumbbell        : num  -84.9 -84.7 -85.1 -84.9 -84.9 ...
##  $ total_accel_dumbbell: int  37 37 37 37 37 37 37 37 37 37 ...
##  $ gyros_dumbbell_x    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ gyros_dumbbell_y    : num  -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 ...
##  $ gyros_dumbbell_z    : num  0 0 0 -0.02 0 0 0 0 0 0 ...
##  $ accel_dumbbell_x    : int  -234 -233 -232 -232 -233 -234 -232 -234 -232 -235 ...
##  $ accel_dumbbell_y    : int  47 47 46 48 48 48 47 46 47 48 ...
##  $ accel_dumbbell_z    : int  -271 -269 -270 -269 -270 -269 -270 -272 -269 -270 ...
##  $ magnet_dumbbell_x   : int  -559 -555 -561 -552 -554 -558 -551 -555 -549 -558 ...
##  $ magnet_dumbbell_y   : int  293 296 298 303 292 294 295 300 292 291 ...
##  $ magnet_dumbbell_z   : num  -65 -64 -63 -60 -68 -66 -70 -74 -65 -69 ...
##  $ roll_forearm        : num  28.4 28.3 28.3 28.1 28 27.9 27.9 27.8 27.7 27.7 ...
##  $ pitch_forearm       : num  -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.8 -63.8 -63.8 ...
##  $ yaw_forearm         : num  -153 -153 -152 -152 -152 -152 -152 -152 -152 -152 ...
##  $ total_accel_forearm : int  36 36 36 36 36 36 36 36 36 36 ...
##  $ gyros_forearm_x     : num  0.03 0.02 0.03 0.02 0.02 0.02 0.02 0.02 0.03 0.02 ...
##  $ gyros_forearm_y     : num  0 0 -0.02 -0.02 0 -0.02 0 -0.02 0 0 ...
##  $ gyros_forearm_z     : num  -0.02 -0.02 0 0 -0.02 -0.03 -0.02 0 -0.02 -0.02 ...
##  $ accel_forearm_x     : int  192 192 196 189 189 193 195 193 193 190 ...
##  $ accel_forearm_y     : int  203 203 204 206 206 203 205 205 204 205 ...
##  $ accel_forearm_z     : int  -215 -216 -213 -214 -214 -215 -215 -213 -214 -215 ...
##  $ magnet_forearm_x    : int  -17 -18 -18 -16 -17 -9 -18 -9 -16 -22 ...
##  $ magnet_forearm_y    : num  654 661 658 658 655 660 659 660 653 656 ...
##  $ magnet_forearm_z    : num  476 473 469 469 473 478 470 474 476 473 ...
##  $ classe              : Factor w/ 5 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...

Model training

inTrain <- createDataPartition(y=training$classe,p=.7,list=F)
testing1 <- training[-inTrain,]
training1 <- training[inTrain,]
nrow(training1)
## [1] 13737
nrow(testing1)
## [1] 5885
#Cross validation is used to train the model with 3 folds:
#fitControl <- trainControl(method = "cv", number = 3)

#fit <- train(classe~.,method="gbm",data=training1,verbose=F,tuneLength = 5)

#saveRDS(fit, "gbm.RDS")
modelFit <- readRDS("gbm.RDS")
varImp(modelFit$finalModel, scale=F)
##                          Overall
## roll_belt            3004.093477
## pitch_belt            679.993968
## yaw_belt             1648.889106
## total_accel_belt       20.856205
## gyros_belt_x            5.491205
## gyros_belt_y           86.565344
## gyros_belt_z          434.596881
## accel_belt_x            6.604805
## accel_belt_y           42.358991
## accel_belt_z          191.374454
## magnet_belt_x         197.878859
## magnet_belt_y         210.818436
## magnet_belt_z         623.077487
## roll_arm              117.009951
## pitch_arm              20.422464
## yaw_arm               333.345001
## total_accel_arm        33.315643
## gyros_arm_x            39.760450
## gyros_arm_y           122.725285
## gyros_arm_z             0.000000
## accel_arm_x            76.385023
## accel_arm_y            52.890282
## accel_arm_z            22.002730
## magnet_arm_x           72.423170
## magnet_arm_y          159.564827
## magnet_arm_z          185.908975
## roll_dumbbell         456.561032
## pitch_dumbbell         30.514983
## yaw_dumbbell           99.766805
## total_accel_dumbbell  112.713723
## gyros_dumbbell_x       80.708692
## gyros_dumbbell_y      278.494692
## gyros_dumbbell_z       20.389388
## accel_dumbbell_x      217.137012
## accel_dumbbell_y      294.569673
## accel_dumbbell_z      341.248980
## magnet_dumbbell_x     252.266311
## magnet_dumbbell_y    1013.621658
## magnet_dumbbell_z    1106.217746
## roll_forearm          807.672343
## pitch_forearm        1577.666871
## yaw_forearm            17.568953
## total_accel_forearm    41.468195
## gyros_forearm_x         9.576312
## gyros_forearm_y        50.887809
## gyros_forearm_z        29.283814
## accel_forearm_x       311.183222
## accel_forearm_y        74.968003
## accel_forearm_z       369.010177
## magnet_forearm_x      145.863822
## magnet_forearm_y       60.842073
## magnet_forearm_z      358.652175
print(modelFit)
## Stochastic Gradient Boosting 
## 
## 13737 samples
##    52 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 13737, 13737, 13737, 13737, 13737, 13737, ... 
## 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa      Accuracy SD
##   1                   50      0.7516434  0.6849270  0.009168248
##   1                  100      0.8196163  0.7716189  0.005586482
##   1                  150      0.8506829  0.8109837  0.005845992
##   1                  200      0.8704462  0.8360045  0.005620265
##   1                  250      0.8839624  0.8530955  0.004864760
##   2                   50      0.8537421  0.8146310  0.005198881
##   2                  100      0.9034158  0.8777253  0.004886976
##   2                  150      0.9282319  0.9091575  0.004043820
##   2                  200      0.9443170  0.9295246  0.003364400
##   2                  250      0.9556296  0.9438435  0.002573835
##   3                   50      0.8931105  0.8646357  0.006337308
##   3                  100      0.9385834  0.9222610  0.004063111
##   3                  150      0.9585646  0.9475615  0.002034929
##   3                  200      0.9682432  0.9598137  0.002076093
##   3                  250      0.9744680  0.9676934  0.002215972
##   4                   50      0.9193958  0.8979421  0.004746302
##   4                  100      0.9566473  0.9451295  0.003316027
##   4                  150      0.9704781  0.9626419  0.002475504
##   4                  200      0.9775544  0.9715989  0.002262608
##   4                  250      0.9817048  0.9768510  0.002065833
##   5                   50      0.9364768  0.9195725  0.003606252
##   5                  100      0.9665538  0.9576745  0.002197911
##   5                  150      0.9767470  0.9705757  0.001904366
##   5                  200      0.9820477  0.9772850  0.001886927
##   5                  250      0.9846874  0.9806250  0.001495849
##   Kappa SD   
##   0.011676189
##   0.007064147
##   0.007375821
##   0.007091741
##   0.006135850
##   0.006560892
##   0.006174827
##   0.005120780
##   0.004246717
##   0.003249857
##   0.008019292
##   0.005130304
##   0.002559469
##   0.002613941
##   0.002791496
##   0.005992431
##   0.004182738
##   0.003126037
##   0.002854093
##   0.002609490
##   0.004569988
##   0.002754739
##   0.002403089
##   0.002377884
##   0.001890803
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were n.trees = 250,
##  interaction.depth = 5, shrinkage = 0.1 and n.minobsinnode = 10.
#Model accuracy plotting: 
plot(modelFit, metric="Accuracy")

Decision tree trial using Regressive Partitioning and Regression trees

library(rpart) # Regressive Partitioning and Regression trees
library(rpart.plot) # Decision Tree plot
treeModel <- rpart(classe ~ ., data=training1, method="class")

# Predicting:
treePrediction <- predict(treeModel, testing1, type = "class")

# Plot of the Decision Tree
rpart.plot(treeModel, main="Classification Tree", extra=102, under=TRUE, faclen=0)

confusionMatrix(treePrediction, testing1$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1498  196   69  106   25
##          B   42  669   85   86   92
##          C   43  136  739  129  131
##          D   33   85   98  553   44
##          E   58   53   35   90  790
## 
## Overall Statistics
##                                           
##                Accuracy : 0.722           
##                  95% CI : (0.7104, 0.7334)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6467          
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.8949   0.5874   0.7203  0.57365   0.7301
## Specificity            0.9060   0.9357   0.9097  0.94717   0.9509
## Pos Pred Value         0.7909   0.6869   0.6273  0.68020   0.7700
## Neg Pred Value         0.9559   0.9043   0.9390  0.91897   0.9399
## Prevalence             0.2845   0.1935   0.1743  0.16381   0.1839
## Detection Rate         0.2545   0.1137   0.1256  0.09397   0.1342
## Detection Prevalence   0.3218   0.1655   0.2002  0.13815   0.1743
## Balanced Accuracy      0.9004   0.7615   0.8150  0.76041   0.8405

Outcome predicting using Stochastic Gradient Boosting

predictions <- predict(modelFit,testing1)
confusionMatrix(predictions,testing1$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1670   10    0    0    0
##          B    4 1119   17    0    1
##          C    0   10 1005   18    1
##          D    0    0    4  944    5
##          E    0    0    0    2 1075
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9878          
##                  95% CI : (0.9846, 0.9904)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9845          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9976   0.9824   0.9795   0.9793   0.9935
## Specificity            0.9976   0.9954   0.9940   0.9982   0.9996
## Pos Pred Value         0.9940   0.9807   0.9720   0.9906   0.9981
## Neg Pred Value         0.9990   0.9958   0.9957   0.9959   0.9985
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2838   0.1901   0.1708   0.1604   0.1827
## Detection Prevalence   0.2855   0.1939   0.1757   0.1619   0.1830
## Balanced Accuracy      0.9976   0.9889   0.9868   0.9887   0.9966
predictfinal <- predict(modelFit, test)
predictfinal
##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E