Libraries loading

In this section the libraries used are loaded and the seed number is fixed (for reproducibility).

library(RGtk2)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(ggplot2)
library(rpart)
library(rpart.plot)
library(rattle)

## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(corrplot)
library(ipred)
library(e1071)
library(gbm)

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## Loading required package: splines

## Loading required package: parallel

## Loaded gbm 2.1.1

library(parallel)
library(doParallel)

## Loading required package: foreach

## Loading required package: iterators

set.seed(333)

Loading and reading the raw data

if(!file.exists("Prediction Assignment")){
dir.create("Prediction Assignment")}

fileUrl<-"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
download.file(fileUrl,destfile="./Prediction Assignment/training.csv")
training<-read.csv("./Prediction Assignment/training.csv")

fileUrl<-"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
download.file(fileUrl,destfile="./Prediction Assignment/testing.csv")
testing<-read.csv("./Prediction Assignment/testing.csv")

Data partition

The data of the training set is partitioned in training_0 and testing_0 for cross validation

inTrain<-createDataPartition(y=training$classe,p=0.8,list=FALSE)
training_0<-training[inTrain,]
testing_0<-training[-inTrain,]

Cleaning

In this section I remove the first 7 columns and those columns that contain more than 50% of NA values. This reduces the number of columns from 160 to 86

training_0<-training_0[,-(1:7)]
testing_0<-testing_0[,-(1:7)]
testing<-testing[,-(1:7)]

l<-nrow(training_0)*0.5
delete<-which(colSums(is.na(training_0))>l)

training_0<-training_0[,-c(delete)]
testing_0<-testing_0[,-c(delete)]
testing<-testing[,-c(delete)]

In this section I select all those columns that do not show a significant variability using the function “nearZeroVar”. This reduces the number of columns to 53. Rows with NA values are also removed.

nsv<-nearZeroVar(training_0,saveMetrics = TRUE)

nsv <- add_rownames(nsv, "Name")

## Warning: Deprecated, use tibble::rownames_to_column() instead.

nsv<-as.data.frame(nsv)
nsv<-filter(nsv,nzv=="TRUE")
delete<-nsv$Name
delete<-delete[delete!= "classe"]

training_0<-training_0[ , -which(names(training_0) %in% delete)]
training_0<-training_0[complete.cases(training_0),]

testing_0<-testing_0[ , -which(names(testing_0) %in% delete)]
testing_0<-testing_0[complete.cases(testing_0),]

testing<-testing[ , -which(names(testing) %in% delete)]

The findCorrelation function is used next. It helps to remove columns that exhibit pair-wise correlations.

corrPlot <- cor(training_0[, -53])

corrplot(corrPlot, method="circle",tl.cex=0.7,type="upper")

The correlations plot represent current features correlations and those higher than 0.70 will be removed in next lines. This reduces the possibility of over fitting when applying the regression tree model and the computation time. The number of features is reduced to 31

fc<-findCorrelation(corrPlot,cutoff=0.70)
fc<-sort(fc)

training_0<-training_0[,-c(fc)]
testing_0<-testing_0[,-c(fc)]

testing<-testing[,-c(fc)]
testing<-testing[,-32]

Analysis

Model 1: Recursive Partitioning and Regression Trees

modFit<-train(classe~.,method="rpart", data=training_0)
modFit

## CART 
## 
## 15699 samples
##    31 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 15699, 15699, 15699, 15699, 15699, 15699, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.01794986  0.5509786  0.4261322
##   0.02136182  0.5083866  0.3636180
##   0.04205607  0.4318434  0.2421631
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.01794986.

prediction<- predict(modFit, newdata=testing_0)
confusionMatrix(prediction, testing_0$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   A   B   C   D   E
##          A 934 277 287 186 143
##          B  89 330  40 108 184
##          C  26 122 334 104 117
##          D  66  30  21 177  34
##          E   1   0   2  68 243
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5144          
##                  95% CI : (0.4986, 0.5302)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.37            
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.8369  0.43478  0.48830  0.27527  0.33703
## Specificity            0.6819  0.86694  0.88608  0.95396  0.97783
## Pos Pred Value         0.5112  0.43941  0.47511  0.53963  0.77389
## Neg Pred Value         0.9132  0.86475  0.89130  0.87038  0.86755
## Prevalence             0.2845  0.19347  0.17436  0.16391  0.18379
## Detection Rate         0.2381  0.08412  0.08514  0.04512  0.06194
## Detection Prevalence   0.4657  0.19144  0.17920  0.08361  0.08004
## Balanced Accuracy      0.7594  0.65086  0.68719  0.61462  0.65743

The cross validation shows an overall accuracy of 48.7%, which is very low. The sensitivity of the model is very low, while the performance in specificity is better. Then, a bagging model will be applied y the next section.

Model 2: Bagging

modFit2<-train(classe~.,method="treebag", data=training_0)

prediction2<- predict(modFit2, newdata=testing_0)
confusionMatrix(prediction2, testing_0$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1106   21    2    2    1
##          B    6  726   14    0    2
##          C    3    9  662   15    1
##          D    1    2    6  623    1
##          E    0    1    0    3  716
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9771          
##                  95% CI : (0.9719, 0.9815)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.971           
##  Mcnemar's Test P-Value : 0.03847         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9910   0.9565   0.9678   0.9689   0.9931
## Specificity            0.9907   0.9930   0.9914   0.9970   0.9988
## Pos Pred Value         0.9770   0.9706   0.9594   0.9842   0.9944
## Neg Pred Value         0.9964   0.9896   0.9932   0.9939   0.9984
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2819   0.1851   0.1687   0.1588   0.1825
## Detection Prevalence   0.2886   0.1907   0.1759   0.1614   0.1835
## Balanced Accuracy      0.9909   0.9748   0.9796   0.9829   0.9959

In this model, the accuracy of the process is largely improved (97.7%). A boosting model is applied in the next section in order to try an additional improvement of the sensitivity.

Model 3 Boosting

cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

fitControl <- trainControl(allowParallel = TRUE)
modFit3<-train(classe~.,method="gbm", data=training_0,verbose=FALSE,trControl =fitControl)

prediction3<- predict(modFit3, newdata=testing_0)
confusionMatrix(prediction3, testing_0$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1075   66    4    4    4
##          B   13  643   39    8   26
##          C    9   40  624   43   27
##          D   17    7   15  576   18
##          E    2    3    2   12  646
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9085         
##                  95% CI : (0.899, 0.9173)
##     No Information Rate : 0.2845         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.8841         
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9633   0.8472   0.9123   0.8958   0.8960
## Specificity            0.9722   0.9728   0.9633   0.9826   0.9941
## Pos Pred Value         0.9324   0.8820   0.8398   0.9100   0.9714
## Neg Pred Value         0.9852   0.9637   0.9811   0.9796   0.9770
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2740   0.1639   0.1591   0.1468   0.1647
## Detection Prevalence   0.2939   0.1858   0.1894   0.1614   0.1695
## Balanced Accuracy      0.9677   0.9100   0.9378   0.9392   0.9450

stopCluster(cluster)
registerDoSEQ()

The accuracy in this last process is worse than before as can be seen in the confusionMatrix output (90.57%). A random forest model will be applied as a fourth try.

Model 4 Random forest

ptm <- proc.time()
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

fitControl <- trainControl(allowParallel = TRUE)
modFit4<-train(classe~.,method="rf", data=training_0,prox=TRUE,trControl =fitControl)
proc.time() - ptm

##     user   system  elapsed 
##  255.016   11.276 8343.869

prediction4<- predict(modFit4, newdata=testing_0)
confusionMatrix(prediction4, testing_0$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1113    7    0    0    0
##          B    2  750    7    0    0
##          C    0    2  672   12    1
##          D    1    0    5  630    0
##          E    0    0    0    1  720
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9903          
##                  95% CI : (0.9867, 0.9931)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9877          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9973   0.9881   0.9825   0.9798   0.9986
## Specificity            0.9975   0.9972   0.9954   0.9982   0.9997
## Pos Pred Value         0.9938   0.9881   0.9782   0.9906   0.9986
## Neg Pred Value         0.9989   0.9972   0.9963   0.9960   0.9997
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2837   0.1912   0.1713   0.1606   0.1835
## Detection Prevalence   0.2855   0.1935   0.1751   0.1621   0.1838
## Balanced Accuracy      0.9974   0.9926   0.9889   0.9890   0.9992

stopCluster(cluster)
registerDoSEQ()

The accuracy of the random forest model (99.1%) is the highest among the methods I tried. I will use this model in order to make the predictions of the testing set (testing dataframe).

prediction5<- predict(modFit4, newdata=testing)