Analysis

Preparing the work environment

First we load the libraries, echo false to supress messages

library(RCurl)

## Loading required package: bitops

library(caret)

## Loading required package: lattice
## Loading required package: ggplot2

library(randomForest)

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

library(rpart)
library(rpart.plot)
library(reshape2)

Download data

download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv",destfile="pml-training.csv",method="curl")
training <- read.csv("pml-training.csv", na.strings=c("NA","#DIV/0!", ""))
download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv",destfile="pml-testing.csv",method="curl")
testing <- read.csv("pml-testing.csv", na.strings=c("NA","#DIV/0!", ""))

Exploratory analysis and data cleaning

Exploration of data.

summary(training)
str(training)

What we learn after exploring data are three main things: 1. Dataset contains many variables which are mostly NA’s 2. Dataset contains experiment-related variables such as id or timestamp that are not interesting for prediction purposes 3. Dataset contains variables with near to zero variance, which may not be very informative taking into account that the datasets contains a wide amount of variables

Thus I choose to remove all the previously mention variables

First we remove variables that are not really interesting for o

# Remove the first 7 variables which contain information related to the experiment
training <- training[,-c(1:7)]

# Removing Vars NA values
variablesWithNA <- sapply(training, function (x) any(is.na(x) | x == ""))
training <- training[, names(variablesWithNA[!variablesWithNA])]

# Removing Near Zero Var (NZV)
near.zero.vars <- nearZeroVar(training, saveMetrics=TRUE)
training <- training[, !near.zero.vars$nzv]

# Selected variables
names(training)

##  [1] "roll_belt"            "pitch_belt"           "yaw_belt"            
##  [4] "total_accel_belt"     "gyros_belt_x"         "gyros_belt_y"        
##  [7] "gyros_belt_z"         "accel_belt_x"         "accel_belt_y"        
## [10] "accel_belt_z"         "magnet_belt_x"        "magnet_belt_y"       
## [13] "magnet_belt_z"        "roll_arm"             "pitch_arm"           
## [16] "yaw_arm"              "total_accel_arm"      "gyros_arm_x"         
## [19] "gyros_arm_y"          "gyros_arm_z"          "accel_arm_x"         
## [22] "accel_arm_y"          "accel_arm_z"          "magnet_arm_x"        
## [25] "magnet_arm_y"         "magnet_arm_z"         "roll_dumbbell"       
## [28] "pitch_dumbbell"       "yaw_dumbbell"         "total_accel_dumbbell"
## [31] "gyros_dumbbell_x"     "gyros_dumbbell_y"     "gyros_dumbbell_z"    
## [34] "accel_dumbbell_x"     "accel_dumbbell_y"     "accel_dumbbell_z"    
## [37] "magnet_dumbbell_x"    "magnet_dumbbell_y"    "magnet_dumbbell_z"   
## [40] "roll_forearm"         "pitch_forearm"        "yaw_forearm"         
## [43] "total_accel_forearm"  "gyros_forearm_x"      "gyros_forearm_y"     
## [46] "gyros_forearm_z"      "accel_forearm_x"      "accel_forearm_y"     
## [49] "accel_forearm_z"      "magnet_forearm_x"     "magnet_forearm_y"    
## [52] "magnet_forearm_z"     "classe"

# Visualizing boxplot of the predictors per classe type
variables <- names(training[,!names(training) %in% "classe"])
v_per_plot = 14

for(i in 1:ceiling(length(names(training))/v_per_plot))  # for each row
{ 
min <- 1 + (i-1)*v_per_plot
max <- 1 + i*v_per_plot
if (max > length(variables))
{
  max <- length(variables)
}
var_to_plot <- c(variables[min:max], "classe")

p <- ggplot() + geom_violin(aes(y = value, x = classe, color = classe, fill=classe), data = melt(training[, var_to_plot], id.vars = "classe")) + facet_wrap(~ variable, ncol = 5)
print(p)
}

Model

set.seed(591852)
training$classe <- factor(training$classe)
# Create training and validation set with 0.6 probability
inTrain <- createDataPartition(training$classe, p=0.6)
train_set <- training[inTrain[[1]],]
validate_set <- training[-inTrain[[1]],]

library(parallel)
library(doParallel)

## Loading required package: foreach
## Loading required package: iterators

cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)

ctrl <- trainControl(classProbs=TRUE, savePredictions=TRUE, allowParallel=TRUE)
mod_rf <- train(classe ~ ., method="rf", data = train_set)

## Loading required namespace: e1071

stopCluster(cl)

Validating the model

# Test on the train set with 100% accuracy
confusionMatrix(predict(mod_rf, train_set), train_set$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 3348    0    0    0    0
##          B    0 2279    0    0    0
##          C    0    0 2054    0    0
##          D    0    0    0 1930    0
##          E    0    0    0    0 2165
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9997, 1)
##     No Information Rate : 0.2843     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   1.0000   1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Prevalence             0.2843   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2843   0.1935   0.1744   0.1639   0.1838
## Detection Prevalence   0.2843   0.1935   0.1744   0.1639   0.1838
## Balanced Accuracy      1.0000   1.0000   1.0000   1.0000   1.0000

# Test on the validation set with 99.25% accuracy
confusionMatrix(predict(mod_rf, validate_set), validate_set$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 2231    8    0    0    0
##          B    1 1497    6    0    0
##          C    0   11 1357   18    3
##          D    0    0    5 1266    4
##          E    0    2    0    2 1435
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9924          
##                  95% CI : (0.9902, 0.9942)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9903          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9996   0.9862   0.9920   0.9844   0.9951
## Specificity            0.9986   0.9989   0.9951   0.9986   0.9994
## Pos Pred Value         0.9964   0.9953   0.9770   0.9929   0.9972
## Neg Pred Value         0.9998   0.9967   0.9983   0.9970   0.9989
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2843   0.1908   0.1730   0.1614   0.1829
## Detection Prevalence   0.2854   0.1917   0.1770   0.1625   0.1834
## Balanced Accuracy      0.9991   0.9925   0.9935   0.9915   0.9973

Prediction of testing set and final model

test_set <- testing[, names(testing) %in% names(training)]
# Prediction of the testing dataset
data.frame(testing[,1:6], classe_predicted = predict(mod_rf, test_set))

##     X user_name raw_timestamp_part_1 raw_timestamp_part_2   cvtd_timestamp
## 1   1     pedro           1323095002               868349 05/12/2011 14:23
## 2   2    jeremy           1322673067               778725 30/11/2011 17:11
## 3   3    jeremy           1322673075               342967 30/11/2011 17:11
## 4   4    adelmo           1322832789               560311 02/12/2011 13:33
## 5   5    eurico           1322489635               814776 28/11/2011 14:13
## 6   6    jeremy           1322673149               510661 30/11/2011 17:12
## 7   7    jeremy           1322673128               766645 30/11/2011 17:12
## 8   8    jeremy           1322673076                54671 30/11/2011 17:11
## 9   9  carlitos           1323084240               916313 05/12/2011 11:24
## 10 10   charles           1322837822               384285 02/12/2011 14:57
## 11 11  carlitos           1323084277                36553 05/12/2011 11:24
## 12 12    jeremy           1322673101               442731 30/11/2011 17:11
## 13 13    eurico           1322489661               298656 28/11/2011 14:14
## 14 14    jeremy           1322673043               178652 30/11/2011 17:10
## 15 15    jeremy           1322673156               550750 30/11/2011 17:12
## 16 16    eurico           1322489713               706637 28/11/2011 14:15
## 17 17     pedro           1323094971               920315 05/12/2011 14:22
## 18 18  carlitos           1323084285               176314 05/12/2011 11:24
## 19 19     pedro           1323094999               828379 05/12/2011 14:23
## 20 20    eurico           1322489658               106658 28/11/2011 14:14
##    new_window classe_predicted
## 1          no                B
## 2          no                A
## 3          no                B
## 4          no                A
## 5          no                A
## 6          no                E
## 7          no                D
## 8          no                B
## 9          no                A
## 10         no                A
## 11         no                B
## 12         no                C
## 13         no                B
## 14         no                A
## 15         no                E
## 16         no                E
## 17         no                A
## 18         no                B
## 19         no                B
## 20         no                B

# Importance of the variables
varImp(mod_rf)

## rf variable importance
## 
##   only 20 most important variables shown (out of 52)
## 
##                      Overall
## roll_belt             100.00
## pitch_forearm          63.32
## yaw_belt               55.84
## magnet_dumbbell_z      44.89
## pitch_belt             44.53
## magnet_dumbbell_y      43.47
## roll_forearm           41.02
## accel_dumbbell_y       21.05
## roll_dumbbell          19.84
## magnet_dumbbell_x      18.11
## accel_forearm_x        16.67
## magnet_belt_z          15.34
## accel_belt_z           14.26
## total_accel_dumbbell   14.17
## accel_dumbbell_z       14.09
## magnet_belt_y          13.28
## magnet_forearm_z       13.02
## gyros_belt_z           12.53
## yaw_arm                10.79
## magnet_belt_x          10.08

# Log plot of the model
plot(mod_rf, log="y")

Conclusions

I use Random Forest as it is easy to use, performs well without almost no additional configuration, and its quite robust.
Train function in the Caret package already peforms automatically cross validation within the processing of the model, so we do not need to worry about that, the default paramerers suffice.
I have been able to predict 20 samples correctly out of 20 of the testing set (100% accuracy).

Prediction Assignment Writeup

Jose A. Ruiperez Valiente

30 de enero de 2016

Preliminar work

Background of the project

Data

Required submissions