Practical Machine Learning Project

Load libraries

library(jpeg)
library(randomForest)

## Warning: package 'randomForest' was built under R version 3.6.1

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(caTools)

## Warning: package 'caTools' was built under R version 3.6.1

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(adabag)

## Warning: package 'adabag' was built under R version 3.6.1

## Loading required package: rpart

## Loading required package: caret

## Warning: package 'caret' was built under R version 3.6.1

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## Loading required package: foreach

## Loading required package: doParallel

## Warning: package 'doParallel' was built under R version 3.6.1

## Loading required package: iterators

## Loading required package: parallel

library(caret)
library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(e1071)

## Warning: package 'e1071' was built under R version 3.6.1

Read the files

setwd("C:/Users/Javier/Documents/2-D Especializacion en Ciencia de Datos/8 Practical Machine Learning/Unidad 4/Proyecto")
training <- read.csv("train.csv")
testing <- read.csv("test.csv")

Split the datasets

Using the sample.split function from caret package and a seed of 1, the train dataset will be split into a 80% for training and the remaining 20% for testing. The test dataset (with 20 rows) will be used for prediction and validation.

set.seed(1)
sample <- sample.split(training, SplitRatio = 0.80)
train <- subset(training, sample == TRUE)
test <- subset(training, sample == FALSE)
validate <- testing
c(dim(train), dim(test), dim(validate))

## [1] 15693   160  3929   160    20   160

Avoid multicollinearity

Even when no lineal models will be used (Random Forest, Naive Bayes and AdaBoost) is better avoid independent variables with high correlation between each other cause this only will make noise into the model.

Variables with high correlation was ommited from models.

Besides, variables with many NA’s was omitted.

attach(train)

temp <- as.data.frame(cbind(roll_belt, pitch_belt, yaw_belt, total_accel_belt, gyros_belt_x, gyros_belt_y, gyros_belt_z,
        accel_belt_x, accel_belt_y, accel_belt_z, magnet_belt_x, magnet_belt_y, magnet_belt_z,
        roll_arm, pitch_arm, yaw_arm, total_accel_arm, gyros_arm_x, gyros_arm_y, gyros_arm_z, accel_arm_x,
        accel_arm_y, accel_arm_z, magnet_arm_x, magnet_arm_y, magnet_arm_z, roll_dumbbell, pitch_dumbbell,
        yaw_dumbbell, total_accel_dumbbell, gyros_dumbbell_y, gyros_dumbbell_z, accel_dumbbell_x,
        accel_dumbbell_y, accel_dumbbell_z, magnet_dumbbell_x, magnet_dumbbell_y, magnet_dumbbell_z,
        roll_forearm, pitch_forearm, yaw_forearm, gyros_forearm_x, gyros_forearm_y, gyros_forearm_z,
        accel_forearm_x, accel_forearm_y, accel_forearm_z, magnet_forearm_x, magnet_forearm_y, magnet_forearm_z))

jj <- readJPEG("cor.jpg",native=TRUE)
plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
rasterImage(jj,0,0,1,1)

Random Forest

Random forest is a bagging type classifier. Uses multiple decision tree for make a stronger model in which the winning class is those that have more votes.

The variables with less importance (mean gini decrease) was omitted.

RF <- randomForest(classe ~ 
      pitch_belt
    + gyros_belt_z
    + magnet_belt_y
    + roll_arm
    + pitch_arm
    + yaw_arm
    + gyros_arm_x
    + accel_arm_x
    + accel_arm_y
    + accel_arm_z
    + magnet_arm_x
    + magnet_arm_y
    + magnet_arm_z
    + roll_dumbbell
    + pitch_dumbbell
    + yaw_dumbbell
    + total_accel_dumbbell
    + accel_dumbbell_y
    + accel_dumbbell_z
    + magnet_dumbbell_y
    + magnet_dumbbell_z
    + roll_forearm
    + pitch_forearm
    + gyros_forearm_y
    + accel_forearm_x
    + accel_forearm_y
    + accel_forearm_z
    + magnet_forearm_x
    + magnet_forearm_y
    + magnet_forearm_z,
    data = train,
    ntree = 100,
    mtry = 1)

# importance(RF)

R <- as.character(as.factor(test$classe))
P_RF <- as.character(predict(RF, test, type = "class"))
tabla <- table(R,P_RF)
accuracy <- round(sum(diag(tabla)) / sum(tabla),4)
modelo <- "RandomForest"
error <- round( 1 - accuracy,4)
modeloRF <- cbind(modelo, accuracy, error)

#summary(RF)

AdaBoost

AdaBoost is a classifier of type boosting. In each iteration, the elements that were erroneously classified receive a greater weight for the next iteration. The idea is that a series of expert judges can make a more certain decision than a single judge.

Naive Bayes also have a “importance” function that allows the evaluate the significance of each independent

      Ada <-  boosting(classe ~ 
               pitch_belt
             + gyros_belt_z
             + magnet_belt_y
             + roll_arm
             + pitch_arm
             + yaw_arm
             + gyros_arm_x
             + accel_arm_x
             + accel_arm_y
             + accel_arm_z
             + magnet_arm_x
             + magnet_arm_y
             + magnet_arm_z
             + roll_dumbbell
             + pitch_dumbbell
             + yaw_dumbbell
             + total_accel_dumbbell
             + accel_dumbbell_y
             + accel_dumbbell_z
             + magnet_dumbbell_y
             + magnet_dumbbell_z
             + roll_forearm
             + pitch_forearm
             + gyros_forearm_y
             + accel_forearm_x
             + accel_forearm_y
             + accel_forearm_z
             + magnet_forearm_x
             + magnet_forearm_y
             + magnet_forearm_z,
             data = train,
             mfinal = 10,
             coeflearn = "Breiman")
        
      R <- as.character(as.factor(test$classe))
      P_ADA <- predict(Ada, test)$class
      tabla <- table(R,P_ADA)
      accuracy <- round(sum(diag(tabla)) / sum(tabla),4)
      modelo <- "AdaBoost"
      error <- 1 - accuracy
      modeloAda <- cbind(modelo, accuracy, error)
      
      #summary(Ada)

Naive Bayes

Naive Bayes is a probabilistic classifier based on the Bayes theorem.

            NB <-  naiveBayes(classe ~ 
                          pitch_belt
                        + gyros_belt_z
                        + magnet_belt_y
                        + roll_arm
                        + pitch_arm
                        + yaw_arm
                        + magnet_arm_y
                        + magnet_arm_z
                        + roll_dumbbell
                        + pitch_dumbbell
                        + yaw_dumbbell
                        + total_accel_dumbbell
                        + accel_dumbbell_y
                        + accel_dumbbell_z
                        + magnet_dumbbell_y
                        + magnet_dumbbell_z
                        + roll_forearm
                        + pitch_forearm
                        + accel_forearm_x
                        + accel_forearm_y
                        + accel_forearm_z
                        + magnet_forearm_x
                        + magnet_forearm_y
                        + magnet_forearm_z,
                        data = train)
      
            R <- as.character(as.factor(test$classe))
            P_NB <- as.character(predict(NB, test))
            tabla <- table(R,P_NB)
            accuracy <- round(sum(diag(tabla)) / sum(tabla),4)
            modelo <- "Naive Bayes"
            error <- 1 - accuracy
            modeloNB <- cbind(modelo, accuracy, error)

#summary(NB)

Bagging

A Random Forest classifier will be used to combine the results of the 3 previous classifiers in order to obtain a more robust and accurate model.

Bagging1 is Random Forest + AdaBoost + Naive Bayes bagging2 is Random Forest + AdaBoost

Y <- test$classe
X1 <- P_RF
X2 <- P_ADA
X3 <- P_NB
temp <- as.data.frame(cbind(Y,X1,X2,X3))
BAG <- randomForest(Y ~ X1 + X2 + X3, data = temp, ntree = 100, mtry = 1)

R <- as.character(as.factor(test$classe))
P <- as.character(predict(BAG, temp, type = "class"))
tabla <- table(R,P)
accuracy <- round(sum(diag(tabla)) / sum(tabla),4)
modelo <- "Bagging1"
error <- 1 - accuracy
modeloBag <- cbind(modelo, accuracy, error)

Y <- test$classe
X1 <- P_RF
X2 <- P_ADA
temp <- as.data.frame(cbind(Y,X1,X2))
BAG2 <- randomForest(Y ~ X1 + X2, data = temp, ntree = 100, mtry = 1)

R <- as.character(as.factor(test$classe))
P <- as.character(predict(BAG2, temp, type = "class"))
tabla <- table(R,P)
accuracy <- round(sum(diag(tabla)) / sum(tabla),4)
modelo <- "Bagging2"
error <- 1 - accuracy
modeloBag2 <- cbind(modelo, accuracy, error)

Choosing the best model

The Accuracy will be used for decide which model is better.

Accuracy can be understood as X cases of 100 cases were correctly classified.

Accuracy is given by: (TP + TN) / (TP + TN + FP + FN)

The error is given by 1 - accuracy

The winning classifier is Random Forest (have the better accuracy and the lowest error)

rbind(modeloRF, modeloAda, modeloNB, modeloBag, modeloBag2)

##      modelo         accuracy error   
## [1,] "RandomForest" "0.9814" "0.0186"
## [2,] "AdaBoost"     "0.8183" "0.1817"
## [3,] "Naive Bayes"  "0.5401" "0.4599"
## [4,] "Bagging1"     "0.9812" "0.0188"
## [5,] "Bagging2"     "0.9809" "0.0191"

Prediction

Finally, we are going to use our best model (Random Forest) to predict the class of the 20 rows validation dataset.

predict(RF, validate, type = "class")

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  B  A  B  A  A  E  D  B  A  A  B  C  B  A  E  E  A  B  B  B 
## Levels: A B C D E

table(predict(RF, validate, type = "class"))

## 
## A B C D E 
## 7 8 1 1 3