Load initial Libraries

library(caret)

## Loading required package: lattice
## Loading required package: ggplot2

library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)

## Rattle: A free graphical interface for data mining with R.
## Version 3.4.1 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

set.seed(2048)
filesDirectory="./Data"
options(warn=-1)

Get the data

if (!file.exists(filesDirectory)) {
    dir.create(filesDirectory)
}
  trainUrl <- "http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
  testUrl <- "http://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
  trainFile <- "train.csv"
  testFile <- "test.csv"
  trainFilePath <- paste(filesDirectory, trainFile, sep = "/")
  testFilePath <- paste(filesDirectory, testFile, sep = "/")
# Download Data
  download.file(trainUrl, destfile = trainFilePath, method="curl")
  download.file(testUrl, destfile = testFilePath, method="curl")
# Load Data
  training <- read.csv(trainFilePath, na.strings=c("NA","#DIV/0!",""))
  testing <- read.csv(testFilePath, na.strings=c("NA","#DIV/0!",""))

Partition Data

inTrain <- createDataPartition(y=training$classe, p=0.6, list=FALSE)
myTraining <- training[inTrain, ]; myTesting <- training[-inTrain, ]
dim(myTraining); dim(myTesting)

## [1] 11776   160

## [1] 7846  160

Clean Data, remove near zero covariates and missing values

# remove zero values
nsv <- nearZeroVar(training, saveMetrics = T)
training <- training[, !nsv$nzv]

# remove missing values
nav <- sapply(colnames(training), function(x) if(sum(is.na(training[, x])) > 0.8*nrow(training)){return(T)}else{return(F)})
training <- training[, !nav]

Calculate Correlations

cor <- abs(sapply(colnames(training[, -ncol(training)]), function(x) cor(as.numeric(training[, x]), as.numeric(training$classe), method = "spearman")))
head(cor)

##                    X            user_name raw_timestamp_part_1 
##          0.976646598          0.012000639          0.201244517 
## raw_timestamp_part_2       cvtd_timestamp           num_window 
##          0.014748691          0.131346255          0.005798365

plot(training[, names(which.max(cor))], training[, names(which.max(cor[-which.max(cor)]))], col = training$classe, pch = 19, cex = 0.1, xlab = names(which.max(cor)), ylab = names(which.max(cor[-which.max(cor)])))

First step, build a machine learning algorithm to predict activity quality from activity monitors

Test using Boosting method, then compare results with Random Trees ## Boosting Basic ideas are: 1. Take lots of (possibly) weak predictors 2. Weight them and add them up 3. Get a stronger predictor

To begin our boosting observations we need to:

Start with a set of classifiers \(h_1,\ldots,h_k\)

Examples: All possible trees, all possible regression models, all possible cutoffs.

Create a classifier that combines classification functions: \(f(x) = \rm{sgn}\left(\sum_{t=1}^T \alpha_t h_t(x)\right)\).

Goal is to minimize error (on training set)
Iterative, select one \(h\) at each step
Calculate weights based on errors
Upweight missed classifications and select next \(h\)
gbm - boosting with trees.

# set the seet
set.seed(123)

# train the data using gbm method
boostFit <- train(classe ~ ., method = "gbm", data = training, verbose = F, trControl = trainControl(method = "cv", number = 10))

## Loading required package: gbm
## Loading required package: survival
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1
## Loading required package: plyr

# display the results from our model
boostFit

## Stochastic Gradient Boosting 
## 
## 19622 samples
##    58 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 17660, 17661, 17659, 17660, 17658, 17660, ... 
## 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa      Accuracy SD 
##   1                   50      0.9998981  0.9998711  0.0002147923
##   1                  100      0.9998471  0.9998067  0.0002461172
##   1                  150      0.9998471  0.9998067  0.0002461172
##   2                   50      0.9998471  0.9998067  0.0002461172
##   2                  100      0.9998471  0.9998067  0.0002461172
##   2                  150      0.9998471  0.9998067  0.0002461172
##   3                   50      0.9998471  0.9998067  0.0002461172
##   3                  100      0.9998471  0.9998067  0.0002461172
##   3                  150      0.9998471  0.9998067  0.0002461172
##   Kappa SD   
##   0.000271656
##   0.000311286
##   0.000311286
##   0.000311286
##   0.000311286
##   0.000311286
##   0.000311286
##   0.000311286
##   0.000311286
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were n.trees = 50, interaction.depth
##  = 1 and shrinkage = 0.1.

# plot the results
plot(boostFit, ylim = c(0.9, 1))

Cross Validation using Random forests

Basic concepts of Random Forest are:

Bootstrap samples
At each split, bootstrap variables
Grow multiple trees and vote Pros:
Accuracy Cons:
Speed
Interpretability
Overfitting

# set the seed
set.seed(123)

# call caret model using random forest
rfFit <- train(classe ~ ., method = "rf", data = training, importance = T, trControl = trainControl(method = "cv", number = 10))

## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

# examine the results
rfFit

## Random Forest 
## 
## 19622 samples
##    58 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 17660, 17661, 17659, 17660, 17658, 17660, ... 
## 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa      Accuracy SD   Kappa SD    
##    2    0.9971974  0.9964551  0.0022688160  0.0028697342
##   41    0.9999490  0.9999355  0.0001612584  0.0002039895
##   80    0.9998470  0.9998065  0.0003441560  0.0004353440
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 41.

# plot random forest fit
plot(rfFit, ylim = c(0.9, 1))

# examine the importance of the fit
imp <- varImp(rfFit)$importance
imp$max <- apply(imp, 1, max)
imp <- imp[order(imp$max, decreasing = T), ]

Final Results

When we compare the analysis from Random Forest versus Boosting we observe that * The out of sample error observed in the training model was 0.04% and the prediction using the test data were the same

# Final Model
rfFit$finalModel

## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry, importance = ..1) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 41
## 
##         OOB estimate of  error rate: 0.01%
## Confusion matrix:
##      A    B    C    D    E  class.error
## A 5580    0    0    0    0 0.0000000000
## B    1 3796    0    0    0 0.0002633658
## C    0    1 3421    0    0 0.0002922268
## D    0    0    0 3216    0 0.0000000000
## E    0    0    0    0 3607 0.0000000000

# prediction
(prediction <- as.character(predict(rfFit, testing)))

##  [1] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [18] "A" "A" "A"

Complete the assignment by writing out the results to our prediction files

# write prediction files
pml_write_files = function(x){
  n = length(x)
  for(i in 1:n){
    filename = paste0("./prediction/problem_id_", i, ".txt")
    write.table(x[i], file = filename, quote = FALSE, row.names = FALSE, col.names = FALSE)
  }
}
pml_write_files(prediction)

Practical Machine Learning - Course Project

Al Shain