Introduction

This is a prediction model of the “classe” variable from the Weight Lifting Exercises Dataset gathered from: http://groupware.les.inf.puc-rio.br/har. The “classe” of an exercise will be predicted from various classifiers in the training dataset. These classifiers will be those variables that contain the words “belt”. “arm”, “forearm” or “dumbell.”

Import data

training <- read.csv("C:/Users/Erika/Desktop/Data Science/Practical Machine Learning/pml-training.csv")
testing <- read.csv("C:/Users/Erika/Desktop/Data Science/Practical Machine Learning/pml-testing.csv")

Import libraries

library(caret); library(ggplot2)
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Loading required package: ggplot2

Preprocessing

I split the datset into data for only the readings from the arm, belt, forearm and dumbell accelerometers respectively.

#separating my data to combine later

arm_str <- grep("_arm", colnames(training), value = TRUE)
arm_data <- training[, arm_str]
#arm_data <- cbind(training[, arm_str], training$classe)

belt_str <- grep("belt", colnames(training), value = TRUE)
belt_data <- training[, belt_str]
#belt_data <- cbind(training[, belt_str], training$classe)

farm_str <- grep("fore", colnames(training), value = TRUE)
farm_data <- training[, farm_str]
#farm_data <- cbind(training[, farm_str], training$classe)

bell_str <- grep("dumbbell", colnames(training), value = TRUE)
bell_data <- training[, bell_str]
#bell_data <- cbind(training[, bell_str], training$classe)

classe_str <- grep("classe", colnames(training), value = TRUE)
classe_data <- training[, classe_str]

#combine into master data
master_data <- cbind(arm_data, belt_data, farm_data, bell_data, classe_data)
names(master_data)[153]<-"classe"

#Remove columns with more than 95% of NA or "" values
threshold <- dim(master_data)[1] * 0.95
gCols <- !apply(master_data, 2, function(x) sum(is.na(x)) > threshold  || sum(x=="") > threshold)

master_data <- master_data[, gCols]

bCols <- nearZeroVar(master_data, saveMetrics = TRUE)

master_data <- master_data[, bCols$nzv==FALSE]

Generating Models

Models will be generated using random forests and boosting since they have a high accuracy rate. They will then stacked for a complete prediction algorithm. My cross validation percentage for training/ testing was 75/25 respectively.

#split training into another train and test set
set.seed(14641)
inTrain = createDataPartition(master_data$classe, p = 3/4)[[1]]
train2 = master_data[inTrain, ]
test2 = master_data[-inTrain, ]

#Build two different models
mod1 <- train(classe ~.,method="rf",data=train2)
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 3.2.5
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
mod2 <- train(classe ~.,method="gbm",verbose = FALSE, data=train2)
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.2.5
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.2.5
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Loading required package: plyr

Calculating Predictions

#Predict on the testing set
pred1 <- predict(mod1,test2) 
pred2 <- predict(mod2,test2)
qplot(pred2,pred1,colour=classe,data=test2)

#Fit a model that combines the predictors
predDF <- data.frame(pred2,pred1,classe=test2$classe)
combModFit <- train(classe ~.,method="rf",data=predDF)
combPred <- predict(combModFit,predDF)

Checking out-of-sample errors via confusion matrices

#Confusion matrices
c1 <- confusionMatrix(pred1, test2$classe)
c2 <- confusionMatrix(pred2, test2$classe)
c3 <- confusionMatrix(combPred, test2$classe)

c1$table
##           Reference
## Prediction    A    B    C    D    E
##          A 1395    1    0    0    0
##          B    0  948    8    0    0
##          C    0    0  845   11    0
##          D    0    0    2  791    4
##          E    0    0    0    2  897
c2$table
##           Reference
## Prediction    A    B    C    D    E
##          A 1381   22    0    1    1
##          B    7  905   27    4   18
##          C    5   19  820   27    7
##          D    1    3    8  764   20
##          E    1    0    0    8  855
c3$table
##           Reference
## Prediction    A    B    C    D    E
##          A 1395    1    0    0    0
##          B    0  948    8    0    0
##          C    0    0  845   11    0
##          D    0    0    2  791    4
##          E    0    0    0    2  897
#Accuracy
print(cbind(c("rf", "gbm", "combrf"),c(c1$overall[1], c2$overall[1], c3$overall[1])))
##          [,1]     [,2]               
## Accuracy "rf"     "0.994290375203915"
## Accuracy "gbm"    "0.963499184339315"
## Accuracy "combrf" "0.994290375203915"

This will make the out-of-sample errors amount to .0051%.

Below is a plot showing how well the model did in predicting:

To get a better sense of the counts of erroneous predictions: