Synopsis

Goal of this project is to quantify how well people do a particular activity i.e. to predict the manner in which people did the exercise using the ‘classe’ variable in the given training set.

Sourcing The Data Into R

datatr <- read.csv("F:/Machine Learning/Week 4 Programming Assignment/pml-training.csv",na.strings=c("NA","#DIV/0!",""))

testingdata <- read.csv("F:/Machine Learning/Week 4 Programming Assignment/pml-testing.csv",na.strings=c("NA","#DIV/0!",""))

Loading the required libraries

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RMySQL)
## Loading required package: DBI
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(foreach)
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
library(gbm)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loaded gbm 2.1.1
library(Hmisc)
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(corrplot)

Only use features used in testing cases

datatr <- datatr[,c(features,"classe")]
testingdata <- testingdata[,c(features,"problem_id")]
dim(datatr)
## [1] 19622    53

Creating Training & Test Data Sets

set.seed(6)
inTrain <- createDataPartition(y=datatr$classe,p=0.7,list=FALSE)
training <- datatr[inTrain,]
testing <- datatr[-inTrain,]

Computing & Checking whether Area Under Curve > 0.5 by plotting ROC Curve

lm1 <- lm(classe ~.,data = training)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
newdata <- data.frame(testing)
pred <- predict(lm1,newdata)
print(auc(testing$classe,pred))
## Area under the curve: 0.8137

Plotting The ROC Curve

plot.roc(testing$classe,pred)

## 
## Call:
## plot.roc.default(x = testing$classe, predictor = pred)
## 
## Data: pred in 1674 controls (testing$classe A) < 1139 cases (testing$classe B).
## Area under the curve: 0.8137

As Area Under Curve is 0.8137 which is greater than 0.8, it is a good fit.

Predictions using Decision Tree Model

fitDT <- rpart(classe~.,data=training,method="class")
predDT <- predict(fitDT,testing,type="class")
c1 <- confusionMatrix(predDT, testing$classe)$overall[1]
c1
##  Accuracy 
## 0.7345794

Prediction Using Linear Discriminant Analysis Method

fitLDA <- train(classe ~ ., data=training, method="lda")
## Loading required package: MASS
predLDA <- predict(fitLDA, testing)
c2 <- confusionMatrix(predLDA, testing$classe)$overall[1]
c2
##  Accuracy 
## 0.7026338

Prediction Using Random Forest Method

fitRF <- randomForest(classe ~ ., data=training, ntree=500)
predRF <- predict(fitRF, testing,type="class")
c3 <- confusionMatrix(predRF, testing$classe)$overall[1]
c3
##  Accuracy 
## 0.9940527

As c3>c1>c2, it can be concluded that Random Forest Method is the most accurate method.Hence, we will predict using Random Forest Method for the cases in ‘pml-testing’ set.

RESULTS

Applying the Random Forest model to testingdata set and getting the predictions

predictionRF <- predict(fitRF,testingdata,type="class")
predictionRF
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  B  A  B  A  A  E  D  B  A  A  B  C  B  A  E  E  A  B  B  B 
## Levels: A B C D E
summary(predictionRF)
## A B C D E 
## 7 8 1 1 3
print(fitRF)
## 
## Call:
##  randomForest(formula = classe ~ ., data = training, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 0.43%
## Confusion matrix:
##      A    B    C    D    E  class.error
## A 3905    1    0    0    0 0.0002560164
## B   10 2644    4    0    0 0.0052671181
## C    0   13 2381    2    0 0.0062604341
## D    0    0   21 2230    1 0.0097690941
## E    0    0    1    6 2518 0.0027722772

Out of sample error Calculation

In Sample Error Rate is 0.4%.

Predicting on testing Data using Random Forest Method

predRF <- predict(fitRF, testing,type="class")
length(predRF)
## [1] 5885

True accuracy of the predicted model

outOfSampleError.accuracy <- sum(predRF == testing$classe)/length(predRF)

outOfSampleError.accuracy
## [1] 0.9940527
outOfSampleError <- 1 - outOfSampleError.accuracy
outOfSampleError
## [1] 0.005947324
e <- outOfSampleError * 100
paste0("Out of sample error estimation: ", round(e, digits = 2), "%")
## [1] "Out of sample error estimation: 0.59%"

Write up

Write up the predicted character to the “.txt” files

pml_write_files = function(x){
  n = length(x)
  for(i in 1:n){
    filename = paste0("problem_id_",i,".txt")
    write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.names=FALSE)
  }
}

pml_write_files(predictionRF)

Plotting Confusion Matrix

dt <- data.frame(test = testing$classe, predict = predict(fitRF, testing))
g  <- ggplot(data=dt, aes(x=test, y=predict))
g <- g + geom_tile(alpha=0.8)
g <- g + ggtitle("Confusion matrix. Predicted vs Actual outcomes on test set")
g <- g + scale_fill_continuous(name = "Number of records")
print(g)

Plotting Corelation Plot

CP <- cor(datatr[, -length(names(datatr))])
col1 <- colorRampPalette(c("#7F0000","red","#FF7F00","yellow","white", 
        "cyan", "#007FFF", "blue","#00007F"))
col2 <- colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582", "#FDDBC7",
            "#FFFFFF", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC", "#053061"))  
col3 <- colorRampPalette(c("red", "white", "blue")) 
col4 <- colorRampPalette(c("#7F0000","red","#FF7F00","yellow","#7FFF7F", 
            "cyan", "#007FFF", "blue","#00007F"))   
wb <- c("white","black")
## using these color spectrums
corrplot(CP, order="hclust", addrect=2, col=col1(100),method = "number")

Decision Tree Plot

fancyRpartPlot(fitDT)