Synopsis
Goal of this project is to quantify how well people do a particular activity i.e. to predict the manner in which people did the exercise using the ‘classe’ variable in the given training set.
Sourcing The Data Into R
datatr <- read.csv("F:/Machine Learning/Week 4 Programming Assignment/pml-training.csv",na.strings=c("NA","#DIV/0!",""))
testingdata <- read.csv("F:/Machine Learning/Week 4 Programming Assignment/pml-testing.csv",na.strings=c("NA","#DIV/0!",""))
Loading the required libraries
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RMySQL)
## Loading required package: DBI
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(foreach)
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
library(gbm)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loaded gbm 2.1.1
library(Hmisc)
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(corrplot)
Only use features used in testing cases
datatr <- datatr[,c(features,"classe")]
testingdata <- testingdata[,c(features,"problem_id")]
dim(datatr)
## [1] 19622 53
Creating Training & Test Data Sets
set.seed(6)
inTrain <- createDataPartition(y=datatr$classe,p=0.7,list=FALSE)
training <- datatr[inTrain,]
testing <- datatr[-inTrain,]
Computing & Checking whether Area Under Curve > 0.5 by plotting ROC Curve
lm1 <- lm(classe ~.,data = training)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
newdata <- data.frame(testing)
pred <- predict(lm1,newdata)
print(auc(testing$classe,pred))
## Area under the curve: 0.8137
Plotting The ROC Curve
plot.roc(testing$classe,pred)

##
## Call:
## plot.roc.default(x = testing$classe, predictor = pred)
##
## Data: pred in 1674 controls (testing$classe A) < 1139 cases (testing$classe B).
## Area under the curve: 0.8137
As Area Under Curve is 0.8137 which is greater than 0.8, it is a good fit.
Predictions using Decision Tree Model
fitDT <- rpart(classe~.,data=training,method="class")
predDT <- predict(fitDT,testing,type="class")
c1 <- confusionMatrix(predDT, testing$classe)$overall[1]
c1
## Accuracy
## 0.7345794
Prediction Using Linear Discriminant Analysis Method
fitLDA <- train(classe ~ ., data=training, method="lda")
## Loading required package: MASS
predLDA <- predict(fitLDA, testing)
c2 <- confusionMatrix(predLDA, testing$classe)$overall[1]
c2
## Accuracy
## 0.7026338
Prediction Using Random Forest Method
fitRF <- randomForest(classe ~ ., data=training, ntree=500)
predRF <- predict(fitRF, testing,type="class")
c3 <- confusionMatrix(predRF, testing$classe)$overall[1]
c3
## Accuracy
## 0.9940527
As c3>c1>c2, it can be concluded that Random Forest Method is the most accurate method.Hence, we will predict using Random Forest Method for the cases in ‘pml-testing’ set.
RESULTS
Applying the Random Forest model to testingdata set and getting the predictions
predictionRF <- predict(fitRF,testingdata,type="class")
predictionRF
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
summary(predictionRF)
## A B C D E
## 7 8 1 1 3
print(fitRF)
##
## Call:
## randomForest(formula = classe ~ ., data = training, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 0.43%
## Confusion matrix:
## A B C D E class.error
## A 3905 1 0 0 0 0.0002560164
## B 10 2644 4 0 0 0.0052671181
## C 0 13 2381 2 0 0.0062604341
## D 0 0 21 2230 1 0.0097690941
## E 0 0 1 6 2518 0.0027722772
Out of sample error Calculation
In Sample Error Rate is 0.4%.
Predicting on testing Data using Random Forest Method
predRF <- predict(fitRF, testing,type="class")
length(predRF)
## [1] 5885
True accuracy of the predicted model
outOfSampleError.accuracy <- sum(predRF == testing$classe)/length(predRF)
outOfSampleError.accuracy
## [1] 0.9940527
outOfSampleError <- 1 - outOfSampleError.accuracy
outOfSampleError
## [1] 0.005947324
e <- outOfSampleError * 100
paste0("Out of sample error estimation: ", round(e, digits = 2), "%")
## [1] "Out of sample error estimation: 0.59%"
Write up
Write up the predicted character to the “.txt” files
pml_write_files = function(x){
n = length(x)
for(i in 1:n){
filename = paste0("problem_id_",i,".txt")
write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.names=FALSE)
}
}
pml_write_files(predictionRF)
Plotting Confusion Matrix
dt <- data.frame(test = testing$classe, predict = predict(fitRF, testing))
g <- ggplot(data=dt, aes(x=test, y=predict))
g <- g + geom_tile(alpha=0.8)
g <- g + ggtitle("Confusion matrix. Predicted vs Actual outcomes on test set")
g <- g + scale_fill_continuous(name = "Number of records")
print(g)

Plotting Corelation Plot
CP <- cor(datatr[, -length(names(datatr))])
col1 <- colorRampPalette(c("#7F0000","red","#FF7F00","yellow","white",
"cyan", "#007FFF", "blue","#00007F"))
col2 <- colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582", "#FDDBC7",
"#FFFFFF", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC", "#053061"))
col3 <- colorRampPalette(c("red", "white", "blue"))
col4 <- colorRampPalette(c("#7F0000","red","#FF7F00","yellow","#7FFF7F",
"cyan", "#007FFF", "blue","#00007F"))
wb <- c("white","black")
## using these color spectrums
corrplot(CP, order="hclust", addrect=2, col=col1(100),method = "number")

Decision Tree Plot
fancyRpartPlot(fitDT)
