1. Description of HAR data

HAR (human activity recognition) research has focused to predict “which” activity was performed at a specific point in time (like with the Daily Living Activities).

Weight Lifting Exercises dataset investigated “how (well)” an activity was performed by the wearer.

*Six young health participants were asked to perform one set of 10 repetitions of the Unilateral Dumbbell Biceps Curl in five different fashions:

exactly according to the specification (Class A) # correct

throwing the elbows to the front (Class B)

lifting the dumbbell only halfway (Class C)

lowering the dumbbell only halfway (Class D)

throwing the hips to the front (Class E)

Participants were supervised by an experienced weight lifter to make sure that all participants could easily simulate the mistakes

Safe and controlled exercises by using a relatively light dumbbell (1.25kg).

source: http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har

  1. Data Processing Loaded required packages
library(dplyr)
library(ggplot2)
library(caret)
library(rpart)
library(randomForest)
library(rattle)

Data Clearning to prepare to get training set and testing sets

training_data<-read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv",
                        na.strings = c("NA","#DIV/0!",""),header=TRUE,sep = ",")
#*Downloaded the training data from the given url, removed columns with blank cells and "#DIV/0!"*

trfun <- function(x){sum(is.na(x))}
trainSet<- training_data[sapply(training_data,trfun)==0] # removed the columns with all NAs
trainSet <- trainSet[,-c(1:7)] # removed first 7 columns as they are not necessary for prediction
  1. Data Partitioned to create Training and Testing sets for model tuning
set.seed(1977)
Intrain <- createDataPartition(trainSet$classe,p=0.69,list = FALSE)
trainingOne <- trainSet[Intrain,]
testingOne <- trainSet[-Intrain,]
  1. Applied ML algos/methods (i). method = Decision tree (Recurssive Partitioning and Regression Trees)
set.seed(1977)
library(rattle)
library(rpart.plot)

modelTree<- rpart(trainingOne$classe ~ .,data=trainingOne,method ="class")
fancyRpartPlot(modelTree,cex=0.4)

PredTree <- predict(modelTree,testingOne,type="class")
confusionMatrix(PredTree,testingOne$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1421  207   20   51   12
##          B   85  763   71   59   46
##          C   67   75  848  206   95
##          D   97   72   47  602   69
##          E   59   60   74   78  896
## 
## Overall Statistics
##                                          
##                Accuracy : 0.7451         
##                  95% CI : (0.7339, 0.756)
##     No Information Rate : 0.2844         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.6778         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.8219   0.6483   0.8000  0.60442   0.8014
## Specificity            0.9333   0.9468   0.9118  0.94394   0.9454
## Pos Pred Value         0.8305   0.7451   0.6569  0.67869   0.7678
## Neg Pred Value         0.9295   0.9181   0.9557  0.92413   0.9548
## Prevalence             0.2844   0.1936   0.1743  0.16382   0.1839
## Detection Rate         0.2337   0.1255   0.1395  0.09901   0.1474
## Detection Prevalence   0.2814   0.1684   0.2123  0.14589   0.1919
## Balanced Accuracy      0.8776   0.7975   0.8559  0.77418   0.8734

Accuracy_of model_ = 74.51%

(ii). method = Random Forest

set.seed(1980)
library(randomForest)
modelRF<- randomForest(trainingOne$classe ~ .,data=trainingOne)
modelRF
## 
## Call:
##  randomForest(formula = trainingOne$classe ~ ., data = trainingOne) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 0.56%
## Confusion matrix:
##      A    B    C    D    E  class.error
## A 3849    2    0    0    0 0.0005193456
## B   15 2601    4    0    0 0.0072519084
## C    0   14 2344    4    0 0.0076206605
## D    0    0   25 2190    5 0.0135135135
## E    0    0    1    6 2482 0.0028123744
PredRF <- predict(modelRF,testingOne)
confRF <- confusionMatrix(PredRF,testingOne$classe)
plot(confRF$table,color = confRF$byClass, cex=.5,main = paste("Accuracy% =",100*round(confRF$overall['Accuracy'], 4)))

(iii). method = Support Vector Machine

set.seed(1990)
library(kernlab)
modelSVM <- ksvm(trainingOne$classe ~ ., data = trainingOne, type= "nu-svc",nu=0.1)
PredSVM <- predict(modelSVM,testingOne,type="response")
confusionMatrix(PredSVM,testingOne$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1723   43    0    2    0
##          B    5 1122    6    0    1
##          C    0    6 1045   48    3
##          D    0    0    6  946   13
##          E    1    6    3    0 1101
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9765          
##                  95% CI : (0.9724, 0.9801)
##     No Information Rate : 0.2844          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9702          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9965   0.9533   0.9858   0.9498   0.9848
## Specificity            0.9897   0.9976   0.9886   0.9963   0.9980
## Pos Pred Value         0.9745   0.9894   0.9483   0.9803   0.9910
## Neg Pred Value         0.9986   0.9889   0.9970   0.9902   0.9966
## Prevalence             0.2844   0.1936   0.1743   0.1638   0.1839
## Detection Rate         0.2834   0.1845   0.1719   0.1556   0.1811
## Detection Prevalence   0.2908   0.1865   0.1812   0.1587   0.1827
## Balanced Accuracy      0.9931   0.9754   0.9872   0.9730   0.9914

Accuracy of the model =97.65%

  1. Selected the random forest model to apply in finding the predictions of 20 test cases
test_cases <- read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv",
                        na.strings = c("NA","#DIV/0!",""),header=TRUE, sep=",")
#Downloaded the test data from the given url, removed columns with blank cells and "#DIV/0!"

Answers_Quiz <- predict(modelRF, newdata=test_cases)
print(Answers_Quiz)
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  B  A  B  A  A  E  D  B  A  A  B  C  B  A  E  E  A  B  B  B 
## Levels: A B C D E