This exercise presents summary and analysis of the weight lifting exercises dataset as part of the Coursera Practical Machine Learning course.
The data is here. It consists of a series of measures on 6 healthy volunteers of 10 repetitions of Unilateral Dumbell Curls (UDC) performed in 5 different ways of which one was correct. The objective of the analysis is to predict which form of UDC each of 20 test subjects performed.
First we need to downoad and examine the data.
train <- read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", na.strings = c("NA","#DIV/0!"))
test <- read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv", na.strings = c("NA","#DIV/0!"))
dim (train)
## [1] 19622 160
We need to limit the dataset to the measure variables only, and we will exclude those variables with high proportions of missing data.
sensordata <- grep(pattern = "_belt|_arm|_dumbbell|_forearm", names(train))
data <- train[,c(sensordata, 160)]
datatest <- test[,c(sensordata, 160)]
nadata<- apply(data, 2, function(x) mean(is.na(x)))
t <- which(nadata >0)
train1 <- data[,-t]
nadatatest<- apply(datatest, 2, function(x) mean(is.na(x)))
t <- which(nadatatest >0)
test1 <- datatest[,-t]
dim(train1); dim(test1)
## [1] 19622 53
## [1] 20 53
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(corrplot)
## partition data
set.seed(123)
train <- createDataPartition(train1$classe, p=0.6, list = FALSE)
inTrain <- train1[train,]
inTest <- train1[-train,]
dim(inTrain); dim(inTest)
## [1] 11776 53
## [1] 7846 53
library(corrplot)
cor <- cor(inTrain[, -53])
corrplot(cor, order = "FPC", method = "color", type = "lower", tl.cex = 0.7, tl.col = rgb(0, 0, 0))
modtree <- train(classe~., data = inTrain, method = "rpart")
## Loading required package: rpart
plot(modtree$finalModel)
text(modtree$finalModel, pretty = 1)
predtree <- predict(modtree, inTest)
confusionMatrix(predtree, inTest$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1356 239 42 68 20
## B 5 264 34 8 6
## C 441 263 907 397 269
## D 422 752 385 813 510
## E 8 0 0 0 637
##
## Overall Statistics
##
## Accuracy : 0.5069
## 95% CI : (0.4958, 0.518)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3865
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.6075 0.17391 0.6630 0.6322 0.44175
## Specificity 0.9343 0.99162 0.7885 0.6846 0.99875
## Pos Pred Value 0.7861 0.83281 0.3983 0.2821 0.98760
## Neg Pred Value 0.8569 0.83344 0.9172 0.9047 0.88821
## Prevalence 0.2845 0.19347 0.1744 0.1639 0.18379
## Detection Rate 0.1728 0.03365 0.1156 0.1036 0.08119
## Detection Prevalence 0.2199 0.04040 0.2902 0.3673 0.08221
## Balanced Accuracy 0.7709 0.58277 0.7258 0.6584 0.72025
Overall accuracy is poor at 0.51.
set.seed(321)
modrf <-randomForest(classe ~., data = inTrain, ntree = 500)
modrf
##
## Call:
## randomForest(formula = classe ~ ., data = inTrain, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 0.69%
## Confusion matrix:
## A B C D E class.error
## A 3343 2 1 1 1 0.001493429
## B 10 2264 5 0 0 0.006581834
## C 1 19 2029 5 0 0.012171373
## D 1 0 24 1903 2 0.013989637
## E 0 0 2 7 2156 0.004157044
varImpPlot(modrf, cex = 0.5)
Random Forest is much more accurate
pred <- predict(modrf, inTest)
confusionMatrix(pred, inTest$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2230 9 0 0 0
## B 2 1506 9 0 0
## C 0 3 1357 12 3
## D 0 0 2 1274 4
## E 0 0 0 0 1435
##
## Overall Statistics
##
## Accuracy : 0.9944
## 95% CI : (0.9925, 0.9959)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9929
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9991 0.9921 0.9920 0.9907 0.9951
## Specificity 0.9984 0.9983 0.9972 0.9991 1.0000
## Pos Pred Value 0.9960 0.9927 0.9869 0.9953 1.0000
## Neg Pred Value 0.9996 0.9981 0.9983 0.9982 0.9989
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2842 0.1919 0.1730 0.1624 0.1829
## Detection Prevalence 0.2854 0.1933 0.1752 0.1631 0.1829
## Balanced Accuracy 0.9988 0.9952 0.9946 0.9949 0.9976
pred1 <- predict(modrf, test1)
pred1
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E