1.Data preprocessing 2.Building Model using different methods (i.e. rpart, rf etc.) 3.Applying cross validation 4.Estimated out of sample error 4.Predictions
1.Loading packages and importing data
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version
## 3.1.3
library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.1.3
library(rattle)
## Warning: package 'rattle' was built under R version 3.1.3
## Loading required package: RGtk2
## Warning: package 'RGtk2' was built under R version 3.1.3
## Rattle: A free graphical interface for data mining with R.
## Version 3.5.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
## Loading required package: rpart
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.1.3
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
training <- "pml-training.csv"
testing <- "pml-testing.csv"
# Importing data considering null values as NA
training <- read.csv(training, na.strings=c("NA",""), header=TRUE)
column_training <- colnames(training)
testing <- read.csv(testing, na.strings=c("NA",""), header=TRUE)
column_testing <- colnames(testing)
# Verify that the column names (excluding classe and problem_id) are identical in the training and test set.
all.equal(column_training[1:length(column_training)-1], column_testing[1:length(column_training)-1])
## [1] TRUE
Train <- createDataPartition(y=training$classe, p=0.6, list=FALSE)
trainpart <- training[Train, ];
testpart <- training[-Train, ]
dim(trainpart)
## [1] 11776 160
dim(testpart)
## [1] 7846 160
# remove variables that are almost always NA
NAs <- sapply(trainpart, function(x) mean(is.na(x))) > 0.95
trainpart <- trainpart[, NAs==F]
testpart <- testpart[, NAs==F]
# remove variables with nearly zero variance
nzv <- nearZeroVar(trainpart)
trainpart <- trainpart[, -nzv]
testpart <- testpart[, -nzv]
# remove variables that don't make sense for prediction (X, user_name, raw_timestamp_part_1, raw_timestamp_part_2, cvtd_timestamp, new_windo, num_window), which are the first seven variables
trainpart <- trainpart[, -(1:7)]
testpart <- testpart[, -(1:7)]
set.seed(666)
modFit <- train(trainpart$classe ~ ., data =trainpart, method="rpart")
print(modFit, digits=3)
## CART
##
## 11776 samples
## 51 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 11776, 11776, 11776, 11776, 11776, 11776, ...
##
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Accuracy SD Kappa SD
## 0.0299 0.526 0.392 0.0399 0.0604
## 0.0323 0.515 0.377 0.0398 0.0609
## 0.0638 0.349 0.106 0.0860 0.1447
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0299.
print(modFit$finalModel, digits=3)
## n= 11776
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 11776 8430 A (0.28 0.19 0.17 0.16 0.18)
## 2) pitch_forearm< -33.7 947 11 A (0.99 0.012 0 0 0) *
## 3) pitch_forearm>=-33.7 10829 8420 A (0.22 0.21 0.19 0.18 0.2)
## 6) accel_belt_z>=-186 10165 7770 A (0.24 0.22 0.2 0.19 0.15)
## 12) magnet_dumbbell_y< 440 8516 6170 A (0.28 0.18 0.23 0.19 0.13)
## 24) roll_forearm< 124 5370 3260 A (0.39 0.18 0.17 0.16 0.092) *
## 25) roll_forearm>=124 3146 2100 C (0.074 0.17 0.33 0.23 0.19)
## 50) accel_forearm_x>=-104 2216 1390 C (0.084 0.21 0.37 0.1 0.23) *
## 51) accel_forearm_x< -104 930 444 D (0.052 0.091 0.23 0.52 0.1) *
## 13) magnet_dumbbell_y>=440 1649 886 B (0.031 0.46 0.045 0.21 0.25) *
## 7) accel_belt_z< -186 664 16 E (0.024 0 0 0 0.98) *
fancyRpartPlot(modFit$finalModel)
predictions <- predict(modFit, newdata=testpart)
print(confusionMatrix(predictions, testpart$classe), digits=4)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2016 614 649 580 345
## B 30 523 33 225 289
## C 129 327 542 155 325
## D 40 53 144 326 73
## E 17 1 0 0 410
##
## Overall Statistics
##
## Accuracy : 0.4865
## 95% CI : (0.4754, 0.4976)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3281
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9032 0.34453 0.39620 0.25350 0.28433
## Specificity 0.6103 0.90882 0.85551 0.95274 0.99719
## Pos Pred Value 0.4795 0.47545 0.36671 0.51258 0.95794
## Neg Pred Value 0.9407 0.85251 0.87029 0.86685 0.86088
## Prevalence 0.2845 0.19347 0.17436 0.16391 0.18379
## Detection Rate 0.2569 0.06666 0.06908 0.04155 0.05226
## Detection Prevalence 0.5358 0.14020 0.18838 0.08106 0.05455
## Balanced Accuracy 0.7567 0.62668 0.62585 0.60312 0.64076
set.seed(666)
modFit <- train(trainpart$classe ~ ., preProcess=c("center", "scale"), data = trainpart, method="rpart")
print(modFit, digits=3)
## CART
##
## 11776 samples
## 51 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## Pre-processing: centered, scaled
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 11776, 11776, 11776, 11776, 11776, 11776, ...
##
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Accuracy SD Kappa SD
## 0.0299 0.526 0.392 0.0399 0.0604
## 0.0323 0.515 0.377 0.0398 0.0609
## 0.0638 0.349 0.106 0.0860 0.1447
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0299.
set.seed(666)
modFit <- train(trainpart$classe ~ ., trControl=trainControl(method = "cv", number = 4), data = trainpart, method="rpart")
print(modFit, digits=3)
## CART
##
## 11776 samples
## 51 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (4 fold)
##
## Summary of sample sizes: 8831, 8833, 8831, 8833
##
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Accuracy SD Kappa SD
## 0.0299 0.519 0.377 0.0324 0.0518
## 0.0323 0.507 0.361 0.0389 0.0609
## 0.0638 0.374 0.150 0.1042 0.1729
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0299.
set.seed(666)
modFit <- train(trainpart$classe ~ ., preProcess=c("center", "scale"), trControl=trainControl(method = "cv", number = 4), data =trainpart, method="rpart")
print(modFit, digits=3)
## CART
##
## 11776 samples
## 51 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## Pre-processing: centered, scaled
## Resampling: Cross-Validated (4 fold)
##
## Summary of sample sizes: 8831, 8833, 8831, 8833
##
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Accuracy SD Kappa SD
## 0.0299 0.519 0.377 0.0324 0.0518
## 0.0323 0.507 0.361 0.0389 0.0609
## 0.0638 0.374 0.150 0.1042 0.1729
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0299.
predictions <- predict(modFit, newdata=testpart)
print(confusionMatrix(predictions, testpart$classe), digits=4)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2016 614 649 580 345
## B 30 523 33 225 289
## C 129 327 542 155 325
## D 40 53 144 326 73
## E 17 1 0 0 410
##
## Overall Statistics
##
## Accuracy : 0.4865
## 95% CI : (0.4754, 0.4976)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3281
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9032 0.34453 0.39620 0.25350 0.28433
## Specificity 0.6103 0.90882 0.85551 0.95274 0.99719
## Pos Pred Value 0.4795 0.47545 0.36671 0.51258 0.95794
## Neg Pred Value 0.9407 0.85251 0.87029 0.86685 0.86088
## Prevalence 0.2845 0.19347 0.17436 0.16391 0.18379
## Detection Rate 0.2569 0.06666 0.06908 0.04155 0.05226
## Detection Prevalence 0.5358 0.14020 0.18838 0.08106 0.05455
## Balanced Accuracy 0.7567 0.62668 0.62585 0.60312 0.64076
there is no impact of incorporating both preprocessing and cross validation in accuracy (.4827). lets try using different method for building model
# Train on trainpart with only cross validation.
set.seed(666)
modFit <- train(trainpart$classe ~ ., method="rf", trControl=trainControl(method = "cv", number = 4), data=trainpart)
print(modFit, digits=3)
## Random Forest
##
## 11776 samples
## 51 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (4 fold)
##
## Summary of sample sizes: 8831, 8833, 8831, 8833
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.988 0.984 0.00102 0.00129
## 26 0.990 0.987 0.00185 0.00234
## 51 0.985 0.981 0.00179 0.00226
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 26.
predictions <- predict(modFit, newdata=testpart)
print(confusionMatrix(predictions, testpart$classe), digits=4)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2229 10 0 0 0
## B 3 1502 4 1 0
## C 0 5 1358 13 5
## D 0 0 6 1270 4
## E 0 1 0 2 1433
##
## Overall Statistics
##
## Accuracy : 0.9931
## 95% CI : (0.991, 0.9948)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9913
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9987 0.9895 0.9927 0.9876 0.9938
## Specificity 0.9982 0.9987 0.9964 0.9985 0.9995
## Pos Pred Value 0.9955 0.9947 0.9833 0.9922 0.9979
## Neg Pred Value 0.9995 0.9975 0.9985 0.9976 0.9986
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2841 0.1914 0.1731 0.1619 0.1826
## Detection Prevalence 0.2854 0.1925 0.1760 0.1631 0.1830
## Balanced Accuracy 0.9984 0.9941 0.9946 0.9930 0.9966
# predictions based on model build in previous step against 20 testing set provided .
print(predict(modFit, newdata=testing))
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
set.seed(666)
modFit <- train(trainpart$classe ~ ., method="rf", preProcess=c("center", "scale"), trControl=trainControl(method = "cv", number = 4), data=trainpart)
print(modFit, digits=3)
## Random Forest
##
## 11776 samples
## 51 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## Pre-processing: centered, scaled
## Resampling: Cross-Validated (4 fold)
##
## Summary of sample sizes: 8831, 8833, 8831, 8833
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.988 0.985 0.000833 0.00106
## 26 0.990 0.987 0.001186 0.00150
## 51 0.984 0.980 0.002282 0.00289
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 26.
predictions <- predict(modFit, newdata=testpart)
print(confusionMatrix(predictions, testpart$classe), digits=4)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2229 8 0 0 0
## B 3 1503 6 0 0
## C 0 6 1356 12 4
## D 0 0 6 1272 4
## E 0 1 0 2 1434
##
## Overall Statistics
##
## Accuracy : 0.9934
## 95% CI : (0.9913, 0.995)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9916
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9987 0.9901 0.9912 0.9891 0.9945
## Specificity 0.9986 0.9986 0.9966 0.9985 0.9995
## Pos Pred Value 0.9964 0.9940 0.9840 0.9922 0.9979
## Neg Pred Value 0.9995 0.9976 0.9981 0.9979 0.9988
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2841 0.1916 0.1728 0.1621 0.1828
## Detection Prevalence 0.2851 0.1927 0.1756 0.1634 0.1832
## Balanced Accuracy 0.9986 0.9943 0.9939 0.9938 0.9970
print(predict(modFit, newdata=testing))
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
Preprocessing actually rose the accuracy rate from 0.9904 to 0.9908 against the training set. Thus I decided to apply both preprocessing and cross validation to final model.