library(lattice)
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(rpart)
## Warning: package 'rpart' was built under R version 3.5.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.3
#Data Cleaning and Preparation
set.seed(717)
trainurl = "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
testurl = "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
download.file(trainurl, "pml-training.csv")
download.file(testurl, "pml-testing.csv")
training <- read.csv("pml-training.csv", na.strings=c("NA","#DIV/0!", ""))
testing <- read.csv("pml-testing.csv", na.strings=c("NA","#DIV/0!", ""))
#update datasets to exclude those variables with NA values
training <- training[,colSums(is.na(training)) == 0]
testing <-testing[,colSums(is.na(testing)) == 0]
#remove irrelevant variables to the prediction
newtraining <- training[,-c(1:7)]
newtesting <- testing[, -c(1:7)]
#For cross validation purpose, the training data will be split into training training and training testing.
cv <- createDataPartition(y=newtraining$classe, p=0.75, list=FALSE)
training_train <- newtraining[cv, ]
training_test <- newtraining[-cv, ]
summary(newtraining)
## roll_belt pitch_belt yaw_belt total_accel_belt
## Min. :-28.90 Min. :-55.8000 Min. :-180.00 Min. : 0.00
## 1st Qu.: 1.10 1st Qu.: 1.7600 1st Qu.: -88.30 1st Qu.: 3.00
## Median :113.00 Median : 5.2800 Median : -13.00 Median :17.00
## Mean : 64.41 Mean : 0.3053 Mean : -11.21 Mean :11.31
## 3rd Qu.:123.00 3rd Qu.: 14.9000 3rd Qu.: 12.90 3rd Qu.:18.00
## Max. :162.00 Max. : 60.3000 Max. : 179.00 Max. :29.00
## gyros_belt_x gyros_belt_y gyros_belt_z
## Min. :-1.040000 Min. :-0.64000 Min. :-1.4600
## 1st Qu.:-0.030000 1st Qu.: 0.00000 1st Qu.:-0.2000
## Median : 0.030000 Median : 0.02000 Median :-0.1000
## Mean :-0.005592 Mean : 0.03959 Mean :-0.1305
## 3rd Qu.: 0.110000 3rd Qu.: 0.11000 3rd Qu.:-0.0200
## Max. : 2.220000 Max. : 0.64000 Max. : 1.6200
## accel_belt_x accel_belt_y accel_belt_z magnet_belt_x
## Min. :-120.000 Min. :-69.00 Min. :-275.00 Min. :-52.0
## 1st Qu.: -21.000 1st Qu.: 3.00 1st Qu.:-162.00 1st Qu.: 9.0
## Median : -15.000 Median : 35.00 Median :-152.00 Median : 35.0
## Mean : -5.595 Mean : 30.15 Mean : -72.59 Mean : 55.6
## 3rd Qu.: -5.000 3rd Qu.: 61.00 3rd Qu.: 27.00 3rd Qu.: 59.0
## Max. : 85.000 Max. :164.00 Max. : 105.00 Max. :485.0
## magnet_belt_y magnet_belt_z roll_arm pitch_arm
## Min. :354.0 Min. :-623.0 Min. :-180.00 Min. :-88.800
## 1st Qu.:581.0 1st Qu.:-375.0 1st Qu.: -31.77 1st Qu.:-25.900
## Median :601.0 Median :-320.0 Median : 0.00 Median : 0.000
## Mean :593.7 Mean :-345.5 Mean : 17.83 Mean : -4.612
## 3rd Qu.:610.0 3rd Qu.:-306.0 3rd Qu.: 77.30 3rd Qu.: 11.200
## Max. :673.0 Max. : 293.0 Max. : 180.00 Max. : 88.500
## yaw_arm total_accel_arm gyros_arm_x gyros_arm_y
## Min. :-180.0000 Min. : 1.00 Min. :-6.37000 Min. :-3.4400
## 1st Qu.: -43.1000 1st Qu.:17.00 1st Qu.:-1.33000 1st Qu.:-0.8000
## Median : 0.0000 Median :27.00 Median : 0.08000 Median :-0.2400
## Mean : -0.6188 Mean :25.51 Mean : 0.04277 Mean :-0.2571
## 3rd Qu.: 45.8750 3rd Qu.:33.00 3rd Qu.: 1.57000 3rd Qu.: 0.1400
## Max. : 180.0000 Max. :66.00 Max. : 4.87000 Max. : 2.8400
## gyros_arm_z accel_arm_x accel_arm_y accel_arm_z
## Min. :-2.3300 Min. :-404.00 Min. :-318.0 Min. :-636.00
## 1st Qu.:-0.0700 1st Qu.:-242.00 1st Qu.: -54.0 1st Qu.:-143.00
## Median : 0.2300 Median : -44.00 Median : 14.0 Median : -47.00
## Mean : 0.2695 Mean : -60.24 Mean : 32.6 Mean : -71.25
## 3rd Qu.: 0.7200 3rd Qu.: 84.00 3rd Qu.: 139.0 3rd Qu.: 23.00
## Max. : 3.0200 Max. : 437.00 Max. : 308.0 Max. : 292.00
## magnet_arm_x magnet_arm_y magnet_arm_z roll_dumbbell
## Min. :-584.0 Min. :-392.0 Min. :-597.0 Min. :-153.71
## 1st Qu.:-300.0 1st Qu.: -9.0 1st Qu.: 131.2 1st Qu.: -18.49
## Median : 289.0 Median : 202.0 Median : 444.0 Median : 48.17
## Mean : 191.7 Mean : 156.6 Mean : 306.5 Mean : 23.84
## 3rd Qu.: 637.0 3rd Qu.: 323.0 3rd Qu.: 545.0 3rd Qu.: 67.61
## Max. : 782.0 Max. : 583.0 Max. : 694.0 Max. : 153.55
## pitch_dumbbell yaw_dumbbell total_accel_dumbbell
## Min. :-149.59 Min. :-150.871 Min. : 0.00
## 1st Qu.: -40.89 1st Qu.: -77.644 1st Qu.: 4.00
## Median : -20.96 Median : -3.324 Median :10.00
## Mean : -10.78 Mean : 1.674 Mean :13.72
## 3rd Qu.: 17.50 3rd Qu.: 79.643 3rd Qu.:19.00
## Max. : 149.40 Max. : 154.952 Max. :58.00
## gyros_dumbbell_x gyros_dumbbell_y gyros_dumbbell_z
## Min. :-204.0000 Min. :-2.10000 Min. : -2.380
## 1st Qu.: -0.0300 1st Qu.:-0.14000 1st Qu.: -0.310
## Median : 0.1300 Median : 0.03000 Median : -0.130
## Mean : 0.1611 Mean : 0.04606 Mean : -0.129
## 3rd Qu.: 0.3500 3rd Qu.: 0.21000 3rd Qu.: 0.030
## Max. : 2.2200 Max. :52.00000 Max. :317.000
## accel_dumbbell_x accel_dumbbell_y accel_dumbbell_z magnet_dumbbell_x
## Min. :-419.00 Min. :-189.00 Min. :-334.00 Min. :-643.0
## 1st Qu.: -50.00 1st Qu.: -8.00 1st Qu.:-142.00 1st Qu.:-535.0
## Median : -8.00 Median : 41.50 Median : -1.00 Median :-479.0
## Mean : -28.62 Mean : 52.63 Mean : -38.32 Mean :-328.5
## 3rd Qu.: 11.00 3rd Qu.: 111.00 3rd Qu.: 38.00 3rd Qu.:-304.0
## Max. : 235.00 Max. : 315.00 Max. : 318.00 Max. : 592.0
## magnet_dumbbell_y magnet_dumbbell_z roll_forearm pitch_forearm
## Min. :-3600 Min. :-262.00 Min. :-180.0000 Min. :-72.50
## 1st Qu.: 231 1st Qu.: -45.00 1st Qu.: -0.7375 1st Qu.: 0.00
## Median : 311 Median : 13.00 Median : 21.7000 Median : 9.24
## Mean : 221 Mean : 46.05 Mean : 33.8265 Mean : 10.71
## 3rd Qu.: 390 3rd Qu.: 95.00 3rd Qu.: 140.0000 3rd Qu.: 28.40
## Max. : 633 Max. : 452.00 Max. : 180.0000 Max. : 89.80
## yaw_forearm total_accel_forearm gyros_forearm_x
## Min. :-180.00 Min. : 0.00 Min. :-22.000
## 1st Qu.: -68.60 1st Qu.: 29.00 1st Qu.: -0.220
## Median : 0.00 Median : 36.00 Median : 0.050
## Mean : 19.21 Mean : 34.72 Mean : 0.158
## 3rd Qu.: 110.00 3rd Qu.: 41.00 3rd Qu.: 0.560
## Max. : 180.00 Max. :108.00 Max. : 3.970
## gyros_forearm_y gyros_forearm_z accel_forearm_x accel_forearm_y
## Min. : -7.02000 Min. : -8.0900 Min. :-498.00 Min. :-632.0
## 1st Qu.: -1.46000 1st Qu.: -0.1800 1st Qu.:-178.00 1st Qu.: 57.0
## Median : 0.03000 Median : 0.0800 Median : -57.00 Median : 201.0
## Mean : 0.07517 Mean : 0.1512 Mean : -61.65 Mean : 163.7
## 3rd Qu.: 1.62000 3rd Qu.: 0.4900 3rd Qu.: 76.00 3rd Qu.: 312.0
## Max. :311.00000 Max. :231.0000 Max. : 477.00 Max. : 923.0
## accel_forearm_z magnet_forearm_x magnet_forearm_y magnet_forearm_z
## Min. :-446.00 Min. :-1280.0 Min. :-896.0 Min. :-973.0
## 1st Qu.:-182.00 1st Qu.: -616.0 1st Qu.: 2.0 1st Qu.: 191.0
## Median : -39.00 Median : -378.0 Median : 591.0 Median : 511.0
## Mean : -55.29 Mean : -312.6 Mean : 380.1 Mean : 393.6
## 3rd Qu.: 26.00 3rd Qu.: -73.0 3rd Qu.: 737.0 3rd Qu.: 653.0
## Max. : 291.00 Max. : 672.0 Max. :1480.0 Max. :1090.0
## classe
## A:5580
## B:3797
## C:3422
## D:3216
## E:3607
##
tree_mod <- rpart(classe ~ ., data=training_train, method="class")
tree_pred <- predict(tree_mod, training_test, type = "class")
rpart.plot(tree_mod, main = "Classification Tree", extra=102, under=TRUE, faclen = 0, cex = .5)
confusionMatrix(tree_pred, training_test$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1284 182 13 81 32
## B 31 522 90 38 74
## C 38 123 667 56 58
## D 27 68 55 532 70
## E 15 54 30 97 667
##
## Overall Statistics
##
## Accuracy : 0.7488
## 95% CI : (0.7364, 0.7609)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6807
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9204 0.5501 0.7801 0.6617 0.7403
## Specificity 0.9122 0.9411 0.9321 0.9463 0.9510
## Pos Pred Value 0.8065 0.6914 0.7081 0.7074 0.7729
## Neg Pred Value 0.9665 0.8971 0.9525 0.9345 0.9421
## Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Detection Rate 0.2618 0.1064 0.1360 0.1085 0.1360
## Detection Prevalence 0.3246 0.1540 0.1921 0.1533 0.1760
## Balanced Accuracy 0.9163 0.7456 0.8561 0.8040 0.8457
rf_mod <- randomForest(classe ~. , data = training_train, method = "class")
rf_pred <- predict(rf_mod, training_test, type = "class")
confusionMatrix(rf_pred, training_test$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1394 4 0 0 0
## B 1 944 7 0 0
## C 0 1 848 6 0
## D 0 0 0 798 2
## E 0 0 0 0 899
##
## Overall Statistics
##
## Accuracy : 0.9957
## 95% CI : (0.9935, 0.9973)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9946
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9993 0.9947 0.9918 0.9925 0.9978
## Specificity 0.9989 0.9980 0.9983 0.9995 1.0000
## Pos Pred Value 0.9971 0.9916 0.9918 0.9975 1.0000
## Neg Pred Value 0.9997 0.9987 0.9983 0.9985 0.9995
## Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Detection Rate 0.2843 0.1925 0.1729 0.1627 0.1833
## Detection Prevalence 0.2851 0.1941 0.1743 0.1631 0.1833
## Balanced Accuracy 0.9991 0.9964 0.9950 0.9960 0.9989
Looking at the results, clearly, the random forest model provides a more accurate prediction of classe with 0.9955 compare to decision tree’s 0.7488. The expected out-of-sample error is estimated at 0.005.