## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.4
## Warning: package 'kernlab' was built under R version 3.2.5
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
## Warning: package 'e1071' was built under R version 3.2.5
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## Warning: package 'rpart.plot' was built under R version 3.2.5
## Loading required package: rpart
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## Warning: package 'gbm' was built under R version 3.2.5
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
setwd("/Users/Mood/Documents/Coursera/Machine Learning/")
training <- read.csv("pml-training.csv", na.strings = c("NA", ""))
testing <- read.csv("pml-testing.csv", na.strings = c("NA", ""))
training <- training[, colSums(is.na(training)) == 0]
testing <- testing[, colSums(is.na(testing)) == 0]
trainData <- training[, -c(1:7)]
testData <- testing[, -c(1:7)]
set.seed(5000)
inTrain <- createDataPartition(trainData$classe, p = 0.7, list = FALSE)
train <- trainData[inTrain, ]
valid <- trainData[-inTrain, ]
Path <- trainControl(method = "cv", number = 10)
fit_rpart <- train(classe ~ ., data = train, method = "rpart", trControl = Path)
print(fit_rpart, digits = 4)
## CART
##
## 13737 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 12364, 12362, 12366, 12362, 12363, 12363, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.03672 0.5037 0.35188
## 0.05971 0.4277 0.22899
## 0.11637 0.3245 0.06142
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.03672.
fancyRpartPlot(fit_rpart$finalModel)
#Now apply the model to the validation set (ClassTree)
predict_rpart <- predict(fit_rpart, valid)
conf_part <- confusionMatrix(valid$classe, predict_rpart)
print(conf_part)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1493 30 122 0 29
## B 485 394 260 0 0
## C 465 39 522 0 0
## D 437 179 348 0 0
## E 142 128 302 0 510
##
## Overall Statistics
##
## Accuracy : 0.496
## 95% CI : (0.4832, 0.5089)
## No Information Rate : 0.5135
## P-Value [Acc > NIR] : 0.9965
##
## Kappa : 0.3418
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.4940 0.51169 0.3359 NA 0.94620
## Specificity 0.9368 0.85435 0.8836 0.8362 0.89300
## Pos Pred Value 0.8919 0.34592 0.5088 NA 0.47135
## Neg Pred Value 0.6369 0.92078 0.7876 NA 0.99396
## Prevalence 0.5135 0.13084 0.2641 0.0000 0.09159
## Detection Rate 0.2537 0.06695 0.0887 0.0000 0.08666
## Detection Prevalence 0.2845 0.19354 0.1743 0.1638 0.18386
## Balanced Accuracy 0.7154 0.68302 0.6098 NA 0.91960
## Accuracy
## 0.4960068
## [1] "Sample Error = 0.503993203058624"
fit_rf <- train(classe ~ ., data = train, method = "rf",
trControl = Path)
print(fit_rpart, digits = 4)
## CART
##
## 13737 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 12364, 12362, 12366, 12362, 12363, 12363, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.03672 0.5037 0.35188
## 0.05971 0.4277 0.22899
## 0.11637 0.3245 0.06142
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.03672.
predict_rf <- predict(fit_rf, valid)
conf_rf <- confusionMatrix(valid$classe, predict_rf)
print(conf_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1674 0 0 0 0
## B 7 1129 3 0 0
## C 0 11 1013 2 0
## D 0 0 12 951 1
## E 0 0 0 4 1078
##
## Overall Statistics
##
## Accuracy : 0.9932
## 95% CI : (0.9908, 0.9951)
## No Information Rate : 0.2856
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9914
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9958 0.9904 0.9854 0.9937 0.9991
## Specificity 1.0000 0.9979 0.9973 0.9974 0.9992
## Pos Pred Value 1.0000 0.9912 0.9873 0.9865 0.9963
## Neg Pred Value 0.9983 0.9977 0.9969 0.9988 0.9998
## Prevalence 0.2856 0.1937 0.1747 0.1626 0.1833
## Detection Rate 0.2845 0.1918 0.1721 0.1616 0.1832
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9979 0.9941 0.9914 0.9955 0.9991
## Accuracy
## 0.9932031
## [1] "Sample Error = 0.00679694137638065"
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
## [1] "2017-05-01 23:26:39 HKT"