This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Data Processing Import the data We first load the R packages needed for analysis and then download the training and testing data sets from the given URLs.
# load the required packages
library(caret);
## Loading required package: lattice
## Loading required package: ggplot2
library(rattle);
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart);
library(rpart.plot)
library(randomForest);
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
library(repmis)
training <- read.csv("pml-training.csv", na.strings = c("NA", ""))
testing <- read.csv("pml-testing.csv", na.strings = c("NA", ""))
#We now delete columns (predictors) of the training set that contain any missing values.
training <- training[, colSums(is.na(training)) == 0]
testing <- testing[, colSums(is.na(testing)) == 0]
#We also remove the first seven predictors since these variables have little predicting power for the outcome classe.
trainData <- training[, -c(1:7)]
testData <- testing[, -c(1:7)]
#In order to get out-of-sample errors, we split the cleaned training set trainData into a training set (train, 70%) for prediction and a validation set (valid 30%) to compute the out-of-sample errors.
set.seed(7826)
inTrain <- createDataPartition(trainData$classe, p = 0.7, list = FALSE)
train <- trainData[inTrain, ]
valid <- trainData[-inTrain, ]
control <- trainControl(method = "cv", number = 5)
fit_rpart <- train(classe ~ ., data = train, method = "rpart",trControl = control)
print(fit_rpart, digits = 4)
## CART
##
## 13737 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 10989, 10989, 10989, 10991, 10990
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.03723 0.5181 0.37581
## 0.05954 0.4154 0.20866
## 0.11423 0.3488 0.09842
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.03723.
fancyRpartPlot(fit_rpart$finalModel)
# predict outcomes using validation set
predict_rpart <- predict(fit_rpart, valid)
# Show prediction result
(conf_rpart <- confusionMatrix(valid$classe, predict_rpart))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1544 21 107 0 2
## B 492 391 256 0 0
## C 474 38 514 0 0
## D 436 175 353 0 0
## E 155 138 293 0 496
##
## Overall Statistics
##
## Accuracy : 0.5004
## 95% CI : (0.4876, 0.5133)
## No Information Rate : 0.5269
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3464
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.4979 0.51245 0.33749 NA 0.99598
## Specificity 0.9533 0.85396 0.88262 0.8362 0.89122
## Pos Pred Value 0.9223 0.34328 0.50097 NA 0.45841
## Neg Pred Value 0.6303 0.92162 0.79234 NA 0.99958
## Prevalence 0.5269 0.12965 0.25879 0.0000 0.08462
## Detection Rate 0.2624 0.06644 0.08734 0.0000 0.08428
## Detection Prevalence 0.2845 0.19354 0.17434 0.1638 0.18386
## Balanced Accuracy 0.7256 0.68321 0.61006 NA 0.94360
(accuracy_rpart <- conf_rpart$overall[1])
## Accuracy
## 0.5004248
#Random forests
#Since classification tree method does not perform well, we try random forest method instead.
fit_rf <- train(classe ~ ., data = train, method = "rf",trControl = control)
print(fit_rf, digits = 4)
## Random Forest
##
## 13737 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 10991, 10990, 10988, 10990, 10989
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9901 0.9875
## 27 0.9918 0.9896
## 52 0.9865 0.9830
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 27.
# predict outcomes using validation set
predict_rf <- predict(fit_rf, valid)
# Show prediction result
(conf_rf <- confusionMatrix(valid$classe, predict_rf))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1669 3 0 0 2
## B 8 1129 2 0 0
## C 0 2 1016 8 0
## D 0 0 16 944 4
## E 2 2 1 3 1074
##
## Overall Statistics
##
## Accuracy : 0.991
## 95% CI : (0.9882, 0.9932)
## No Information Rate : 0.2853
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9886
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9940 0.9938 0.9816 0.9885 0.9944
## Specificity 0.9988 0.9979 0.9979 0.9959 0.9983
## Pos Pred Value 0.9970 0.9912 0.9903 0.9793 0.9926
## Neg Pred Value 0.9976 0.9985 0.9961 0.9978 0.9988
## Prevalence 0.2853 0.1930 0.1759 0.1623 0.1835
## Detection Rate 0.2836 0.1918 0.1726 0.1604 0.1825
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9964 0.9959 0.9898 0.9922 0.9964
(accuracy_rf <- conf_rf$overall[1])
## Accuracy
## 0.9909941
#Prediction on Testing Set
#We now use random forests to predict the outcome variable classe for the testing set.
(predict(fit_rf, testData))
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E