R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Data Processing Import the data We first load the R packages needed for analysis and then download the training and testing data sets from the given URLs.

# load the required packages
library(caret); 
## Loading required package: lattice
## Loading required package: ggplot2
library(rattle);
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart); 
library(rpart.plot)
library(randomForest); 
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(repmis)
training <- read.csv("pml-training.csv", na.strings = c("NA", ""))
testing <- read.csv("pml-testing.csv", na.strings = c("NA", ""))

#We now delete columns (predictors) of the training set that contain any missing values.

training <- training[, colSums(is.na(training)) == 0]
testing <- testing[, colSums(is.na(testing)) == 0]

#We also remove the first seven predictors since these variables have little predicting power for the outcome classe.

trainData <- training[, -c(1:7)]
testData <- testing[, -c(1:7)]

#In order to get out-of-sample errors, we split the cleaned training set trainData into a training set (train, 70%) for prediction and a validation set (valid 30%) to compute the out-of-sample errors.

set.seed(7826) 
inTrain <- createDataPartition(trainData$classe, p = 0.7, list = FALSE)
train <- trainData[inTrain, ]
valid <- trainData[-inTrain, ]

control <- trainControl(method = "cv", number = 5)
fit_rpart <- train(classe ~ ., data = train, method = "rpart",trControl = control)
print(fit_rpart, digits = 4)
## CART 
## 
## 13737 samples
##    52 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10989, 10989, 10989, 10991, 10990 
## Resampling results across tuning parameters:
## 
##   cp       Accuracy  Kappa  
##   0.03723  0.5181    0.37581
##   0.05954  0.4154    0.20866
##   0.11423  0.3488    0.09842
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.03723.
fancyRpartPlot(fit_rpart$finalModel)

# predict outcomes using validation set
predict_rpart <- predict(fit_rpart, valid)
# Show prediction result
(conf_rpart <- confusionMatrix(valid$classe, predict_rpart))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1544   21  107    0    2
##          B  492  391  256    0    0
##          C  474   38  514    0    0
##          D  436  175  353    0    0
##          E  155  138  293    0  496
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5004          
##                  95% CI : (0.4876, 0.5133)
##     No Information Rate : 0.5269          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3464          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.4979  0.51245  0.33749       NA  0.99598
## Specificity            0.9533  0.85396  0.88262   0.8362  0.89122
## Pos Pred Value         0.9223  0.34328  0.50097       NA  0.45841
## Neg Pred Value         0.6303  0.92162  0.79234       NA  0.99958
## Prevalence             0.5269  0.12965  0.25879   0.0000  0.08462
## Detection Rate         0.2624  0.06644  0.08734   0.0000  0.08428
## Detection Prevalence   0.2845  0.19354  0.17434   0.1638  0.18386
## Balanced Accuracy      0.7256  0.68321  0.61006       NA  0.94360
(accuracy_rpart <- conf_rpart$overall[1])
##  Accuracy 
## 0.5004248
#Random forests
#Since classification tree method does not perform well, we try random forest method instead.

fit_rf <- train(classe ~ ., data = train, method = "rf",trControl = control)
print(fit_rf, digits = 4)
## Random Forest 
## 
## 13737 samples
##    52 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10991, 10990, 10988, 10990, 10989 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy  Kappa 
##    2    0.9901    0.9875
##   27    0.9918    0.9896
##   52    0.9865    0.9830
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 27.
# predict outcomes using validation set
predict_rf <- predict(fit_rf, valid)
# Show prediction result
(conf_rf <- confusionMatrix(valid$classe, predict_rf))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1669    3    0    0    2
##          B    8 1129    2    0    0
##          C    0    2 1016    8    0
##          D    0    0   16  944    4
##          E    2    2    1    3 1074
## 
## Overall Statistics
##                                           
##                Accuracy : 0.991           
##                  95% CI : (0.9882, 0.9932)
##     No Information Rate : 0.2853          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9886          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9940   0.9938   0.9816   0.9885   0.9944
## Specificity            0.9988   0.9979   0.9979   0.9959   0.9983
## Pos Pred Value         0.9970   0.9912   0.9903   0.9793   0.9926
## Neg Pred Value         0.9976   0.9985   0.9961   0.9978   0.9988
## Prevalence             0.2853   0.1930   0.1759   0.1623   0.1835
## Detection Rate         0.2836   0.1918   0.1726   0.1604   0.1825
## Detection Prevalence   0.2845   0.1935   0.1743   0.1638   0.1839
## Balanced Accuracy      0.9964   0.9959   0.9898   0.9922   0.9964
(accuracy_rf <- conf_rf$overall[1])
##  Accuracy 
## 0.9909941
#Prediction on Testing Set
#We now use random forests to predict the outcome variable classe for the testing set.

(predict(fit_rf, testData))
##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E