Executive Summary

In this project, our goal is to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants to evaluate how well a person do a particular activity. Firstly, I will extract useful features from the training and testing dataset. Then I will use training dataset to train our models and envaluate the performance of each model accordingly. Finally, I will choose the best model to make predictions in our testing dataset.

Download and read both training and testing datasets

fileUrl1 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
fileUrl2 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
# download datasets
download.file(fileUrl2, destfile = "pml-testing.csv", method = "curl")
dataDownloaded <- date()
# read datasets
train <- read.csv("pml-training.csv",na.strings = c("NA", "#DIV/0!", ""))
validate <-read.csv("pml-testing.csv",na.strings = c("NA", "#DIV/0!", ""))

Data cleaning and processing

1. Check the dimensions of train and validate datasets

dim(train)
## [1] 19622   160
dim(validate)
## [1]  20 160

2. Remove all columns that contains NA

Here we use “colSums(is.na()) == 0” to check whether there are NA in each column.

train <- train[,colSums(is.na(train)) == 0]
validate <- validate[,colSums(is.na(validate)) == 0]

Then we check the dimensions of train and validates datasets again.

dim(train)
## [1] 19622    60
dim(validate)
## [1] 20 60

3. Check the feature names and remove unrelated features

names(train)
##  [1] "X"                    "user_name"            "raw_timestamp_part_1"
##  [4] "raw_timestamp_part_2" "cvtd_timestamp"       "new_window"          
##  [7] "num_window"           "roll_belt"            "pitch_belt"          
## [10] "yaw_belt"             "total_accel_belt"     "gyros_belt_x"        
## [13] "gyros_belt_y"         "gyros_belt_z"         "accel_belt_x"        
## [16] "accel_belt_y"         "accel_belt_z"         "magnet_belt_x"       
## [19] "magnet_belt_y"        "magnet_belt_z"        "roll_arm"            
## [22] "pitch_arm"            "yaw_arm"              "total_accel_arm"     
## [25] "gyros_arm_x"          "gyros_arm_y"          "gyros_arm_z"         
## [28] "accel_arm_x"          "accel_arm_y"          "accel_arm_z"         
## [31] "magnet_arm_x"         "magnet_arm_y"         "magnet_arm_z"        
## [34] "roll_dumbbell"        "pitch_dumbbell"       "yaw_dumbbell"        
## [37] "total_accel_dumbbell" "gyros_dumbbell_x"     "gyros_dumbbell_y"    
## [40] "gyros_dumbbell_z"     "accel_dumbbell_x"     "accel_dumbbell_y"    
## [43] "accel_dumbbell_z"     "magnet_dumbbell_x"    "magnet_dumbbell_y"   
## [46] "magnet_dumbbell_z"    "roll_forearm"         "pitch_forearm"       
## [49] "yaw_forearm"          "total_accel_forearm"  "gyros_forearm_x"     
## [52] "gyros_forearm_y"      "gyros_forearm_z"      "accel_forearm_x"     
## [55] "accel_forearm_y"      "accel_forearm_z"      "magnet_forearm_x"    
## [58] "magnet_forearm_y"     "magnet_forearm_z"     "classe"

Since our datasets has no time-dependence, so we decided to remove the first 7 features.

train <- train[,-(1:7)]
validate <- validate[,-(1:7)]

Data Partitioning

Here i decided to seperate train dataset into two parts, 60% of them will be used to train our models, while 40% of them will be used to evaluate our modles.

library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: ggplot2
inTrain <- createDataPartition(y = train$classe, p = 0.6, list = FALSE)
training <- train[inTrain,]
testing <- train[-inTrain,]

Use three machine learning strategies to train models

1. Classification Trees

library(rpart)
set.seed(123)
#modFit_rpart <- train(classe ~ ., method = "rpart", data = training, 
#                      trControl=trainControl(method="cv",number = 10), 
#                      tuneGrid=data.frame(cp=0.01))
modFit_rpart <- rpart(classe ~ ., data = training, method = "class")
prediction_rpart <- predict(modFit_rpart, testing, type = "class")
confusionMatrix(prediction_rpart, testing$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1951  268   17   58   21
##          B   70  835   84   93  115
##          C   59  212 1101  216  191
##          D  113  112   86  809   84
##          E   39   91   80  110 1031
## 
## Overall Statistics
##                                         
##                Accuracy : 0.7299        
##                  95% CI : (0.72, 0.7397)
##     No Information Rate : 0.2845        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.6582        
##  Mcnemar's Test P-Value : < 2.2e-16     
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.8741   0.5501   0.8048   0.6291   0.7150
## Specificity            0.9352   0.9428   0.8953   0.9398   0.9500
## Pos Pred Value         0.8428   0.6976   0.6189   0.6719   0.7631
## Neg Pred Value         0.9492   0.8973   0.9560   0.9282   0.9367
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2487   0.1064   0.1403   0.1031   0.1314
## Detection Prevalence   0.2951   0.1526   0.2267   0.1535   0.1722
## Balanced Accuracy      0.9046   0.7464   0.8501   0.7844   0.8325

We can see that the accuracy of this Classification Tree is 0.74.

2. Randon Forest

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
set.seed(123)
#modFit_rf <- train(classe ~ ., method = "rf", data = training, prox = TRUE,
#                   trControl=trainControl(method="cv",number = 10))
modFit_rf <- randomForest(classe ~ ., data = training, method = "rf", 
          importance = T, trControl = trainControl(method = "cv", number = 10))
prediction_rf <- predict(modFit_rf, testing)
confusionMatrix(prediction_rf, testing$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 2232    5    0    0    0
##          B    0 1510   16    0    0
##          C    0    3 1349   20    2
##          D    0    0    3 1265    2
##          E    0    0    0    1 1438
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9934         
##                  95% CI : (0.9913, 0.995)
##     No Information Rate : 0.2845         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9916         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   0.9947   0.9861   0.9837   0.9972
## Specificity            0.9991   0.9975   0.9961   0.9992   0.9998
## Pos Pred Value         0.9978   0.9895   0.9818   0.9961   0.9993
## Neg Pred Value         1.0000   0.9987   0.9971   0.9968   0.9994
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2845   0.1925   0.1719   0.1612   0.1833
## Detection Prevalence   0.2851   0.1945   0.1751   0.1619   0.1834
## Balanced Accuracy      0.9996   0.9961   0.9911   0.9915   0.9985

We can see taht the accuracy of the Random Forest is 0.99.

3. Boosting:

library(gbm)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
set.seed(123)
modFit_boosting <- train(classe ~ ., method = "gbm", data = training, verbose = FALSE, trControl=trainControl(method="cv",number = 10))
prediction_boosting <- predict(modFit_boosting, testing)
confusionMatrix(prediction_boosting, testing$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 2197   50    0    0    2
##          B   18 1416   58    2   11
##          C   10   44 1291   44   15
##          D    4    2   15 1230   12
##          E    3    6    4   10 1402
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9605          
##                  95% CI : (0.9559, 0.9647)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.95            
##  Mcnemar's Test P-Value : 6.128e-08       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9843   0.9328   0.9437   0.9565   0.9723
## Specificity            0.9907   0.9859   0.9826   0.9950   0.9964
## Pos Pred Value         0.9769   0.9409   0.9195   0.9739   0.9839
## Neg Pred Value         0.9937   0.9839   0.9880   0.9915   0.9938
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2800   0.1805   0.1645   0.1568   0.1787
## Detection Prevalence   0.2866   0.1918   0.1789   0.1610   0.1816
## Balanced Accuracy      0.9875   0.9594   0.9631   0.9757   0.9843

We can see that the accuracy of this boosting is 0.96.

Make predictions

Here we choose Random Forest as our model because it has the highest accuracy. We can see the prediction of “classe” in validate dataset in the end.

prediction_validate <- predict(modFit_rf, newdata=validate)
prediction_validate
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  B  A  B  A  A  E  D  B  A  A  B  C  B  A  E  E  A  B  B  B 
## Levels: A B C D E