Use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants to predict which activity they were performing. They were asked to perform barbell lifts correctly and incorrectly in 5 different ways.
Training data for this project is available here
Test data is available here
Data for this project comes from the source: http://groupware.les.inf.puc-rio.br/har.
More information is available from the website here: Human Activity Recognition (see the section on the Weight Lifting Exercise Dataset).
Download traning data and testing data into working directory and load data.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(rpart)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(gbm)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
training <- read.csv("pml-training.csv")
testing <- read.csv("pml-testing.csv")
dim(training)
## [1] 19622 160
dim(testing)
## [1] 20 160
The training dataset should have 19622 rows and 160 columns.
Select most useful features.
names<-names(training)
features <- names[grepl("^roll|^pitch|^yaw|^gyros|^accel|^magnet.*",names)]
training_feature <- training[,c(features,"classe")]
summary(training_feature$classe)
## A B C D E
## 5580 3797 3422 3216 3607
baseline <- summary(training_feature$classe)[1]/nrow(training_feature)
Our traning data contains 5 classes. Our model accuracy should be larger than 0.2843747, which is accuracy of predicting the majority class all the time.
Train gbm and rf model.
set.seed(123)
inTrain <- createDataPartition(training_feature$classe,p=0.8,list = FALSE)
train <- training_feature[inTrain,]
validation <- training_feature[-inTrain,]
tc <- trainControl("cv",10)
if(file.exists("gbmmodel.rds")){
gbmmodel <- readRDS("gbmmodel.rds")
}else{
gbmmodel <- train(classe~.,method="gbm",trControl=tc,data=train)
saveRDS(gbmmodel,"gbmmodel.rds")
}
if(file.exists("rfmodel.rds")){
rfmodel <- readRDS("rfmodel.rds")
}else{
rfmodel <- train(classe~.,method="rf",trControl=tc,data=train)
saveRDS(rfmodel,"rfmodel.rds")
}
Combine two models to train combine_model.
gbm_est <- predict(gbmmodel,train)
rf_est <- predict(rfmodel,train)
if(file.exists("combine_model.rds")){
combine_model <- readRDS("combine_model.rds")
}else{
ests <- data.frame(gbm_est=gbm_est,rf_est=rf_est,classe=train$classe)
combine_model <- train(classe~.,method="rf",data=ests)
saveRDS(combine_model,"combine_model.rds")
}
Evaluation on validation data.
summary(validation)
## roll_belt pitch_belt yaw_belt
## Min. :-28.60 Min. :-53.0000 Min. :-179.000
## 1st Qu.: 1.09 1st Qu.: 1.6650 1st Qu.: -88.300
## Median :115.00 Median : 5.3200 Median : -11.500
## Mean : 64.93 Mean : 0.2182 Mean : -9.943
## 3rd Qu.:123.00 3rd Qu.: 15.8000 3rd Qu.: 13.900
## Max. :162.00 Max. : 60.3000 Max. : 179.000
## gyros_belt_x gyros_belt_y gyros_belt_z accel_belt_x
## Min. :-1.040000 Min. :-0.53000 Min. :-1.1500 Min. :-83.00
## 1st Qu.:-0.050000 1st Qu.: 0.00000 1st Qu.:-0.2000 1st Qu.:-21.00
## Median : 0.030000 Median : 0.02000 Median :-0.1100 Median :-14.00
## Mean :-0.004963 Mean : 0.03952 Mean :-0.1306 Mean : -5.42
## 3rd Qu.: 0.110000 3rd Qu.: 0.11000 3rd Qu.:-0.0200 3rd Qu.: -5.00
## Max. : 2.020000 Max. : 0.51000 Max. : 1.3300 Max. : 76.00
## accel_belt_y accel_belt_z magnet_belt_x magnet_belt_y
## Min. :-69.00 Min. :-263.0 Min. :-49.00 Min. :376.0
## 1st Qu.: 3.00 1st Qu.:-162.0 1st Qu.: 9.00 1st Qu.:582.0
## Median : 38.00 Median :-153.0 Median : 35.00 Median :601.0
## Mean : 30.26 Mean : -73.2 Mean : 56.68 Mean :594.3
## 3rd Qu.: 61.00 3rd Qu.: 28.0 3rd Qu.: 60.00 3rd Qu.:610.0
## Max. : 90.00 Max. : 105.0 Max. :440.00 Max. :669.0
## magnet_belt_z roll_arm pitch_arm yaw_arm
## Min. :-607.0 Min. :-178.00 Min. :-88.800 Min. :-180.000
## 1st Qu.:-374.0 1st Qu.: -31.55 1st Qu.:-25.350 1st Qu.: -42.800
## Median :-319.0 Median : 0.00 Median : 0.000 Median : 0.000
## Mean :-343.3 Mean : 17.05 Mean : -4.414 Mean : -1.271
## 3rd Qu.:-306.0 3rd Qu.: 77.15 3rd Qu.: 10.900 3rd Qu.: 43.200
## Max. : 287.0 Max. : 178.00 Max. : 84.600 Max. : 179.000
## gyros_arm_x gyros_arm_y gyros_arm_z accel_arm_x
## Min. :-5.94000 Min. :-3.400 Min. :-2.330 Min. :-371.00
## 1st Qu.:-1.29000 1st Qu.:-0.800 1st Qu.:-0.070 1st Qu.:-242.00
## Median : 0.08000 Median :-0.260 Median : 0.260 Median : -38.00
## Mean : 0.07679 Mean :-0.278 Mean : 0.282 Mean : -58.22
## 3rd Qu.: 1.57000 3rd Qu.: 0.130 3rd Qu.: 0.720 3rd Qu.: 86.00
## Max. : 4.87000 Max. : 2.790 Max. : 3.020 Max. : 435.00
## accel_arm_y accel_arm_z magnet_arm_x magnet_arm_y
## Min. :-246.00 Min. :-612.00 Min. :-570.0 Min. :-372.0
## 1st Qu.: -55.00 1st Qu.:-143.50 1st Qu.:-292.0 1st Qu.: -10.0
## Median : 10.00 Median : -47.00 Median : 303.0 Median : 198.0
## Mean : 31.36 Mean : -71.69 Mean : 199.7 Mean : 154.9
## 3rd Qu.: 136.00 3rd Qu.: 24.00 3rd Qu.: 642.0 3rd Qu.: 322.0
## Max. : 299.00 Max. : 221.00 Max. : 779.0 Max. : 583.0
## magnet_arm_z roll_dumbbell pitch_dumbbell yaw_dumbbell
## Min. :-597.0 Min. :-153.71 Min. :-127.23 Min. :-140.076
## 1st Qu.: 121.0 1st Qu.: -16.03 1st Qu.: -39.98 1st Qu.: -77.654
## Median : 441.0 Median : 49.15 Median : -20.98 Median : -8.103
## Mean : 303.3 Mean : 24.96 Mean : -10.81 Mean : 1.134
## 3rd Qu.: 542.0 3rd Qu.: 68.45 3rd Qu.: 17.53 3rd Qu.: 78.584
## Max. : 687.0 Max. : 148.80 Max. : 129.52 Max. : 151.044
## gyros_dumbbell_x gyros_dumbbell_y gyros_dumbbell_z accel_dumbbell_x
## Min. :-1.860 Min. :-2.07000 Min. :-1.7400 Min. :-237.00
## 1st Qu.:-0.030 1st Qu.:-0.14000 1st Qu.:-0.3100 1st Qu.: -51.00
## Median : 0.130 Median : 0.03000 Median :-0.1300 Median : -8.00
## Mean : 0.173 Mean : 0.04342 Mean :-0.1491 Mean : -28.96
## 3rd Qu.: 0.370 3rd Qu.: 0.21000 3rd Qu.: 0.0300 3rd Qu.: 10.00
## Max. : 2.140 Max. : 2.73000 Max. : 1.6100 Max. : 234.00
## accel_dumbbell_y accel_dumbbell_z magnet_dumbbell_x magnet_dumbbell_y
## Min. :-182.00 Min. :-273.00 Min. :-639.0 Min. :-736.0
## 1st Qu.: -8.00 1st Qu.:-141.00 1st Qu.:-534.0 1st Qu.: 232.0
## Median : 44.00 Median : -2.00 Median :-478.0 Median : 312.0
## Mean : 54.02 Mean : -38.79 Mean :-329.9 Mean : 223.1
## 3rd Qu.: 113.00 3rd Qu.: 37.50 3rd Qu.:-313.0 3rd Qu.: 391.5
## Max. : 299.00 Max. : 318.00 Max. : 583.0 Max. : 633.0
## magnet_dumbbell_z roll_forearm pitch_forearm yaw_forearm
## Min. :-245.00 Min. :-180.00 Min. :-71.60 Min. :-180.0
## 1st Qu.: -47.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: -70.3
## Median : 15.00 Median : 19.00 Median : 8.24 Median : 0.0
## Mean : 45.63 Mean : 34.14 Mean : 10.48 Mean : 17.5
## 3rd Qu.: 97.00 3rd Qu.: 140.00 3rd Qu.: 27.80 3rd Qu.: 108.0
## Max. : 443.00 Max. : 180.00 Max. : 87.20 Max. : 180.0
## gyros_forearm_x gyros_forearm_y gyros_forearm_z accel_forearm_x
## Min. :-2.990 Min. :-6.62000 Min. :-4.2800 Min. :-468.00
## 1st Qu.:-0.220 1st Qu.:-1.45000 1st Qu.:-0.1800 1st Qu.:-179.00
## Median : 0.050 Median : 0.03000 Median : 0.0700 Median : -56.00
## Mean : 0.153 Mean : 0.08567 Mean : 0.1438 Mean : -61.67
## 3rd Qu.: 0.550 3rd Qu.: 1.65000 3rd Qu.: 0.4900 3rd Qu.: 74.00
## Max. : 3.520 Max. : 6.13000 Max. : 3.2800 Max. : 477.00
## accel_forearm_y accel_forearm_z magnet_forearm_x magnet_forearm_y
## Min. :-496.0 Min. :-371.00 Min. :-1280.0 Min. :-882.0
## 1st Qu.: 62.0 1st Qu.:-181.00 1st Qu.: -614.0 1st Qu.: 33.5
## Median : 200.0 Median : -37.00 Median : -377.0 Median : 604.0
## Mean : 166.7 Mean : -55.19 Mean : -314.8 Mean : 392.2
## 3rd Qu.: 315.0 3rd Qu.: 25.00 3rd Qu.: -78.5 3rd Qu.: 740.0
## Max. : 569.0 Max. : 277.00 Max. : 661.0 Max. :1480.0
## magnet_forearm_z classe
## Min. :-960.0 A:1116
## 1st Qu.: 219.5 B: 759
## Median : 520.0 C: 684
## Mean : 400.9 D: 643
## 3rd Qu.: 656.0 E: 721
## Max. :1030.0
gbm_est <- predict(gbmmodel,validation)
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
rf_est <- predict(rfmodel,validation)
ests <- data.frame(gbm_est=gbm_est,rf_est=rf_est)
combine_est <- predict(combine_model,ests)
confusionMatrix(gbm_est,validation$classe)$overall["Accuracy"]
## Accuracy
## 0.9638032
confusionMatrix(rf_est,validation$classe)$overall["Accuracy"]
## Accuracy
## 0.994392
confusionMatrix(combine_est,validation$classe)$overall["Accuracy"]
## Accuracy
## 0.994392
On validation data, combined model performs equally well as rf model. Choose rf model for the convenience of data preprocessing.
varImp(rfmodel$finalModel)
## Overall
## roll_belt 643.1432
## pitch_belt 420.7276
## yaw_belt 517.2509
## gyros_belt_x 147.5908
## gyros_belt_y 146.3473
## gyros_belt_z 255.8693
## accel_belt_x 169.8875
## accel_belt_y 160.0943
## accel_belt_z 326.7118
## magnet_belt_x 227.0831
## magnet_belt_y 317.5985
## magnet_belt_z 334.2471
## roll_arm 278.0508
## pitch_arm 198.0575
## yaw_arm 223.5487
## gyros_arm_x 184.8183
## gyros_arm_y 176.8710
## gyros_arm_z 103.8034
## accel_arm_x 250.3441
## accel_arm_y 192.4414
## accel_arm_z 179.9224
## magnet_arm_x 242.8204
## magnet_arm_y 249.7541
## magnet_arm_z 198.6414
## roll_dumbbell 322.3415
## pitch_dumbbell 212.0130
## yaw_dumbbell 252.6936
## gyros_dumbbell_x 157.7407
## gyros_dumbbell_y 251.7384
## gyros_dumbbell_z 120.5823
## accel_dumbbell_x 256.4273
## accel_dumbbell_y 350.9134
## accel_dumbbell_z 308.1369
## magnet_dumbbell_x 370.6638
## magnet_dumbbell_y 419.8749
## magnet_dumbbell_z 448.1362
## roll_forearm 364.8781
## pitch_forearm 443.4977
## yaw_forearm 188.9111
## gyros_forearm_x 124.6428
## gyros_forearm_y 164.3684
## gyros_forearm_z 122.4945
## accel_forearm_x 266.6877
## accel_forearm_y 183.4110
## accel_forearm_z 223.6186
## magnet_forearm_x 226.1433
## magnet_forearm_y 235.8194
## magnet_forearm_z 239.6934
Belt matters most!
test <- testing[,c(features)]
est_rf_test <- predict(rfmodel,test)
summary(est_rf_test)
## A B C D E
## 7 8 1 1 3