Setting up a few things…
setwd("C:/Users/An/Documents/Study/Coursera/Practical Machine Learning/Course Project/")
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)
## Loading required package: RGtk2
## Rattle: A free graphical interface for data mining with R.
## Version 3.5.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
set.seed(60640)
Read and partition the data
training <- read.csv(url("http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"), na.strings=c("NA","#DIV/0!",""))
testing <- read.csv(url("http://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"), na.strings=c("NA","#DIV/0!",""))
Partition the training data into training1 and validation:
trainingIndex <- createDataPartition(y=training$classe, p=0.6, list=FALSE)
training1 <- training[trainingIndex, ]
validation <- training[-trainingIndex, ]
Let’s see the size of training1 and validation:
dim(training1)
## [1] 11776 160
dim(validation)
## [1] 7846 160
Take a quick look at the training data:
str(training1)
## 'data.frame': 11776 obs. of 160 variables:
## $ X : int 1 3 5 9 10 12 13 16 18 21 ...
## $ user_name : Factor w/ 6 levels "adelmo","carlitos",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ raw_timestamp_part_1 : int 1323084231 1323084231 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 ...
## $ raw_timestamp_part_2 : int 788290 820366 196328 484323 484434 528316 560359 644302 732306 876301 ...
## $ cvtd_timestamp : Factor w/ 20 levels "02/12/2011 13:32",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ new_window : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ num_window : int 11 11 12 12 12 12 12 12 12 12 ...
## $ roll_belt : num 1.41 1.42 1.48 1.43 1.45 1.43 1.42 1.48 1.55 1.6 ...
## $ pitch_belt : num 8.07 8.07 8.07 8.16 8.17 8.18 8.2 8.15 8.08 8.1 ...
## $ yaw_belt : num -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
## $ total_accel_belt : int 3 3 3 3 3 3 3 3 3 3 ...
## $ kurtosis_roll_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ kurtosis_picth_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ kurtosis_yaw_belt : logi NA NA NA NA NA NA ...
## $ skewness_roll_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ skewness_roll_belt.1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ skewness_yaw_belt : logi NA NA NA NA NA NA ...
## $ max_roll_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ max_picth_belt : int NA NA NA NA NA NA NA NA NA NA ...
## $ max_yaw_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ min_roll_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ min_pitch_belt : int NA NA NA NA NA NA NA NA NA NA ...
## $ min_yaw_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ amplitude_roll_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ amplitude_pitch_belt : int NA NA NA NA NA NA NA NA NA NA ...
## $ amplitude_yaw_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ var_total_accel_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ avg_roll_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ stddev_roll_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ var_roll_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ avg_pitch_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ stddev_pitch_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ var_pitch_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ avg_yaw_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ stddev_yaw_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ var_yaw_belt : num NA NA NA NA NA NA NA NA NA NA ...
## $ gyros_belt_x : num 0 0 0.02 0.02 0.03 0.02 0.02 0 0 0.02 ...
## $ gyros_belt_y : num 0 0 0.02 0 0 0 0 0 0.02 0 ...
## $ gyros_belt_z : num -0.02 -0.02 -0.02 -0.02 0 -0.02 0 0 0 -0.02 ...
## $ accel_belt_x : int -21 -20 -21 -20 -21 -22 -22 -21 -21 -20 ...
## $ accel_belt_y : int 4 5 2 2 4 2 4 4 5 1 ...
## $ accel_belt_z : int 22 23 24 24 22 23 21 23 21 20 ...
## $ magnet_belt_x : int -3 -2 -6 1 -3 -2 -3 0 1 -10 ...
## $ magnet_belt_y : int 599 600 600 602 609 602 606 592 600 607 ...
## $ magnet_belt_z : int -313 -305 -302 -312 -308 -319 -309 -305 -316 -304 ...
## $ roll_arm : num -128 -128 -128 -128 -128 -128 -128 -129 -129 -129 ...
## $ pitch_arm : num 22.5 22.5 22.1 21.7 21.6 21.5 21.4 21.3 21.2 20.9 ...
## $ yaw_arm : num -161 -161 -161 -161 -161 -161 -161 -161 -161 -161 ...
## $ total_accel_arm : int 34 34 34 34 34 34 34 34 34 34 ...
## $ var_accel_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ avg_roll_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ stddev_roll_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ var_roll_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ avg_pitch_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ stddev_pitch_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ var_pitch_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ avg_yaw_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ stddev_yaw_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ var_yaw_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ gyros_arm_x : num 0 0.02 0 0.02 0.02 0.02 0.02 0.02 0.02 0.03 ...
## $ gyros_arm_y : num 0 -0.02 -0.03 -0.03 -0.03 -0.03 -0.02 0 -0.02 -0.02 ...
## $ gyros_arm_z : num -0.02 -0.02 0 -0.02 -0.02 0 -0.02 -0.03 -0.03 -0.02 ...
## $ accel_arm_x : int -288 -289 -289 -288 -288 -288 -287 -289 -288 -288 ...
## $ accel_arm_y : int 109 110 111 109 110 111 111 109 108 111 ...
## $ accel_arm_z : int -123 -126 -123 -122 -124 -123 -124 -121 -124 -124 ...
## $ magnet_arm_x : int -368 -368 -374 -369 -376 -363 -372 -367 -373 -375 ...
## $ magnet_arm_y : int 337 344 337 341 334 343 338 340 336 337 ...
## $ magnet_arm_z : int 516 513 506 518 516 520 509 509 510 513 ...
## $ kurtosis_roll_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ kurtosis_picth_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ kurtosis_yaw_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ skewness_roll_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ skewness_pitch_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ skewness_yaw_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ max_roll_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ max_picth_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ max_yaw_arm : int NA NA NA NA NA NA NA NA NA NA ...
## $ min_roll_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ min_pitch_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ min_yaw_arm : int NA NA NA NA NA NA NA NA NA NA ...
## $ amplitude_roll_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ amplitude_pitch_arm : num NA NA NA NA NA NA NA NA NA NA ...
## $ amplitude_yaw_arm : int NA NA NA NA NA NA NA NA NA NA ...
## $ roll_dumbbell : num 13.1 12.9 13.4 13.2 13.3 ...
## $ pitch_dumbbell : num -70.5 -70.3 -70.4 -70.4 -70.9 ...
## $ yaw_dumbbell : num -84.9 -85.1 -84.9 -84.9 -84.4 ...
## $ kurtosis_roll_dumbbell : num NA NA NA NA NA NA NA NA NA NA ...
## $ kurtosis_picth_dumbbell : num NA NA NA NA NA NA NA NA NA NA ...
## $ kurtosis_yaw_dumbbell : logi NA NA NA NA NA NA ...
## $ skewness_roll_dumbbell : num NA NA NA NA NA NA NA NA NA NA ...
## $ skewness_pitch_dumbbell : num NA NA NA NA NA NA NA NA NA NA ...
## $ skewness_yaw_dumbbell : logi NA NA NA NA NA NA ...
## $ max_roll_dumbbell : num NA NA NA NA NA NA NA NA NA NA ...
## $ max_picth_dumbbell : num NA NA NA NA NA NA NA NA NA NA ...
## $ max_yaw_dumbbell : num NA NA NA NA NA NA NA NA NA NA ...
## $ min_roll_dumbbell : num NA NA NA NA NA NA NA NA NA NA ...
## $ min_pitch_dumbbell : num NA NA NA NA NA NA NA NA NA NA ...
## $ min_yaw_dumbbell : num NA NA NA NA NA NA NA NA NA NA ...
## $ amplitude_roll_dumbbell : num NA NA NA NA NA NA NA NA NA NA ...
## [list output truncated]
Select predictors
There is a lot of variables! We will first fit a Classification Decision Tree to see which variables are important as predictors of “classe”. Note that this Classification Tree is not the final model (it may not as accurate as Random Forest). However, this Classication Tree uses much less computing resoures than Random Forest so we use it to give us a glimpse of which predictors are important.
# Fit one Classification Tree with "classe" as target and the rest (except first column, which is ID)
# as predictors.
fit <- rpart(classe ~ ., data=training1[,-1], cp=10^(-6))
# Put the tree's variable.importance into a vector called "predictors"
predictors <- fit$variable.importance
# How many predictors are important?
length(predictors)
## [1] 57
# What are the names and importance of those predictors?
predictors
## cvtd_timestamp raw_timestamp_part_1 roll_belt
## 3913.097455 2919.021031 1513.074747
## yaw_belt pitch_belt num_window
## 1408.769802 1190.141851 1173.211527
## magnet_dumbbell_x magnet_belt_y magnet_dumbbell_z
## 1011.626517 931.207608 906.027972
## pitch_forearm accel_belt_z user_name
## 852.985323 809.378032 765.854734
## accel_belt_x magnet_belt_z magnet_dumbbell_y
## 681.275893 625.109797 607.989768
## total_accel_belt roll_dumbbell magnet_belt_x
## 571.367813 553.202447 535.186796
## magnet_forearm_x accel_dumbbell_y accel_dumbbell_x
## 521.098197 505.826607 503.247721
## accel_dumbbell_z accel_forearm_x total_accel_dumbbell
## 482.118025 399.886643 389.544425
## yaw_dumbbell gyros_belt_y gyros_belt_z
## 362.857252 287.568760 250.752047
## pitch_dumbbell roll_forearm accel_forearm_z
## 195.640802 195.099316 172.227168
## accel_belt_y gyros_dumbbell_y pitch_arm
## 162.259859 132.364340 129.457873
## total_accel_forearm yaw_arm accel_arm_y
## 83.299942 66.289762 66.174565
## accel_forearm_y accel_arm_x magnet_forearm_z
## 58.242084 55.460849 53.867704
## magnet_forearm_y gyros_arm_x gyros_belt_x
## 48.039456 47.330050 44.618884
## magnet_arm_x yaw_forearm magnet_arm_y
## 44.111297 38.701299 37.654517
## roll_arm gyros_arm_y gyros_dumbbell_x
## 28.123059 26.905617 26.416358
## gyros_forearm_y accel_arm_z raw_timestamp_part_2
## 23.861977 19.732437 12.692308
## gyros_forearm_z gyros_forearm_x gyros_dumbbell_z
## 11.634196 10.747008 6.456892
## gyros_arm_z magnet_arm_z total_accel_arm
## 6.004130 4.475286 2.237643
# Put those important predictors' names into a vector
predictors.names <- names(predictors)
From the above Classification Tree, we know that only 57 predictors out of 158 are important and we have their names! Let’s revise the training and validation datasets so that they only have those 57 predictors and the target (classe), and testing dataset so that it only has 57 predictors.
keep <- c(predictors.names,"classe")
training1 <- training1[keep]
validation <- validation[keep]
testing <- testing[predictors.names]
# Double check the new dimensions of training1, validation and testing data
dim(training1)
## [1] 11776 58
dim(validation)
## [1] 7846 58
dim(testing)
## [1] 20 57
In order to ensure that we can use our trained model to predict the testing data, we need to coerce all predictors with same name to the same data type.
testing <- rbind(training1[2, -58] , testing)
testing <- testing[-1,]
Develop predictive models
Now that we know only 57 predictors are useful to predict classe, let’s develop a Random Forest model to predict classe:
model <- randomForest(classe ~. , data=training1)
Use the model to predict classe in validation data:
predictions <- predict(model, validation, type = "class")
Create a confusion matrix to see how good is our predictions:
confusionMatrix(predictions, validation$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2232 0 0 0 0
## B 0 1518 2 0 0
## C 0 0 1366 3 0
## D 0 0 0 1280 4
## E 0 0 0 3 1438
##
## Overall Statistics
##
## Accuracy : 0.9985
## 95% CI : (0.9973, 0.9992)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9981
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 1.0000 0.9985 0.9953 0.9972
## Specificity 1.0000 0.9997 0.9995 0.9994 0.9995
## Pos Pred Value 1.0000 0.9987 0.9978 0.9969 0.9979
## Neg Pred Value 1.0000 1.0000 0.9997 0.9991 0.9994
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2845 0.1935 0.1741 0.1631 0.1833
## Detection Prevalence 0.2845 0.1937 0.1745 0.1637 0.1837
## Balanced Accuracy 1.0000 0.9998 0.9990 0.9974 0.9984
Great result! Accuracy 99.9%!
Create predictions for testing set
# Predict testing set
testingPredictions <- predict(model, testing, type = "class")
# Write each prediction into a file
write_prediction <- function(result){
n = length(result)
for(i in 1:n){
filename = paste0("problem_id_",i,".txt")
write.table(result[i],file=filename,quote=FALSE,row.names=FALSE,col.names=FALSE)
}
}
write_prediction(testingPredictions)