Synopsis
In this project we will be building a prediction model to know how a certain user is lifting weights based on data obtained from an accelerometer.
The dataset consists on 5 classes:
. The subject is lifting weights exactly according to the specification (Class A).
. Throwing the elbow to the front (Class B).
. Lifting the dumbbell only halfway (Class C).
. Lowering the dumbbell only halfway (Class D).
. Throwing the hips to the front (Class E).
For more information and description about the dataset, see the official website: http://groupware.les.inf.puc-rio.br/har
Getting the data
The file “pml-training” will served as our training set. The file “pml-testing” is a data set without the classes, in other word, we will predict the classes based on our model.
if(!file.exists("pml-training.csv"))
{
download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", "pml-training.csv", method = 'curl')
}
dataset <- read.csv("pml-training.csv", na.strings = c("NA", ""))
if(!file.exists("pml-testing.csv"))
{
download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv", "pml-testing.csv", method = 'curl')
}
validation <- read.csv("pml-testing.csv")
Data preprocessing
Import the necessary packages.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
Set a seed for reproducibility.
set.seed(17)
Create the data partitions. 70% of it will go to the training set and the rest will be the test set.
inTrain = createDataPartition(y=dataset$classe, p=0.7, list=FALSE)
training = dataset[inTrain,]
testing = dataset[-inTrain,]
Eliminate the NA entries.
naColumns = sapply(training, function(x) {sum(is.na(x))}) #Make a vector of all the columns and the number of NA entries
naColumns
## X user_name raw_timestamp_part_1
## 0 0 0
## raw_timestamp_part_2 cvtd_timestamp new_window
## 0 0 0
## num_window roll_belt pitch_belt
## 0 0 0
## yaw_belt total_accel_belt kurtosis_roll_belt
## 0 0 13460
## kurtosis_picth_belt kurtosis_yaw_belt skewness_roll_belt
## 13460 13460 13460
## skewness_roll_belt.1 skewness_yaw_belt max_roll_belt
## 13460 13460 13460
## max_picth_belt max_yaw_belt min_roll_belt
## 13460 13460 13460
## min_pitch_belt min_yaw_belt amplitude_roll_belt
## 13460 13460 13460
## amplitude_pitch_belt amplitude_yaw_belt var_total_accel_belt
## 13460 13460 13460
## avg_roll_belt stddev_roll_belt var_roll_belt
## 13460 13460 13460
## avg_pitch_belt stddev_pitch_belt var_pitch_belt
## 13460 13460 13460
## avg_yaw_belt stddev_yaw_belt var_yaw_belt
## 13460 13460 13460
## gyros_belt_x gyros_belt_y gyros_belt_z
## 0 0 0
## accel_belt_x accel_belt_y accel_belt_z
## 0 0 0
## magnet_belt_x magnet_belt_y magnet_belt_z
## 0 0 0
## roll_arm pitch_arm yaw_arm
## 0 0 0
## total_accel_arm var_accel_arm avg_roll_arm
## 0 13460 13460
## stddev_roll_arm var_roll_arm avg_pitch_arm
## 13460 13460 13460
## stddev_pitch_arm var_pitch_arm avg_yaw_arm
## 13460 13460 13460
## stddev_yaw_arm var_yaw_arm gyros_arm_x
## 13460 13460 0
## gyros_arm_y gyros_arm_z accel_arm_x
## 0 0 0
## accel_arm_y accel_arm_z magnet_arm_x
## 0 0 0
## magnet_arm_y magnet_arm_z kurtosis_roll_arm
## 0 0 13460
## kurtosis_picth_arm kurtosis_yaw_arm skewness_roll_arm
## 13460 13460 13460
## skewness_pitch_arm skewness_yaw_arm max_roll_arm
## 13460 13460 13460
## max_picth_arm max_yaw_arm min_roll_arm
## 13460 13460 13460
## min_pitch_arm min_yaw_arm amplitude_roll_arm
## 13460 13460 13460
## amplitude_pitch_arm amplitude_yaw_arm roll_dumbbell
## 13460 13460 0
## pitch_dumbbell yaw_dumbbell kurtosis_roll_dumbbell
## 0 0 13460
## kurtosis_picth_dumbbell kurtosis_yaw_dumbbell skewness_roll_dumbbell
## 13460 13460 13460
## skewness_pitch_dumbbell skewness_yaw_dumbbell max_roll_dumbbell
## 13460 13460 13460
## max_picth_dumbbell max_yaw_dumbbell min_roll_dumbbell
## 13460 13460 13460
## min_pitch_dumbbell min_yaw_dumbbell amplitude_roll_dumbbell
## 13460 13460 13460
## amplitude_pitch_dumbbell amplitude_yaw_dumbbell total_accel_dumbbell
## 13460 13460 0
## var_accel_dumbbell avg_roll_dumbbell stddev_roll_dumbbell
## 13460 13460 13460
## var_roll_dumbbell avg_pitch_dumbbell stddev_pitch_dumbbell
## 13460 13460 13460
## var_pitch_dumbbell avg_yaw_dumbbell stddev_yaw_dumbbell
## 13460 13460 13460
## var_yaw_dumbbell gyros_dumbbell_x gyros_dumbbell_y
## 13460 0 0
## gyros_dumbbell_z accel_dumbbell_x accel_dumbbell_y
## 0 0 0
## accel_dumbbell_z magnet_dumbbell_x magnet_dumbbell_y
## 0 0 0
## magnet_dumbbell_z roll_forearm pitch_forearm
## 0 0 0
## yaw_forearm kurtosis_roll_forearm kurtosis_picth_forearm
## 0 13460 13460
## kurtosis_yaw_forearm skewness_roll_forearm skewness_pitch_forearm
## 13460 13460 13460
## skewness_yaw_forearm max_roll_forearm max_picth_forearm
## 13460 13460 13460
## max_yaw_forearm min_roll_forearm min_pitch_forearm
## 13460 13460 13460
## min_yaw_forearm amplitude_roll_forearm amplitude_pitch_forearm
## 13460 13460 13460
## amplitude_yaw_forearm total_accel_forearm var_accel_forearm
## 13460 0 13460
## avg_roll_forearm stddev_roll_forearm var_roll_forearm
## 13460 13460 13460
## avg_pitch_forearm stddev_pitch_forearm var_pitch_forearm
## 13460 13460 13460
## avg_yaw_forearm stddev_yaw_forearm var_yaw_forearm
## 13460 13460 13460
## gyros_forearm_x gyros_forearm_y gyros_forearm_z
## 0 0 0
## accel_forearm_x accel_forearm_y accel_forearm_z
## 0 0 0
## magnet_forearm_x magnet_forearm_y magnet_forearm_z
## 0 0 0
## classe
## 0
columnsWithNA = names(naColumns[naColumns > 0]) #Vector with all the columns that has NA values
training = training[, !names(training) %in% columnsWithNA] #Remove those columns from the training set
names(training)
## [1] "X" "user_name" "raw_timestamp_part_1"
## [4] "raw_timestamp_part_2" "cvtd_timestamp" "new_window"
## [7] "num_window" "roll_belt" "pitch_belt"
## [10] "yaw_belt" "total_accel_belt" "gyros_belt_x"
## [13] "gyros_belt_y" "gyros_belt_z" "accel_belt_x"
## [16] "accel_belt_y" "accel_belt_z" "magnet_belt_x"
## [19] "magnet_belt_y" "magnet_belt_z" "roll_arm"
## [22] "pitch_arm" "yaw_arm" "total_accel_arm"
## [25] "gyros_arm_x" "gyros_arm_y" "gyros_arm_z"
## [28] "accel_arm_x" "accel_arm_y" "accel_arm_z"
## [31] "magnet_arm_x" "magnet_arm_y" "magnet_arm_z"
## [34] "roll_dumbbell" "pitch_dumbbell" "yaw_dumbbell"
## [37] "total_accel_dumbbell" "gyros_dumbbell_x" "gyros_dumbbell_y"
## [40] "gyros_dumbbell_z" "accel_dumbbell_x" "accel_dumbbell_y"
## [43] "accel_dumbbell_z" "magnet_dumbbell_x" "magnet_dumbbell_y"
## [46] "magnet_dumbbell_z" "roll_forearm" "pitch_forearm"
## [49] "yaw_forearm" "total_accel_forearm" "gyros_forearm_x"
## [52] "gyros_forearm_y" "gyros_forearm_z" "accel_forearm_x"
## [55] "accel_forearm_y" "accel_forearm_z" "magnet_forearm_x"
## [58] "magnet_forearm_y" "magnet_forearm_z" "classe"
#Remove unnecessary columns (the first 7 columns)
training <- training[, !names(training) %in% c("X", "user_name", "raw_timestamp_part_1", "raw_timestamp_part_2", "cvtd_timestamp", "new_window", "num_window")]
Do the same for the validation set
naColumns = sapply(validation, function(x) {sum(is.na(x))}) #Make a vector of all the columns and the number of NA entries
columnsWithNA = names(naColumns[naColumns > 0]) #Vector with all the columns that has NA values
validation = validation[, !names(validation) %in% columnsWithNA] #Remove those columns from the training set.
validation <- validation[, !names(validation) %in% c("X", "user_name", "raw_timestamp_part_1", "raw_timestamp_part_2", "cvtd_timestamp", "new_window", "num_window")]
Do the same for the testing set.
naColumns = sapply(testing, function(x) {sum(is.na(x))}) #Make a vector of all the columns and the number of NA entries
columnsWithNA = names(naColumns[naColumns > 0]) #Vector with all the columns that has NA values
testing = testing[, !names(testing) %in% columnsWithNA] #Remove those columns from the training set.
testing <- testing[, !names(testing) %in% c("X", "user_name", "raw_timestamp_part_1", "raw_timestamp_part_2", "cvtd_timestamp", "new_window", "num_window")]
Now, we build the prediction model using Random Forest.
model <- randomForest(classe ~ ., data=training, ntree = 50)
predictions <- predict(model, testing)
confusionMatrix(predictions, testing$classe)
## Loading required namespace: e1071
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1674 6 0 0 0
## B 0 1129 13 0 0
## C 0 4 1008 10 0
## D 0 0 5 953 3
## E 0 0 0 1 1079
##
## Overall Statistics
##
## Accuracy : 0.993
## 95% CI : (0.99, 0.995)
## No Information Rate : 0.284
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.991
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.000 0.991 0.982 0.989 0.997
## Specificity 0.999 0.997 0.997 0.998 1.000
## Pos Pred Value 0.996 0.989 0.986 0.992 0.999
## Neg Pred Value 1.000 0.998 0.996 0.998 0.999
## Prevalence 0.284 0.194 0.174 0.164 0.184
## Detection Rate 0.284 0.192 0.171 0.162 0.183
## Detection Prevalence 0.285 0.194 0.174 0.163 0.184
## Balanced Accuracy 0.999 0.994 0.990 0.993 0.999
modelAcc <- confusionMatrix(predictions, testing$classe)$overall[[1]]
Our model is 0.9929 accurate.
Now, we will predict the unknown classes of the validation set.
predictions <- predict(model, validation)
predictions
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E