Using devices such as JawboneUp, NikeFuelBand, and Fitbitit is now possible to collect a large amount of data about personal activity relatively inexpensively. These type of devices are part of the quantified self movement - a group of enthusiasts who take measurements about themselves regularly to improve their health, to find patterns in their behavior, or because they are tech geeks. One thing that people regularly do is quantify how much of a particular activity they do, but they rarely quantify how well they do it.
In this project, the goal is to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants. They were asked to perform barbell lifts correctly and incorrectly in 5 different ways. More information is available from the website: http://groupware.les.inf.puc-rio.br/har (see the section on the Weight Lifting Exercise Dataset).
The following library where used to create within the report to produce the predictions.
library(Hmisc)
library(caret)
library(randomForest)
library(foreach)
library(doParallel)
knitr::opts_chunk$set(cache=TRUE)
set.seed(223)
In the project the pml-training.csv data is actually used to devise training and testing sets. The pml-test.csv data is used to predict and answer the 20 questions based on the trained model. Follow the links provided to aquire the needed data file.
# URL of the training and testing data
train.url ="https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test.url = "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
# file names
train.name = "./data/pml-training.csv"
test.name = "./data/pml-testing.csv"
# if directory does not exist, create new
if (!file.exists("./data")) {
dir.create("./data")
}
# if files does not exist, download the files
if (!file.exists(train.name)) {
download.file(train.url, destfile=train.name)
}
if (!file.exists(test.name)) {
download.file(test.url, destfile=test.name)
}
# load the CSV files as data.frame
train.df = read.csv("./data/pml-training.csv", na.strings=c("#DIV/0!"))
test.df = read.csv("./data/pml-testing.csv", na.strings=c("#DIV/0!"))
The raw training data has 19622 rows of observations and 160 variables. Column X is unusable row number. While the testing data has 20 rows and the same 160 variables. There is one column of target outcome named classe.
dim(train.df)
## [1] 19622 160
dim(test.df)
## [1] 20 160
names(train.df)
## [1] "X" "user_name"
## [3] "raw_timestamp_part_1" "raw_timestamp_part_2"
## [5] "cvtd_timestamp" "new_window"
## [7] "num_window" "roll_belt"
## [9] "pitch_belt" "yaw_belt"
## [11] "total_accel_belt" "kurtosis_roll_belt"
## [13] "kurtosis_picth_belt" "kurtosis_yaw_belt"
## [15] "skewness_roll_belt" "skewness_roll_belt.1"
## [17] "skewness_yaw_belt" "max_roll_belt"
## [19] "max_picth_belt" "max_yaw_belt"
## [21] "min_roll_belt" "min_pitch_belt"
## [23] "min_yaw_belt" "amplitude_roll_belt"
## [25] "amplitude_pitch_belt" "amplitude_yaw_belt"
## [27] "var_total_accel_belt" "avg_roll_belt"
## [29] "stddev_roll_belt" "var_roll_belt"
## [31] "avg_pitch_belt" "stddev_pitch_belt"
## [33] "var_pitch_belt" "avg_yaw_belt"
## [35] "stddev_yaw_belt" "var_yaw_belt"
## [37] "gyros_belt_x" "gyros_belt_y"
## [39] "gyros_belt_z" "accel_belt_x"
## [41] "accel_belt_y" "accel_belt_z"
## [43] "magnet_belt_x" "magnet_belt_y"
## [45] "magnet_belt_z" "roll_arm"
## [47] "pitch_arm" "yaw_arm"
## [49] "total_accel_arm" "var_accel_arm"
## [51] "avg_roll_arm" "stddev_roll_arm"
## [53] "var_roll_arm" "avg_pitch_arm"
## [55] "stddev_pitch_arm" "var_pitch_arm"
## [57] "avg_yaw_arm" "stddev_yaw_arm"
## [59] "var_yaw_arm" "gyros_arm_x"
## [61] "gyros_arm_y" "gyros_arm_z"
## [63] "accel_arm_x" "accel_arm_y"
## [65] "accel_arm_z" "magnet_arm_x"
## [67] "magnet_arm_y" "magnet_arm_z"
## [69] "kurtosis_roll_arm" "kurtosis_picth_arm"
## [71] "kurtosis_yaw_arm" "skewness_roll_arm"
## [73] "skewness_pitch_arm" "skewness_yaw_arm"
## [75] "max_roll_arm" "max_picth_arm"
## [77] "max_yaw_arm" "min_roll_arm"
## [79] "min_pitch_arm" "min_yaw_arm"
## [81] "amplitude_roll_arm" "amplitude_pitch_arm"
## [83] "amplitude_yaw_arm" "roll_dumbbell"
## [85] "pitch_dumbbell" "yaw_dumbbell"
## [87] "kurtosis_roll_dumbbell" "kurtosis_picth_dumbbell"
## [89] "kurtosis_yaw_dumbbell" "skewness_roll_dumbbell"
## [91] "skewness_pitch_dumbbell" "skewness_yaw_dumbbell"
## [93] "max_roll_dumbbell" "max_picth_dumbbell"
## [95] "max_yaw_dumbbell" "min_roll_dumbbell"
## [97] "min_pitch_dumbbell" "min_yaw_dumbbell"
## [99] "amplitude_roll_dumbbell" "amplitude_pitch_dumbbell"
## [101] "amplitude_yaw_dumbbell" "total_accel_dumbbell"
## [103] "var_accel_dumbbell" "avg_roll_dumbbell"
## [105] "stddev_roll_dumbbell" "var_roll_dumbbell"
## [107] "avg_pitch_dumbbell" "stddev_pitch_dumbbell"
## [109] "var_pitch_dumbbell" "avg_yaw_dumbbell"
## [111] "stddev_yaw_dumbbell" "var_yaw_dumbbell"
## [113] "gyros_dumbbell_x" "gyros_dumbbell_y"
## [115] "gyros_dumbbell_z" "accel_dumbbell_x"
## [117] "accel_dumbbell_y" "accel_dumbbell_z"
## [119] "magnet_dumbbell_x" "magnet_dumbbell_y"
## [121] "magnet_dumbbell_z" "roll_forearm"
## [123] "pitch_forearm" "yaw_forearm"
## [125] "kurtosis_roll_forearm" "kurtosis_picth_forearm"
## [127] "kurtosis_yaw_forearm" "skewness_roll_forearm"
## [129] "skewness_pitch_forearm" "skewness_yaw_forearm"
## [131] "max_roll_forearm" "max_picth_forearm"
## [133] "max_yaw_forearm" "min_roll_forearm"
## [135] "min_pitch_forearm" "min_yaw_forearm"
## [137] "amplitude_roll_forearm" "amplitude_pitch_forearm"
## [139] "amplitude_yaw_forearm" "total_accel_forearm"
## [141] "var_accel_forearm" "avg_roll_forearm"
## [143] "stddev_roll_forearm" "var_roll_forearm"
## [145] "avg_pitch_forearm" "stddev_pitch_forearm"
## [147] "var_pitch_forearm" "avg_yaw_forearm"
## [149] "stddev_yaw_forearm" "var_yaw_forearm"
## [151] "gyros_forearm_x" "gyros_forearm_y"
## [153] "gyros_forearm_z" "accel_forearm_x"
## [155] "accel_forearm_y" "accel_forearm_z"
## [157] "magnet_forearm_x" "magnet_forearm_y"
## [159] "magnet_forearm_z" "classe"
# Checked how many users where in the study.
unique(train.df$user_name)
## [1] carlitos pedro adelmo charles eurico jeremy
## Levels: adelmo carlitos charles eurico jeremy pedro
unique(test.df$user_name)
## [1] pedro jeremy adelmo eurico carlitos charles
## Levels: adelmo carlitos charles eurico jeremy pedro
It is a good idea to cast the last 8 columns of data to numerical to help with the prediction in future steps. SuppressWarnings() was used to remove the “Warning: NAs introduced by coercion” messages that where produced while converting to numerical.
# Casted the last 8 column to the end to numerical
suppressWarnings(for(i in c(8:ncol(train.df)-1)) {train.df[,i] = as.numeric(as.character(train.df[,i]))})
suppressWarnings(for(i in c(8:ncol(test.df)-1)) {test.df[,i] = as.numeric(as.character(test.df[,i]))})
Remove all the blank(‘“”’), ‘#DIV/0’ and ‘NA’ values need to be converted to ‘NA’. Any Columns containing ‘NA’ will be removed from both downloaded data sets to help the accuracy of the prediction.
#tested Blank data
table(colSums(is.na(train.df)) == 0)
##
## FALSE TRUE
## 100 60
table(colSums(is.na(test.df)) == 0)
##
## FALSE TRUE
## 100 60
# Remove first 7 columns and blank data
train.clean <- colnames(train.df[colSums(is.na(train.df)) == 0])[-(1:7)]
test.clean <- colnames(test.df[colSums(is.na(test.df)) == 0])[-(1:7)]
test.data <- test.df[test.clean]
model <- train.df[train.clean]
# Display clean data
train.clean
## [1] "roll_belt" "pitch_belt" "yaw_belt"
## [4] "total_accel_belt" "gyros_belt_x" "gyros_belt_y"
## [7] "gyros_belt_z" "accel_belt_x" "accel_belt_y"
## [10] "accel_belt_z" "magnet_belt_x" "magnet_belt_y"
## [13] "magnet_belt_z" "roll_arm" "pitch_arm"
## [16] "yaw_arm" "total_accel_arm" "gyros_arm_x"
## [19] "gyros_arm_y" "gyros_arm_z" "accel_arm_x"
## [22] "accel_arm_y" "accel_arm_z" "magnet_arm_x"
## [25] "magnet_arm_y" "magnet_arm_z" "roll_dumbbell"
## [28] "pitch_dumbbell" "yaw_dumbbell" "total_accel_dumbbell"
## [31] "gyros_dumbbell_x" "gyros_dumbbell_y" "gyros_dumbbell_z"
## [34] "accel_dumbbell_x" "accel_dumbbell_y" "accel_dumbbell_z"
## [37] "magnet_dumbbell_x" "magnet_dumbbell_y" "magnet_dumbbell_z"
## [40] "roll_forearm" "pitch_forearm" "yaw_forearm"
## [43] "total_accel_forearm" "gyros_forearm_x" "gyros_forearm_y"
## [46] "gyros_forearm_z" "accel_forearm_x" "accel_forearm_y"
## [49] "accel_forearm_z" "magnet_forearm_x" "magnet_forearm_y"
## [52] "magnet_forearm_z" "classe"
Six young health male participants aged between 20-28 years, with little weight lifting experience were asked to perform one set of 10 repetitions of the Unilateral Dumbbell Biceps Curl in five different fashions:
and controlled manner by using a relatively light dumbbell (1.25kg).
[Read more:] (http://groupware.les.inf.puc-rio.br/har#ixzz4Igy0toT2)
Ugulino, W.; Cardador, D.; Vega, K.; Velloso, E.; Milidiu, R.; Fuks, H. Wearable Computing: Accelerometers’ Data Classification of Body Postures and Movements. Proceedings of 21st Brazilian Symposium on Artificial Intelligence. Advances in Artificial Intelligence - SBIA 2012. In: Lecture Notes in Computer Science. , pp. 52-61. Curitiba, PR: Springer Berlin / Heidelberg, 2012. ISBN 978-3-642-34458-9. DOI: 10.1007/978-3-642-34459-6_6.
Split the dataset into a 60% training and 40% probing dataset.
#Divide training_model to testing.model (60% and 40% resp.)
indx <- createDataPartition(y=model$classe, p=0.60, list=FALSE )
training.model <- model[indx,]
testing.model <- model[-indx,]
In order to build this model make use of parallel processing this will provide speedup in the building of the random forest.
registerDoParallel()
x <- training.model[-ncol(training.model)]
y <- training.model$classe
training.rf <- foreach(ntree=rep(150, 6), .combine=randomForest::combine, .packages='randomForest') %dopar% {
randomForest(x, y, ntree=ntree)
}
predictions1 <- predict(training.rf, newdata=training.model)
pdtree1 <- confusionMatrix(predictions1,training.model$classe)
pdtree1
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 3348 0 0 0 0
## B 0 2279 0 0 0
## C 0 0 2054 0 0
## D 0 0 0 1930 0
## E 0 0 0 0 2165
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9997, 1)
## No Information Rate : 0.2843
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 1.0000 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000 1.0000 1.0000
## Prevalence 0.2843 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2843 0.1935 0.1744 0.1639 0.1838
## Detection Prevalence 0.2843 0.1935 0.1744 0.1639 0.1838
## Balanced Accuracy 1.0000 1.0000 1.0000 1.0000 1.0000
plot(pdtree1$table, col = pdtree1$byClass, main = paste("Decision Tree Confusion Matrix: Accuracy =", round(pdtree1$overall['Accuracy'], 4)))
Figure 1
predictions2 <- predict(training.rf, newdata=testing.model)
pdtree2 <- confusionMatrix(predictions2,testing.model$classe)
pdtree2
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2229 9 0 0 0
## B 2 1508 12 0 0
## C 0 1 1355 17 3
## D 0 0 1 1268 5
## E 1 0 0 1 1434
##
## Overall Statistics
##
## Accuracy : 0.9934
## 95% CI : (0.9913, 0.995)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9916
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9987 0.9934 0.9905 0.9860 0.9945
## Specificity 0.9984 0.9978 0.9968 0.9991 0.9997
## Pos Pred Value 0.9960 0.9908 0.9847 0.9953 0.9986
## Neg Pred Value 0.9995 0.9984 0.9980 0.9973 0.9988
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2841 0.1922 0.1727 0.1616 0.1828
## Detection Prevalence 0.2852 0.1940 0.1754 0.1624 0.1830
## Balanced Accuracy 0.9985 0.9956 0.9936 0.9925 0.9971
plot(pdtree2$table, col = pdtree2$byClass, main = paste("Decision Tree Confusion Matrix: Accuracy =", round(pdtree2$overall['Accuracy'], 4)))
Figure 2
It was found that the confusion matrix this model is very accurate. The test data was around 99% accurate which would indicate nearly all of the submitted test cases to be correct.
final_pred <- predict(training.rf, test.data, type = "class")
final_pred
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E