Summary

One thing that people regularly do is quantify how much of a particular activity they do, but they rarely quantify how well they do it. In this project, your goal will be to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants.

The goal of your project is to predict the manner in which they did the exercise. This is the “classe” variable in the training set.

Development of the model

Libraries

library(tidyverse)
library(skimr)          ## skim() make nice summary of datasets
library(doParallel)     ## to use the four available cores. makePSOCKcluster(<#cores>)
library(caret)
library(corrplot)       
library(nnet)           ## To make multinominal logistic regression
library(randomForest)
library(xgboost)        ## Boosting

Reading Data

data <- read_csv("./data/training_proj.csv", na = c("", NA), 
                 col_types = cols(kurtosis_roll_belt = col_double(),
                                  kurtosis_picth_belt = col_double(),
                                  kurtosis_yaw_belt = col_double(),
                                  skewness_roll_belt = col_double(),
                                  skewness_roll_belt.1 = col_double(),
                                  skewness_yaw_belt = col_double(),
                                  max_yaw_belt = col_double(),
                                  min_yaw_belt = col_double(),
                                  amplitude_yaw_belt = col_double(),
                                  kurtosis_picth_arm = col_double(),
                                  kurtosis_yaw_arm = col_double(),
                                  skewness_pitch_arm = col_double(),
                                  skewness_yaw_arm = col_double(),
                                  kurtosis_yaw_dumbbell = col_double(),
                                  skewness_yaw_dumbbell = col_double(),
                                  kurtosis_roll_forearm = col_double(),
                                  kurtosis_picth_forearm = col_double(),
                                  kurtosis_yaw_forearm = col_double(),
                                  skewness_roll_forearm = col_double(),
                                  skewness_pitch_forearm = col_double(),
                                  skewness_yaw_forearm = col_double(),
                                  max_yaw_forearm = col_double(),
                                  min_yaw_forearm = col_double(),
                                  amplitude_yaw_forearm = col_double() ) )

data <- data %>% rename( "skewness_pitch_belt" = "skewness_roll_belt.1")

val.data <- read_csv("./data/validation_data.csv", na = c("", NA), 
                 col_types = cols(kurtosis_roll_belt = col_double(),
                                  kurtosis_picth_belt = col_double(),
                                  kurtosis_yaw_belt = col_double(),
                                  skewness_roll_belt = col_double(),
                                  skewness_roll_belt.1 = col_double(),
                                  skewness_yaw_belt = col_double(),
                                  max_yaw_belt = col_double(),
                                  min_yaw_belt = col_double(),
                                  amplitude_yaw_belt = col_double(),
                                  kurtosis_picth_arm = col_double(),
                                  kurtosis_yaw_arm = col_double(),
                                  skewness_pitch_arm = col_double(),
                                  skewness_yaw_arm = col_double(),
                                  kurtosis_yaw_dumbbell = col_double(),
                                  skewness_yaw_dumbbell = col_double(),
                                  kurtosis_roll_forearm = col_double(),
                                  kurtosis_picth_forearm = col_double(),
                                  kurtosis_yaw_forearm = col_double(),
                                  skewness_roll_forearm = col_double(),
                                  skewness_pitch_forearm = col_double(),
                                  skewness_yaw_forearm = col_double(),
                                  max_yaw_forearm = col_double(),
                                  min_yaw_forearm = col_double(),
                                  amplitude_yaw_forearm = col_double() ) )

val.data <- val.data %>% rename( "skewness_pitch_belt" = "skewness_roll_belt.1")
val.data <- val.data %>% mutate("classe" = 1)

Tidydata

skim <- data %>% skim()                                    # Some inspectiopn of data
rmv <- sapply(data, function(x){ mean(is.na(x)) < 0.95 } ) # Which columns are mostly Na's
# removing columns wich are mostly NA and not usefull variables as User name and  time-related columns
tidydata <- data[,rmv] %>% select(-c(1:7))  
tidydata %>% skim()                                        # Summarizing variables. 
Data summary
Name Piped data
Number of rows 19622
Number of columns 53
_______________________
Column type frequency:
character 1
numeric 52
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
classe 0 1 1 1 0 5 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
roll_belt 0 1 64.41 62.75 -28.90 1.10 113.00 123.00 162.00 ▇▁▁▅▅
pitch_belt 0 1 0.31 22.35 -55.80 1.76 5.28 14.90 60.30 ▃▁▇▅▁
yaw_belt 0 1 -11.21 95.19 -180.00 -88.30 -13.00 12.90 179.00 ▁▇▅▁▃
total_accel_belt 0 1 11.31 7.74 0.00 3.00 17.00 18.00 29.00 ▇▁▂▆▁
gyros_belt_x 0 1 -0.01 0.21 -1.04 -0.03 0.03 0.11 2.22 ▁▇▁▁▁
gyros_belt_y 0 1 0.04 0.08 -0.64 0.00 0.02 0.11 0.64 ▁▁▇▁▁
gyros_belt_z 0 1 -0.13 0.24 -1.46 -0.20 -0.10 -0.02 1.62 ▁▂▇▁▁
accel_belt_x 0 1 -5.59 29.64 -120.00 -21.00 -15.00 -5.00 85.00 ▁▁▇▁▂
accel_belt_y 0 1 30.15 28.58 -69.00 3.00 35.00 61.00 164.00 ▁▇▇▁▁
accel_belt_z 0 1 -72.59 100.45 -275.00 -162.00 -152.00 27.00 105.00 ▁▇▁▅▃
magnet_belt_x 0 1 55.60 64.18 -52.00 9.00 35.00 59.00 485.00 ▇▁▂▁▁
magnet_belt_y 0 1 593.68 35.68 354.00 581.00 601.00 610.00 673.00 ▁▁▁▇▃
magnet_belt_z 0 1 -345.48 65.21 -623.00 -375.00 -320.00 -306.00 293.00 ▁▇▁▁▁
roll_arm 0 1 17.83 72.74 -180.00 -31.78 0.00 77.30 180.00 ▁▃▇▆▂
pitch_arm 0 1 -4.61 30.68 -88.80 -25.90 0.00 11.20 88.50 ▁▅▇▂▁
yaw_arm 0 1 -0.62 71.36 -180.00 -43.10 0.00 45.88 180.00 ▁▃▇▃▂
total_accel_arm 0 1 25.51 10.52 1.00 17.00 27.00 33.00 66.00 ▃▆▇▁▁
gyros_arm_x 0 1 0.04 1.99 -6.37 -1.33 0.08 1.57 4.87 ▁▃▇▆▂
gyros_arm_y 0 1 -0.26 0.85 -3.44 -0.80 -0.24 0.14 2.84 ▁▂▇▂▁
gyros_arm_z 0 1 0.27 0.55 -2.33 -0.07 0.23 0.72 3.02 ▁▂▇▂▁
accel_arm_x 0 1 -60.24 182.04 -404.00 -242.00 -44.00 84.00 437.00 ▇▅▇▅▁
accel_arm_y 0 1 32.60 109.87 -318.00 -54.00 14.00 139.00 308.00 ▁▃▇▆▂
accel_arm_z 0 1 -71.25 134.65 -636.00 -143.00 -47.00 23.00 292.00 ▁▁▅▇▁
magnet_arm_x 0 1 191.72 443.64 -584.00 -300.00 289.00 637.00 782.00 ▆▃▂▃▇
magnet_arm_y 0 1 156.61 201.91 -392.00 -9.00 202.00 323.00 583.00 ▁▅▅▇▂
magnet_arm_z 0 1 306.49 326.62 -597.00 131.25 444.00 545.00 694.00 ▁▂▂▃▇
roll_dumbbell 0 1 23.84 69.93 -153.71 -18.49 48.17 67.61 153.55 ▂▂▃▇▂
pitch_dumbbell 0 1 -10.78 36.99 -149.59 -40.89 -20.96 17.50 149.40 ▁▆▇▂▁
yaw_dumbbell 0 1 1.67 82.52 -150.87 -77.64 -3.32 79.64 154.95 ▃▇▅▅▆
total_accel_dumbbell 0 1 13.72 10.23 0.00 4.00 10.00 19.00 58.00 ▇▅▃▁▁
gyros_dumbbell_x 0 1 0.16 1.51 -204.00 -0.03 0.13 0.35 2.22 ▁▁▁▁▇
gyros_dumbbell_y 0 1 0.05 0.61 -2.10 -0.14 0.03 0.21 52.00 ▇▁▁▁▁
gyros_dumbbell_z 0 1 -0.13 2.29 -2.38 -0.31 -0.13 0.03 317.00 ▇▁▁▁▁
accel_dumbbell_x 0 1 -28.62 67.32 -419.00 -50.00 -8.00 11.00 235.00 ▁▁▆▇▁
accel_dumbbell_y 0 1 52.63 80.75 -189.00 -8.00 41.50 111.00 315.00 ▁▇▇▅▁
accel_dumbbell_z 0 1 -38.32 109.47 -334.00 -142.00 -1.00 38.00 318.00 ▁▆▇▃▁
magnet_dumbbell_x 0 1 -328.48 339.72 -643.00 -535.00 -479.00 -304.00 592.00 ▇▂▁▁▂
magnet_dumbbell_y 0 1 220.97 326.87 -3600.00 231.00 311.00 390.00 633.00 ▁▁▁▁▇
magnet_dumbbell_z 0 1 46.05 139.96 -262.00 -45.00 13.00 95.00 452.00 ▁▇▆▂▂
roll_forearm 0 1 33.83 108.04 -180.00 -0.74 21.70 140.00 180.00 ▃▂▇▂▇
pitch_forearm 0 1 10.71 28.15 -72.50 0.00 9.24 28.40 89.80 ▁▁▇▃▁
yaw_forearm 0 1 19.21 103.22 -180.00 -68.60 0.00 110.00 180.00 ▅▅▇▆▇
total_accel_forearm 0 1 34.72 10.06 0.00 29.00 36.00 41.00 108.00 ▁▇▂▁▁
gyros_forearm_x 0 1 0.16 0.65 -22.00 -0.22 0.05 0.56 3.97 ▁▁▁▁▇
gyros_forearm_y 0 1 0.08 3.10 -7.02 -1.46 0.03 1.62 311.00 ▇▁▁▁▁
gyros_forearm_z 0 1 0.15 1.75 -8.09 -0.18 0.08 0.49 231.00 ▇▁▁▁▁
accel_forearm_x 0 1 -61.65 180.59 -498.00 -178.00 -57.00 76.00 477.00 ▂▆▇▅▁
accel_forearm_y 0 1 163.66 200.13 -632.00 57.00 201.00 312.00 923.00 ▁▂▇▅▁
accel_forearm_z 0 1 -55.29 138.40 -446.00 -182.00 -39.00 26.00 291.00 ▁▇▅▅▃
magnet_forearm_x 0 1 -312.58 346.96 -1280.00 -616.00 -378.00 -73.00 672.00 ▁▇▇▅▁
magnet_forearm_y 0 1 380.12 509.37 -896.00 2.00 591.00 737.00 1480.00 ▂▂▂▇▁
magnet_forearm_z 0 1 393.61 369.27 -973.00 191.00 511.00 653.00 1090.00 ▁▁▂▇▃
tidydata$classe <- as.factor(tidydata$classe) 
val.data <- val.data %>% select(colnames(tidydata))

finally I have a tidy dataset made up of 52 variables and 1 out outcome (categorical classe).

Data transformation

set.seed(123)

training.samples <- tidydata$classe %>% createDataPartition(p=0.7, list = FALSE) 
train.data <- tidydata[training.samples,]
test.data <- tidydata[-training.samples,]

# estimate preprocessing parameters
preproc.param <- train.data %>% preProcess(method = c("center","scale")) 
# Transform the data using the staimated parameters
train.transformed <- preproc.param %>% predict(train.data)
test.transformed <- preproc.param %>% predict(test.data)
valid.transformed <- preproc.param %>% predict(val.data)

Exploratory Analisys

A convenient step to explore our data set is to get the correlation matrix. This could get us some clues to pick some predictors which are highly correlated and then remove them from our data set. Also we could develop some strategy to make a dimension reduction.

cor_mt <- cor(tidydata[,-53])                           # Correlation matrix
corrplot(cor_mt, order = "FPC", method = "color", type = "lower", 
         tl.cex = 0.8, tl.col = rgb(0, 0, 0))           

In the plot above we see that there are a high number of correlated predictors.

Selecting Predictors

In the dataset there are 52 predictors and many of them are highly correlated. So, It’d be convenient to implement a methodology to reduce the dimensionality of our data set. A simple and in many cases effective tool for dimension reduction in a data set is Correlation method. This method just remove predictors which are highly correlated. (See Applied Predictive Modeling. Max Kuhn and Kjell Johnson)

The algorithm is as follows:

  1. Calculate the correlation matrix of the predictors.
  2. Determine the two predictors associated with the largest absolute pairwise correlation (call them predictors A and B).
  3. Determine the average correlation between A and the other variables. Do the same for predictor B.
  4. If A has a larger average correlation, remove it; otherwise, remove predictor B.
  5. Repeat Steps 2–4 until no absolute correlations are above the threshold.

Correlation method

In the carret package the findCorrelation function return the variables to be removed in in accordance with the Correlation method.

rmv <- findCorrelation(cor_mt, cutoff = 0.75)   # return columns to remove
# rmv contain the predictors to remove from datas sets. it'll go from 52 to 32 variables given a cutoff of 0.75 to find correlated varibales. 
names <- colnames(cor_mt)
train.transformed.reduced.1 <- train.transformed %>% select(-names[rmv])
test.transformed.reduced.1 <- test.transformed %>% select(-names[rmv])

Training

Code to Train Final KNN Model:

set.seed(123)
model_knn <- train(classe ~., data = train.transformed.reduced.1, method = "knn",
               trControl = trainControl("cv", number = 10),
               tuneLength = 10)
# Really a fast training
# plot model accuracy vs different values of k
plot(model_knn)
# print the best tuning parameter k
model_knn$bestTune 

saveRDS(model_knn, "./C8_ML/project_models/model_knn.rds")

Code to Train Final Random Forest Model:

set.seed(123)
model_rf <- train(classe ~. , data = train.transformed.reduced.1, method = "rf",
                  trControl = trainControl("cv", number = 5),
                  importance = TRUE)
model_rf$bestTune
# model_rf$finalModel 
# Variable importance
# importance(model_rf$finalModel)
saveRDS(model_SVM, "./C8_ML/project_models/model_rf.rds")

I played some time with a bunch of models and its tuning parameters previusly. I pasted the code for my final models but I ran them in other files. Here I’m going to load my final models to make the predictions:

knn_model <- readRDS("./project_models/model_knn.rds")
rf_model <- readRDS("./project_models/model_rf.rds")

Models Testing

Here I’m going to compute just the models accuracy parameter to compare:

# make predictions on the test data
predicted.classes <- knn_model %>% predict(test.transformed.reduced.1)
# compute model accuracy rate
knn_acc <- mean(predicted.classes == test.transformed.reduced.1$classe) 

The accuracy for the knn classifier in 0.957859

# make predictions 
predicted.classes <- rf_model %>% predict(test.transformed.reduced.1)
#compute model accuracy
rf_acc <- mean(predicted.classes == test.transformed.reduced.1$classe)

The accuracy for the Random Forest model in 0.9942226

Validation

To make the final validation we have to apply our machine learning algorithm to the 20 test cases available in the test data.

# make predictions ovn the test data
classes.knn <- knn_model %>% predict(valid.transformed)
classes.knn
##  [1] B A A A A C D B A A D C B A E E A B B B
## Levels: A B C D E
classes.rf <- rf_model %>% predict(valid.transformed)
classes.rf
##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E

Conclusions

The random forest model obtained an 100% accuracy in the final Quiz. The KNN model has a significant accuracy, though, got 3 errors in the final Quiz, this mean 85% accuracy.