Development of the model

Libraries

library(tidyverse)
library(skimr)          ## skim() make nice summary of datasets
library(doParallel)     ## to use the four available cores. makePSOCKcluster(<#cores>)
library(caret)
library(corrplot)       
library(nnet)           ## To make multinominal logistic regression
library(randomForest)
library(xgboost)        ## Boosting

Reading Data

data <- read_csv("./data/training_proj.csv", na = c("", NA), 
                 col_types = cols(kurtosis_roll_belt = col_double(),
                                  kurtosis_picth_belt = col_double(),
                                  kurtosis_yaw_belt = col_double(),
                                  skewness_roll_belt = col_double(),
                                  skewness_roll_belt.1 = col_double(),
                                  skewness_yaw_belt = col_double(),
                                  max_yaw_belt = col_double(),
                                  min_yaw_belt = col_double(),
                                  amplitude_yaw_belt = col_double(),
                                  kurtosis_picth_arm = col_double(),
                                  kurtosis_yaw_arm = col_double(),
                                  skewness_pitch_arm = col_double(),
                                  skewness_yaw_arm = col_double(),
                                  kurtosis_yaw_dumbbell = col_double(),
                                  skewness_yaw_dumbbell = col_double(),
                                  kurtosis_roll_forearm = col_double(),
                                  kurtosis_picth_forearm = col_double(),
                                  kurtosis_yaw_forearm = col_double(),
                                  skewness_roll_forearm = col_double(),
                                  skewness_pitch_forearm = col_double(),
                                  skewness_yaw_forearm = col_double(),
                                  max_yaw_forearm = col_double(),
                                  min_yaw_forearm = col_double(),
                                  amplitude_yaw_forearm = col_double() ) )

data <- data %>% rename( "skewness_pitch_belt" = "skewness_roll_belt.1")

val.data <- read_csv("./data/validation_data.csv", na = c("", NA), 
                 col_types = cols(kurtosis_roll_belt = col_double(),
                                  kurtosis_picth_belt = col_double(),
                                  kurtosis_yaw_belt = col_double(),
                                  skewness_roll_belt = col_double(),
                                  skewness_roll_belt.1 = col_double(),
                                  skewness_yaw_belt = col_double(),
                                  max_yaw_belt = col_double(),
                                  min_yaw_belt = col_double(),
                                  amplitude_yaw_belt = col_double(),
                                  kurtosis_picth_arm = col_double(),
                                  kurtosis_yaw_arm = col_double(),
                                  skewness_pitch_arm = col_double(),
                                  skewness_yaw_arm = col_double(),
                                  kurtosis_yaw_dumbbell = col_double(),
                                  skewness_yaw_dumbbell = col_double(),
                                  kurtosis_roll_forearm = col_double(),
                                  kurtosis_picth_forearm = col_double(),
                                  kurtosis_yaw_forearm = col_double(),
                                  skewness_roll_forearm = col_double(),
                                  skewness_pitch_forearm = col_double(),
                                  skewness_yaw_forearm = col_double(),
                                  max_yaw_forearm = col_double(),
                                  min_yaw_forearm = col_double(),
                                  amplitude_yaw_forearm = col_double() ) )

val.data <- val.data %>% rename( "skewness_pitch_belt" = "skewness_roll_belt.1")
val.data <- val.data %>% mutate("classe" = 1)

Tidydata

skim <- data %>% skim()                                    # Some inspectiopn of data
rmv <- sapply(data, function(x){ mean(is.na(x)) < 0.95 } ) # Which columns are mostly Na's
# removing columns wich are mostly NA and not usefull variables as User name and  time-related columns
tidydata <- data[,rmv] %>% select(-c(1:7))  
tidydata %>% skim()                                        # Summarizing variables.

Data summary
Name	Piped data
Number of rows	19622
Number of columns	53
_______________________
Column type frequency:
character	1
numeric	52
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
classe	0	1	1	1	0	5	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
roll_belt	1	64.41	62.75	-28.90	1.10	113.00	123.00	162.00	▇▁▁▅▅
pitch_belt	1	0.31	22.35	-55.80	1.76	5.28	14.90	60.30	▃▁▇▅▁
yaw_belt	1	-11.21	95.19	-180.00	-88.30	-13.00	12.90	179.00	▁▇▅▁▃
total_accel_belt	1	11.31	7.74	0.00	3.00	17.00	18.00	29.00	▇▁▂▆▁
gyros_belt_x	1	-0.01	0.21	-1.04	-0.03	0.03	0.11	2.22	▁▇▁▁▁
gyros_belt_y	1	0.04	0.08	-0.64	0.00	0.02	0.11	0.64	▁▁▇▁▁
gyros_belt_z	1	-0.13	0.24	-1.46	-0.20	-0.10	-0.02	1.62	▁▂▇▁▁
accel_belt_x	1	-5.59	29.64	-120.00	-21.00	-15.00	-5.00	85.00	▁▁▇▁▂
accel_belt_y	1	30.15	28.58	-69.00	3.00	35.00	61.00	164.00	▁▇▇▁▁
accel_belt_z	1	-72.59	100.45	-275.00	-162.00	-152.00	27.00	105.00	▁▇▁▅▃
magnet_belt_x	1	55.60	64.18	-52.00	9.00	35.00	59.00	485.00	▇▁▂▁▁
magnet_belt_y	1	593.68	35.68	354.00	581.00	601.00	610.00	673.00	▁▁▁▇▃
magnet_belt_z	1	-345.48	65.21	-623.00	-375.00	-320.00	-306.00	293.00	▁▇▁▁▁
roll_arm	1	17.83	72.74	-180.00	-31.78	0.00	77.30	180.00	▁▃▇▆▂
pitch_arm	1	-4.61	30.68	-88.80	-25.90	0.00	11.20	88.50	▁▅▇▂▁
yaw_arm	1	-0.62	71.36	-180.00	-43.10	0.00	45.88	180.00	▁▃▇▃▂
total_accel_arm	1	25.51	10.52	1.00	17.00	27.00	33.00	66.00	▃▆▇▁▁
gyros_arm_x	1	0.04	1.99	-6.37	-1.33	0.08	1.57	4.87	▁▃▇▆▂
gyros_arm_y	1	-0.26	0.85	-3.44	-0.80	-0.24	0.14	2.84	▁▂▇▂▁
gyros_arm_z	1	0.27	0.55	-2.33	-0.07	0.23	0.72	3.02	▁▂▇▂▁
accel_arm_x	1	-60.24	182.04	-404.00	-242.00	-44.00	84.00	437.00	▇▅▇▅▁
accel_arm_y	1	32.60	109.87	-318.00	-54.00	14.00	139.00	308.00	▁▃▇▆▂
accel_arm_z	1	-71.25	134.65	-636.00	-143.00	-47.00	23.00	292.00	▁▁▅▇▁
magnet_arm_x	1	191.72	443.64	-584.00	-300.00	289.00	637.00	782.00	▆▃▂▃▇
magnet_arm_y	1	156.61	201.91	-392.00	-9.00	202.00	323.00	583.00	▁▅▅▇▂
magnet_arm_z	1	306.49	326.62	-597.00	131.25	444.00	545.00	694.00	▁▂▂▃▇
roll_dumbbell	1	23.84	69.93	-153.71	-18.49	48.17	67.61	153.55	▂▂▃▇▂
pitch_dumbbell	1	-10.78	36.99	-149.59	-40.89	-20.96	17.50	149.40	▁▆▇▂▁
yaw_dumbbell	1	1.67	82.52	-150.87	-77.64	-3.32	79.64	154.95	▃▇▅▅▆
total_accel_dumbbell	1	13.72	10.23	0.00	4.00	10.00	19.00	58.00	▇▅▃▁▁
gyros_dumbbell_x	1	0.16	1.51	-204.00	-0.03	0.13	0.35	2.22	▁▁▁▁▇
gyros_dumbbell_y	1	0.05	0.61	-2.10	-0.14	0.03	0.21	52.00	▇▁▁▁▁
gyros_dumbbell_z	1	-0.13	2.29	-2.38	-0.31	-0.13	0.03	317.00	▇▁▁▁▁
accel_dumbbell_x	1	-28.62	67.32	-419.00	-50.00	-8.00	11.00	235.00	▁▁▆▇▁
accel_dumbbell_y	1	52.63	80.75	-189.00	-8.00	41.50	111.00	315.00	▁▇▇▅▁
accel_dumbbell_z	1	-38.32	109.47	-334.00	-142.00	-1.00	38.00	318.00	▁▆▇▃▁
magnet_dumbbell_x	1	-328.48	339.72	-643.00	-535.00	-479.00	-304.00	592.00	▇▂▁▁▂
magnet_dumbbell_y	1	220.97	326.87	-3600.00	231.00	311.00	390.00	633.00	▁▁▁▁▇
magnet_dumbbell_z	1	46.05	139.96	-262.00	-45.00	13.00	95.00	452.00	▁▇▆▂▂
roll_forearm	1	33.83	108.04	-180.00	-0.74	21.70	140.00	180.00	▃▂▇▂▇
pitch_forearm	1	10.71	28.15	-72.50	0.00	9.24	28.40	89.80	▁▁▇▃▁
yaw_forearm	1	19.21	103.22	-180.00	-68.60	0.00	110.00	180.00	▅▅▇▆▇
total_accel_forearm	1	34.72	10.06	0.00	29.00	36.00	41.00	108.00	▁▇▂▁▁
gyros_forearm_x	1	0.16	0.65	-22.00	-0.22	0.05	0.56	3.97	▁▁▁▁▇
gyros_forearm_y	1	0.08	3.10	-7.02	-1.46	0.03	1.62	311.00	▇▁▁▁▁
gyros_forearm_z	1	0.15	1.75	-8.09	-0.18	0.08	0.49	231.00	▇▁▁▁▁
accel_forearm_x	1	-61.65	180.59	-498.00	-178.00	-57.00	76.00	477.00	▂▆▇▅▁
accel_forearm_y	1	163.66	200.13	-632.00	57.00	201.00	312.00	923.00	▁▂▇▅▁
accel_forearm_z	1	-55.29	138.40	-446.00	-182.00	-39.00	26.00	291.00	▁▇▅▅▃
magnet_forearm_x	1	-312.58	346.96	-1280.00	-616.00	-378.00	-73.00	672.00	▁▇▇▅▁
magnet_forearm_y	1	380.12	509.37	-896.00	2.00	591.00	737.00	1480.00	▂▂▂▇▁
magnet_forearm_z	1	393.61	369.27	-973.00	191.00	511.00	653.00	1090.00	▁▁▂▇▃

tidydata$classe <- as.factor(tidydata$classe) 
val.data <- val.data %>% select(colnames(tidydata))

finally I have a tidy dataset made up of 52 variables and 1 out outcome (categorical classe).

Data transformation

set.seed(123)

training.samples <- tidydata$classe %>% createDataPartition(p=0.7, list = FALSE) 
train.data <- tidydata[training.samples,]
test.data <- tidydata[-training.samples,]

# estimate preprocessing parameters
preproc.param <- train.data %>% preProcess(method = c("center","scale")) 
# Transform the data using the staimated parameters
train.transformed <- preproc.param %>% predict(train.data)
test.transformed <- preproc.param %>% predict(test.data)
valid.transformed <- preproc.param %>% predict(val.data)

Exploratory Analisys

A convenient step to explore our data set is to get the correlation matrix. This could get us some clues to pick some predictors which are highly correlated and then remove them from our data set. Also we could develop some strategy to make a dimension reduction.

cor_mt <- cor(tidydata[,-53])                           # Correlation matrix
corrplot(cor_mt, order = "FPC", method = "color", type = "lower", 
         tl.cex = 0.8, tl.col = rgb(0, 0, 0))

In the plot above we see that there are a high number of correlated predictors.

Selecting Predictors

In the dataset there are 52 predictors and many of them are highly correlated. So, It’d be convenient to implement a methodology to reduce the dimensionality of our data set. A simple and in many cases effective tool for dimension reduction in a data set is Correlation method. This method just remove predictors which are highly correlated. (See Applied Predictive Modeling. Max Kuhn and Kjell Johnson)

The algorithm is as follows:

Calculate the correlation matrix of the predictors.
Determine the two predictors associated with the largest absolute pairwise correlation (call them predictors A and B).
Determine the average correlation between A and the other variables. Do the same for predictor B.
If A has a larger average correlation, remove it; otherwise, remove predictor B.
Repeat Steps 2–4 until no absolute correlations are above the threshold.

Correlation method

In the carret package the findCorrelation function return the variables to be removed in in accordance with the Correlation method.

rmv <- findCorrelation(cor_mt, cutoff = 0.75)   # return columns to remove
# rmv contain the predictors to remove from datas sets. it'll go from 52 to 32 variables given a cutoff of 0.75 to find correlated varibales. 
names <- colnames(cor_mt)
train.transformed.reduced.1 <- train.transformed %>% select(-names[rmv])
test.transformed.reduced.1 <- test.transformed %>% select(-names[rmv])

Training

Code to Train Final KNN Model:

set.seed(123)
model_knn <- train(classe ~., data = train.transformed.reduced.1, method = "knn",
               trControl = trainControl("cv", number = 10),
               tuneLength = 10)
# Really a fast training
# plot model accuracy vs different values of k
plot(model_knn)
# print the best tuning parameter k
model_knn$bestTune 

saveRDS(model_knn, "./C8_ML/project_models/model_knn.rds")

Code to Train Final Random Forest Model:

set.seed(123)
model_rf <- train(classe ~. , data = train.transformed.reduced.1, method = "rf",
                  trControl = trainControl("cv", number = 5),
                  importance = TRUE)
model_rf$bestTune
# model_rf$finalModel 
# Variable importance
# importance(model_rf$finalModel)
saveRDS(model_SVM, "./C8_ML/project_models/model_rf.rds")

I played some time with a bunch of models and its tuning parameters previusly. I pasted the code for my final models but I ran them in other files. Here I’m going to load my final models to make the predictions:

knn_model <- readRDS("./project_models/model_knn.rds")

rf_model <- readRDS("./project_models/model_rf.rds")

Models Testing

Here I’m going to compute just the models accuracy parameter to compare:

# make predictions on the test data
predicted.classes <- knn_model %>% predict(test.transformed.reduced.1)
# compute model accuracy rate
knn_acc <- mean(predicted.classes == test.transformed.reduced.1$classe)

The accuracy for the knn classifier in 0.957859

# make predictions 
predicted.classes <- rf_model %>% predict(test.transformed.reduced.1)
#compute model accuracy
rf_acc <- mean(predicted.classes == test.transformed.reduced.1$classe)

The accuracy for the Random Forest model in 0.9942226

Validation

To make the final validation we have to apply our machine learning algorithm to the 20 test cases available in the test data.

# make predictions ovn the test data
classes.knn <- knn_model %>% predict(valid.transformed)
classes.knn

##  [1] B A A A A C D B A A D C B A E E A B B B
## Levels: A B C D E

classes.rf <- rf_model %>% predict(valid.transformed)
classes.rf

##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E

Proyecto Final (Sin tidyModels)

Paul Gomez

7/10/2020

Summary