classe variable) based on
sensor data.Six young health participants were asked to perform one set
of 10 repetitions of the Unilateral Dumbbell Biceps Curl in five
different fashions:
You can find more information about the study
training <- read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", na.strings = c("", NA))
testing <- read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv", na.strings = c("", NA))
library(DataExplorer)
library(dplyr)
library(randomForest)
library(caret)
library(ggplot2)
library(gridExtra)
library(reshape2)
library(rpart)
library(rpart.plot)
library(skimr)
library(corrplot)
library(caretEnsemble)
library(pryr)
library(gbm3)
library(glmnet)
introduce(training)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 19622 160 37 123 0
## total_missing_values complete_rows total_observations memory_usage
## 1 1921600 406 3139520 22810544
dim(testing)
## [1] 20 160
colnames(training)
## [1] "X" "user_name"
## [3] "raw_timestamp_part_1" "raw_timestamp_part_2"
## [5] "cvtd_timestamp" "new_window"
## [7] "num_window" "roll_belt"
## [9] "pitch_belt" "yaw_belt"
## [11] "total_accel_belt" "kurtosis_roll_belt"
## [13] "kurtosis_picth_belt" "kurtosis_yaw_belt"
## [15] "skewness_roll_belt" "skewness_roll_belt.1"
## [17] "skewness_yaw_belt" "max_roll_belt"
## [19] "max_picth_belt" "max_yaw_belt"
## [21] "min_roll_belt" "min_pitch_belt"
## [23] "min_yaw_belt" "amplitude_roll_belt"
## [25] "amplitude_pitch_belt" "amplitude_yaw_belt"
## [27] "var_total_accel_belt" "avg_roll_belt"
## [29] "stddev_roll_belt" "var_roll_belt"
## [31] "avg_pitch_belt" "stddev_pitch_belt"
## [33] "var_pitch_belt" "avg_yaw_belt"
## [35] "stddev_yaw_belt" "var_yaw_belt"
## [37] "gyros_belt_x" "gyros_belt_y"
## [39] "gyros_belt_z" "accel_belt_x"
## [41] "accel_belt_y" "accel_belt_z"
## [43] "magnet_belt_x" "magnet_belt_y"
## [45] "magnet_belt_z" "roll_arm"
## [47] "pitch_arm" "yaw_arm"
## [49] "total_accel_arm" "var_accel_arm"
## [51] "avg_roll_arm" "stddev_roll_arm"
## [53] "var_roll_arm" "avg_pitch_arm"
## [55] "stddev_pitch_arm" "var_pitch_arm"
## [57] "avg_yaw_arm" "stddev_yaw_arm"
## [59] "var_yaw_arm" "gyros_arm_x"
## [61] "gyros_arm_y" "gyros_arm_z"
## [63] "accel_arm_x" "accel_arm_y"
## [65] "accel_arm_z" "magnet_arm_x"
## [67] "magnet_arm_y" "magnet_arm_z"
## [69] "kurtosis_roll_arm" "kurtosis_picth_arm"
## [71] "kurtosis_yaw_arm" "skewness_roll_arm"
## [73] "skewness_pitch_arm" "skewness_yaw_arm"
## [75] "max_roll_arm" "max_picth_arm"
## [77] "max_yaw_arm" "min_roll_arm"
## [79] "min_pitch_arm" "min_yaw_arm"
## [81] "amplitude_roll_arm" "amplitude_pitch_arm"
## [83] "amplitude_yaw_arm" "roll_dumbbell"
## [85] "pitch_dumbbell" "yaw_dumbbell"
## [87] "kurtosis_roll_dumbbell" "kurtosis_picth_dumbbell"
## [89] "kurtosis_yaw_dumbbell" "skewness_roll_dumbbell"
## [91] "skewness_pitch_dumbbell" "skewness_yaw_dumbbell"
## [93] "max_roll_dumbbell" "max_picth_dumbbell"
## [95] "max_yaw_dumbbell" "min_roll_dumbbell"
## [97] "min_pitch_dumbbell" "min_yaw_dumbbell"
## [99] "amplitude_roll_dumbbell" "amplitude_pitch_dumbbell"
## [101] "amplitude_yaw_dumbbell" "total_accel_dumbbell"
## [103] "var_accel_dumbbell" "avg_roll_dumbbell"
## [105] "stddev_roll_dumbbell" "var_roll_dumbbell"
## [107] "avg_pitch_dumbbell" "stddev_pitch_dumbbell"
## [109] "var_pitch_dumbbell" "avg_yaw_dumbbell"
## [111] "stddev_yaw_dumbbell" "var_yaw_dumbbell"
## [113] "gyros_dumbbell_x" "gyros_dumbbell_y"
## [115] "gyros_dumbbell_z" "accel_dumbbell_x"
## [117] "accel_dumbbell_y" "accel_dumbbell_z"
## [119] "magnet_dumbbell_x" "magnet_dumbbell_y"
## [121] "magnet_dumbbell_z" "roll_forearm"
## [123] "pitch_forearm" "yaw_forearm"
## [125] "kurtosis_roll_forearm" "kurtosis_picth_forearm"
## [127] "kurtosis_yaw_forearm" "skewness_roll_forearm"
## [129] "skewness_pitch_forearm" "skewness_yaw_forearm"
## [131] "max_roll_forearm" "max_picth_forearm"
## [133] "max_yaw_forearm" "min_roll_forearm"
## [135] "min_pitch_forearm" "min_yaw_forearm"
## [137] "amplitude_roll_forearm" "amplitude_pitch_forearm"
## [139] "amplitude_yaw_forearm" "total_accel_forearm"
## [141] "var_accel_forearm" "avg_roll_forearm"
## [143] "stddev_roll_forearm" "var_roll_forearm"
## [145] "avg_pitch_forearm" "stddev_pitch_forearm"
## [147] "var_pitch_forearm" "avg_yaw_forearm"
## [149] "stddev_yaw_forearm" "var_yaw_forearm"
## [151] "gyros_forearm_x" "gyros_forearm_y"
## [153] "gyros_forearm_z" "accel_forearm_x"
## [155] "accel_forearm_y" "accel_forearm_z"
## [157] "magnet_forearm_x" "magnet_forearm_y"
## [159] "magnet_forearm_z" "classe"
The training data contains 19622 observations of 160 variables and the testing data contains 20 observations of 160 variables.
First we omit the metadata including user names, time data and window chunks since we will try to classify rows only based on sensor data. In fact, including the meta data like user names or time introduces significant overfitting. Especially user names are highly decisive on sensor readings. You can see about this details on appendix.
#omit metadata
training2 <- training %>% select(-c(X,user_name,raw_timestamp_part_1,raw_timestamp_part_2,cvtd_timestamp,new_window,num_window))
testing2 <- testing %>% select(-c(X,user_name,raw_timestamp_part_1,raw_timestamp_part_2,cvtd_timestamp,new_window,num_window))
#convert outcome into factor
training2$classe <- as.factor(training2$classe)
# Handle missing values: remove columns with more than 70% missing data
na_props <- training2 %>% summarise(across(everything(), ~ sum(is.na(.)) / length(.)))
high_na_cols <- na_props %>% select(where(~ . > 0.7))
training3 <- training2[, !(colnames(training2) %in% colnames(high_na_cols))]
testing3 <- testing2[, !(colnames(testing2) %in% colnames(high_na_cols))]
set.seed(123)
inTrain <- createDataPartition(y = training3$classe, p = 0.7, list = FALSE)
training4 <- training3[inTrain, ]
validation <- training3[-inTrain, ]
testing4 <- testing3
# a second look using skim function
skim(training4)
| Name | training4 |
| Number of rows | 13737 |
| Number of columns | 53 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 52 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| classe | 0 | 1 | FALSE | 5 | A: 3906, B: 2658, E: 2525, C: 2396 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| roll_belt | 0 | 1 | 64.25 | 62.72 | -28.80 | 1.10 | 113.00 | 123.00 | 162.00 | ▇▁▁▅▅ |
| pitch_belt | 0 | 1 | 0.45 | 22.24 | -55.80 | 1.92 | 5.32 | 15.00 | 60.30 | ▃▁▇▅▁ |
| yaw_belt | 0 | 1 | -11.67 | 94.86 | -180.00 | -88.30 | -13.60 | 11.90 | 179.00 | ▁▇▅▁▃ |
| total_accel_belt | 0 | 1 | 11.29 | 7.75 | 0.00 | 3.00 | 17.00 | 18.00 | 29.00 | ▇▁▂▆▁ |
| gyros_belt_x | 0 | 1 | -0.01 | 0.21 | -1.04 | -0.03 | 0.03 | 0.11 | 2.20 | ▁▇▁▁▁ |
| gyros_belt_y | 0 | 1 | 0.04 | 0.08 | -0.64 | 0.00 | 0.02 | 0.11 | 0.61 | ▁▁▇▃▁ |
| gyros_belt_z | 0 | 1 | -0.13 | 0.24 | -1.46 | -0.20 | -0.10 | -0.02 | 1.61 | ▁▂▇▁▁ |
| accel_belt_x | 0 | 1 | -5.79 | 29.47 | -120.00 | -21.00 | -15.00 | -5.00 | 85.00 | ▁▁▇▁▂ |
| accel_belt_y | 0 | 1 | 30.10 | 28.63 | -69.00 | 3.00 | 33.00 | 61.00 | 164.00 | ▁▇▇▁▁ |
| accel_belt_z | 0 | 1 | -72.38 | 100.40 | -275.00 | -162.00 | -152.00 | 27.00 | 105.00 | ▁▇▁▅▃ |
| magnet_belt_x | 0 | 1 | 55.00 | 63.93 | -52.00 | 8.00 | 34.00 | 58.00 | 485.00 | ▇▁▂▁▁ |
| magnet_belt_y | 0 | 1 | 593.76 | 35.35 | 354.00 | 581.00 | 601.00 | 610.00 | 673.00 | ▁▁▁▇▃ |
| magnet_belt_z | 0 | 1 | -345.41 | 64.17 | -621.00 | -375.00 | -319.00 | -306.00 | 287.00 | ▁▇▁▁▁ |
| roll_arm | 0 | 1 | 17.78 | 73.20 | -178.00 | -32.20 | 0.00 | 77.50 | 180.00 | ▁▃▇▆▂ |
| pitch_arm | 0 | 1 | -4.46 | 30.72 | -88.20 | -26.00 | 0.00 | 11.50 | 88.50 | ▁▅▇▂▁ |
| yaw_arm | 0 | 1 | -0.52 | 71.61 | -180.00 | -43.10 | 0.00 | 46.70 | 180.00 | ▁▃▇▃▂ |
| total_accel_arm | 0 | 1 | 25.53 | 10.45 | 1.00 | 17.00 | 27.00 | 33.00 | 65.00 | ▃▆▇▁▁ |
| gyros_arm_x | 0 | 1 | 0.02 | 2.00 | -6.37 | -1.36 | 0.06 | 1.53 | 4.87 | ▁▃▇▆▂ |
| gyros_arm_y | 0 | 1 | -0.25 | 0.86 | -3.44 | -0.79 | -0.24 | 0.16 | 2.81 | ▁▂▇▂▁ |
| gyros_arm_z | 0 | 1 | 0.27 | 0.55 | -2.33 | -0.08 | 0.23 | 0.72 | 2.66 | ▁▂▇▃▁ |
| accel_arm_x | 0 | 1 | -60.05 | 182.68 | -404.00 | -241.00 | -46.00 | 84.00 | 435.00 | ▇▆▇▅▁ |
| accel_arm_y | 0 | 1 | 32.61 | 109.66 | -318.00 | -54.00 | 14.00 | 139.00 | 308.00 | ▁▃▇▇▂ |
| accel_arm_z | 0 | 1 | -70.84 | 133.98 | -630.00 | -142.00 | -48.00 | 23.00 | 292.00 | ▁▁▅▇▁ |
| magnet_arm_x | 0 | 1 | 192.20 | 444.52 | -584.00 | -301.00 | 290.00 | 640.00 | 782.00 | ▆▃▂▃▇ |
| magnet_arm_y | 0 | 1 | 155.96 | 201.65 | -392.00 | -8.00 | 200.00 | 322.00 | 583.00 | ▁▃▅▇▂ |
| magnet_arm_z | 0 | 1 | 307.10 | 325.84 | -597.00 | 129.00 | 445.00 | 546.00 | 694.00 | ▁▂▂▃▇ |
| roll_dumbbell | 0 | 1 | 23.56 | 70.25 | -153.71 | -19.94 | 48.07 | 67.78 | 153.55 | ▂▂▃▇▂ |
| pitch_dumbbell | 0 | 1 | -10.84 | 37.03 | -149.59 | -40.96 | -20.85 | 17.35 | 149.40 | ▁▆▇▂▁ |
| yaw_dumbbell | 0 | 1 | 2.07 | 82.36 | -150.87 | -77.57 | 0.00 | 79.97 | 154.95 | ▃▇▅▅▆ |
| total_accel_dumbbell | 0 | 1 | 13.65 | 10.24 | 0.00 | 4.00 | 10.00 | 19.00 | 58.00 | ▇▅▃▁▁ |
| gyros_dumbbell_x | 0 | 1 | 0.15 | 1.78 | -204.00 | -0.03 | 0.13 | 0.35 | 2.20 | ▁▁▁▁▇ |
| gyros_dumbbell_y | 0 | 1 | 0.05 | 0.66 | -2.10 | -0.14 | 0.03 | 0.21 | 52.00 | ▇▁▁▁▁ |
| gyros_dumbbell_z | 0 | 1 | -0.12 | 2.72 | -2.38 | -0.31 | -0.13 | 0.03 | 317.00 | ▇▁▁▁▁ |
| accel_dumbbell_x | 0 | 1 | -28.43 | 67.35 | -419.00 | -50.00 | -8.00 | 10.00 | 235.00 | ▁▁▆▇▁ |
| accel_dumbbell_y | 0 | 1 | 51.96 | 80.82 | -189.00 | -9.00 | 40.00 | 110.00 | 315.00 | ▁▇▇▅▁ |
| accel_dumbbell_z | 0 | 1 | -37.52 | 109.28 | -273.00 | -141.00 | 0.00 | 38.00 | 318.00 | ▃▃▇▂▁ |
| magnet_dumbbell_x | 0 | 1 | -328.71 | 340.26 | -643.00 | -535.00 | -480.00 | -307.00 | 584.00 | ▇▂▁▁▂ |
| magnet_dumbbell_y | 0 | 1 | 219.91 | 328.00 | -3600.00 | 231.00 | 310.00 | 390.00 | 633.00 | ▁▁▁▁▇ |
| magnet_dumbbell_z | 0 | 1 | 45.00 | 139.39 | -262.00 | -46.00 | 12.00 | 95.00 | 452.00 | ▁▇▆▂▂ |
| roll_forearm | 0 | 1 | 33.19 | 108.26 | -180.00 | -2.44 | 20.60 | 140.00 | 180.00 | ▃▂▇▂▇ |
| pitch_forearm | 0 | 1 | 10.77 | 28.42 | -72.50 | 0.00 | 9.60 | 28.90 | 89.80 | ▁▁▇▃▁ |
| yaw_forearm | 0 | 1 | 19.03 | 103.90 | -180.00 | -70.50 | 0.00 | 111.00 | 180.00 | ▅▅▇▆▇ |
| total_accel_forearm | 0 | 1 | 34.72 | 10.08 | 0.00 | 29.00 | 36.00 | 41.00 | 108.00 | ▁▇▂▁▁ |
| gyros_forearm_x | 0 | 1 | 0.16 | 0.66 | -22.00 | -0.22 | 0.05 | 0.56 | 3.52 | ▁▁▁▁▇ |
| gyros_forearm_y | 0 | 1 | 0.09 | 3.43 | -7.02 | -1.46 | 0.03 | 1.64 | 311.00 | ▇▁▁▁▁ |
| gyros_forearm_z | 0 | 1 | 0.16 | 2.06 | -5.55 | -0.18 | 0.08 | 0.49 | 231.00 | ▇▁▁▁▁ |
| accel_forearm_x | 0 | 1 | -62.46 | 180.51 | -496.00 | -179.00 | -57.00 | 75.00 | 477.00 | ▂▆▇▅▁ |
| accel_forearm_y | 0 | 1 | 163.12 | 200.28 | -595.00 | 55.00 | 201.00 | 312.00 | 923.00 | ▁▃▇▃▁ |
| accel_forearm_z | 0 | 1 | -56.51 | 138.45 | -446.00 | -182.00 | -41.00 | 26.00 | 287.00 | ▁▇▅▅▃ |
| magnet_forearm_x | 0 | 1 | -314.03 | 346.44 | -1280.00 | -617.00 | -381.00 | -77.00 | 672.00 | ▁▇▆▅▁ |
| magnet_forearm_y | 0 | 1 | 377.48 | 507.68 | -892.00 | 4.00 | 586.00 | 735.00 | 1480.00 | ▂▂▂▇▁ |
| magnet_forearm_z | 0 | 1 | 391.60 | 368.89 | -966.00 | 186.00 | 509.00 | 651.00 | 1070.00 | ▁▁▃▇▃ |
#check correlations
numeric_features <- training4 %>% select_if(is.numeric)
cor_matrix <- cor(numeric_features)
corrplot(cor_matrix,
tl.cex = .7, # Reduce text label size
tl.srt = 70, # Rotate the labels by 45 degrees
method = "circle" # Use circles for a clean look
)
#check class imbalance
table(training4$classe)
##
## A B C D E
## 3906 2658 2396 2252 2525
#center and scale training4
preProc_train <- preProcess(training4, method = c("center", "scale"))
#transform the data
training5 <- predict(preProc_train, newdata = training4)
validation2 <- predict(preProc_train, newdata = validation)
testing5 <- predict(preProc_train, newdata = testing4)
#apply pca
pca <- prcomp(training5[, -which(names(training5) %in% c("classe"))])
pca_var <- pca$sdev^2 / sum(pca$sdev^2)
cum_var <- cumsum(pca_var)
#cumulative explained variance vs. number of principal components
plot(cum_var, xlab = "Number of Principal Components",
ylab = "Cumulative Proportion of Variance Explained",
type = "b", col = "blue", pch = 19,
main = "Cumulative Variance Explained by Principal Components")
abline(h = 0.9, col = "red", lty = 2)
pca_transformed_data <- pca$x[, 1:18]
#transform the data using principal components by 90% cutoff on explained variance
training6 <- data.frame(pca_transformed_data, classe = training5$classe)
validation3 <- data.frame(predict(pca, newdata = validation2)[, 1:18],classe = validation2$classe)
testing6 <- predict(pca, newdata = testing5)[, 1:18]
We will define a trainControl function control_base
because we will plan to stack models in the end.
First we train a multinomial logistic regression model.
control_base <- trainControl(method = "cv", number = 5, savePredictions = "final", classProbs = TRUE)
Rprof("memory_profile.out", memory.profiling = TRUE)
model_glm <- train(classe ~ .,
data = training6,
method = "multinom",
trControl = control_base,
verbose = FALSE)
Rprof(NULL)
time_glm1 <- summaryRprof("memory_profile.out", memory = "both")$by.total
memory_glm1 <- summaryRprof("memory_profile.out", memory = "both")$memory
tm_glm <- sapply(list(time_glm1$total.time,time_glm1$mem.total),max)
model_glm_preds <- predict(model_glm,validation3)
conf_matrix <- confusionMatrix(model_glm_preds, validation3$classe)
model_glm$results
## decay Accuracy Kappa AccuracySD KappaSD
## 1 0e+00 0.5145256 0.3815310 0.008621836 0.01095427
## 2 1e-04 0.5145256 0.3815310 0.008621836 0.01095427
## 3 1e-01 0.5145256 0.3815327 0.008621836 0.01095683
conf_matrix$overall["Accuracy"]
## Accuracy
## 0.4968564
model_glmtune_grid <- expand.grid(decay = seq(0, 0.1, by = 0.01))
Rprof("memory_profile.out", memory.profiling = TRUE)
model_glm_tuned <- train(
classe ~ .,
data = training6,
method = "multinom",
trControl = control_base,
tuneGrid = tune_grid,
maxit = 200,
verbose = FALSE
)
Rprof(NULL)
time_glm_tuned <- summaryRprof("memory_profile.out", memory = "both")$by.total
tm_glm_tuned <- sapply(list(time_glm_tuned$total.time,time_glm_tuned$mem.total),max)
head(model_glm_tuned$results)
## decay Accuracy Kappa AccuracySD KappaSD
## 1 0.00 0.5146675 0.3817817 0.01229979 0.01566126
## 2 0.01 0.5147403 0.3818760 0.01229335 0.01565946
## 3 0.02 0.5147403 0.3818760 0.01229335 0.01565946
## 4 0.03 0.5146675 0.3817817 0.01229979 0.01566126
## 5 0.04 0.5147403 0.3818760 0.01229335 0.01565946
## 6 0.05 0.5146675 0.3817817 0.01229979 0.01566126
model_glm_tuned_preds <- predict(model_glm_tuned,validation3)
conf_matrix <- confusionMatrix(model_glm_tuned_preds, validation3$classe)
conf_matrix$overall["Accuracy"]
## Accuracy
## 0.4968564
control_base
again and 10 different numbers of \(k\).Rprof("memory_profile.out", memory.profiling = TRUE)
model_knn <- train(classe ~ ., data = training6, method = "knn", trControl = control_base, tuneLength = 10)
Rprof(NULL)
time_knn <- summaryRprof("memory_profile.out", memory = "both")$by.total
tm_knn <- sapply(list(time_knn$total.time,time_knn$mem.total),max)
head(model_knn$results)
## k Accuracy Kappa AccuracySD KappaSD
## 1 5 0.9406720 0.9249422 0.005791014 0.007312515
## 2 7 0.9240012 0.9038475 0.005780462 0.007307526
## 3 9 0.9109709 0.8873732 0.007378862 0.009360324
## 4 11 0.8986685 0.8717871 0.005768835 0.007342700
## 5 13 0.8881855 0.8585090 0.004984859 0.006333097
## 6 15 0.8764658 0.8436722 0.004374847 0.005573689
model_knn_preds <- predict(model_knn,validation3)
conf_matrix <- confusionMatrix(model_knn_preds, validation3$classe)
conf_matrix$overall["Accuracy"]
## Accuracy
## 0.9502124
tree_tune_grid <- expand.grid(cp = seq(0.001, 0.05, by = 0.001))
Rprof("memory_profile.out", memory.profiling = TRUE)
model_tree <- train(classe ~ ., data = training6,
method = "rpart",
trControl = control_base,
tuneGrid = tree_tune_grid)
Rprof(NULL)
time_tree <- summaryRprof("memory_profile.out", memory = "both")$by.total
tm_tree <- sapply(list(time_tree$total.time,time_tree$mem.total),max)
model_tree_preds <- predict(model_tree,validation3)
conf_matrix <- confusionMatrix(model_tree_preds, validation3$classe)
conf_matrix$overall["Accuracy"]
## Accuracy
## 0.7231946
Rprof("memory_profile.out", memory.profiling = TRUE)
model_rf <- train(classe ~ ., data = training6, method = "rf",trControl = control_base, importance = TRUE)
Rprof(NULL)
time_rf <- summaryRprof("memory_profile.out", memory = "both")$by.total
tm_rf <- sapply(list(time_rf$total.time,time_rf$mem.total),max)
model_rf_preds <- predict(model_rf$finalModel,validation3)
conf_matrix <- confusionMatrix(model_rf_preds, validation3$classe)
conf_matrix$overall["Accuracy"]
## Accuracy
## 0.9712829
Rprof("memory_profile.out", memory.profiling = TRUE)
model_boost <- train(
classe ~ .,
method = "gbm",
data = training6,
trControl = control_base,
verbose = FALSE, # Prevents printing during training
tuneLength = 10 # This will try 10 different hyperparameter combinations
)
Rprof(NULL)
time_boost <- summaryRprof("memory_profile.out", memory = "both")$by.total
tm_boost <- sapply(list(time_boost$total.time,time_boost$mem.total),max)
model_boost_preds <- predict(model_boost,validation3)
conf_matrix <- confusionMatrix(model_boost_preds, validation3$classe)
conf_matrix$overall["Accuracy"]
## Accuracy
## 0.9604078
# Model Stacking with Profiling
base_learners <- list(
rpart = model_tree, # Using the caret model directly
rf = model_rf, # Full caret model
knn = model_knn, # Full caret model
glm = model_glm_tuned, # Logistic regression tuned model
gbm = model_boost # Boosted model
)
# Control for stacking models
stack_control <- trainControl(method = "cv", number = 5, savePredictions = "final", classProbs = TRUE)
# Profiling the stacking process
Rprof("memory_profile_stack.out", memory.profiling = TRUE)
# Stacking the models
model_stack <- caretStack(
base_learners,
method = "glmnet", # Use glmnet for model stacking
metric = "Accuracy", # Metric to optimize
trControl = stack_control
)
Rprof(NULL)
# Profiling results
time_stack <- summaryRprof("memory_profile_stack.out", memory = "both")$by.total
tm_stack <- sapply(list(time_stack$total.time, time_stack$mem.total), max)
summary(model_stack)
## The following models were ensembled: rpart, rf, knn, glm, gbm
##
## Model Importance:
## rpart_B rpart_C rpart_D rpart_E rf_B rf_C rf_D rf_E knn_B knn_C
## 0.0055 0.0031 0.0033 0.0008 0.2975 0.1697 0.1926 0.1654 0.0063 0.0176
## knn_D knn_E glm_B glm_C glm_D glm_E gbm_B gbm_C gbm_D gbm_E
## 0.0253 0.0298 0.0065 0.0017 0.0019 0.0029 0.0136 0.0340 0.0122 0.0104
##
## Model accuracy:
## model_name metric value sd
## <char> <char> <num> <num>
## 1: ensemble Accuracy 0.9708828 0.003681060
## 2: rpart Accuracy 0.7328383 0.003144346
## 3: rf Accuracy 0.9657134 0.004275948
## 4: knn Accuracy 0.9406720 0.005791014
## 5: multinom Accuracy 0.5147403 0.012293350
## 6: gbm Accuracy 0.9501347 0.002287768
model_stack_preds <- predict(model_stack, newdata = validation3)
model_stack_preds <- apply(model_stack_preds, 1, function(x) colnames(model_stack_preds)[which.max(x)])
model_stack_preds <- as.factor(model_stack_preds)
validation_actual <- as.factor(validation3$classe)
conf_matrix <- confusionMatrix(model_stack_preds, validation_actual)
conf_matrix$overall["Accuracy"]
## Accuracy
## 0.9785896
models <- list(
glm = model_glm,
glm_tuned = model_glm_tuned,
knn = model_knn,
tree = model_tree,
rf = model_rf,
boost = model_boost,
stack = model_stack
)
# Initialize a list to store predictions
predictions <- list()
# Generate predictions for each model
for (model_name in names(models)) {
if (model_name == "stack") {
# For the stacked model, predictions may be probabilities
pred_probs <- predict(models[[model_name]], newdata = validation3)
# Convert probabilities to class labels
preds <- apply(pred_probs, 1, function(x) colnames(pred_probs)[which.max(x)])
predictions[[model_name]] <- as.factor(preds)
} else {
# For other models, directly predict class labels
predictions[[model_name]] <- predict(models[[model_name]], newdata = validation3)
}
}
# Initialize a data frame to store performance metrics
performance_metrics <- data.frame(
Model = character(),
Class = character(),
Sensitivity = numeric(),
Specificity = numeric(),
Pos_Pred_Value = numeric(),
Neg_Pred_Value = numeric(),
Prevalence = numeric(),
Detection_Rate = numeric(),
Detection_Prevalence = numeric(),
Balanced_Accuracy = numeric(),
stringsAsFactors = FALSE
)
# Loop over each model to extract per-class metrics
for (model_name in names(predictions)) {
cm <- confusionMatrix(predictions[[model_name]], validation3$classe)
# Extract the per-class metrics
byClass <- as.data.frame(cm$byClass)
# Since in multiclass, the row names might be like "Class: A", extract the class labels
byClass$Class <- rownames(byClass)
byClass$Class <- sub("Class: ", "", byClass$Class)
# Add the model name
byClass$Model <- model_name
# Reorder columns
byClass <- byClass[, c("Model", "Class", names(byClass)[1:(ncol(byClass)-2)])]
# Append to the performance_metrics data frame
performance_metrics <- rbind(performance_metrics, byClass)
}
# Rename columns for clarity
colnames(performance_metrics) <- c(
"Model", "Class", "Sensitivity", "Specificity",
"Pos_Pred_Value", "Neg_Pred_Value", "Precision",
"Recall", "F1", "Prevalence", "Detection_Rate",
"Detection_Prevalence", "Balanced_Accuracy"
)
performance_metrics <- performance_metrics %>% select(-Recall) %>% arrange(desc(Balanced_Accuracy))
profiling_data <- data.frame(
Model = c("glm", "glm_tuned", "knn", "tree", "rf", "boost", "stack"),
Training_Time = c(tm_glm[1], tm_glm_tuned[1], tm_knn[1],
tm_tree[1], tm_rf[1], tm_boost[1], tm_stack[1]),
Memory_Usage = c(tm_glm[2], tm_glm_tuned[2], tm_knn[2],
tm_tree[2], tm_rf[2], tm_boost[2], tm_stack[2])
)
profiling_data <- profiling_data %>% arrange(desc(Memory_Usage),desc(Training_Time)) %>% mutate(Training_Time = round(Training_Time/60,3))
rownames(performance_metrics) <- 1:35
head(performance_metrics,10)
## Model Class Sensitivity Specificity Pos_Pred_Value Neg_Pred_Value Precision
## 1 stack E 0.9889094 0.9977098 0.9898242 0.9975021 0.9898242
## 2 stack A 0.9916368 0.9938257 0.9845789 0.9966659 0.9845789
## 3 rf A 0.9898447 0.9919259 0.9798936 0.9959466 0.9798936
## 4 boost A 0.9880526 0.9905011 0.9763872 0.9952279 0.9763872
## 5 rf E 0.9759704 0.9983344 0.9924812 0.9946069 0.9924812
## 6 knn A 0.9826762 0.9871764 0.9682166 0.9930721 0.9682166
## 7 stack B 0.9727831 0.9949431 0.9787986 0.9934778 0.9787986
## 8 stack C 0.9707602 0.9892982 0.9503817 0.9937978 0.9503817
## 9 boost E 0.9611830 0.9966687 0.9848485 0.9913025 0.9848485
## 10 stack D 0.9595436 0.9973583 0.9861407 0.9921164 0.9861407
## F1 Prevalence Detection_Rate Detection_Prevalence Balanced_Accuracy
## 1 0.9893666 0.1838573 0.1818182 0.1836873 0.9933096
## 2 0.9880952 0.2844520 0.2820731 0.2864911 0.9927312
## 3 0.9848440 0.2844520 0.2815633 0.2873407 0.9908853
## 4 0.9821853 0.2844520 0.2810535 0.2878505 0.9892768
## 5 0.9841566 0.1838573 0.1794393 0.1807986 0.9871524
## 6 0.9753928 0.2844520 0.2795242 0.2887001 0.9849263
## 7 0.9757816 0.1935429 0.1882753 0.1923534 0.9838631
## 8 0.9604629 0.1743415 0.1692438 0.1780799 0.9800292
## 9 0.9728718 0.1838573 0.1767205 0.1794393 0.9789259
## 10 0.9726604 0.1638063 0.1571793 0.1593883 0.9784509
profiling_data
## Model Training_Time Memory_Usage
## 1 rf 6.001 17746.9
## 2 boost 38.109 16900.4
## 3 tree 0.137 4141.1
## 4 stack 0.721 3347.8
## 5 knn 1.052 2349.2
## 6 glm_tuned 1.935 2207.5
## 7 glm 0.631 758.3