El objetivo es predecir la forma en que se realizó el ejercicio
(variable classe) usando mediciones de
acelerómetros. A continuación describo: adquisición y limpieza de datos,
partición y validación, modelos entrenados, evaluación fuera de muestra
y predicciones finales sobre los 20 casos de prueba.
# Paquetes
suppressPackageStartupMessages({
library(caret)
library(randomForest)
library(gbm)
library(dplyr)
library(lubridate)
# Evita fallos si `multiClassSummary` requiere MLmetrics
if (!requireNamespace("MLmetrics", quietly = TRUE)) {
message("Instalando MLmetrics (necesario si usas multiClassSummary)...")
try(utils::install.packages("MLmetrics"), silent = TRUE)
}
})
# URLs oficiales del curso
url_train <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
url_test <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
# Lectura (con NA explícitos)
train_raw <- read.csv(url_train, na.strings = c("NA", "", "#DIV/0!"))
test_raw <- read.csv(url_test, na.strings = c("NA", "", "#DIV/0!"))
dim(train_raw); dim(test_raw)
## [1] 19622 160
## [1] 20 160
Reglas aplicadas:
classe como factor objetivo.# 1) Columnas sin NA en train
noNA <- colSums(is.na(train_raw)) == 0
train_noNA <- train_raw[, noNA]
# 2) Remover identificadores / timestamps
id_cols <- c("X","user_name","raw_timestamp_part_1","raw_timestamp_part_2",
"cvtd_timestamp","new_window","num_window")
keep_cols <- setdiff(names(train_noNA), id_cols)
train_cln <- train_noNA[, keep_cols]
# Asegurar tipo factor en y
train_cln$classe <- factor(train_cln$classe)
# Aplicar el mismo filtrado de columnas al set de 20 casos
# (usar intersección de columnas predictoras)
common_cols <- intersect(names(train_cln), names(test_raw))
common_cols <- setdiff(common_cols, c("classe"))
# Preparar test limpio con mismas columnas
# Primero filtrar NA en test_raw a las columnas finales
test_cln <- test_raw[, c(common_cols, setdiff(names(test_raw), common_cols))]
Usaré una partición 70/30 para estimar error fuera de muestra; y validación cruzada en el entrenamiento para selección de hiperparámetros.
set.seed(12345)
idx <- createDataPartition(train_cln$classe, p = 0.7, list = FALSE)
train_set <- train_cln[idx, ]
valid_set <- train_cln[-idx, ]
ctrl <- trainControl(method = "repeatedcv", number = 5, repeats = 1,
classProbs = TRUE, summaryFunction = defaultSummary)
set.seed(12345)
rf_fit <- train(classe ~ ., data = train_set,
method = "rf",
trControl = ctrl,
tuneLength = 3,
ntree = 500,
metric = "Accuracy")
rf_fit
## Random Forest
##
## 13737 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 1 times)
## Summary of sample sizes: 10990, 10990, 10989, 10991, 10988
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9920649 0.9899614
## 27 0.9919194 0.9897776
## 52 0.9836935 0.9793717
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
rf_pred_valid <- predict(rf_fit, valid_set)
rf_cm <- confusionMatrix(rf_pred_valid, valid_set$classe)
rf_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1673 5 0 0 0
## B 1 1134 5 0 0
## C 0 0 1021 20 0
## D 0 0 0 943 0
## E 0 0 0 1 1082
##
## Overall Statistics
##
## Accuracy : 0.9946
## 95% CI : (0.9923, 0.9963)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9931
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9994 0.9956 0.9951 0.9782 1.0000
## Specificity 0.9988 0.9987 0.9959 1.0000 0.9998
## Pos Pred Value 0.9970 0.9947 0.9808 1.0000 0.9991
## Neg Pred Value 0.9998 0.9989 0.9990 0.9958 1.0000
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2843 0.1927 0.1735 0.1602 0.1839
## Detection Prevalence 0.2851 0.1937 0.1769 0.1602 0.1840
## Balanced Accuracy 0.9991 0.9972 0.9955 0.9891 0.9999
rf_acc <- rf_cm$overall["Accuracy"]
rf_oose <- 1 - as.numeric(rf_acc)
rf_oose
## [1] 0.005437553
set.seed(12345)
gbm_fit <- train(classe ~ ., data = train_set,
method = "gbm",
trControl = ctrl,
verbose = FALSE,
tuneLength = 5,
metric = "Accuracy")
gbm_fit
## Stochastic Gradient Boosting
##
## 13737 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 1 times)
## Summary of sample sizes: 10990, 10990, 10989, 10991, 10988
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.7489996 0.6815523
## 1 100 0.8201198 0.7723057
## 1 150 0.8557894 0.8175610
## 1 200 0.8765362 0.8437606
## 1 250 0.8913138 0.8624814
## 2 50 0.8564465 0.8181690
## 2 100 0.9067469 0.8820142
## 2 150 0.9312799 0.9130599
## 2 200 0.9470038 0.9329594
## 2 250 0.9578506 0.9466841
## 3 50 0.8990307 0.8722037
## 3 100 0.9426357 0.9274277
## 3 150 0.9611267 0.9508242
## 3 200 0.9707356 0.9629802
## 3 250 0.9774328 0.9714537
## 4 50 0.9238536 0.9036365
## 4 100 0.9595243 0.9487976
## 4 150 0.9729184 0.9657410
## 4 200 0.9799804 0.9746773
## 4 250 0.9844933 0.9803856
## 5 50 0.9378308 0.9213291
## 5 100 0.9673131 0.9586528
## 5 150 0.9799074 0.9745852
## 5 200 0.9847122 0.9806633
## 5 250 0.9877697 0.9845307
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 250, interaction.depth =
## 5, shrinkage = 0.1 and n.minobsinnode = 10.
# Validación GBM
gbm_pred_valid <- predict(gbm_fit, valid_set)
gbm_cm <- confusionMatrix(gbm_pred_valid, valid_set$classe)
gbm_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1673 6 0 0 0
## B 1 1129 9 0 1
## C 0 4 1013 12 0
## D 0 0 4 952 2
## E 0 0 0 0 1079
##
## Overall Statistics
##
## Accuracy : 0.9934
## 95% CI : (0.991, 0.9953)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9916
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9994 0.9912 0.9873 0.9876 0.9972
## Specificity 0.9986 0.9977 0.9967 0.9988 1.0000
## Pos Pred Value 0.9964 0.9904 0.9845 0.9937 1.0000
## Neg Pred Value 0.9998 0.9979 0.9973 0.9976 0.9994
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2843 0.1918 0.1721 0.1618 0.1833
## Detection Prevalence 0.2853 0.1937 0.1749 0.1628 0.1833
## Balanced Accuracy 0.9990 0.9945 0.9920 0.9932 0.9986
varImp(rf_fit) %>% plot(top = 20, main = "Importancia de variables – RF (Top 20)")
El estimador práctico del error fuera de muestra para el modelo final se toma como el error en el conjunto de validación (30%):
rf_oose
## [1] 0.005437553
Conclusión: Usaremos Random Forest como modelo final (mejor precisión validada). GBM se reporta sólo como comparación.
Entrenamos nuevamente con todo
train_cln para aprovechar más datos al generar las
predicciones finales.
set.seed(12345)
rf_final <- train(classe ~ ., data = train_cln,
method = "rf",
trControl = ctrl,
tuneGrid = rf_fit$bestTune,
ntree = 500,
metric = "Accuracy")
rf_final
## Random Forest
##
## 19622 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 1 times)
## Summary of sample sizes: 15698, 15698, 15697, 15698, 15697
## Resampling results:
##
## Accuracy Kappa
## 0.9941392 0.992586
##
## Tuning parameter 'mtry' was held constant at a value of 2
final_pred <- predict(rf_final, test_cln)
final_pred
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
length(final_pred)
## [1] 20
# Crea 20 archivos problem_id_X.txt en el WD actual
pml_write_files <- function(x){
for(i in seq_along(x)){
fname <- paste0("problem_id_", i, ".txt")
write.table(x[i], file=fname, quote=FALSE, row.names=FALSE, col.names=FALSE)
}
}
pml_write_files(final_pred)
train evita imputaciones y simplifica el pipeline.valid_set (≈ 99.46% de exactitud; error ≈ 0.54%).sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.3.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Santiago
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] lubridate_1.9.4 dplyr_1.1.4 gbm_2.2.2
## [4] randomForest_4.7-1.2 caret_7.0-1 lattice_0.22-7
## [7] ggplot2_3.5.2
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.53 bslib_0.9.0
## [4] recipes_1.3.1 vctrs_0.6.5 tools_4.5.1
## [7] generics_0.1.4 stats4_4.5.1 parallel_4.5.1
## [10] proxy_0.4-27 tibble_3.3.0 pkgconfig_2.0.3
## [13] ModelMetrics_1.2.2.2 Matrix_1.7-3 data.table_1.17.8
## [16] RColorBrewer_1.1-3 lifecycle_1.0.4 compiler_4.5.1
## [19] farver_2.1.2 stringr_1.5.1 codetools_0.2-20
## [22] htmltools_0.5.8.1 class_7.3-23 sass_0.4.10
## [25] yaml_2.3.10 prodlim_2025.04.28 pillar_1.11.0
## [28] jquerylib_0.1.4 MASS_7.3-65 cachem_1.1.0
## [31] gower_1.0.2 iterators_1.0.14 rpart_4.1.24
## [34] foreach_1.5.2 nlme_3.1-168 parallelly_1.45.1
## [37] lava_1.8.1 tidyselect_1.2.1 digest_0.6.37
## [40] stringi_1.8.7 future_1.67.0 reshape2_1.4.4
## [43] purrr_1.1.0 listenv_0.9.1 splines_4.5.1
## [46] fastmap_1.2.0 grid_4.5.1 cli_3.6.5
## [49] magrittr_2.0.3 survival_3.8-3 e1071_1.7-16
## [52] future.apply_1.20.0 withr_3.0.2 scales_1.4.0
## [55] timechange_0.3.0 rmarkdown_2.29 globals_0.18.0
## [58] nnet_7.3-20 timeDate_4041.110 evaluate_1.0.4
## [61] knitr_1.50 hardhat_1.4.2 rlang_1.1.6
## [64] Rcpp_1.1.0 glue_1.8.0 pROC_1.19.0.1
## [67] ipred_0.9-15 rstudioapi_0.17.1 jsonlite_2.0.0
## [70] R6_2.6.1 plyr_1.8.9 MLmetrics_1.1.3