# Set Working Directory
setwd("C:/Users/maria/OneDrive/Escritorio/R/Hopkins/Practical machine learning")
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Cargando paquete requerido: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Cargando paquete requerido: lattice
## Warning: package 'lattice' was built under R version 4.4.2
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Adjuntando el paquete: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(ggplot2)
train_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
train_data <- read.csv(train_url, na.strings=c("NA","","#DIV/0!"))
test_data <- read.csv(test_url, na.strings=c("NA","","#DIV/0!"))
train_data <- train_data[, colSums(is.na(train_data)) == 0]
test_data <- test_data[, colSums(is.na(test_data)) == 0]
train_data <- train_data[, -c(1:7)]
test_data <- test_data[, -c(1:7)]
train_data$classe <- as.factor(train_data$classe)
set.seed(123)
inTrain <- createDataPartition(train_data$classe, p = 0.75, list = FALSE)
training <- train_data[inTrain, ]
validation <- train_data[-inTrain, ]
set.seed(123)
fitControl <- trainControl(method = "cv", number = 5)
rf_model <- train(classe ~ .,
data = training,
method = "rf",
trControl = fitControl)
rf_model
## Random Forest
##
## 14718 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 11775, 11773, 11774, 11774, 11776
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9921186 0.9900297
## 27 0.9918469 0.9896872
## 52 0.9853928 0.9815194
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
val_preds <- predict(rf_model, validation)
conf_matrix <- confusionMatrix(val_preds, validation$classe)
conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1395 1 0 0 0
## B 0 946 8 0 0
## C 0 2 847 15 0
## D 0 0 0 789 2
## E 0 0 0 0 899
##
## Overall Statistics
##
## Accuracy : 0.9943
## 95% CI : (0.9918, 0.9962)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9928
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 0.9968 0.9906 0.9813 0.9978
## Specificity 0.9997 0.9980 0.9958 0.9995 1.0000
## Pos Pred Value 0.9993 0.9916 0.9803 0.9975 1.0000
## Neg Pred Value 1.0000 0.9992 0.9980 0.9964 0.9995
## Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Detection Rate 0.2845 0.1929 0.1727 0.1609 0.1833
## Detection Prevalence 0.2847 0.1945 0.1762 0.1613 0.1833
## Balanced Accuracy 0.9999 0.9974 0.9932 0.9904 0.9989
final_predictions <- predict(rf_model, test_data)
final_predictions
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
In this project, we:
Overall, the Random Forest model demonstrated excellent predictive performance. The cross-validation approach helped verify that the model’s accuracy was not simply the result of overfitting. This workflow—from data cleaning to validation—illustrates a standard approach to predictive modeling in machine learning.