# Set Working Directory
setwd("C:/Users/maria/OneDrive/Escritorio/R/Hopkins/Practical machine learning")

Load necessary libraries

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Cargando paquete requerido: ggplot2

## Warning: package 'ggplot2' was built under R version 4.4.2

## Cargando paquete requerido: lattice

## Warning: package 'lattice' was built under R version 4.4.2

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Adjuntando el paquete: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(ggplot2)

Load data

train_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test_url  <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"

train_data <- read.csv(train_url, na.strings=c("NA","","#DIV/0!"))
test_data  <- read.csv(test_url, na.strings=c("NA","","#DIV/0!"))

Remove columns with too many NAs

train_data <- train_data[, colSums(is.na(train_data)) == 0]
test_data  <- test_data[,  colSums(is.na(test_data)) == 0]

Remove irrelevant variables (e.g., columns 1:7)

train_data <- train_data[, -c(1:7)]
test_data  <- test_data[,  -c(1:7)]

MODELING: Convert target variable to factor

train_data$classe <- as.factor(train_data$classe)

Split into training & validation

set.seed(123)
inTrain <- createDataPartition(train_data$classe, p = 0.75, list = FALSE)
training <- train_data[inTrain, ]
validation <- train_data[-inTrain, ]

Train random forest with cross-validation

set.seed(123)
fitControl <- trainControl(method = "cv", number = 5)

rf_model <- train(classe ~ ., 
                  data = training, 
                  method = "rf", 
                  trControl = fitControl)

rf_model

## Random Forest 
## 
## 14718 samples
##    52 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 11775, 11773, 11774, 11774, 11776 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9921186  0.9900297
##   27    0.9918469  0.9896872
##   52    0.9853928  0.9815194
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

VALIDATATION set performance

val_preds <- predict(rf_model, validation)
conf_matrix <- confusionMatrix(val_preds, validation$classe)
conf_matrix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1395    1    0    0    0
##          B    0  946    8    0    0
##          C    0    2  847   15    0
##          D    0    0    0  789    2
##          E    0    0    0    0  899
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9943          
##                  95% CI : (0.9918, 0.9962)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9928          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   0.9968   0.9906   0.9813   0.9978
## Specificity            0.9997   0.9980   0.9958   0.9995   1.0000
## Pos Pred Value         0.9993   0.9916   0.9803   0.9975   1.0000
## Neg Pred Value         1.0000   0.9992   0.9980   0.9964   0.9995
## Prevalence             0.2845   0.1935   0.1743   0.1639   0.1837
## Detection Rate         0.2845   0.1929   0.1727   0.1609   0.1833
## Detection Prevalence   0.2847   0.1945   0.1762   0.1613   0.1833
## Balanced Accuracy      0.9999   0.9974   0.9932   0.9904   0.9989

Test set predictions

final_predictions <- predict(rf_model, test_data)
final_predictions

##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E

CONCLUSION

In this project, we:

Read and cleaned the data by removing columns with excessive missing values and irrelevant metadata.
Split the cleaned training data into a training set (75%) and a validation set (25%) for robust performance estimation.
Built a Random Forest model using 5-fold cross-validation, obtaining an unbiased estimate of out-of-sample error.
Evaluated performance on the validation set via a confusion matrix, achieving high accuracy (well above 90% in most cases).
Predicted final test set classes using the trained model, producing the 20 predictions required for submission.

Overall, the Random Forest model demonstrated excellent predictive performance. The cross-validation approach helped verify that the model’s accuracy was not simply the result of overfitting. This workflow—from data cleaning to validation—illustrates a standard approach to predictive modeling in machine learning.

Machine Learning Course Project

Alejandra Perez