1. Introduction

This report describes the development of a machine learning model to classify weightlifting exercises into five categories (A, B, C, D, E) based on sensor data from accelerometers.

Data Sources Training Data: pml-training.csv (19,622 observations)

Test Data: pml-testing.csv (20 observations)

2 Data Loading and Preprocessing

#2.1 Load Libraries and Data

# Load Libraries and Data
library(caret)
library(randomForest)
library(dplyr)
library(tidyr)

# Download data if not already present
if (!file.exists("pml-training.csv")) {
  download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv",
                "pml-training.csv", method = "curl")
}
if (!file.exists("pml-testing.csv")) {
  download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv",
                "pml-testing.csv", method = "curl")
}

# Load data
train_data <- read.csv("pml-training.csv", na.strings = c("NA", "", "#DIV/0!"))
test_data <- read.csv("pml-testing.csv", na.strings = c("NA", "", "#DIV/0!"))

2.2 Data Cleaning

You can also embed plots, for example:

# Remove columns with >95% NA values
train_data <- train_data[, colMeans(is.na(train_data)) < 0.95]
test_data <- test_data[, colMeans(is.na(test_data)) < 0.95]

# Remove metadata columns (non-predictive)
cols_to_remove <- c("X", "user_name", "raw_timestamp_part_1", 
                   "raw_timestamp_part_2", "cvtd_timestamp",
                   "new_window", "num_window")
train_data <- train_data %>% select(-all_of(cols_to_remove))
test_data <- test_data %>% select(-all_of(cols_to_remove))

# Convert classe to factor
train_data$classe <- factor(train_data$classe, levels = c("A", "B", "C", "D", "E"))

# Ensure all remaining columns are numeric
numeric_cols <- sapply(train_data, is.numeric)
train_data <- train_data[, numeric_cols | names(train_data) == "classe"]
test_data <- test_data[, numeric_cols[names(numeric_cols) %in% names(test_data)]]

3. Data Partitioning

Split the training data into training (70%) and validation (30%) sets

set.seed(123)
train_idx <- createDataPartition(train_data$classe, p = 0.7, list = FALSE)
training <- train_data[train_idx, ]
validation <- train_data[-train_idx, ]

4. Model Training (Random Forest)

# Train model with cross-validation
ctrl <- trainControl(method = "cv", number = 5)
model_rf <- train(classe ~ ., 
                 data = training, 
                 method = "rf",
                 trControl = ctrl,
                 ntree = 100,
                 importance = TRUE)

# View model details
print(model_rf)
## Random Forest 
## 
## 13737 samples
##    52 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10990, 10990, 10989, 10988, 10991 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9898818  0.9871992
##   27    0.9906823  0.9882134
##   52    0.9843489  0.9801999
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 27.

5. Model Evaluation

# Validation Set Performance
pred_val <- predict(model_rf, validation)
confusionMatrix(pred_val, validation$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1674    6    0    0    0
##          B    0 1124    6    0    0
##          C    0    9 1018   10    4
##          D    0    0    2  954    4
##          E    0    0    0    0 1074
## 
## Overall Statistics
##                                          
##                Accuracy : 0.993          
##                  95% CI : (0.9906, 0.995)
##     No Information Rate : 0.2845         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9912         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   0.9868   0.9922   0.9896   0.9926
## Specificity            0.9986   0.9987   0.9953   0.9988   1.0000
## Pos Pred Value         0.9964   0.9947   0.9779   0.9937   1.0000
## Neg Pred Value         1.0000   0.9968   0.9983   0.9980   0.9983
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2845   0.1910   0.1730   0.1621   0.1825
## Detection Prevalence   0.2855   0.1920   0.1769   0.1631   0.1825
## Balanced Accuracy      0.9993   0.9928   0.9937   0.9942   0.9963

6. Final Predictions on Test Set

test_pred <- predict(model_rf, test_data)
test_pred
##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E