Abstract

We build a classifier to predict how barbell lifts were performed (classe, A–E) using accelerometer data from belt, forearm, arm, and dumbbell sensors. We clean the data, perform a stratified train/validation split, tune a Random Forest via 5-fold cross-validation, report expected out-of-sample error from a hold-out set, and generate predictions for 20 test cases.

Constraints: < 2000 words, < 5 figures.

Data

Source: Weight Lifting Exercise Dataset (see assignment links). Training and test CSVs are provided by the course.

set.seed(2025)
suppressPackageStartupMessages({
  library(tidyverse)
  library(caret)
  library(randomForest)
  library(e1071)
})

train_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test_url  <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"

if (!file.exists("pml-training.csv")) download.file(train_url, "pml-training.csv", mode = "wb")
if (!file.exists("pml-testing.csv"))  download.file(test_url,  "pml-testing.csv",  mode = "wb")

raw_train <- read.csv("pml-training.csv", na.strings = c("NA","","#DIV/0!"))
raw_test  <- read.csv("pml-testing.csv",  na.strings = c("NA","","#DIV/0!"))

dim(raw_train); dim(raw_test)
## [1] 19622   160
## [1]  20 160
head(raw_train[,1:8])
##   X user_name raw_timestamp_part_1 raw_timestamp_part_2   cvtd_timestamp
## 1 1  carlitos           1323084231               788290 05/12/2011 11:23
## 2 2  carlitos           1323084231               808298 05/12/2011 11:23
## 3 3  carlitos           1323084231               820366 05/12/2011 11:23
## 4 4  carlitos           1323084232               120339 05/12/2011 11:23
## 5 5  carlitos           1323084232               196328 05/12/2011 11:23
## 6 6  carlitos           1323084232               304277 05/12/2011 11:23
##   new_window num_window roll_belt
## 1         no         11      1.41
## 2         no         11      1.41
## 3         no         11      1.42
## 4         no         12      1.48
## 5         no         12      1.48
## 6         no         12      1.45

Cleaning

We remove columns with excessive missingness, non-predictive identifiers/timestamps, and near-zero-variance features. Important fix: ensure the final predictor set exists in both train and the 20-row test file to avoid subsetting errors.

# 1) Drop columns with >95% NA *in training*
na_frac <- sapply(raw_train, function(x) mean(is.na(x)))
train_keep <- names(na_frac[na_frac <= 0.95])

# 2) Remove obvious non-predictors / identifiers
id_cols <- c("X","user_name","raw_timestamp_part_1","raw_timestamp_part_2",
             "cvtd_timestamp","new_window","num_window")
train_keep <- setdiff(train_keep, id_cols)

# 3) Keep only features present in BOTH training and the 20-case test file
features <- intersect(train_keep, names(raw_test))

# 4) Build aligned train/test
train <- raw_train[, c(features, "classe")]
test  <- raw_test[,  features]

# 5) Outcome as factor
train$classe <- factor(train$classe)

# 6) Remove near-zero variance predictors (based on training only)
nzv <- nearZeroVar(train[, features], saveMetrics = TRUE)
good_features <- features[!nzv$nzv]
train <- train[, c(good_features, "classe")]
test  <- test[,  good_features]

dim(train); length(good_features)
## [1] 19622    53
## [1] 52

Train/Validation Split

Stratified split 70/30 to estimate generalization error.

set.seed(2025)
inTrain <- createDataPartition(train$classe, p = 0.7, list = FALSE)
train_set <- train[inTrain, ]
valid_set <- train[-inTrain, ]

prop.table(table(train_set$classe))
## 
##         A         B         C         D         E 
## 0.2843416 0.1934920 0.1744195 0.1639368 0.1838101

Modeling

We use Random Forest (500 trees) with 5-fold cross-validation.

ctrl <- trainControl(method = "cv", number = 5, verboseIter = FALSE)
rf_fit <- caret::train(
  classe ~ .,
  data = train_set,
  method = "rf",
  trControl = ctrl,
  tuneLength = 3,     # small grid over mtry
  ntree = 500
)
rf_fit
## Random Forest 
## 
## 13737 samples
##    52 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10990, 10991, 10989, 10988, 10990 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9909008  0.9884890
##   27    0.9902456  0.9876601
##   52    0.9838401  0.9795544
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
plot(rf_fit)

Figure 1. Cross-validated accuracy vs mtry (<= 5 figures).

Variable Importance

vi <- varImp(rf_fit)
plot(vi, top = 20)

Figure 2. Top 20 important predictors (<= 5 figures).

Validation Performance

pred_valid <- predict(rf_fit, newdata = valid_set)
cm <- confusionMatrix(pred_valid, valid_set$classe)
cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1673    6    0    0    0
##          B    1 1128   10    0    0
##          C    0    5 1015   14    0
##          D    0    0    1  949    1
##          E    0    0    0    1 1081
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9934         
##                  95% CI : (0.991, 0.9953)
##     No Information Rate : 0.2845         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9916         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9994   0.9903   0.9893   0.9844   0.9991
## Specificity            0.9986   0.9977   0.9961   0.9996   0.9998
## Pos Pred Value         0.9964   0.9903   0.9816   0.9979   0.9991
## Neg Pred Value         0.9998   0.9977   0.9977   0.9970   0.9998
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2843   0.1917   0.1725   0.1613   0.1837
## Detection Prevalence   0.2853   0.1935   0.1757   0.1616   0.1839
## Balanced Accuracy      0.9990   0.9940   0.9927   0.9920   0.9994
oose <- 1 - cm$overall["Accuracy"]
oose
##    Accuracy 
## 0.006627018

Result: Expected out-of-sample error (approx.): 0.66% (lower is better).

Predictions for the 20 Quiz Cases

Refit on all cleaned training data and predict.

set.seed(2025)
rf_all <- caret::train(
  classe ~ .,
  data = train,
  method = "rf",
  trControl = ctrl,
  tuneGrid = rf_fit$bestTune,
  ntree = 500
)

quiz_preds <- predict(rf_all, newdata = test)
quiz_preds
##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E

To write the 20 individual files (optional helper):

write_prediction_files <- function(preds, outdir = "predictions") {
  dir.create(outdir, showWarnings = FALSE)
  for (i in seq_along(preds)) {
    fn <- file.path(outdir, paste0("problem_id_", i, ".txt"))
    write.table(preds[i], file = fn, quote = FALSE, row.names = FALSE, col.names = FALSE)
  }
}
write_prediction_files(quiz_preds)