We build a classifier to predict how barbell lifts
were performed (classe
, A–E) using accelerometer data from
belt, forearm, arm, and dumbbell sensors. We clean the data, perform a
stratified train/validation split, tune a Random Forest
via 5-fold cross-validation, report expected out-of-sample error from a
hold-out set, and generate predictions for 20 test cases.
Constraints: < 2000 words, < 5 figures.
Source: Weight Lifting Exercise Dataset (see assignment links). Training and test CSVs are provided by the course.
set.seed(2025)
suppressPackageStartupMessages({
library(tidyverse)
library(caret)
library(randomForest)
library(e1071)
})
train_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
if (!file.exists("pml-training.csv")) download.file(train_url, "pml-training.csv", mode = "wb")
if (!file.exists("pml-testing.csv")) download.file(test_url, "pml-testing.csv", mode = "wb")
raw_train <- read.csv("pml-training.csv", na.strings = c("NA","","#DIV/0!"))
raw_test <- read.csv("pml-testing.csv", na.strings = c("NA","","#DIV/0!"))
dim(raw_train); dim(raw_test)
## [1] 19622 160
## [1] 20 160
head(raw_train[,1:8])
## X user_name raw_timestamp_part_1 raw_timestamp_part_2 cvtd_timestamp
## 1 1 carlitos 1323084231 788290 05/12/2011 11:23
## 2 2 carlitos 1323084231 808298 05/12/2011 11:23
## 3 3 carlitos 1323084231 820366 05/12/2011 11:23
## 4 4 carlitos 1323084232 120339 05/12/2011 11:23
## 5 5 carlitos 1323084232 196328 05/12/2011 11:23
## 6 6 carlitos 1323084232 304277 05/12/2011 11:23
## new_window num_window roll_belt
## 1 no 11 1.41
## 2 no 11 1.41
## 3 no 11 1.42
## 4 no 12 1.48
## 5 no 12 1.48
## 6 no 12 1.45
We remove columns with excessive missingness, non-predictive identifiers/timestamps, and near-zero-variance features. Important fix: ensure the final predictor set exists in both train and the 20-row test file to avoid subsetting errors.
# 1) Drop columns with >95% NA *in training*
na_frac <- sapply(raw_train, function(x) mean(is.na(x)))
train_keep <- names(na_frac[na_frac <= 0.95])
# 2) Remove obvious non-predictors / identifiers
id_cols <- c("X","user_name","raw_timestamp_part_1","raw_timestamp_part_2",
"cvtd_timestamp","new_window","num_window")
train_keep <- setdiff(train_keep, id_cols)
# 3) Keep only features present in BOTH training and the 20-case test file
features <- intersect(train_keep, names(raw_test))
# 4) Build aligned train/test
train <- raw_train[, c(features, "classe")]
test <- raw_test[, features]
# 5) Outcome as factor
train$classe <- factor(train$classe)
# 6) Remove near-zero variance predictors (based on training only)
nzv <- nearZeroVar(train[, features], saveMetrics = TRUE)
good_features <- features[!nzv$nzv]
train <- train[, c(good_features, "classe")]
test <- test[, good_features]
dim(train); length(good_features)
## [1] 19622 53
## [1] 52
Stratified split 70/30 to estimate generalization error.
set.seed(2025)
inTrain <- createDataPartition(train$classe, p = 0.7, list = FALSE)
train_set <- train[inTrain, ]
valid_set <- train[-inTrain, ]
prop.table(table(train_set$classe))
##
## A B C D E
## 0.2843416 0.1934920 0.1744195 0.1639368 0.1838101
We use Random Forest (500 trees) with 5-fold cross-validation.
ctrl <- trainControl(method = "cv", number = 5, verboseIter = FALSE)
rf_fit <- caret::train(
classe ~ .,
data = train_set,
method = "rf",
trControl = ctrl,
tuneLength = 3, # small grid over mtry
ntree = 500
)
rf_fit
## Random Forest
##
## 13737 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 10990, 10991, 10989, 10988, 10990
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9909008 0.9884890
## 27 0.9902456 0.9876601
## 52 0.9838401 0.9795544
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
plot(rf_fit)
Figure 1. Cross-validated accuracy vs mtry
(<= 5 figures).
vi <- varImp(rf_fit)
plot(vi, top = 20)
Figure 2. Top 20 important predictors (<= 5 figures).
pred_valid <- predict(rf_fit, newdata = valid_set)
cm <- confusionMatrix(pred_valid, valid_set$classe)
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1673 6 0 0 0
## B 1 1128 10 0 0
## C 0 5 1015 14 0
## D 0 0 1 949 1
## E 0 0 0 1 1081
##
## Overall Statistics
##
## Accuracy : 0.9934
## 95% CI : (0.991, 0.9953)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9916
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9994 0.9903 0.9893 0.9844 0.9991
## Specificity 0.9986 0.9977 0.9961 0.9996 0.9998
## Pos Pred Value 0.9964 0.9903 0.9816 0.9979 0.9991
## Neg Pred Value 0.9998 0.9977 0.9977 0.9970 0.9998
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2843 0.1917 0.1725 0.1613 0.1837
## Detection Prevalence 0.2853 0.1935 0.1757 0.1616 0.1839
## Balanced Accuracy 0.9990 0.9940 0.9927 0.9920 0.9994
oose <- 1 - cm$overall["Accuracy"]
oose
## Accuracy
## 0.006627018
Result: Expected out-of-sample error (approx.): 0.66% (lower is better).
Refit on all cleaned training data and predict.
set.seed(2025)
rf_all <- caret::train(
classe ~ .,
data = train,
method = "rf",
trControl = ctrl,
tuneGrid = rf_fit$bestTune,
ntree = 500
)
quiz_preds <- predict(rf_all, newdata = test)
quiz_preds
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
To write the 20 individual files (optional helper):
write_prediction_files <- function(preds, outdir = "predictions") {
dir.create(outdir, showWarnings = FALSE)
for (i in seq_along(preds)) {
fn <- file.path(outdir, paste0("problem_id_", i, ".txt"))
write.table(preds[i], file = fn, quote = FALSE, row.names = FALSE, col.names = FALSE)
}
}
write_prediction_files(quiz_preds)