Build a machine learning algorithm to predict activity quality from personal activity monitors.
Course project for Practical Machine Learning, part of the Johns Hopkins Data Science Specialization taught by Jeff Leek
Velloso, E.; Bulling, A.; Gellersen, H.; Ugulino, W.; Fuks, H. Qualitative Activity Recognition of Weight Lifting Exercises. Proceedings of 4th International Conference in Cooperation with SIGCHI (Augmented Human '13) . Stuttgart, Germany: ACM SIGCHI, 2013.
10 reps unilateral dumbbell biceps curls
### download data, if necessary
if (!file.exists(trainingfile)) {
download.file(trainingURL, trainingfile, mode = "w")
}
if (!file.exists(testingfile)) {
download.file(testingURL, testingfile, mode = "w")
}
### read in files
df <- read.csv(trainingfile, na.strings = c('NA', '#DIV/0!', ''))
validation <- read.csv(testingfile, na.strings = c('NA', '#DIV/0!', ''))
validation_id <- validation$problem_id
160 Features, 19622 Observations
suppressMessages(library(caret))
inTrain <- createDataPartition(y = df$classe, p = 0.7, list = FALSE)
training <- df[inTrain, ]
training_labels <- training$classe
testing <- df[-inTrain, ]
testing_labels <- testing$classe
# remove columns with more than 80% NA values (100 features)
badfeatures <- training[ , colSums(is.na(training)) >= 0.8 * nrow(training)]
validation <- validation[ , colSums(is.na(training)) < 0.8 * nrow(training)]
testing <- testing[ , colSums(is.na(training)) < 0.8 * nrow(training)]
training <- training[ , colSums(is.na(training)) < 0.8 * nrow(training)]
# remove identification columns (7 features)
validation <- validation[ , 8:ncol(training)]
testing <- testing[ , 8:ncol(training)]
training <- training[ , 8:ncol(training)]
suppressMessages(library(knitr))
kable(class_table)
| Technique | Counts | Proportion | |
|---|---|---|---|
| A | Proper Form | 3906 | 0.28 |
| B | Elbows Forward | 2658 | 0.19 |
| C | Halfway Up | 2396 | 0.17 |
| D | Halfway Down | 2252 | 0.16 |
| E | Hips Forward | 2525 | 0.18 |
# Perform Principal Component Analysis on the training set
preProc <- preProcess(training[ , -53], method = "pca")
# Apply the transform to all 3 datasets
trainPC <- predict(preProc, training[ , -53])
testPC <- predict(preProc, testing[ , -53])
validationPC <- predict(preProc, validation[ , -53])
# Use Random Forest with cross validation removing 20% of features
suppressMessages(library(randomForest))
modFit <- rfcv(trainPC, training_labels, step = 0.8, cv.fold = 10)
Error in eval(expr, envir, enclos) :
could not find function "randomForest"