The goal of this project is to predict the manner in which participants performed weight lifting exercises. The outcome variable is classe.
library(caret)
library(randomForest)
training_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
testing_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
training <- read.csv(training_url, na.strings = c("NA", "", "#DIV/0!"))
testing <- read.csv(testing_url, na.strings = c("NA", "", "#DIV/0!"))
dim(training)
## [1] 19622 160
dim(testing)
## [1] 20 160
# remove variables with many missing values
keep_cols <- colSums(is.na(training)) == 0
training_clean <- training[, keep_cols]
testing_clean <- testing[, keep_cols]
# remove non-predictor columns
training_clean <- training_clean[, -c(1:7)]
testing_clean <- testing_clean[, -c(1:7)]
# make classe a factor
training_clean$classe <- as.factor(training_clean$classe)
dim(training_clean)
## [1] 19622 53
dim(testing_clean)
## [1] 20 53
set.seed(123)
inTrain <- createDataPartition(training_clean$classe, p = 0.7, list = FALSE)
trainData <- training_clean[inTrain, ]
validData <- training_clean[-inTrain, ]
dim(trainData)
## [1] 13737 53
dim(validData)
## [1] 5885 53
set.seed(123)
model_rf <- randomForest(classe ~ ., data = trainData, ntree = 100)
model_rf
##
## Call:
## randomForest(formula = classe ~ ., data = trainData, ntree = 100)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 0.66%
## Confusion matrix:
## A B C D E class.error
## A 3902 3 0 0 1 0.001024066
## B 17 2636 5 0 0 0.008276900
## C 1 17 2373 5 0 0.009599332
## D 0 0 32 2218 2 0.015097691
## E 0 1 2 4 2518 0.002772277
pred_valid <- predict(model_rf, validData)
conf_matrix <- confusionMatrix(pred_valid, validData$classe)
conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1673 4 0 0 0
## B 1 1132 3 0 1
## C 0 3 1023 10 4
## D 0 0 0 954 4
## E 0 0 0 0 1073
##
## Overall Statistics
##
## Accuracy : 0.9949
## 95% CI : (0.9927, 0.9966)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9936
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9994 0.9939 0.9971 0.9896 0.9917
## Specificity 0.9991 0.9989 0.9965 0.9992 1.0000
## Pos Pred Value 0.9976 0.9956 0.9837 0.9958 1.0000
## Neg Pred Value 0.9998 0.9985 0.9994 0.9980 0.9981
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2843 0.1924 0.1738 0.1621 0.1823
## Detection Prevalence 0.2850 0.1932 0.1767 0.1628 0.1823
## Balanced Accuracy 0.9992 0.9964 0.9968 0.9944 0.9958
accuracy <- conf_matrix$overall["Accuracy"]
out_of_sample_error <- 1 - accuracy
accuracy
## Accuracy
## 0.9949023
out_of_sample_error
## Accuracy
## 0.005097706
final_predictions <- predict(model_rf, testing_clean)
final_predictions
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E