Step 1: Load the data
library(readr)
library(dplyr)
library(caret)
library(caret)
library(randomForest)
library(doParallel)
# load datasets
training <- read_csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv")
testing <- read_csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv")
# remove columns with too many NA values (more than 95% missing)
na_threshold <- 0.95
training <- training[, colMeans(is.na(training)) < na_threshold]
testing <- testing[, colMeans(is.na(testing)) < na_threshold]
# check what identifier columns exist and drop them safely
idCols <- intersect(c("user_name","raw_timestamp_part_1",
"raw_timestamp_part_2","cvtd_timestamp","num_window"),
colnames(training))
training <- training %>% select(-all_of(idCols))
testing <- testing %>% select(-all_of(idCols))
# Ensure 'classe' is a factor for classification
training$classe <- as.factor(training$classe)
set.seed(1)
# create partition (70% training, 30% validation)
inTrain <- createDataPartition(y=training$classe, p=0.7, list=FALSE)
training_sub <- training[inTrain, ]
testing_sub <- training[-inTrain, ]
library(parallel)
library(doParallel)
# setup parallel cluster
cluster <- makeCluster(detectCores() - 1)
registerDoParallel(cluster)
fitControl <- trainControl(method="cv", number=5, allowParallel=TRUE)
set.seed(1)
modFit <- train(classe ~ ., data=training_sub,
method="rf",
trControl=fitControl)
# stop cluster
stopCluster(cluster)
registerDoSEQ()
# prediction on validation set
pred_sub <- predict(modFit, testing_sub)
# confusion matrix
confMat <- confusionMatrix(pred_sub, testing_sub$classe)
confMat
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1674 1 0 0 0
## B 0 1138 2 0 0
## C 0 0 1024 0 0
## D 0 0 0 964 0
## E 0 0 0 0 1082
##
## Overall Statistics
##
## Accuracy : 0.9995
## 95% CI : (0.9985, 0.9999)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9994
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 0.9991 0.9981 1.0000 1.0000
## Specificity 0.9998 0.9996 1.0000 1.0000 1.0000
## Pos Pred Value 0.9994 0.9982 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 0.9998 0.9996 1.0000 1.0000
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2845 0.1934 0.1740 0.1638 0.1839
## Detection Prevalence 0.2846 0.1937 0.1740 0.1638 0.1839
## Balanced Accuracy 0.9999 0.9994 0.9990 1.0000 1.0000
pred <- predict(modFit, testing)
head(pred)
## [1] A A A A A A
## Levels: A B C D E
importance <- varImp(modFit, scale=FALSE)
plot(importance, top=20)
