Machine Learning Course Project

Coursera course: Practical Machine Learning

Step 1: Load the data

library(readr)
library(dplyr)
library(caret)
library(caret)
library(randomForest)
library(doParallel)

# load datasets
training <- read_csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv")
testing  <- read_csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv")

# remove columns with too many NA values (more than 95% missing)
na_threshold <- 0.95
training <- training[, colMeans(is.na(training)) < na_threshold]
testing  <- testing[, colMeans(is.na(testing)) < na_threshold]

# check what identifier columns exist and drop them safely
idCols <- intersect(c("user_name","raw_timestamp_part_1",
                      "raw_timestamp_part_2","cvtd_timestamp","num_window"), 
                    colnames(training))

training <- training %>% select(-all_of(idCols))
testing  <- testing %>% select(-all_of(idCols))

# Ensure 'classe' is a factor for classification
training$classe <- as.factor(training$classe)

set.seed(1)
# create partition (70% training, 30% validation)
inTrain <- createDataPartition(y=training$classe, p=0.7, list=FALSE)
training_sub <- training[inTrain, ]
testing_sub  <- training[-inTrain, ]
library(parallel)
library(doParallel)

# setup parallel cluster
cluster <- makeCluster(detectCores() - 1)
registerDoParallel(cluster)

fitControl <- trainControl(method="cv", number=5, allowParallel=TRUE)

set.seed(1)
modFit <- train(classe ~ ., data=training_sub, 
                method="rf", 
                trControl=fitControl)

# stop cluster
stopCluster(cluster)
registerDoSEQ()
# prediction on validation set
pred_sub <- predict(modFit, testing_sub)

# confusion matrix
confMat <- confusionMatrix(pred_sub, testing_sub$classe)
confMat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1674    1    0    0    0
##          B    0 1138    2    0    0
##          C    0    0 1024    0    0
##          D    0    0    0  964    0
##          E    0    0    0    0 1082
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9995          
##                  95% CI : (0.9985, 0.9999)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9994          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   0.9991   0.9981   1.0000   1.0000
## Specificity            0.9998   0.9996   1.0000   1.0000   1.0000
## Pos Pred Value         0.9994   0.9982   1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   0.9998   0.9996   1.0000   1.0000
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2845   0.1934   0.1740   0.1638   0.1839
## Detection Prevalence   0.2846   0.1937   0.1740   0.1638   0.1839
## Balanced Accuracy      0.9999   0.9994   0.9990   1.0000   1.0000
pred <- predict(modFit, testing)
head(pred)
## [1] A A A A A A
## Levels: A B C D E
importance <- varImp(modFit, scale=FALSE)
plot(importance, top=20)