Goal :

To build a model for predicting the manner in which an individual did an exercise (the “class” variable)

Introduction :

Six young healthy participants were asked to perform one set of 10 repetitions of the Unilateral Dumbbell Biceps Curl in five different fashions (class A to class E). A key requirement for effective training to have a positive impact on cardio-respiratory fitness is a proper technique. Incorrect technique has been identified as the main cause of training injuries. The researchers in this work have tried to investigate the feasibility of automatically assessing the quality of execution of weight lifting exercises and the impact of providing real-time feedback to the athlete - so-called qualitative activity recognition.

The class variables have been methodically formulated to define the quality of the exercise done. They are defined as follows :

  * Exactly according to the specification (Class A)
  * Throwing the elbows to the front (Class B)
  * Lifting the dumbbell only halfway (Class C)
  * Lowering the dumbbell only halfway (Class D)
  * Throwing the hips to the front (Class E)

Data loading and standardizing

# Finding missing data
Blank_Cols <- apply(trainData[,-153], 2, function(Col)
      {S = sum(is.na(Col))#;print(S)
      if(S <= (length(Col)/1.5)) # Returns TRUE if the variable has <66% NAs else returns FALSE
                  {return(TRUE)}else{return(FALSE)}})

# Subsetting data with non missing/blank variables
trainData <- trainData[,c(Blank_Cols)]
#dim(trainData)

# Outlier detection
Indices <- apply(trainData[,-53], 2, function(Col){
                  IQR_var <- IQR(Col)
                  quantiles <- quantile(Col, probs = c(0.25,0.75), na.rm = T)
                  down <- quantiles[1] - (1.5*IQR_var)
                  up <- quantiles[2] + (1.5*IQR_var)
                  which(Col > up | Col < down)
})

# Replacing outliers with NA
for (i in 1:length(Indices)) {
   Index <- c(Indices[[i]])
   trainData[Index,i] <- NA
}

# Impute any remaining missing values and standardize
preProcObj <- preProcess(trainData, method = c("knnImpute","BoxCox", "center", "scale"))
trainData <- data.frame(predict(preProcObj, trainData))
trainData.copy <- trainData
# trainData <- trainData.copy


Exploratory Analysis :

nZero_Predictors <- nearZeroVar(trainData, saveMetrics = T)
sum(nZero_Predictors$nzv == TRUE)
## [1] 0

prCom <- preProcess(trainData, method = "pca")
trainData.PCA <- predict(prCom, newdata = trainData)


Building a model :

set.seed(825)
# Model function 1
mod_rPART <- train(classe~., data = trainData, method = "rpart") # rpart : R's pckg for partition trees
# Model function 2 ; A Boosting alternative
controlBM <- trainControl(method = "repeatedcv", number = 5, repeats = 1)

mod_GBM <- train(classe~., data = trainData, trControl = controlBM, method = "gbm", verbose=FALSE)

# Model function 2 ; Known best performer 1
controlRF <- trainControl(method="cv", number=3, verboseIter=FALSE)
mod_RF <- train(classe~., data = trainData, method = "rf", trControl= controlRF, allowParallel=TRUE)

Testing the model

# Subsetting
testData <- testData[,-(1:7)]; testClass <- as.factor(testData$classe)

Cols <- colnames(testData)[-153]
# Typecasting
for(i in Cols){
   testData[,i] <- as.numeric(as.character(testData[,i]))
}
# Subsetting data with non missing/blank variables
testData <- testData[,c(Blank_Cols)]

# Impute & scale
testData <- data.frame(predict(preProcObj, testData))
#fancyRpartPlot(mod_rPART$finalModel, palettes=c("Greys", "Blues"))

pred_rPART <- predict(mod_rPART, newdata = testData)
pred_GBM <- predict(mod_GBM, newdata = testData)
pred_RF <- predict(mod_RF, newdata = testData)

T_rPART <- confusionMatrix(testClass, pred_rPART)
T_GBM <- confusionMatrix(pred_GBM,testClass)
T_rf <- confusionMatrix(testClass, pred_RF)
Models Class-A Class-B Class-C Class-D Class-E
rPART 0.4222283 NA 0.2982558 NA 0.9979757
Random Forest 0.9934524 0.9774306 0.9841584 0.9906250 0.9981533
Gradient Boosting (GBM) 0.9767025 0.9332748 0.9122807 0.9014523 0.9676525
Models Class-A Class-B Class-C Class-D Class-E
rPART 0.9439928 0.8064571 0.8768307 0.8361937 0.8907438
Random Forest 0.9988109 0.9972533 0.9934359 0.9973604 0.9997918
Gradient Boosting (GBM) 0.9919259 0.9715550 0.9823009 0.9951229 0.9883406
Models Accuracy Kappa_Est
rPART 0.4343246 0.2519242
Random Forest 0.9891249 0.9862416
Gradient Boosting (GBM) 0.9430756 0.9279746

Predicting with the model for validation set

Validation_Data <- as.data.frame(read.csv2("./Data/pml-testing.csv", header = T, sep =",", quote = '"'))
Validation_Data <- Validation_Data[,-(1:7)] ; Problem <- Validation_Data$problem_id

Cols <- colnames(Validation_Data)[-153]
for(i in Cols){
   Validation_Data[,i] <- as.numeric(as.character(Validation_Data[,i]))
}
# Subsetting data with non missing/blank variables
Validation_Data <- Validation_Data[,c(Blank_Cols)]

# Impute
Validation_Data <- data.frame(predict(preProcObj, Validation_Data))
Val_pred_rPART <- predict(mod_rPART, newdata = Validation_Data)
Val_pred_GBM <- predict(mod_GBM, newdata = Validation_Data)
Val_pred_RF <- predict(mod_RF, newdata = Validation_Data)
Models X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20
rPART 3 1 1 1 1 3 3 1 1 1 3 3 3 1 3 1 1 1 1 3
Random Forest 2 1 2 1 1 5 4 2 1 1 2 3 2 1 5 5 1 2 2 2
Gradient Boosting (GBM) 2 1 2 1 1 5 4 2 1 1 2 3 2 1 5 5 1 2 2 2

Concluding remarks