Packages, Libraries and Seed

Installing packages, loading libraries, and setting the seed for reproduceability:

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.3
library(RColorBrewer)
library(rattle)
## Warning: package 'rattle' was built under R version 4.0.3
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 4.0.3
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
## 
##     importance
set.seed(1234)

Getting and cleaning data

The training data set can be found on the following URL:

trainUrl <- "http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"

The testing data set can be found on the following URL:

testUrl <- "http://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"

Load data to memory.

training <- read.csv(url(trainUrl), na.strings=c("NA","#DIV/0!",""))
testing <- read.csv(url(testUrl), na.strings=c("NA","#DIV/0!",""))

Partitioning the training set into two

Partitioning Training data set into two data sets, 60% for myTraining, 40% for myTesting:

inTrain <- createDataPartition(y=training$classe, p=0.6, list=FALSE)
myTraining <- training[inTrain, ]; myTesting <- training[-inTrain, ]
dim(myTraining); dim(myTesting)
## [1] 11776   160
## [1] 7846  160

Cleaning the data

The following transformations were used to clean the data:

Transformation 1: Cleaning NearZeroVariance Variables Run this code to view possible NZV Variables:

myDataNZV <- nearZeroVar(myTraining, saveMetrics=TRUE)

Another subset of NZV variables.

myNZVvars <- names(myTraining) %in% c("new_window", "kurtosis_roll_belt", "kurtosis_picth_belt",
"kurtosis_yaw_belt", "skewness_roll_belt", "skewness_roll_belt.1", "skewness_yaw_belt",
"max_yaw_belt", "min_yaw_belt", "amplitude_yaw_belt", "avg_roll_arm", "stddev_roll_arm",
"var_roll_arm", "avg_pitch_arm", "stddev_pitch_arm", "var_pitch_arm", "avg_yaw_arm",
"stddev_yaw_arm", "var_yaw_arm", "kurtosis_roll_arm", "kurtosis_picth_arm",
"kurtosis_yaw_arm", "skewness_roll_arm", "skewness_pitch_arm", "skewness_yaw_arm",
"max_roll_arm", "min_roll_arm", "min_pitch_arm", "amplitude_roll_arm", "amplitude_pitch_arm",
"kurtosis_roll_dumbbell", "kurtosis_picth_dumbbell", "kurtosis_yaw_dumbbell", "skewness_roll_dumbbell",
"skewness_pitch_dumbbell", "skewness_yaw_dumbbell", "max_yaw_dumbbell", "min_yaw_dumbbell",
"amplitude_yaw_dumbbell", "kurtosis_roll_forearm", "kurtosis_picth_forearm", "kurtosis_yaw_forearm",
"skewness_roll_forearm", "skewness_pitch_forearm", "skewness_yaw_forearm", "max_roll_forearm",
"max_yaw_forearm", "min_roll_forearm", "min_yaw_forearm", "amplitude_roll_forearm",
"amplitude_yaw_forearm", "avg_roll_forearm", "stddev_roll_forearm", "var_roll_forearm",
"avg_pitch_forearm", "stddev_pitch_forearm", "var_pitch_forearm", "avg_yaw_forearm",
"stddev_yaw_forearm", "var_yaw_forearm")
myTraining <- myTraining[!myNZVvars]

dim(myTraining)
## [1] 11776   100

Transformation 2: Killing first column of Dataset - ID Removing first ID variable so that it does not interfer with ML Algorithms:

myTraining <- myTraining[c(-1)]

Transformation 3: Cleaning Variables with too many NAs. For Variables that have more than a 60% threshold of NA’s I’m going to leave them out:

trainingV3 <- myTraining #creating another subset to iterate in loop
for(i in 1:length(myTraining)) { #for every column in the training dataset
        if( sum( is.na( myTraining[, i] ) ) /nrow(myTraining) >= .6 ) { #if n?? NAs > 60% of total observations
        for(j in 1:length(trainingV3)) {
            if( length( grep(names(myTraining[i]), names(trainingV3)[j]) ) ==1)  { #if the columns are the same:
                trainingV3 <- trainingV3[ , -j] #Remove that column
            }   
        } 
    }
}
#To check the new N?? of observations
dim(trainingV3)
## [1] 11776    58
#Setting back to our set:
myTraining <- trainingV3
rm(trainingV3)  

Now let us do the exact same 3 transformations for myTesting and testing data sets.

clean1 <- colnames(myTraining)
clean2 <- colnames(myTraining[, -58]) #already with classe column removed
myTesting <- myTesting[clean1]
testing <- testing[clean2]

#To check the new N?? of observations
dim(myTesting)
## [1] 7846   58
#To check the new N?? of observations
dim(testing)
## [1] 20 57

In order to ensure proper functioning of Decision Trees and especially RandomForest Algorithm with the Test data set (data set provided), we need to coerce the data into the same type.

for (i in 1:length(testing) ) {
        for(j in 1:length(myTraining)) {
        if( length( grep(names(myTraining[i]), names(testing)[j]) ) ==1)  {
            class(testing[j]) <- class(myTraining[i])
        }      
    }      
}
#And to make sure Coertion really worked, simple smart ass technique:
testing <- rbind(myTraining[2, -58] , testing) #note row 2 does not mean anything, this will be removed right.. now:
testing <- testing[-1,]

Using ML algorithms for prediction: Decision Tree

modFitA1 <- rpart(classe ~ ., data=myTraining, method="class")

To view the decision tree with fancy :

fancyRpartPlot(modFitA1)

# Predicting:

predictionsA1 <- predict(modFitA1, myTesting, type = "class")

Using confusion Matrix to test results:

confusionMatrix(predictionsA1, as.factor(myTesting$classe))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 2157   68   10    1    0
##          B   60 1265   73   67    0
##          C   15  177 1261  141   70
##          D    0    8   15  962  111
##          E    0    0    9  115 1261
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8802          
##                  95% CI : (0.8728, 0.8873)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8484          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9664   0.8333   0.9218   0.7481   0.8745
## Specificity            0.9859   0.9684   0.9378   0.9796   0.9806
## Pos Pred Value         0.9647   0.8635   0.7578   0.8777   0.9105
## Neg Pred Value         0.9866   0.9604   0.9827   0.9520   0.9720
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2749   0.1612   0.1607   0.1226   0.1607
## Detection Prevalence   0.2850   0.1867   0.2121   0.1397   0.1765
## Balanced Accuracy      0.9762   0.9009   0.9298   0.8638   0.9276

Using ML algorithms for prediction: Random Forests

myTraining$classe = factor(myTraining$classe) 
modFitB1 <- randomForest(classe ~. , data=myTraining)

Predicting in-sample error:

predictionsB1 <- predict(modFitB1, myTesting, type = "class")

Using confusion Matrix to test results:

confusionMatrix(predictionsB1, as.factor(myTesting$classe))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 2232    5    0    0    0
##          B    0 1513    3    0    0
##          C    0    0 1363    7    0
##          D    0    0    2 1278    2
##          E    0    0    0    1 1440
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9975          
##                  95% CI : (0.9961, 0.9984)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9968          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   0.9967   0.9963   0.9938   0.9986
## Specificity            0.9991   0.9995   0.9989   0.9994   0.9998
## Pos Pred Value         0.9978   0.9980   0.9949   0.9969   0.9993
## Neg Pred Value         1.0000   0.9992   0.9992   0.9988   0.9997
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2845   0.1928   0.1737   0.1629   0.1835
## Detection Prevalence   0.2851   0.1932   0.1746   0.1634   0.1837
## Balanced Accuracy      0.9996   0.9981   0.9976   0.9966   0.9992

Random Forests yielded better Results. # Generating Files to submit as answers for the Assignment: Finally, using the provided Test Set out-of-sample error.

For Random Forests we use the following formula, which yielded a much better prediction in in-sample:

predictionsB2 <- predict(modFitB1, testing, type = "class")

Function to generate files with predictions to submit for assignment

pml_write_files = function(x){
  n = length(x)
  for(i in 1:n){
    filename = paste0("problem_id_",i,".txt")
    write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.names=FALSE)
  }
}

pml_write_files(predictionsB2)