MachineLearning.Rmd

Loading data.

fileURL<- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
download.file(fileURL, destfile = "pml-training.csv", method = "curl")
training <- read.csv("pml-training.csv", header = TRUE,  na.strings= c("NA", "#DIV/0!", ""))

fileURL1 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
download.file(fileURL1, destfile = "pml-testing.csv", method = "curl")
testing <- read.csv("pml-testing.csv", header = TRUE, na.strings= c("NA", "#DIV/0!", ""))

Libraries and seed for reproducibility. Paritioning the training dataset into two subsets, one for training and one for testing.

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(rpart)
library(rattle)

## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

## 
## Attaching package: 'rattle'

## The following object is masked from 'package:randomForest':
## 
##     importance

set.seed(1234)
inTrain = createDataPartition(y = training$classe, p = 0.6, list = F)
myTrain = training[inTrain, ]
myTest = training[-inTrain, ]
dim(testing)

## [1]  20 160

Remove all NA’s above a certain threshold, 60 percent.

myTrain = myTrain[c(-1)]
trainingV3 <- myTrain
for(i in 1:length(myTrain)) { 
        if( sum( is.na( myTrain[, i] ) ) /nrow(myTrain) >= .6 ) { #if n?? NAs > 60% of total observations
        for(j in 1:length(trainingV3)) {
            if( length( grep(names(myTrain[i]), names(trainingV3)[j]) ) ==1)  { 
                trainingV3 <- trainingV3[ , -j] #Remove that column
            }   
        } 
    }
}
myTrain <- trainingV3
clean1 <- colnames(myTrain)
clean2 <- clean1[1:58]

myTest <- myTest[clean1]
testing <- testing[clean2]



#Coercion
for (i in 1:length(testing) ) {
        for(j in 1:length(myTrain)) {
        if( length( grep(names(myTrain[i]), names(testing)[j]) ) ==1)  {
            class(testing[j]) <- class(myTrain[i])
        }      
    }      
}
testing <- rbind(myTrain[2, -59] , testing) 
testing <- testing[-1,]

Machine Learning

Random Forest model is used to test the data.

model <- randomForest(classe ~ . - classe,data = myTrain)

Predictions

Used testing training data set to see effectiveness

prediction <- predict(model, myTest, type = "class")
confusionMatrix(prediction, myTest$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 2232    2    0    0    0
##          B    0 1516    4    0    0
##          C    0    0 1364    5    0
##          D    0    0    0 1280    0
##          E    0    0    0    1 1442
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9985          
##                  95% CI : (0.9973, 0.9992)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9981          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   0.9987   0.9971   0.9953   1.0000
## Specificity            0.9996   0.9994   0.9992   1.0000   0.9998
## Pos Pred Value         0.9991   0.9974   0.9963   1.0000   0.9993
## Neg Pred Value         1.0000   0.9997   0.9994   0.9991   1.0000
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2845   0.1932   0.1738   0.1631   0.1838
## Detection Prevalence   0.2847   0.1937   0.1745   0.1631   0.1839
## Balanced Accuracy      0.9998   0.9990   0.9982   0.9977   0.9999

Final Predictions

Predicted using untouched, testing dataset.

testpredictions <- predict(model, testing, type = "class")
confusionMatrix(prediction, myTest$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 2232    2    0    0    0
##          B    0 1516    4    0    0
##          C    0    0 1364    5    0
##          D    0    0    0 1280    0
##          E    0    0    0    1 1442
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9985          
##                  95% CI : (0.9973, 0.9992)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9981          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   0.9987   0.9971   0.9953   1.0000
## Specificity            0.9996   0.9994   0.9992   1.0000   0.9998
## Pos Pred Value         0.9991   0.9974   0.9963   1.0000   0.9993
## Neg Pred Value         1.0000   0.9997   0.9994   0.9991   1.0000
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2845   0.1932   0.1738   0.1631   0.1838
## Detection Prevalence   0.2847   0.1937   0.1745   0.1631   0.1839
## Balanced Accuracy      0.9998   0.9990   0.9982   0.9977   0.9999