Loading data.
fileURL<- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
download.file(fileURL, destfile = "pml-training.csv", method = "curl")
training <- read.csv("pml-training.csv", header = TRUE, na.strings= c("NA", "#DIV/0!", ""))
fileURL1 <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
download.file(fileURL1, destfile = "pml-testing.csv", method = "curl")
testing <- read.csv("pml-testing.csv", header = TRUE, na.strings= c("NA", "#DIV/0!", ""))
Libraries and seed for reproducibility. Paritioning the training dataset into two subsets, one for training and one for testing.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(rpart)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
set.seed(1234)
inTrain = createDataPartition(y = training$classe, p = 0.6, list = F)
myTrain = training[inTrain, ]
myTest = training[-inTrain, ]
dim(testing)
## [1] 20 160
Remove all NA’s above a certain threshold, 60 percent.
myTrain = myTrain[c(-1)]
trainingV3 <- myTrain
for(i in 1:length(myTrain)) {
if( sum( is.na( myTrain[, i] ) ) /nrow(myTrain) >= .6 ) { #if n?? NAs > 60% of total observations
for(j in 1:length(trainingV3)) {
if( length( grep(names(myTrain[i]), names(trainingV3)[j]) ) ==1) {
trainingV3 <- trainingV3[ , -j] #Remove that column
}
}
}
}
myTrain <- trainingV3
clean1 <- colnames(myTrain)
clean2 <- clean1[1:58]
myTest <- myTest[clean1]
testing <- testing[clean2]
#Coercion
for (i in 1:length(testing) ) {
for(j in 1:length(myTrain)) {
if( length( grep(names(myTrain[i]), names(testing)[j]) ) ==1) {
class(testing[j]) <- class(myTrain[i])
}
}
}
testing <- rbind(myTrain[2, -59] , testing)
testing <- testing[-1,]
Random Forest model is used to test the data.
model <- randomForest(classe ~ . - classe,data = myTrain)
Used testing training data set to see effectiveness
prediction <- predict(model, myTest, type = "class")
confusionMatrix(prediction, myTest$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2232 2 0 0 0
## B 0 1516 4 0 0
## C 0 0 1364 5 0
## D 0 0 0 1280 0
## E 0 0 0 1 1442
##
## Overall Statistics
##
## Accuracy : 0.9985
## 95% CI : (0.9973, 0.9992)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9981
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 0.9987 0.9971 0.9953 1.0000
## Specificity 0.9996 0.9994 0.9992 1.0000 0.9998
## Pos Pred Value 0.9991 0.9974 0.9963 1.0000 0.9993
## Neg Pred Value 1.0000 0.9997 0.9994 0.9991 1.0000
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2845 0.1932 0.1738 0.1631 0.1838
## Detection Prevalence 0.2847 0.1937 0.1745 0.1631 0.1839
## Balanced Accuracy 0.9998 0.9990 0.9982 0.9977 0.9999
Predicted using untouched, testing dataset.
testpredictions <- predict(model, testing, type = "class")
confusionMatrix(prediction, myTest$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2232 2 0 0 0
## B 0 1516 4 0 0
## C 0 0 1364 5 0
## D 0 0 0 1280 0
## E 0 0 0 1 1442
##
## Overall Statistics
##
## Accuracy : 0.9985
## 95% CI : (0.9973, 0.9992)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9981
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 0.9987 0.9971 0.9953 1.0000
## Specificity 0.9996 0.9994 0.9992 1.0000 0.9998
## Pos Pred Value 0.9991 0.9974 0.9963 1.0000 0.9993
## Neg Pred Value 1.0000 0.9997 0.9994 0.9991 1.0000
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2845 0.1932 0.1738 0.1631 0.1838
## Detection Prevalence 0.2847 0.1937 0.1745 0.1631 0.1839
## Balanced Accuracy 0.9998 0.9990 0.9982 0.9977 0.9999