library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 3.4.1 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
set.seed(2048)
filesDirectory="./Data"
options(warn=-1)
if (!file.exists(filesDirectory)) {
dir.create(filesDirectory)
}
trainUrl <- "http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
testUrl <- "http://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
trainFile <- "train.csv"
testFile <- "test.csv"
trainFilePath <- paste(filesDirectory, trainFile, sep = "/")
testFilePath <- paste(filesDirectory, testFile, sep = "/")
# Download Data
download.file(trainUrl, destfile = trainFilePath, method="curl")
download.file(testUrl, destfile = testFilePath, method="curl")
# Load Data
training <- read.csv(trainFilePath, na.strings=c("NA","#DIV/0!",""))
testing <- read.csv(testFilePath, na.strings=c("NA","#DIV/0!",""))
inTrain <- createDataPartition(y=training$classe, p=0.6, list=FALSE)
myTraining <- training[inTrain, ]; myTesting <- training[-inTrain, ]
dim(myTraining); dim(myTesting)
## [1] 11776 160
## [1] 7846 160
# remove zero values
nsv <- nearZeroVar(training, saveMetrics = T)
training <- training[, !nsv$nzv]
# remove missing values
nav <- sapply(colnames(training), function(x) if(sum(is.na(training[, x])) > 0.8*nrow(training)){return(T)}else{return(F)})
training <- training[, !nav]
cor <- abs(sapply(colnames(training[, -ncol(training)]), function(x) cor(as.numeric(training[, x]), as.numeric(training$classe), method = "spearman")))
head(cor)
## X user_name raw_timestamp_part_1
## 0.976646598 0.012000639 0.201244517
## raw_timestamp_part_2 cvtd_timestamp num_window
## 0.014748691 0.131346255 0.005798365
plot(training[, names(which.max(cor))], training[, names(which.max(cor[-which.max(cor)]))], col = training$classe, pch = 19, cex = 0.1, xlab = names(which.max(cor)), ylab = names(which.max(cor[-which.max(cor)])))
Test using Boosting method, then compare results with Random Trees ## Boosting Basic ideas are: 1. Take lots of (possibly) weak predictors 2. Weight them and add them up 3. Get a stronger predictor
# set the seet
set.seed(123)
# train the data using gbm method
boostFit <- train(classe ~ ., method = "gbm", data = training, verbose = F, trControl = trainControl(method = "cv", number = 10))
## Loading required package: gbm
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1
## Loading required package: plyr
# display the results from our model
boostFit
## Stochastic Gradient Boosting
##
## 19622 samples
## 58 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 17660, 17661, 17659, 17660, 17658, 17660, ...
##
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa Accuracy SD
## 1 50 0.9998981 0.9998711 0.0002147923
## 1 100 0.9998471 0.9998067 0.0002461172
## 1 150 0.9998471 0.9998067 0.0002461172
## 2 50 0.9998471 0.9998067 0.0002461172
## 2 100 0.9998471 0.9998067 0.0002461172
## 2 150 0.9998471 0.9998067 0.0002461172
## 3 50 0.9998471 0.9998067 0.0002461172
## 3 100 0.9998471 0.9998067 0.0002461172
## 3 150 0.9998471 0.9998067 0.0002461172
## Kappa SD
## 0.000271656
## 0.000311286
## 0.000311286
## 0.000311286
## 0.000311286
## 0.000311286
## 0.000311286
## 0.000311286
## 0.000311286
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth
## = 1 and shrinkage = 0.1.
# plot the results
plot(boostFit, ylim = c(0.9, 1))
# set the seed
set.seed(123)
# call caret model using random forest
rfFit <- train(classe ~ ., method = "rf", data = training, importance = T, trControl = trainControl(method = "cv", number = 10))
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
# examine the results
rfFit
## Random Forest
##
## 19622 samples
## 58 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 17660, 17661, 17659, 17660, 17658, 17660, ...
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.9971974 0.9964551 0.0022688160 0.0028697342
## 41 0.9999490 0.9999355 0.0001612584 0.0002039895
## 80 0.9998470 0.9998065 0.0003441560 0.0004353440
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 41.
# plot random forest fit
plot(rfFit, ylim = c(0.9, 1))
# examine the importance of the fit
imp <- varImp(rfFit)$importance
imp$max <- apply(imp, 1, max)
imp <- imp[order(imp$max, decreasing = T), ]
When we compare the analysis from Random Forest versus Boosting we observe that * The out of sample error observed in the training model was 0.04% and the prediction using the test data were the same
# Final Model
rfFit$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry, importance = ..1)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 41
##
## OOB estimate of error rate: 0.01%
## Confusion matrix:
## A B C D E class.error
## A 5580 0 0 0 0 0.0000000000
## B 1 3796 0 0 0 0.0002633658
## C 0 1 3421 0 0 0.0002922268
## D 0 0 0 3216 0 0.0000000000
## E 0 0 0 0 3607 0.0000000000
# prediction
(prediction <- as.character(predict(rfFit, testing)))
## [1] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [18] "A" "A" "A"
Complete the assignment by writing out the results to our prediction files
# write prediction files
pml_write_files = function(x){
n = length(x)
for(i in 1:n){
filename = paste0("./prediction/problem_id_", i, ".txt")
write.table(x[i], file = filename, quote = FALSE, row.names = FALSE, col.names = FALSE)
}
}
pml_write_files(prediction)