The scripts have been solely produced, tested and executed on
## R version 3.2.1 (2015-06-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] magrittr_1.5 formatR_1.2 tools_3.2.1 htmltools_0.2.6
## [5] yaml_2.1.13 stringi_0.4-1 rmarkdown_0.7.3 knitr_1.10
## [9] stringr_1.0.0 digest_0.6.8 evaluate_0.7
THe goal of this project is to predict the manner a person using fitness-wares.
The training data for this project are available here: https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv
The test data are available here: https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv
The outcome (Y) is the “classe” variable (dependent) in the training set. The rest are the independent variables. ## Loading the Data
train.url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test.url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
if (file.exists("pml-training.csv")) {
training <- read.csv("pml-training.csv", na.strings=c("NA","#DIV/0!",""))
} else {
download.file(train.url,"pml-training.csv")
training <- read.csv("pml-training.csv", na.strings=c("NA","#DIV/0!",""))
}
if (file.exists("pml-testing.csv")) {
testing <- read.csv("pml-testing.csv", na.strings=c("NA","#DIV/0!",""))
} else {
download.file(test.url,"pml-testing.csv")
testing <- read.csv("pml-testing.csv", na.strings=c("NA","#DIV/0!",""))
}
library(abind)
library(arm)
## Loading required package: MASS
## Loading required package: Matrix
## Loading required package: lme4
## Loading required package: Rcpp
##
## arm (Version 1.8-4, built: 2015-04-07)
##
## Working directory is /Users/Bill/Dropbox/DataScience/8MachineLearning
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(kernlab)
library(klaR)
library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 3.4.1 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
library(rpart)
From the question, I am not sure whether the testing and training dataset have the same names, so I check the name’s coherence.
all.equal(colnames(testing)[1:length(colnames(testing))-1], colnames(training)[1:length(colnames(training))-1])
## [1] TRUE
Apparently, they are the same.
summary(training)
str(training)
names(training)
# clean the low variance data out
nearzero <- nearZeroVar(training, saveMetrics = TRUE) # from the caret lib
training <- training[, !nearzero$nzv]
# Remove the variables with more thn 50% missing data
md <- sapply(colnames(training), function(x) if( (sum(is.na(training[, x]))/nrow(training))>0.50 ) {return(TRUE)} else{return(FALSE)} )
training <- training[,!md]
# Remove the variables that cannot use to the prediction calculations
training <- training[, -(1:6)]
# Find the hily correlated variables
corr.analysis <- findCorrelation(cor(training[, -53]), cutoff=0.8)
corr.analysis
## [1] 10 1 9 36 8 2 21 34 25 19 46 31
names(training[corr.analysis])
## [1] "accel_belt_z" "roll_belt" "accel_belt_y"
## [4] "accel_dumbbell_z" "accel_belt_x" "pitch_belt"
## [7] "accel_arm_x" "accel_dumbbell_x" "magnet_arm_y"
## [10] "gyros_arm_y" "gyros_forearm_z" "gyros_dumbbell_x"
# PCA analysis
n=dim(training[, -53])[1]
S=(n-1)/n*cov(training[, -53])
#print(S)
R=(n-1)/n*cor(training[, -53])
#print(R)
ei=eigen(R)
per=ei$values/sum(ei$values)
cumper=cumsum(per)
table=rbind(ei$values,per,cumper)
rownames(table)=c('eigenvalues','percentage','Cummulative Per')
print(table)
## [,1] [,2] [,3] [,4] [,5]
## eigenvalues 8.3560546 8.1028988 4.67578119 4.12942713 3.65177222
## percentage 0.1607015 0.1558329 0.08992345 0.07941611 0.07022997
## Cummulative Per 0.1607015 0.3165345 0.40645792 0.48587403 0.55610399
## [,6] [,7] [,8] [,9] [,10]
## eigenvalues 3.00340653 2.23984658 2.07271393 1.71714322 1.5087446
## percentage 0.05776076 0.04307617 0.03986191 0.03302367 0.0290158
## Cummulative Per 0.61386476 0.65694092 0.69680284 0.72982651 0.7588423
## [,11] [,12] [,13] [,14] [,15]
## eigenvalues 1.38542732 1.12918399 0.98662428 0.8906573 0.83601603
## percentage 0.02664419 0.02171618 0.01897451 0.0171289 0.01607805
## Cummulative Per 0.78548650 0.80720268 0.82617719 0.8433061 0.85938414
## [,16] [,17] [,18] [,19] [,20]
## eigenvalues 0.78921111 0.67790053 0.60968912 0.53240414 0.484816243
## percentage 0.01517791 0.01303721 0.01172539 0.01023906 0.009323864
## Cummulative Per 0.87456205 0.88759926 0.89932465 0.90956371 0.918887579
## [,21] [,22] [,23] [,24]
## eigenvalues 0.425619142 0.398574899 0.382675187 0.339283597
## percentage 0.008185401 0.007665293 0.007359513 0.006525017
## Cummulative Per 0.927072979 0.934738272 0.942097785 0.948622802
## [,25] [,26] [,27] [,28]
## eigenvalues 0.307690839 0.292949305 0.255978568 0.236240801
## percentage 0.005917433 0.005633928 0.004922916 0.004543324
## Cummulative Per 0.954540236 0.960174163 0.965097079 0.969640403
## [,29] [,30] [,31] [,32]
## eigenvalues 0.203435616 0.179879937 0.170105135 0.131735634
## percentage 0.003912423 0.003459406 0.003271419 0.002533507
## Cummulative Per 0.973552825 0.977012231 0.980283651 0.982817157
## [,33] [,34] [,35] [,36]
## eigenvalues 0.121825897 0.112441291 0.091976768 0.079714760
## percentage 0.002342925 0.002162443 0.001768874 0.001533054
## Cummulative Per 0.985160082 0.987322525 0.989091399 0.990624454
## [,37] [,38] [,39] [,40]
## eigenvalues 0.064208114 0.056534418 0.055185207 0.0407997580
## percentage 0.001234834 0.001087256 0.001061308 0.0007846507
## Cummulative Per 0.991859288 0.992946544 0.994007852 0.9947925025
## [,41] [,42] [,43] [,44]
## eigenvalues 0.0381015323 0.035455902 0.0337259313 0.032213765
## percentage 0.0007327591 0.000681879 0.0006486087 0.000619527
## Cummulative Per 0.9955252616 0.996207141 0.9968557493 0.997475276
## [,45] [,46] [,47] [,48]
## eigenvalues 0.0287155119 0.0268520914 0.021660795 0.0205948370
## percentage 0.0005522495 0.0005164127 0.000416575 0.0003960747
## Cummulative Per 0.9980275258 0.9985439385 0.998960514 0.9993565883
## [,49] [,50] [,51] [,52]
## eigenvalues 0.01347095 0.0118746988 0.0059612331 0.0021488217
## percentage 0.00025907 0.0002283712 0.0001146449 0.0000413256
## Cummulative Per 0.99961566 0.9998440295 0.9999586744 1.0000000000
plot(1:52,ei$values,type='o', xlab="Variables", ylab="PCA")
# 10-fold cross validation
tc <- trainControl(method = "cv", number = 3, verboseIter=FALSE , preProcOptions="pca", allowParallel=TRUE)
# Random Forest,
rf <- train(classe ~ ., data = training, method = "rf", trControl= tc)
# Support Vector Machine (radial)
svmr <- train(classe ~ ., data = training, method = "svmRadial", trControl= tc)
# Support Vector Machine (linear)
svml <- train(classe ~ ., data = training, method = "svmLinear", trControl= tc)
# Neural network
NN <- train(classe ~ ., data = training, method = "nnet", trControl= tc, verbose=FALSE)
# Bayes Generalized linear model
bayesglm <- train(classe ~ ., data = training, method = "bayesglm", trControl= tc)
## Warning: fitted probabilities numerically 0 or 1 occurred
## Warning: fitted probabilities numerically 0 or 1 occurred
## Warning: fitted probabilities numerically 0 or 1 occurred
## Warning: fitted probabilities numerically 0 or 1 occurred
# Logit Boosted model
logitboost <- train(classe ~ ., data = training, method = "LogitBoost", trControl= tc)
# Decision Trees
dt <- rpart(classe ~ ., data=training)
model <- c("Random Forest", "SVM (radial)","LogitBoost","SVM (linear)","Neural Net", "Bayes GLM")
Accuracy <- c(max(rf$results$Accuracy),
max(svmr$results$Accuracy),
max(svml$results$Accuracy),
max(logitboost$results$Accuracy),
max(NN$results$Accuracy),
max(bayesglm$results$Accuracy),
max(dt$results$Accuracy))
## Warning in max(dt$results$Accuracy): no non-missing arguments to max;
## returning -Inf
Kappa <- c(max(rf$results$Kappa),
max(svmr$results$Kappa),
max(svml$results$Kappa),
max(logitboost$results$Kappa),
max(NN$results$Kappa),
max(bayesglm$results$Kappa),
max(dt$results$Kappa))
## Warning in max(dt$results$Kappa): no non-missing arguments to max;
## returning -Inf
performance <- cbind(model,Accuracy,Kappa)
## Warning in cbind(model, Accuracy, Kappa): number of rows of result is not a
## multiple of vector length (arg 1)
print(performance)
## model Accuracy Kappa
## [1,] "Random Forest" "0.99266121279077" "0.990716287137332"
## [2,] "SVM (radial)" "0.926154487626936" "0.906404277297741"
## [3,] "LogitBoost" "0.784731259158689" "0.726318465619078"
## [4,] "SVM (linear)" "0.904672056854091" "0.878911710156537"
## [5,] "Neural Net" "0.418257159692622" "0.261237039902888"
## [6,] "Bayes GLM" "0.40250725409442" "0.235825489112588"
## [7,] "Random Forest" "-Inf" "-Inf"
From the result above, Random forest have the best accuracy, the accuracy is more than 99%. So the random forest model is choosen.
prediction <- predict(rf, testing)
Shows the results for performance on the testing data.
print(prediction)
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
# Write files for submission
setwd("~/Dropbox/DataScience/8MachineLearning/")
pml_write_files = function(x){
n = length(x)
for(i in 1:n){
filename = paste0(i,".txt")
write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.names=FALSE)
}
}
pml_write_files(prediction)