Data description

The outcome (Y) is the “classe” variable (dependent) in the training set. The rest are the independent variables. ## Loading the Data

train.url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test.url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"

if (file.exists("pml-training.csv")) {
        training <- read.csv("pml-training.csv", na.strings=c("NA","#DIV/0!",""))
} else { 
        download.file(train.url,"pml-training.csv")
        training <- read.csv("pml-training.csv", na.strings=c("NA","#DIV/0!",""))
        }
if (file.exists("pml-testing.csv")) {
        testing <- read.csv("pml-testing.csv", na.strings=c("NA","#DIV/0!",""))
} else { 
        download.file(test.url,"pml-testing.csv")
        testing <- read.csv("pml-testing.csv", na.strings=c("NA","#DIV/0!",""))
}

Loading the Libs

library(abind)
library(arm)

## Loading required package: MASS
## Loading required package: Matrix
## Loading required package: lme4
## Loading required package: Rcpp
## 
## arm (Version 1.8-4, built: 2015-04-07)
## 
## Working directory is /Users/Bill/Dropbox/DataScience/8MachineLearning

library(caret)

## Loading required package: lattice
## Loading required package: ggplot2

library(kernlab)
library(klaR)
library(rattle)

## Rattle: A free graphical interface for data mining with R.
## Version 3.4.1 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(randomForest)

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

library(rpart)

Cleaning the Data

From the question, I am not sure whether the testing and training dataset have the same names, so I check the name’s coherence.

all.equal(colnames(testing)[1:length(colnames(testing))-1], colnames(training)[1:length(colnames(training))-1])

## [1] TRUE

Apparently, they are the same.

Summarize the data

summary(training)
str(training)
names(training)

The summary output is hidden due to the size of the dataset, here I only reported the code to allow reproduction.
From the results, I notice there are a lot of NAs and low variace data in the dataset, so clean those column out.
Remove the variables with more thn 50% missing data
Remove the variables that cannot use to the prediction calculations

# clean the low variance data out
nearzero <- nearZeroVar(training, saveMetrics = TRUE) # from the caret lib
training <- training[, !nearzero$nzv]

# Remove the variables with more thn 50% missing data
md <- sapply(colnames(training), function(x) if( (sum(is.na(training[, x]))/nrow(training))>0.50 ) {return(TRUE)} else{return(FALSE)} )
training <- training[,!md]
# Remove the variables that cannot use to the prediction calculations
training <- training[, -(1:6)]

Correlation Analysis

Find the hily correlated variables
Use Principle Component (PCA) to find the best variables.

# Find the hily correlated variables
corr.analysis <- findCorrelation(cor(training[, -53]), cutoff=0.8)
corr.analysis

##  [1] 10  1  9 36  8  2 21 34 25 19 46 31

names(training[corr.analysis])

##  [1] "accel_belt_z"     "roll_belt"        "accel_belt_y"    
##  [4] "accel_dumbbell_z" "accel_belt_x"     "pitch_belt"      
##  [7] "accel_arm_x"      "accel_dumbbell_x" "magnet_arm_y"    
## [10] "gyros_arm_y"      "gyros_forearm_z"  "gyros_dumbbell_x"

# PCA analysis
n=dim(training[, -53])[1]
S=(n-1)/n*cov(training[, -53])
#print(S)
R=(n-1)/n*cor(training[, -53])
#print(R)
ei=eigen(R)
per=ei$values/sum(ei$values)
cumper=cumsum(per)
table=rbind(ei$values,per,cumper)
rownames(table)=c('eigenvalues','percentage','Cummulative Per')
print(table)

##                      [,1]      [,2]       [,3]       [,4]       [,5]
## eigenvalues     8.3560546 8.1028988 4.67578119 4.12942713 3.65177222
## percentage      0.1607015 0.1558329 0.08992345 0.07941611 0.07022997
## Cummulative Per 0.1607015 0.3165345 0.40645792 0.48587403 0.55610399
##                       [,6]       [,7]       [,8]       [,9]     [,10]
## eigenvalues     3.00340653 2.23984658 2.07271393 1.71714322 1.5087446
## percentage      0.05776076 0.04307617 0.03986191 0.03302367 0.0290158
## Cummulative Per 0.61386476 0.65694092 0.69680284 0.72982651 0.7588423
##                      [,11]      [,12]      [,13]     [,14]      [,15]
## eigenvalues     1.38542732 1.12918399 0.98662428 0.8906573 0.83601603
## percentage      0.02664419 0.02171618 0.01897451 0.0171289 0.01607805
## Cummulative Per 0.78548650 0.80720268 0.82617719 0.8433061 0.85938414
##                      [,16]      [,17]      [,18]      [,19]       [,20]
## eigenvalues     0.78921111 0.67790053 0.60968912 0.53240414 0.484816243
## percentage      0.01517791 0.01303721 0.01172539 0.01023906 0.009323864
## Cummulative Per 0.87456205 0.88759926 0.89932465 0.90956371 0.918887579
##                       [,21]       [,22]       [,23]       [,24]
## eigenvalues     0.425619142 0.398574899 0.382675187 0.339283597
## percentage      0.008185401 0.007665293 0.007359513 0.006525017
## Cummulative Per 0.927072979 0.934738272 0.942097785 0.948622802
##                       [,25]       [,26]       [,27]       [,28]
## eigenvalues     0.307690839 0.292949305 0.255978568 0.236240801
## percentage      0.005917433 0.005633928 0.004922916 0.004543324
## Cummulative Per 0.954540236 0.960174163 0.965097079 0.969640403
##                       [,29]       [,30]       [,31]       [,32]
## eigenvalues     0.203435616 0.179879937 0.170105135 0.131735634
## percentage      0.003912423 0.003459406 0.003271419 0.002533507
## Cummulative Per 0.973552825 0.977012231 0.980283651 0.982817157
##                       [,33]       [,34]       [,35]       [,36]
## eigenvalues     0.121825897 0.112441291 0.091976768 0.079714760
## percentage      0.002342925 0.002162443 0.001768874 0.001533054
## Cummulative Per 0.985160082 0.987322525 0.989091399 0.990624454
##                       [,37]       [,38]       [,39]        [,40]
## eigenvalues     0.064208114 0.056534418 0.055185207 0.0407997580
## percentage      0.001234834 0.001087256 0.001061308 0.0007846507
## Cummulative Per 0.991859288 0.992946544 0.994007852 0.9947925025
##                        [,41]       [,42]        [,43]       [,44]
## eigenvalues     0.0381015323 0.035455902 0.0337259313 0.032213765
## percentage      0.0007327591 0.000681879 0.0006486087 0.000619527
## Cummulative Per 0.9955252616 0.996207141 0.9968557493 0.997475276
##                        [,45]        [,46]       [,47]        [,48]
## eigenvalues     0.0287155119 0.0268520914 0.021660795 0.0205948370
## percentage      0.0005522495 0.0005164127 0.000416575 0.0003960747
## Cummulative Per 0.9980275258 0.9985439385 0.998960514 0.9993565883
##                      [,49]        [,50]        [,51]        [,52]
## eigenvalues     0.01347095 0.0118746988 0.0059612331 0.0021488217
## percentage      0.00025907 0.0002283712 0.0001146449 0.0000413256
## Cummulative Per 0.99961566 0.9998440295 0.9999586744 1.0000000000

plot(1:52,ei$values,type='o', xlab="Variables", ylab="PCA")

Model Specification and Cross Validation

In order to avoid overfitting and to reduce out of sample errors, TrainControl is used to perform 3-fold cross validation.
Six models are used to estimated data:
1. Random forest,
2. Support Vector Machine
- radial
- linear
1. Neural network,
2. Bayes Generalized linear model
3. Logit Boosted model.
4. Decision Trees

# 10-fold cross validation
tc <- trainControl(method = "cv", number = 3, verboseIter=FALSE , preProcOptions="pca", allowParallel=TRUE)

# Random Forest,
rf <- train(classe ~ ., data = training, method = "rf", trControl= tc)

# Support Vector Machine (radial)
svmr <- train(classe ~ ., data = training, method = "svmRadial", trControl= tc)

# Support Vector Machine (linear)
svml <- train(classe ~ ., data = training, method = "svmLinear", trControl= tc)

# Neural network
NN <- train(classe ~ ., data = training, method = "nnet", trControl= tc, verbose=FALSE)

# Bayes Generalized linear model
bayesglm <- train(classe ~ ., data = training, method = "bayesglm", trControl= tc)

## Warning: fitted probabilities numerically 0 or 1 occurred

## Warning: fitted probabilities numerically 0 or 1 occurred

## Warning: fitted probabilities numerically 0 or 1 occurred

## Warning: fitted probabilities numerically 0 or 1 occurred

# Logit Boosted model
logitboost <- train(classe ~ ., data = training, method = "LogitBoost", trControl= tc)

# Decision Trees
dt <- rpart(classe ~ ., data=training)

Accuracy Comparision among The 6 Methods

model <- c("Random Forest", "SVM (radial)","LogitBoost","SVM (linear)","Neural Net", "Bayes GLM")
Accuracy <- c(max(rf$results$Accuracy),
        max(svmr$results$Accuracy),
        max(svml$results$Accuracy),
        max(logitboost$results$Accuracy),
        max(NN$results$Accuracy),
        max(bayesglm$results$Accuracy),
        max(dt$results$Accuracy))

## Warning in max(dt$results$Accuracy): no non-missing arguments to max;
## returning -Inf

Kappa <- c(max(rf$results$Kappa),
        max(svmr$results$Kappa),
        max(svml$results$Kappa),
        max(logitboost$results$Kappa),
        max(NN$results$Kappa),
        max(bayesglm$results$Kappa),
        max(dt$results$Kappa))

## Warning in max(dt$results$Kappa): no non-missing arguments to max;
## returning -Inf

performance <- cbind(model,Accuracy,Kappa)

## Warning in cbind(model, Accuracy, Kappa): number of rows of result is not a
## multiple of vector length (arg 1)

print(performance)

##      model           Accuracy            Kappa              
## [1,] "Random Forest" "0.99266121279077"  "0.990716287137332"
## [2,] "SVM (radial)"  "0.926154487626936" "0.906404277297741"
## [3,] "LogitBoost"    "0.784731259158689" "0.726318465619078"
## [4,] "SVM (linear)"  "0.904672056854091" "0.878911710156537"
## [5,] "Neural Net"    "0.418257159692622" "0.261237039902888"
## [6,] "Bayes GLM"     "0.40250725409442"  "0.235825489112588"
## [7,] "Random Forest" "-Inf"              "-Inf"

From the result above, Random forest have the best accuracy, the accuracy is more than 99%. So the random forest model is choosen.

prediction <- predict(rf, testing)

Shows the results for performance on the testing data.

print(prediction)

##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E

Writ for Submission

# Write files for submission
setwd("~/Dropbox/DataScience/8MachineLearning/")
pml_write_files = function(x){
  n = length(x)
  for(i in 1:n){
    filename = paste0(i,".txt")
    write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.names=FALSE)
  }
}

pml_write_files(prediction)

Practical Machine Learning Project

Xi Chen (Bill)

December 27, 2015

Objectives:

Data source: