This markdown is based on the Leaf Classification competition being run on Kaggle.
The objective of this markdown is to use binary leaf images and extracted features, including shape, margin & texture, to accurately identify 99 species of plants. Leaves, due to their volume, prevalence, and unique characteristics, are an effective means of differentiating plant species. They also provide a fun introduction to applying techniques that involve image-based features.
I will solve the problem using Naive Bayes Classifier, XGBoost Classifier and also a primitive Deep Learning algorithm implemented using the h2o platform. The performance & results for both the methods will then be compared against each other.
library(caret)
library(caTools)
library(xgboost)
library(MLmetrics)
library(h2o)
library(e1071)
library(ggplot2)
train_raw <- read.csv("/Users/aditya/Desktop/Kaggle/Leaf/train.csv")
validate <- read.csv("/Users/aditya/Desktop/Kaggle/Leaf/test.csv")
#Lets count the no. of NA values
sapply(train_raw, function(x) sum(is.na(x)))
## id species margin1 margin2 margin3 margin4 margin5
## 0 0 0 0 0 0 0
## margin6 margin7 margin8 margin9 margin10 margin11 margin12
## 0 0 0 0 0 0 0
## margin13 margin14 margin15 margin16 margin17 margin18 margin19
## 0 0 0 0 0 0 0
## margin20 margin21 margin22 margin23 margin24 margin25 margin26
## 0 0 0 0 0 0 0
## margin27 margin28 margin29 margin30 margin31 margin32 margin33
## 0 0 0 0 0 0 0
## margin34 margin35 margin36 margin37 margin38 margin39 margin40
## 0 0 0 0 0 0 0
## margin41 margin42 margin43 margin44 margin45 margin46 margin47
## 0 0 0 0 0 0 0
## margin48 margin49 margin50 margin51 margin52 margin53 margin54
## 0 0 0 0 0 0 0
## margin55 margin56 margin57 margin58 margin59 margin60 margin61
## 0 0 0 0 0 0 0
## margin62 margin63 margin64 shape1 shape2 shape3 shape4
## 0 0 0 0 0 0 0
## shape5 shape6 shape7 shape8 shape9 shape10 shape11
## 0 0 0 0 0 0 0
## shape12 shape13 shape14 shape15 shape16 shape17 shape18
## 0 0 0 0 0 0 0
## shape19 shape20 shape21 shape22 shape23 shape24 shape25
## 0 0 0 0 0 0 0
## shape26 shape27 shape28 shape29 shape30 shape31 shape32
## 0 0 0 0 0 0 0
## shape33 shape34 shape35 shape36 shape37 shape38 shape39
## 0 0 0 0 0 0 0
## shape40 shape41 shape42 shape43 shape44 shape45 shape46
## 0 0 0 0 0 0 0
## shape47 shape48 shape49 shape50 shape51 shape52 shape53
## 0 0 0 0 0 0 0
## shape54 shape55 shape56 shape57 shape58 shape59 shape60
## 0 0 0 0 0 0 0
## shape61 shape62 shape63 shape64 texture1 texture2 texture3
## 0 0 0 0 0 0 0
## texture4 texture5 texture6 texture7 texture8 texture9 texture10
## 0 0 0 0 0 0 0
## texture11 texture12 texture13 texture14 texture15 texture16 texture17
## 0 0 0 0 0 0 0
## texture18 texture19 texture20 texture21 texture22 texture23 texture24
## 0 0 0 0 0 0 0
## texture25 texture26 texture27 texture28 texture29 texture30 texture31
## 0 0 0 0 0 0 0
## texture32 texture33 texture34 texture35 texture36 texture37 texture38
## 0 0 0 0 0 0 0
## texture39 texture40 texture41 texture42 texture43 texture44 texture45
## 0 0 0 0 0 0 0
## texture46 texture47 texture48 texture49 texture50 texture51 texture52
## 0 0 0 0 0 0 0
## texture53 texture54 texture55 texture56 texture57 texture58 texture59
## 0 0 0 0 0 0 0
## texture60 texture61 texture62 texture63 texture64
## 0 0 0 0 0
#Conduct Near Zero Variance in order to remove variables which do not contribute to the variability in the data
zero_var <- nearZeroVar(train_raw[-as.numeric(ncol(train_raw))],saveMetrics = T)
train_raw <- train_raw[,zero_var$nzv==F]
validate <- validate[,zero_var$nzv==F]
train_X <- train_raw[,-(1:2)]
train_Y <- train_raw[,2]
test_X <- validate[,-1]
test_Y <- validate[,1]
data <- train_raw[,-(1:2)]
covX <- cov(data) # Covariance matrix
pca <- prcomp(covX) # Perform PCA
#Variance Explained
var_exp <- as.data.frame(pca$sdev^2/sum(pca$sdev^2))
var_exp <- cbind(c(1:ncol(data)),var_exp,cumsum(var_exp[,1]))
colnames(var_exp) <- c("Principal_Components","Variance","Cumulative_Variance")
#Plotting the Variance Curves
#Individual Variance
plot(var_exp$Principal_Components,var_exp$Variance,type='b',xlim=c(0,50),pch=16,xlab = "Principal Componets",ylab = "Variance",main = 'Principal Components vs Variance')
#Variance Table
var_exp[20:30,]
## Principal_Components Variance Cumulative_Variance
## 20 20 0.0017112945 0.9877123
## 21 21 0.0015509401 0.9892632
## 22 22 0.0014059423 0.9906692
## 23 23 0.0010389882 0.9917082
## 24 24 0.0009286826 0.9926368
## 25 25 0.0008952483 0.9935321
## 26 26 0.0007773609 0.9943095
## 27 27 0.0006309739 0.9949404
## 28 28 0.0006008982 0.9955413
## 29 29 0.0004934950 0.9960348
## 30 30 0.0004129614 0.9964478
We can see from the chart and the table above that we can use 22 components in order to explain over 99% of the variance in the data.
pca_fin <- pca$rotation[,1:22] # Rotaion matrix (194x22)
PCA <- function(X) { # Reduce observations from N x 194 to N x 22
as.matrix(X) %*% pca_fin
}
train_pca_X <- PCA(train_X)
test_pca_X <- PCA(test_X)
xgb.grid <- expand.grid(
nrounds=100,
max_depth=c(5,10,15),
eta=c(0.5,0.2,0.1),
gamma=c(0,0.5),
colsample_bytree=0.75,
min_child_weight=5,
subsample=0.66
)
xgb.trcontrol <- trainControl(
method="cv",
number=3,
verboseIter=TRUE,
returnData=FALSE,
returnResamp="all",
classProbs=TRUE,
allowParallel=TRUE
)
system.time(xgb_m2 <- train(x=train_pca_X,y=train_Y,
verbose=1,
trControl=xgb.trcontrol,
tuneGrid=xgb.grid,
method="xgbTree"
))
## + Fold1: eta=0.1, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.1, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.1, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.1, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.1, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.1, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.1, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.1, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.1, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.1, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.1, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.1, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.2, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.2, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.2, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.2, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.2, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.2, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.2, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.2, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.2, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.2, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.2, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.2, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.5, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.5, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.5, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.5, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.5, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.5, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.5, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.5, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.5, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.5, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold1: eta=0.5, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold1: eta=0.5, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.1, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.1, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.1, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.1, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.1, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.1, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.1, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.1, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.1, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.1, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.1, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.1, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.2, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.2, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.2, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.2, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.2, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.2, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.2, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.2, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.2, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.2, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.2, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.2, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.5, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.5, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.5, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.5, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.5, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.5, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.5, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.5, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.5, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.5, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold2: eta=0.5, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold2: eta=0.5, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.1, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.1, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.1, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.1, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.1, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.1, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.1, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.1, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.1, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.1, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.1, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.1, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.2, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.2, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.2, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.2, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.2, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.2, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.2, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.2, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.2, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.2, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.2, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.2, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.5, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.5, max_depth= 5, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.5, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.5, max_depth= 5, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.5, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.5, max_depth=10, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.5, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.5, max_depth=10, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.5, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.5, max_depth=15, gamma=0.0, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## + Fold3: eta=0.5, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## - Fold3: eta=0.5, max_depth=15, gamma=0.5, colsample_bytree=0.75, min_child_weight=5, subsample=0.66, nrounds=100
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 100, max_depth = 15, eta = 0.5, gamma = 0.5, colsample_bytree = 0.75, min_child_weight = 5, subsample = 0.66 on full training set
## user system elapsed
## 1770.257 6.586 1791.523
pred <- predict(xgb_m2,newdata= train_pca_X,type='prob')
error_xgb <- MultiLogLoss(y_true = train_Y, y_pred = as.matrix(pred))
error_xgb
## [1] 0.6413769
pca_data <- data.frame(train_raw$species,train_pca_X)
system.time(NB<- naiveBayes(train_raw.species~.,pca_data))
## user system elapsed
## 0.080 0.001 0.081
pred <- predict(NB,newdata=pca_data[,-1],type='raw')
error_nb <- MultiLogLoss(y_true = pca_data[,1], y_pred = as.matrix(pred))
error_nb
## [1] 0.1723988
train.id <- train_raw$id
train_raw$id <- NULL
test.id <- validate$id
validate$id <- NULL
validate$species <- NA
#We will create a local instance of the h2o platform in order to be able to create layers for deep learning and make predictions.
localH2O <- h2o.init(max_mem_size = "12g")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 38 minutes 2 seconds
## H2O cluster version: 3.10.0.8
## H2O cluster version age: 3 months and 6 days
## H2O cluster name: H2O_started_from_R_aditya_dio958
## H2O cluster total nodes: 1
## H2O cluster total memory: 11.90 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 2
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## R Version: R version 3.3.1 (2016-06-21)
h2o.train <- as.h2o(train_raw)
##
|
| | 0%
|
|=================================================================| 100%
h2o.test <- as.h2o(validate)
##
|
| | 0%
|
|=================================================================| 100%
set.seed(13579)
## Deep learning with two hidden layers (1024, 512).
system.time(model <- h2o.deeplearning(x = 2:ncol(h2o.train),
y = 1,
training_frame = h2o.train,
activation = "TanhWithDropout",
input_dropout_ratio = 0,
hidden_dropout_ratios = c(0.1,0.1),
balance_classes = F,
hidden = c(1024,512),
epochs = 250,
loss = "CrossEntropy",
categorical_encoding = "OneHotInternal"))
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|=================================================================| 100%
## user system elapsed
## 1.002 0.062 77.216
#prediction
save(model, file = "h2omodel.RData")
ytrain <- h2o.predict(model, h2o.train, type = 'raw')
##
|
| | 0%
|
|=================================================================| 100%
error_dl <- MultiLogLoss(y_true = train_raw[,1], y_pred = as.matrix(ytrain[,-1]))
error_dl
## [1] 0.0007641991
h2o.shutdown()
## Are you sure you want to shutdown the H2O instance running at http://localhost:54321/ (Y/N)?
error <- data.frame(Model=c("XGBoost","Naive Bayes","Deep Learning"),Error=c(error_xgb,error_nb,error_dl))
ggplot(error,aes(x=Model,y=Error))+geom_bar(stat='identity')+theme_bw()+
ggtitle('Comparison of Model Accuracy')