Set up session for Deep Learning:

Set working directory:

setwd("~/Documents/R")

Clear R environment objects:

rm(list=ls())

Read in JSON dump of Chris’s python dictionary and convert to three dataframes:

# require(jsonlite)
# 
# dat <- fromJSON("data/text.txt")
# 
# zero <- dat[[1]]
# one <- dat[[2]]
# two <- dat[[3]]
# three <- dat[[4]]

Union these dataframes into one:

# all.dat <- data.frame(
#   rbind(
#     zero,
#     one,
#     two,
#     three
#   )
# )

Data Wrangling for Deep Learning:

require(dplyr)
# library(tidyr)
# 
# #isolate the layer columns
# layers <- data.frame(all.dat[,c(8,12,16,20,24,28)])
# 
# #choose an example that is not null
# fix <- layers[6,1]
# 
# #fill it with zeros, retaining its shape
# for(i in 1:nrow(fix[[1]])){
#   for(j in 1:ncol(fix[[1]])){
#     fix[[1]][i,j] <- 0
#   }
# }
# 
# #replace any null values with this matrix of zeroes
# nullToNA <- function(x) {
#     x[sapply(x, is.null)] <- fix
#     return(x)
# }
# 
# layers <- as.matrix(layers)
# layers <- nullToNA(layers)
# 
# layers <- cbind(all.dat[,1:3],layers)
# 
# #unnest each layer individually:
# #//TODO: create a function for this
# 
# #LAYER 0
# 
# a <- rep(NA,264)
# 
# for(i in layers$layer0){
#   add <- as.vector(unlist(i))
#   a <- rbind(a,add)
# }
# 
# a <- a[-1,]
# 
# layer0 <- cbind(all.dat[,c(1:7,9:11,13:15,17:19,21:23,25:27)],a)
# 
# #LAYER 1
# 
# a <- rep(NA,264)
# 
# for(i in layers$layer1){
#   add <- as.vector(unlist(i))
#   a <- rbind(a,add)
# }
# 
# a <- a[-1,]
# 
# layer0 <- cbind(layer0,a)
# 
# #LAYER 2
# 
# a <- rep(NA,264)
# 
# for(i in layers$layer2){
#   add <- as.vector(unlist(i))
#   a <- rbind(a,add)
# }
# 
# a <- a[-1,]
# 
# layer0 <- cbind(layer0,a)
# 
# #LAYER 3
# 
# a <- rep(NA,264)
# 
# for(i in layers$layer3){
#   add <- as.vector(unlist(i))
#   a <- rbind(a,add)
# }
# 
# a <- a[-1,]
# 
# layer0 <- cbind(layer0,a)
# 
# #LAYER 4
# 
# a <- rep(NA,264)
# 
# for(i in layers$layer4){
#   add <- as.vector(unlist(i))
#   a <- rbind(a,add)
# }
# 
# a <- a[-1,]
# 
# layer0 <- cbind(layer0,a)
# 
# #LAYER 5
# 
# a <- rep(NA,264)
# 
# for(i in layers$layer5){
#   add <- as.vector(unlist(i))
#   a <- rbind(a,add)
# }
# 
# a <- a[-1,]
# 
# layer0 <- cbind(layer0,a)
# 
# #make all missing values 0
# layer0[is.na(layer0)] <- 0
# 
# my.dat <- layer0[,4:100]
# 
# #make the outcome variable categorical
# my.dat$pdgCode <- as.factor(my.dat$pdgCode)
# 
# #save the manipulated dataset and clear the R environment
# save(my.dat,file="NNdata.RData")
# rm(list=ls())

Scale the wrangled data and remove unnecessary elements:

#load the manipulated data set
load("NNdata.RData")
#scale numerical predictor variables
my.dat[,-1] <- scale(my.dat[,-1])
my.dat <- my.dat %>%
  subset(pdgCode!=-11) %>%
  subset(pdgCode!=11)

DEEP LEARNING:

require(h2o)
h2o.init(max_mem_size = "28G",nthreads = -1)
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         50 minutes 21 seconds 
##     H2O cluster timezone:       Africa/Johannesburg 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.20.0.2 
##     H2O cluster version age:    2 months and 28 days  
##     H2O cluster name:           H2O_started_from_R_metamorphica_fuq880 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   24.68 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.4 (2018-03-15)
h2o.removeAll()
## [1] 0
#h2o.no_progress()

Upload data to H2O cluster, split into training (60%), validation (20%) and test (20%) sets:

dat.hex <- as.h2o(my.dat,"dat.hex")
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
splitz <- h2o.splitFrame(dat.hex,ratios=c(0.6,0.2),
                         destination_frames = c("train.hex","valid.hex","test.hex"))

train.hex <- splitz[[1]]
valid.hex <- splitz[[2]]
test.hex <- splitz[[3]]

Build first neural network:

# nn_1 <- h2o.deeplearning(
#   x= 2:97,
#   y= 1,
#   model_id = "nn_1",
#   training_frame = train.hex,
#   validation_frame = valid.hex,
#   hidden = c(200,200),
#   nfolds=10,
#   standardize = F,
#   epochs=100,
#   fast_mode = F,
#   sparse = T
# )

Build a differenct architecture:

# nn_2 <- h2o.deeplearning(
#   x= 2:97,
#   y= 1,
#   model_id = "nn_2",
#   training_frame = train.hex,
#   validation_frame = valid.hex,
#   hidden = 500,
#   nfolds=10,
#   standardize = F,
#   fast_mode = F,
#   sparse = T,
#   epochs=100
# )

Build a differenct architecture:

# nn_3 <- h2o.deeplearning(
#   x= 2:97,
#   y= 1,
#   model_id = "nn_3",
#   training_frame = train.hex,
#   validation_frame = valid.hex,
#   hidden = c(32,32,32,32,32,32,32,32,32,32),
#   nfolds=10,
#   standardize = F,
#   fast_mode = F,
#   sparse = T,
#   epochs=100
# )

Find most probable architecture:

# h2o.mean_per_class_error(nn_1)
# h2o.mean_per_class_error(nn_2)
# h2o.mean_per_class_error(nn_3)
# 
# h2o.confusionMatrix(nn_1)
# h2o.confusionMatrix(nn_2)
# h2o.confusionMatrix(nn_3)
# 
# 
# h2o.confusionMatrix(nn_1,newdata = test.hex)
# h2o.confusionMatrix(nn_2,newdata = test.hex)
# h2o.confusionMatrix(nn_3,newdata = test.hex)
# 
# plot(nn_1)
# plot(nn_2)
# plot(nn_3)
# nn_4 <- h2o.deeplearning(
#   x= 2:97,
#   y= 1,
#   model_id = "nn_4",
#   training_frame = train.hex,
#   validation_frame = valid.hex,
#   activation = "TanhWithDropout",
#   hidden = c(500,500),
#   nfolds=10,
#   standardize = F,
#   fast_mode = F,
#   sparse = T,
#   epochs=200,
#   balance_classes = T
# )
# 
# h2o.performance(nn_4,test.hex)
# plot(nn_4)
# nn_5 <- h2o.deeplearning(
#   x= 2:97,
#   y= 1,
#   model_id = "nn_5",
#   training_frame = train.hex,
#   validation_frame = valid.hex,
#   activation = "RectifierWithDropout",
#   hidden = c(500,500),
#   nfolds=10,
#   standardize = F,
#   fast_mode = F,
#   sparse = T,
#   epochs=200,
#   balance_classes = T
# )
# 
# h2o.performance(nn_5,test.hex)
# plot(nn_5)
# nn_6 <- h2o.deeplearning(
#   x= 2:97,
#   y= 1,
#   model_id = "nn_6",
#   training_frame = train.hex,
#   validation_frame = valid.hex,
#   activation = "MaxoutWithDropout",
#   hidden = c(500,500),
#   nfolds=10,
#   standardize = F,
#   fast_mode = F,
#   sparse = T,
#   epochs=200,
#   balance_classes = T
# )
# 
# h2o.performance(nn_6,test.hex)
# plot(nn_6)
# h2o.saveModel(nn_1,"nn1.RData")
# h2o.saveModel(nn_2,"nn2.RData")
# h2o.saveModel(nn_3,"nn3.RData")
# h2o.saveModel(nn_4,"nn4.RData")
# h2o.saveModel(nn_5,"nn5.RData")
# h2o.saveModel(nn_6,"nn6.RData")
# h2o.performance(nn_1,dat.hex)
# h2o.performance(nn_2,dat.hex)
# h2o.performance(nn_3,dat.hex)
# h2o.performance(nn_4,dat.hex)
# h2o.performance(nn_5,dat.hex)
# h2o.performance(nn_6,dat.hex)
# nn_7 <- h2o.deeplearning(
#   x= 2:97,
#   y= 1,
#   model_id = "nn_7",
#   training_frame = train.hex,
#   validation_frame = valid.hex,
#   activation = "RectifierWithDropout",
#   hidden = c(500,500,500,500),
#   nfolds=10,
#   standardize = F,
#   fast_mode = F,
#   sparse = T,
#   epochs=200,
#   balance_classes = T,
#   l1=1e-06
# )

# h2o.saveModel(nn_7,"nn7.RData")

# h2o.performance(nn_7,dat.hex)
# plot(nn_7)
# nn_1 <- h2o.loadModel("nn_1")
# nn_2 <- h2o.loadModel("nn_2")
# nn_3 <- h2o.loadModel("nn_3")
# nn_4 <- h2o.loadModel("nn_4")
# nn_5 <- h2o.loadModel("nn_5")
# nn_6 <- h2o.loadModel("nn_6")
# nn_7 <- h2o.loadModel("nn_7")
# 
# p1 <- as.data.frame(h2o.predict(nn_1,dat.hex))
# p2 <- as.data.frame(h2o.predict(nn_2,dat.hex))
# p3 <- as.data.frame(h2o.predict(nn_3,dat.hex))
# p4 <- as.data.frame(h2o.predict(nn_4,dat.hex))
# p5 <- as.data.frame(h2o.predict(nn_5,dat.hex))
# p6 <- as.data.frame(h2o.predict(nn_6,dat.hex))
# p7 <- as.data.frame(h2o.predict(nn_7,dat.hex))
# 
# p1 <- p1[,3]
# p2 <- p2[,3]
# p3 <- p3[,3]
# p4 <- p4[,3]
# p5 <- p5[,3]
# p6 <- p6[,3]
# p7 <- p7[,3]
# 
# p <- data.frame(cbind(p1,p2,p3,p4,p5,p6,p7))
# 
# p <- scale(p)
# 
# my.dat <- data.frame(cbind(my.dat,p))
#h2o.shutdown(prompt=F)
# h2o.init(max_mem_size = "28G",nthreads = -1)
# h2o.removeAll()
# #h2o.no_progress()
# 
# dat.hex <- as.h2o(my.dat,"dat.hex")
# 
# splitz <- h2o.splitFrame(dat.hex,ratios=c(0.6,0.2),
#                          destination_frames = c("train.hex","valid.hex","test.hex"))
# 
# train.hex <- splitz[[1]]
# valid.hex <- splitz[[2]]
# test.hex <- splitz[[3]]
# rf1 <- h2o.randomForest(y=1,
#                         x=2:111,
#                         training_frame=train.hex,
#                         validation_frame=valid.hex,
#                         nfolds=30,
#                         ntrees=200)
# 
# h2o.performance(rf1,dat.hex)
# 
# p8 <- as.data.frame(h2o.predict(rf1,dat.hex))
# 
# plot(rf1)
# 
# p8 <- p8[,3]
# my.dat <- data.frame(my.dat,p8)
# 
# h2o.shutdown(prompt = F)
# k <- kmeans(x=my.dat[,-1],centers=10)
# k <- k$cluster
# k <- as.factor(k)
# 
# require(dummies)
# 
# k <- dummy(k)
# 
# my.dat <- data.frame(cbind(my.dat,k))
# 
# my.dat$k1 <- as.factor(my.dat$k1)
# my.dat$k2 <- as.factor(my.dat$k2)
# my.dat$k3 <- as.factor(my.dat$k3)
# my.dat$k4 <- as.factor(my.dat$k4)
# my.dat$k5 <- as.factor(my.dat$k5)
# my.dat$k6 <- as.factor(my.dat$k6)
# my.dat$k7 <- as.factor(my.dat$k7)
# my.dat$k8 <- as.factor(my.dat$k8)
# my.dat$k9 <- as.factor(my.dat$k9)
# my.dat$k10 <- as.factor(my.dat$k10)
#save(my.dat,file="mydat.RData")
rm(list=ls())
load("mydat.RData")

Support Vector Machines

Linear Kernel:

require(e1071)
sv <- svm(pdgCode~.,data=my.dat,scale=F,kernel="linear")
svm.p <- data.frame(predict(sv,my.dat))

Polynomial Kernel:

sv <- svm(pdgCode~.,data=my.dat,scale=F,kernel="polynomial")
svm.p <- data.frame(cbind(svm.p,predict(sv,my.dat)))

Radial Kernel

require(e1071)
sv <- svm(pdgCode~.,data=my.dat,scale=F,kernel="radial")
svm.p <- data.frame(cbind(svm.p,predict(sv,my.dat)))

Sigmoid Kernel:

require(e1071)
sv <- svm(pdgCode~.,data=my.dat,scale=F,kernel="sigmoid")
svm.p <- data.frame(cbind(svm.p,predict(sv,my.dat)))
require(dummies)
svm.p <- as.data.frame(svm.p)
svm.p <- dummy.data.frame(svm.p)
my.dat <- data.frame(cbind(my.dat,svm.p))
my.dat$k1 <- as.numeric(my.dat$k1)
my.dat$k2 <- as.numeric(my.dat$k2)
my.dat$k3 <- as.numeric(my.dat$k3)
my.dat$k4 <- as.numeric(my.dat$k4)
my.dat$k5 <- as.numeric(my.dat$k5)
my.dat$k6 <- as.numeric(my.dat$k6)
my.dat$k7 <- as.numeric(my.dat$k7)
my.dat$k8 <- as.numeric(my.dat$k8)
my.dat$k9 <- as.numeric(my.dat$k9)
my.dat$k10 <- as.numeric(my.dat$k10)

pc <- princomp(my.dat[,-1])
biplot(pc)

std_dev <- pc$sdev

pr_var <- std_dev^2

prop_varex <- pr_var/sum(pr_var)

plot(prop_varex, xlab = "Principal Component",
             ylab = "Proportion of Variance Explained",
             type = "b")

plot(cumsum(prop_varex), xlab = "Principal Component",
              ylab = "Cumulative Proportion of Variance Explained",
              type = "b")

abline(v=min(which(cumsum(prop_varex)>=.99)),col="red")

min(which(cumsum(prop_varex)>=.99))
## [1] 38
new.dat <- data.frame(cbind(as.character(my.dat$pdgCode),pc$scores[,1:38]))
new.dat$V1 <- as.factor(new.dat$V1)
names(new.dat)[1] <- "pdgCode"

for(i in 2:ncol(new.dat)){
  new.dat[,i] <- as.numeric(as.character(new.dat[,i]))
}
require(randomForest)

rf <- randomForest(x=new.dat[1:100,2:39],y=new.dat$pdgCode[1:100])

pp <- predict(rf,newdata = new.dat)

pp <- data.frame(cbind(as.factor(new.dat$pdgCode),as.factor(pp)))
pp[,3] <- pp[,1]==pp[,2]

length(which(pp[,3])==T)/nrow(pp)
## [1] 0.7711443