Set working directory:
setwd("~/Documents/R")
Clear R environment objects:
rm(list=ls())
Read in JSON dump of Chris’s python dictionary and convert to three dataframes:
# require(jsonlite)
#
# dat <- fromJSON("data/text.txt")
#
# zero <- dat[[1]]
# one <- dat[[2]]
# two <- dat[[3]]
# three <- dat[[4]]
Union these dataframes into one:
# all.dat <- data.frame(
# rbind(
# zero,
# one,
# two,
# three
# )
# )
require(dplyr)
# library(tidyr)
#
# #isolate the layer columns
# layers <- data.frame(all.dat[,c(8,12,16,20,24,28)])
#
# #choose an example that is not null
# fix <- layers[6,1]
#
# #fill it with zeros, retaining its shape
# for(i in 1:nrow(fix[[1]])){
# for(j in 1:ncol(fix[[1]])){
# fix[[1]][i,j] <- 0
# }
# }
#
# #replace any null values with this matrix of zeroes
# nullToNA <- function(x) {
# x[sapply(x, is.null)] <- fix
# return(x)
# }
#
# layers <- as.matrix(layers)
# layers <- nullToNA(layers)
#
# layers <- cbind(all.dat[,1:3],layers)
#
# #unnest each layer individually:
# #//TODO: create a function for this
#
# #LAYER 0
#
# a <- rep(NA,264)
#
# for(i in layers$layer0){
# add <- as.vector(unlist(i))
# a <- rbind(a,add)
# }
#
# a <- a[-1,]
#
# layer0 <- cbind(all.dat[,c(1:7,9:11,13:15,17:19,21:23,25:27)],a)
#
# #LAYER 1
#
# a <- rep(NA,264)
#
# for(i in layers$layer1){
# add <- as.vector(unlist(i))
# a <- rbind(a,add)
# }
#
# a <- a[-1,]
#
# layer0 <- cbind(layer0,a)
#
# #LAYER 2
#
# a <- rep(NA,264)
#
# for(i in layers$layer2){
# add <- as.vector(unlist(i))
# a <- rbind(a,add)
# }
#
# a <- a[-1,]
#
# layer0 <- cbind(layer0,a)
#
# #LAYER 3
#
# a <- rep(NA,264)
#
# for(i in layers$layer3){
# add <- as.vector(unlist(i))
# a <- rbind(a,add)
# }
#
# a <- a[-1,]
#
# layer0 <- cbind(layer0,a)
#
# #LAYER 4
#
# a <- rep(NA,264)
#
# for(i in layers$layer4){
# add <- as.vector(unlist(i))
# a <- rbind(a,add)
# }
#
# a <- a[-1,]
#
# layer0 <- cbind(layer0,a)
#
# #LAYER 5
#
# a <- rep(NA,264)
#
# for(i in layers$layer5){
# add <- as.vector(unlist(i))
# a <- rbind(a,add)
# }
#
# a <- a[-1,]
#
# layer0 <- cbind(layer0,a)
#
# #make all missing values 0
# layer0[is.na(layer0)] <- 0
#
# my.dat <- layer0[,4:100]
#
# #make the outcome variable categorical
# my.dat$pdgCode <- as.factor(my.dat$pdgCode)
#
# #save the manipulated dataset and clear the R environment
# save(my.dat,file="NNdata.RData")
# rm(list=ls())
Scale the wrangled data and remove unnecessary elements:
#load the manipulated data set
load("NNdata.RData")
#scale numerical predictor variables
my.dat[,-1] <- scale(my.dat[,-1])
my.dat <- my.dat %>%
subset(pdgCode!=-11) %>%
subset(pdgCode!=11)
require(h2o)
h2o.init(max_mem_size = "28G",nthreads = -1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 50 minutes 21 seconds
## H2O cluster timezone: Africa/Johannesburg
## H2O data parsing timezone: UTC
## H2O cluster version: 3.20.0.2
## H2O cluster version age: 2 months and 28 days
## H2O cluster name: H2O_started_from_R_metamorphica_fuq880
## H2O cluster total nodes: 1
## H2O cluster total memory: 24.68 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.4 (2018-03-15)
h2o.removeAll()
## [1] 0
#h2o.no_progress()
Upload data to H2O cluster, split into training (60%), validation (20%) and test (20%) sets:
dat.hex <- as.h2o(my.dat,"dat.hex")
##
|
| | 0%
|
|=================================================================| 100%
splitz <- h2o.splitFrame(dat.hex,ratios=c(0.6,0.2),
destination_frames = c("train.hex","valid.hex","test.hex"))
train.hex <- splitz[[1]]
valid.hex <- splitz[[2]]
test.hex <- splitz[[3]]
Build first neural network:
# nn_1 <- h2o.deeplearning(
# x= 2:97,
# y= 1,
# model_id = "nn_1",
# training_frame = train.hex,
# validation_frame = valid.hex,
# hidden = c(200,200),
# nfolds=10,
# standardize = F,
# epochs=100,
# fast_mode = F,
# sparse = T
# )
Build a differenct architecture:
# nn_2 <- h2o.deeplearning(
# x= 2:97,
# y= 1,
# model_id = "nn_2",
# training_frame = train.hex,
# validation_frame = valid.hex,
# hidden = 500,
# nfolds=10,
# standardize = F,
# fast_mode = F,
# sparse = T,
# epochs=100
# )
Build a differenct architecture:
# nn_3 <- h2o.deeplearning(
# x= 2:97,
# y= 1,
# model_id = "nn_3",
# training_frame = train.hex,
# validation_frame = valid.hex,
# hidden = c(32,32,32,32,32,32,32,32,32,32),
# nfolds=10,
# standardize = F,
# fast_mode = F,
# sparse = T,
# epochs=100
# )
Find most probable architecture:
# h2o.mean_per_class_error(nn_1)
# h2o.mean_per_class_error(nn_2)
# h2o.mean_per_class_error(nn_3)
#
# h2o.confusionMatrix(nn_1)
# h2o.confusionMatrix(nn_2)
# h2o.confusionMatrix(nn_3)
#
#
# h2o.confusionMatrix(nn_1,newdata = test.hex)
# h2o.confusionMatrix(nn_2,newdata = test.hex)
# h2o.confusionMatrix(nn_3,newdata = test.hex)
#
# plot(nn_1)
# plot(nn_2)
# plot(nn_3)
# nn_4 <- h2o.deeplearning(
# x= 2:97,
# y= 1,
# model_id = "nn_4",
# training_frame = train.hex,
# validation_frame = valid.hex,
# activation = "TanhWithDropout",
# hidden = c(500,500),
# nfolds=10,
# standardize = F,
# fast_mode = F,
# sparse = T,
# epochs=200,
# balance_classes = T
# )
#
# h2o.performance(nn_4,test.hex)
# plot(nn_4)
# nn_5 <- h2o.deeplearning(
# x= 2:97,
# y= 1,
# model_id = "nn_5",
# training_frame = train.hex,
# validation_frame = valid.hex,
# activation = "RectifierWithDropout",
# hidden = c(500,500),
# nfolds=10,
# standardize = F,
# fast_mode = F,
# sparse = T,
# epochs=200,
# balance_classes = T
# )
#
# h2o.performance(nn_5,test.hex)
# plot(nn_5)
# nn_6 <- h2o.deeplearning(
# x= 2:97,
# y= 1,
# model_id = "nn_6",
# training_frame = train.hex,
# validation_frame = valid.hex,
# activation = "MaxoutWithDropout",
# hidden = c(500,500),
# nfolds=10,
# standardize = F,
# fast_mode = F,
# sparse = T,
# epochs=200,
# balance_classes = T
# )
#
# h2o.performance(nn_6,test.hex)
# plot(nn_6)
# h2o.saveModel(nn_1,"nn1.RData")
# h2o.saveModel(nn_2,"nn2.RData")
# h2o.saveModel(nn_3,"nn3.RData")
# h2o.saveModel(nn_4,"nn4.RData")
# h2o.saveModel(nn_5,"nn5.RData")
# h2o.saveModel(nn_6,"nn6.RData")
# h2o.performance(nn_1,dat.hex)
# h2o.performance(nn_2,dat.hex)
# h2o.performance(nn_3,dat.hex)
# h2o.performance(nn_4,dat.hex)
# h2o.performance(nn_5,dat.hex)
# h2o.performance(nn_6,dat.hex)
# nn_7 <- h2o.deeplearning(
# x= 2:97,
# y= 1,
# model_id = "nn_7",
# training_frame = train.hex,
# validation_frame = valid.hex,
# activation = "RectifierWithDropout",
# hidden = c(500,500,500,500),
# nfolds=10,
# standardize = F,
# fast_mode = F,
# sparse = T,
# epochs=200,
# balance_classes = T,
# l1=1e-06
# )
# h2o.saveModel(nn_7,"nn7.RData")
# h2o.performance(nn_7,dat.hex)
# plot(nn_7)
# nn_1 <- h2o.loadModel("nn_1")
# nn_2 <- h2o.loadModel("nn_2")
# nn_3 <- h2o.loadModel("nn_3")
# nn_4 <- h2o.loadModel("nn_4")
# nn_5 <- h2o.loadModel("nn_5")
# nn_6 <- h2o.loadModel("nn_6")
# nn_7 <- h2o.loadModel("nn_7")
#
# p1 <- as.data.frame(h2o.predict(nn_1,dat.hex))
# p2 <- as.data.frame(h2o.predict(nn_2,dat.hex))
# p3 <- as.data.frame(h2o.predict(nn_3,dat.hex))
# p4 <- as.data.frame(h2o.predict(nn_4,dat.hex))
# p5 <- as.data.frame(h2o.predict(nn_5,dat.hex))
# p6 <- as.data.frame(h2o.predict(nn_6,dat.hex))
# p7 <- as.data.frame(h2o.predict(nn_7,dat.hex))
#
# p1 <- p1[,3]
# p2 <- p2[,3]
# p3 <- p3[,3]
# p4 <- p4[,3]
# p5 <- p5[,3]
# p6 <- p6[,3]
# p7 <- p7[,3]
#
# p <- data.frame(cbind(p1,p2,p3,p4,p5,p6,p7))
#
# p <- scale(p)
#
# my.dat <- data.frame(cbind(my.dat,p))
#h2o.shutdown(prompt=F)
# h2o.init(max_mem_size = "28G",nthreads = -1)
# h2o.removeAll()
# #h2o.no_progress()
#
# dat.hex <- as.h2o(my.dat,"dat.hex")
#
# splitz <- h2o.splitFrame(dat.hex,ratios=c(0.6,0.2),
# destination_frames = c("train.hex","valid.hex","test.hex"))
#
# train.hex <- splitz[[1]]
# valid.hex <- splitz[[2]]
# test.hex <- splitz[[3]]
# rf1 <- h2o.randomForest(y=1,
# x=2:111,
# training_frame=train.hex,
# validation_frame=valid.hex,
# nfolds=30,
# ntrees=200)
#
# h2o.performance(rf1,dat.hex)
#
# p8 <- as.data.frame(h2o.predict(rf1,dat.hex))
#
# plot(rf1)
#
# p8 <- p8[,3]
# my.dat <- data.frame(my.dat,p8)
#
# h2o.shutdown(prompt = F)
# k <- kmeans(x=my.dat[,-1],centers=10)
# k <- k$cluster
# k <- as.factor(k)
#
# require(dummies)
#
# k <- dummy(k)
#
# my.dat <- data.frame(cbind(my.dat,k))
#
# my.dat$k1 <- as.factor(my.dat$k1)
# my.dat$k2 <- as.factor(my.dat$k2)
# my.dat$k3 <- as.factor(my.dat$k3)
# my.dat$k4 <- as.factor(my.dat$k4)
# my.dat$k5 <- as.factor(my.dat$k5)
# my.dat$k6 <- as.factor(my.dat$k6)
# my.dat$k7 <- as.factor(my.dat$k7)
# my.dat$k8 <- as.factor(my.dat$k8)
# my.dat$k9 <- as.factor(my.dat$k9)
# my.dat$k10 <- as.factor(my.dat$k10)
#save(my.dat,file="mydat.RData")
rm(list=ls())
load("mydat.RData")
Linear Kernel:
require(e1071)
sv <- svm(pdgCode~.,data=my.dat,scale=F,kernel="linear")
svm.p <- data.frame(predict(sv,my.dat))
Polynomial Kernel:
sv <- svm(pdgCode~.,data=my.dat,scale=F,kernel="polynomial")
svm.p <- data.frame(cbind(svm.p,predict(sv,my.dat)))
Radial Kernel
require(e1071)
sv <- svm(pdgCode~.,data=my.dat,scale=F,kernel="radial")
svm.p <- data.frame(cbind(svm.p,predict(sv,my.dat)))
Sigmoid Kernel:
require(e1071)
sv <- svm(pdgCode~.,data=my.dat,scale=F,kernel="sigmoid")
svm.p <- data.frame(cbind(svm.p,predict(sv,my.dat)))
require(dummies)
svm.p <- as.data.frame(svm.p)
svm.p <- dummy.data.frame(svm.p)
my.dat <- data.frame(cbind(my.dat,svm.p))
my.dat$k1 <- as.numeric(my.dat$k1)
my.dat$k2 <- as.numeric(my.dat$k2)
my.dat$k3 <- as.numeric(my.dat$k3)
my.dat$k4 <- as.numeric(my.dat$k4)
my.dat$k5 <- as.numeric(my.dat$k5)
my.dat$k6 <- as.numeric(my.dat$k6)
my.dat$k7 <- as.numeric(my.dat$k7)
my.dat$k8 <- as.numeric(my.dat$k8)
my.dat$k9 <- as.numeric(my.dat$k9)
my.dat$k10 <- as.numeric(my.dat$k10)
pc <- princomp(my.dat[,-1])
biplot(pc)
std_dev <- pc$sdev
pr_var <- std_dev^2
prop_varex <- pr_var/sum(pr_var)
plot(prop_varex, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")
plot(cumsum(prop_varex), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b")
abline(v=min(which(cumsum(prop_varex)>=.99)),col="red")
min(which(cumsum(prop_varex)>=.99))
## [1] 38
new.dat <- data.frame(cbind(as.character(my.dat$pdgCode),pc$scores[,1:38]))
new.dat$V1 <- as.factor(new.dat$V1)
names(new.dat)[1] <- "pdgCode"
for(i in 2:ncol(new.dat)){
new.dat[,i] <- as.numeric(as.character(new.dat[,i]))
}
require(randomForest)
rf <- randomForest(x=new.dat[1:100,2:39],y=new.dat$pdgCode[1:100])
pp <- predict(rf,newdata = new.dat)
pp <- data.frame(cbind(as.factor(new.dat$pdgCode),as.factor(pp)))
pp[,3] <- pp[,1]==pp[,2]
length(which(pp[,3])==T)/nrow(pp)
## [1] 0.7711443