Trung_Phan_Project

load necessary packages

library(neuralnet)
library(nnet)
library(NeuralNetTools)

step 1: collecting data

step 2: exploring and preparing the data

read in data and examine structure

abalone <- read.csv("abalone.csv")
str(abalone)

## 'data.frame':    4177 obs. of  9 variables:
##  $ Sex            : Factor w/ 3 levels "F","I","M": 3 3 1 3 2 2 1 1 3 1 ...
##  $ Length         : num  0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
##  $ Diameter       : num  0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
##  $ Height         : num  0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
##  $ Whole.weight.  : num  0.514 0.226 0.677 0.516 0.205 ...
##  $ Shucked.weight : num  0.2245 0.0995 0.2565 0.2155 0.0895 ...
##  $ Viscera.weight.: num  0.101 0.0485 0.1415 0.114 0.0395 ...
##  $ Shell.weight   : num  0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
##  $ Rings.         : int  15 7 9 10 7 8 20 16 9 19 ...

Encode as a one hot vector multilabel data

train.aba <- cbind(abalone[, 2:9], class.ind(as.factor(abalone$Sex)))

Set labels name

names(train.aba) <- c(names(abalone)[2:9],"Female","Infant","Male")

Scale data

scl <- function(x){ (x - min(x))/(max(x) - min(x)) }
train.aba[, 1:8] <- data.frame(lapply(train.aba[, 1:8], scl))
head(train.aba)

step 3: training a model on the data

n <- names(train.aba)
f <- as.formula(paste("Female + Infant + Male ~", paste(n[!n %in% c("Female","Infant","Male")], collapse = " + ")))
f

## Female + Infant + Male ~ Length + Diameter + Height + Whole.weight. + 
##     Shucked.weight + Viscera.weight. + Shell.weight + Rings.

nn <- neuralnet(f,
                data = train.aba,
                hidden = c(4,2),
                act.fct = "logistic",
                linear.output = FALSE,
                lifesign = "minimal", threshold = 0.1)

## hidden: 4, 2    thresh: 0.1    rep: 1/1    steps:   66850    error: 1045.98411   time: 5.85 mins

summary(nn)

##                     Length Class      Mode    
## call                    8  -none-     call    
## response            12531  -none-     numeric 
## covariate           33416  -none-     numeric 
## model.list              2  -none-     list    
## err.fct                 1  -none-     function
## act.fct                 1  -none-     function
## linear.output           1  -none-     logical 
## data                   11  data.frame list    
## net.result              1  -none-     list    
## weights                 1  -none-     list    
## startweights            1  -none-     list    
## generalized.weights     1  -none-     list    
## result.matrix          58  -none-     numeric

step 4: evaluating model performance

visualize the network topology

plot(nn)

plotnet

par(mar = numeric(4), family = 'serif')
plotnet(nn, alpha = 0.6)

Compute predictions

predicted.nn <- compute(nn, train.aba[, 1:8])

Extract results

result.predicted.nn <- predicted.nn$net.result
head(result.predicted.nn)

##              [,1]          [,2]         [,3]
## [1,] 0.3841716709 0.09724998684 0.5195035384
## [2,] 0.1240357442 0.66113806676 0.2589215381
## [3,] 0.2767986006 0.30819680837 0.3156220456
## [4,] 0.3396939275 0.15194794386 0.4488988752
## [5,] 0.1001855567 0.72977921580 0.2524183415
## [6,] 0.1587387678 0.62126107951 0.2257799228

Accuracy (training set)

original_values <- max.col(train.aba[, 9:11])
result.predicted.nn_2 <- max.col(result.predicted.nn)
mean(result.predicted.nn_2 == original_values)

## [1] 0.5846301173

step 5: improving model performance

Crossvalidate

set.seed(10)
k <- 10
outs <- NULL
proportion <- 0.995

library(plyr) 
pbar <- create_progress_bar('text')
pbar$init(k)

## 
  |                                                                       
  |                                                                 |   0%

for(i in 1:k)
{
  index <- sample(1:nrow(train.aba), round(proportion*nrow(train.aba)))
  cross.train <- train.aba[index, ]
  cross.test  <- train.aba[-index, ]
  cross.nn    <- neuralnet(f,
                     data = cross.train,
                     hidden = c(4,2),
                     act.fct = "logistic",
                     linear.output = FALSE, threshold = 0.1)
  

  # Compute predictions
  predicted.nn <- compute(cross.nn, cross.test[, 1:8])
  
  # Extract results
  result.predicted.nn <- predicted.nn$net.result
  
  # Accuracy (test set)
  original_values <- max.col(cross.test[, 9:11])
  result.predicted.nn_2 <- max.col(result.predicted.nn)
  outs[i] <- mean(result.predicted.nn_2 == original_values)
  pbar$step()
}

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

Average accuracy

mean(outs)

## [1] 0.5857142857

Appendix

#### R code ####
# load necessary packages
library(neuralnet)
library(nnet)
library(NeuralNetTools)
##### step 1: collecting data
##### step 2: exploring and preparing the data
# read in data and examine structure
abalone <- read.csv("abalone.csv")
str(abalone)

# Encode as a one hot vector multilabel data
train.aba <- cbind(abalone[, 2:9], class.ind(as.factor(abalone$Sex)))

# Set labels name
names(train.aba) <- c(names(abalone)[2:9],"Female","Infant","Male")

# Scale data
scl <- function(x){ (x - min(x))/(max(x) - min(x)) }
train.aba[, 1:8] <- data.frame(lapply(train.aba[, 1:8], scl))
head(train.aba)

##### step 3: training a model on the data
n <- names(train.aba)
f <- as.formula(paste("Female + Infant + Male ~", paste(n[!n %in% c("Female","Infant","Male")], collapse = " + ")))
f

nn <- neuralnet(f,
                data = train.aba,
                hidden = c(4,2),
                act.fct = "logistic",
                linear.output = FALSE,
                lifesign = "minimal", threshold = 0.1)

summary(nn)
##### step 4: evaluating model performance
# visualize the network topology
plot(nn)

# plotnet
par(mar = numeric(4), family = 'serif')
plotnet(nn, alpha = 0.6)

# Compute predictions
predicted.nn <- compute(nn, train.aba[, 1:8])

# Extract results
result.predicted.nn <- predicted.nn$net.result
head(result.predicted.nn)

# Accuracy (training set)
original_values <- max.col(train.aba[, 9:11])
result.predicted.nn_2 <- max.col(result.predicted.nn)
mean(result.predicted.nn_2 == original_values)


##### step 5: improving model performance
# Crossvalidate
set.seed(10)
k <- 10
outs <- NULL
proportion <- 0.995

library(plyr) 
pbar <- create_progress_bar('text')
pbar$init(k)

for(i in 1:k)
{
  index <- sample(1:nrow(train.aba), round(proportion*nrow(train.aba)))
  cross.train <- train.aba[index, ]
  cross.test  <- train.aba[-index, ]
  cross.nn    <- neuralnet(f,
                     data = cross.train,
                     hidden = c(4,2),
                     act.fct = "logistic",
                     linear.output = FALSE, threshold = 0.08)
  

  # Compute predictions
  predicted.nn <- compute(cross.nn, cross.test[, 1:8])
  
  # Extract results
  result.predicted.nn <- predicted.nn$net.result
  
  # Accuracy (test set)
  original_values <- max.col(cross.test[, 9:11])
  result.predicted.nn_2 <- max.col(result.predicted.nn)
  outs[i] <- mean(result.predicted.nn_2 == original_values)
  pbar$step()
}

# Average accuracy
mean(outs)

#### R code ####

Trung_Phan_Project

Trung Phan

May 24, 2018

load necessary packages

step 1: collecting data

step 2: exploring and preparing the data

read in data and examine structure

Encode as a one hot vector multilabel data

Set labels name

Scale data

step 3: training a model on the data

step 4: evaluating model performance

visualize the network topology

plotnet

Compute predictions

Extract results

Accuracy (training set)

step 5: improving model performance

Crossvalidate

Average accuracy

Appendix