Introduction to Neural Networks

Data Acquisition and Management

CUNY MSDS DATA 607

Rose Koh

2018/04/09

Rpub Link

Backgrounds

Load packages

library(MASS) 
library(caTools)
library(neuralnet)
library(ggplot2)
library(knitr)

Load data

# Check data
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# Check missing data
any(is.na(Boston))
## [1] FALSE
# rename the dataset
data <- Boston

Preprocessing

# 1. Standardize data
# Normalize : min-max scale, z-normalization etc

# grab min and max value per column using the apply function.
maxs <- apply(data, MARGIN=2, max) #margin 2 means we want to apply this function to column. help(apply) for more info.
maxs #(max values of each of the columns.)
##     crim       zn    indus     chas      nox       rm      age      dis 
##  88.9762 100.0000  27.7400   1.0000   0.8710   8.7800 100.0000  12.1265 
##      rad      tax  ptratio    black    lstat     medv 
##  24.0000 711.0000  22.0000 396.9000  37.9700  50.0000
mins <- apply(data, MARGIN=2, min)
mins
##      crim        zn     indus      chas       nox        rm       age 
##   0.00632   0.00000   0.46000   0.00000   0.38500   3.56100   2.90000 
##       dis       rad       tax   ptratio     black     lstat      medv 
##   1.12960   1.00000 187.00000  12.60000   0.32000   1.73000   5.00000
#help(scale) scale is going to return numeric matrix -> will need to change back to df
scaled.data <- scale(data, center = mins, scale = maxs - mins) # This means, each data value will be subtracted by the mins, then divided by max-mins. so get data, subtract mins and divide by max-mins.
scaled <- as.data.frame(scaled.data) # turn the matrix into frame.

head(scaled)
##           crim   zn      indus chas       nox        rm       age
## 1 0.0000000000 0.18 0.06781525    0 0.3148148 0.5775053 0.6416066
## 2 0.0002359225 0.00 0.24230205    0 0.1728395 0.5479977 0.7826982
## 3 0.0002356977 0.00 0.24230205    0 0.1728395 0.6943859 0.5993821
## 4 0.0002927957 0.00 0.06304985    0 0.1502058 0.6585553 0.4418126
## 5 0.0007050701 0.00 0.06304985    0 0.1502058 0.6871048 0.5283213
## 6 0.0002644715 0.00 0.06304985    0 0.1502058 0.5497222 0.5746653
##         dis        rad        tax   ptratio     black      lstat      medv
## 1 0.2692031 0.00000000 0.20801527 0.2872340 1.0000000 0.08967991 0.4222222
## 2 0.3489620 0.04347826 0.10496183 0.5531915 1.0000000 0.20447020 0.3688889
## 3 0.3489620 0.04347826 0.10496183 0.5531915 0.9897373 0.06346578 0.6600000
## 4 0.4485446 0.08695652 0.06679389 0.6489362 0.9942761 0.03338852 0.6311111
## 5 0.4485446 0.08695652 0.06679389 0.6489362 1.0000000 0.09933775 0.6933333
## 6 0.4485446 0.08695652 0.06679389 0.6489362 0.9929901 0.09602649 0.5266667
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7
# ^ This is important, you have to normalize your data like this at all times.
# 2. now split train / test

#library(caTools)
split <- sample.split(scaled$medv, SplitRatio = 0.7)
train <- subset(scaled, split == T)
test <- subset(scaled, split == F)
# 3. train the model

#install.packages('neuralnet')
#library(neuralnet)

# The format for neuralnet: y ~ col1 + col2 .
# since that's a bit redundant...we will do this way
n <- names(train)
n
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"
f <- as.formula(paste("medv ~", paste(n[!n %in% "medv"], collapse = " + ")))
f
## medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad + 
##     tax + ptratio + black + lstat
nn <- neuralnet(f, data = train, hidden = c(5,3), linear.output = TRUE) 
# hidden: vector of integers, specifying number of hidden neurons in each layers. (first hidden layer of 5 neurons, second hidden layer of 3 neurons)
# linear.output: Whether this is continuous value, so in our case it's true. but if you are performing classification, this should be false.
plot(nn)

# This is like black box, it's really hard to interpret what each of these weighted vectors really means in reference to the column values of the data.
# the black lines, shows the connection between each layer and weights of each connection.
# blue lines shows the bias term added in each step.
# bias can be thought almost like intercept of linear model.
# the net here is like black box. we can't say much about the fitting the weights or even the model.
# But we can say that this training algorithm converged, and therefore, the model is ready to be used on test data.
# create prediction with model
predicted.nn.values <- compute(nn, test[1:13]) # for neuralnet, we use compute function instead of predict. pass in the neural net model, and pass in test data without the labels. ( there are 14 features in the data, we just need 13.)
str(predicted.nn.values)
## List of 2
##  $ neurons   :List of 3
##   ..$ : num [1:139, 1:14] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:139] "1" "4" "8" "14" ...
##   .. .. ..$ : chr [1:14] "1" "crim" "zn" "indus" ...
##   ..$ : num [1:139, 1:6] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:139] "1" "4" "8" "14" ...
##   .. .. ..$ : NULL
##   ..$ : num [1:139, 1:4] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:139] "1" "4" "8" "14" ...
##   .. .. ..$ : NULL
##  $ net.result: num [1:139, 1] 0.463 0.674 0.298 0.321 0.287 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:139] "1" "4" "8" "14" ...
##   .. ..$ : NULL
# this is list of neurons and net.result. and what we want is net result.
# but we scaled the data earlier for the training model.
# So we need to undo the operation in order to obtain the true predictions!

true.predictions <- predicted.nn.values$net.result * 
        (max(data$medv) - min(data$medv)) + min(data$medv)
# we were subtracting from the center value and then dividing by that scale value to perform our normalization operation.
# So for true.predictions, we are inverting this.

#convert the test data mean squared error
test.r <- (test$medv) * max(data$medv) - min(data$medv) + min(data$medv)
MSE.nn <- sum((test.r - true.predictions)^2)/nrow(test)
MSE.nn
## [1] 23.30551677
# we can visualize error in ggplot2 by actually jsut graphically showing the true predictions plotted byr the test values.
error.df <- data.frame(test.r, true.predictions)
head(error.df)
##          test.r true.predictions
## 1  21.111111111      25.82120530
## 4  31.555555556      35.35040956
## 8  24.555555556      18.41440947
## 14 17.111111111      19.43677567
## 15 14.666666667      17.92280603
## 21  9.555555556      14.98378910
library(ggplot2)
ggplot(error.df, aes(x = test.r, y = true.predictions)) + geom_point() + stat_smooth()

# a perfect prediction would be straight line...
# though overall, our graph doesnt look too bad
# all we did was normalizing the data and treated neural net as some kind of black box.

# nn resembles black box a lot. explaining actual outcomes and how they interpret the features is much more difficult than explaining the outcome of a simpler models like linear regression.
# depending on the kind of application you need, you may want to take this into account as well.
# if you are doing something at work or project, wehre you need to specifically interpret the importance of each of the variables, a neural network that is a little more like black box may not be as good as a simpler model.