Machine learning intro in R: Neural Network

Data and packages
Data processing
Split test/train data
Lets make a neural network function!

Start by writing script, explanation will come later.

Data and packages

#install.packages('ISLR')
library(ISLR)

print(head(College,4))

##                              Private Apps Accept Enroll Top10perc
## Abilene Christian University     Yes 1660   1232    721        23
## Adelphi University               Yes 2186   1924    512        16
## Adrian College                   Yes 1428   1097    336        22
## Agnes Scott College              Yes  417    349    137        60
##                              Top25perc F.Undergrad P.Undergrad Outstate
## Abilene Christian University        52        2885         537     7440
## Adelphi University                  29        2683        1227    12280
## Adrian College                      50        1036          99    11250
## Agnes Scott College                 89         510          63    12960
##                              Room.Board Books Personal PhD Terminal
## Abilene Christian University       3300   450     2200  70       78
## Adelphi University                 6450   750     1500  29       30
## Adrian College                     3750   400     1165  53       66
## Agnes Scott College                5450   450      875  92       97
##                              S.F.Ratio perc.alumni Expend Grad.Rate
## Abilene Christian University      18.1          12   7041        60
## Adelphi University                12.2          16  10527        56
## Adrian College                    12.9          30   8735        54
## Agnes Scott College                7.7          37  19016        59

Data processing

Need to normalize data for NN

# Create Vector of Column Max and Min Values
maxs <- apply(College[,2:18], 2, max)
mins <- apply(College[,2:18], 2, min)

# Use scale() and convert the resulting matrix to a data frame
scaled.data <- as.data.frame(scale(College[,2:18],center = mins, scale = maxs - mins))

# Check out results
print(head(scaled.data,2))

##                                    Apps     Accept     Enroll Top10perc
## Abilene Christian University 0.03288693 0.04417701 0.10791254 0.2315789
## Adelphi University           0.04384229 0.07053089 0.07503539 0.1578947
##                              Top25perc F.Undergrad P.Undergrad  Outstate
## Abilene Christian University 0.4725275  0.08716353  0.02454774 0.2634298
## Adelphi University           0.2197802  0.08075165  0.05614839 0.5134298
##                              Room.Board     Books  Personal       PhD
## Abilene Christian University  0.2395965 0.1577540 0.2977099 0.6526316
## Adelphi University            0.7361286 0.2914439 0.1908397 0.2210526
##                                Terminal S.F.Ratio perc.alumni    Expend
## Abilene Christian University 0.71052632 0.4182306      0.1875 0.0726714
## Adelphi University           0.07894737 0.2600536      0.2500 0.1383867
##                              Grad.Rate
## Abilene Christian University 0.4629630
## Adelphi University           0.4259259

# Convert Private column from Yes/No to 1/0
Private = as.numeric(College$Private)-1
# Add new 'Private' to scaled data as new df called 'data'
data = cbind(Private,scaled.data)
dim(data)

## [1] 777  18

Data has 18 features, and 777 inputs.

Could also check for missing values:

# find any missing values
sapply(data,function(x) sum(is.na(x)))

##     Private        Apps      Accept      Enroll   Top10perc   Top25perc 
##           0           0           0           0           0           0 
## F.Undergrad P.Undergrad    Outstate  Room.Board       Books    Personal 
##           0           0           0           0           0           0 
##         PhD    Terminal   S.F.Ratio perc.alumni      Expend   Grad.Rate 
##           0           0           0           0           0           0

Visualize missing data:

# install.packages("Amelia")
library(Amelia)

## Warning: package 'Amelia' was built under R version 3.2.3

## Loading required package: Rcpp

## Warning: package 'Rcpp' was built under R version 3.2.5

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

missmap(data, main = "Missing values vs observed")

# replace if missing like this:
# data$Feature[is.na(data$Feature)] <- mean(data$Feauture,na.rm=T)

Split test/train data

library(caTools)
set.seed(101)

# Create Split (any column is fine)
split = sample.split(data$Private, SplitRatio = 0.70)

# Split based off of split Boolean Vector
train = subset(data, split == TRUE)
test = subset(data, split == FALSE)

as.data.frame(table(test$Private))

##   Var1 Freq
## 1    0   64
## 2    1  169

Lets make a neural network function!

First, we need to make an equation to 'feed' the NN. We want to predict 'Private', so our equation will be Private ~ Apps + Accept +... {other features}

feats <- names(scaled.data)
feats

##  [1] "Apps"        "Accept"      "Enroll"      "Top10perc"   "Top25perc"  
##  [6] "F.Undergrad" "P.Undergrad" "Outstate"    "Room.Board"  "Books"      
## [11] "Personal"    "PhD"         "Terminal"    "S.F.Ratio"   "perc.alumni"
## [16] "Expend"      "Grad.Rate"

# Concatenate strings
f <- paste(feats,collapse=' + ')
f <- paste('Private ~',f)

# Convert to formula
f <- as.formula(f)

f

## Private ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + Books + Personal + 
##     PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate

Prediction

#install.packages('neuralnet')
library(neuralnet)

## Warning: package 'neuralnet' was built under R version 3.2.5

hVal <- 10 #  vector of integers specifying the number of hidden neurons (vertices) in each layer.
nn <- neuralnet(f,train,err.fct="ce",hidden=c(hVal,hVal,hVal),linear.output=FALSE)

# Compute Predictions off Test Set
predicted.nn.values <- compute(nn,test[2:18])

# Check out net.result
print(head(predicted.nn.values$net.result))

##                                                 [,1]
## Adrian College                          1.0000000000
## Alfred University                       0.9999999982
## Allegheny College                       1.0000000000
## Allentown Coll. of St. Francis de Sales 1.0000000000
## Alma College                            1.0000000000
## Amherst College                         1.0000000000

Round prediction values to 0/1 using sapply:

predicted.nn.values$net.result <- sapply(predicted.nn.values$net.result,round,digits=0)
print(head(predicted.nn.values$net.result))

## [1] 1 1 1 1 1 1

Compare test data to predicted results and create confusion matrix table:

table(test$Private,predicted.nn.values$net.result)

##    
##       0   1
##   0  57   7
##   1  10 159

Visualize the NN:

plot(nn)