Start by writing script, explanation will come later.
#install.packages('ISLR')
library(ISLR)
print(head(College,4))## Private Apps Accept Enroll Top10perc
## Abilene Christian University Yes 1660 1232 721 23
## Adelphi University Yes 2186 1924 512 16
## Adrian College Yes 1428 1097 336 22
## Agnes Scott College Yes 417 349 137 60
## Top25perc F.Undergrad P.Undergrad Outstate
## Abilene Christian University 52 2885 537 7440
## Adelphi University 29 2683 1227 12280
## Adrian College 50 1036 99 11250
## Agnes Scott College 89 510 63 12960
## Room.Board Books Personal PhD Terminal
## Abilene Christian University 3300 450 2200 70 78
## Adelphi University 6450 750 1500 29 30
## Adrian College 3750 400 1165 53 66
## Agnes Scott College 5450 450 875 92 97
## S.F.Ratio perc.alumni Expend Grad.Rate
## Abilene Christian University 18.1 12 7041 60
## Adelphi University 12.2 16 10527 56
## Adrian College 12.9 30 8735 54
## Agnes Scott College 7.7 37 19016 59
Need to normalize data for NN
# Create Vector of Column Max and Min Values
maxs <- apply(College[,2:18], 2, max)
mins <- apply(College[,2:18], 2, min)
# Use scale() and convert the resulting matrix to a data frame
scaled.data <- as.data.frame(scale(College[,2:18],center = mins, scale = maxs - mins))
# Check out results
print(head(scaled.data,2))## Apps Accept Enroll Top10perc
## Abilene Christian University 0.03288693 0.04417701 0.10791254 0.2315789
## Adelphi University 0.04384229 0.07053089 0.07503539 0.1578947
## Top25perc F.Undergrad P.Undergrad Outstate
## Abilene Christian University 0.4725275 0.08716353 0.02454774 0.2634298
## Adelphi University 0.2197802 0.08075165 0.05614839 0.5134298
## Room.Board Books Personal PhD
## Abilene Christian University 0.2395965 0.1577540 0.2977099 0.6526316
## Adelphi University 0.7361286 0.2914439 0.1908397 0.2210526
## Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 0.71052632 0.4182306 0.1875 0.0726714
## Adelphi University 0.07894737 0.2600536 0.2500 0.1383867
## Grad.Rate
## Abilene Christian University 0.4629630
## Adelphi University 0.4259259
# Convert Private column from Yes/No to 1/0
Private = as.numeric(College$Private)-1
# Add new 'Private' to scaled data as new df called 'data'
data = cbind(Private,scaled.data)
dim(data)## [1] 777 18
Data has 18 features, and 777 inputs.
Could also check for missing values:
# find any missing values
sapply(data,function(x) sum(is.na(x)))## Private Apps Accept Enroll Top10perc Top25perc
## 0 0 0 0 0 0
## F.Undergrad P.Undergrad Outstate Room.Board Books Personal
## 0 0 0 0 0 0
## PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
## 0 0 0 0 0 0
Visualize missing data:
# install.packages("Amelia")
library(Amelia)## Warning: package 'Amelia' was built under R version 3.2.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.2.5
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(data, main = "Missing values vs observed")# replace if missing like this:
# data$Feature[is.na(data$Feature)] <- mean(data$Feauture,na.rm=T)library(caTools)
set.seed(101)
# Create Split (any column is fine)
split = sample.split(data$Private, SplitRatio = 0.70)
# Split based off of split Boolean Vector
train = subset(data, split == TRUE)
test = subset(data, split == FALSE)
as.data.frame(table(test$Private))## Var1 Freq
## 1 0 64
## 2 1 169
First, we need to make an equation to 'feed' the NN. We want to predict 'Private', so our equation will be Private ~ Apps + Accept +... {other features}
feats <- names(scaled.data)
feats## [1] "Apps" "Accept" "Enroll" "Top10perc" "Top25perc"
## [6] "F.Undergrad" "P.Undergrad" "Outstate" "Room.Board" "Books"
## [11] "Personal" "PhD" "Terminal" "S.F.Ratio" "perc.alumni"
## [16] "Expend" "Grad.Rate"
# Concatenate strings
f <- paste(feats,collapse=' + ')
f <- paste('Private ~',f)
# Convert to formula
f <- as.formula(f)
f## Private ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad +
## P.Undergrad + Outstate + Room.Board + Books + Personal +
## PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate
#install.packages('neuralnet')
library(neuralnet)## Warning: package 'neuralnet' was built under R version 3.2.5
hVal <- 10 # vector of integers specifying the number of hidden neurons (vertices) in each layer.
nn <- neuralnet(f,train,err.fct="ce",hidden=c(hVal,hVal,hVal),linear.output=FALSE)
# Compute Predictions off Test Set
predicted.nn.values <- compute(nn,test[2:18])
# Check out net.result
print(head(predicted.nn.values$net.result))## [,1]
## Adrian College 1.0000000000
## Alfred University 0.9999999982
## Allegheny College 1.0000000000
## Allentown Coll. of St. Francis de Sales 1.0000000000
## Alma College 1.0000000000
## Amherst College 1.0000000000
Round prediction values to 0/1 using sapply:
predicted.nn.values$net.result <- sapply(predicted.nn.values$net.result,round,digits=0)
print(head(predicted.nn.values$net.result))## [1] 1 1 1 1 1 1
Compare test data to predicted results and create confusion matrix table:
table(test$Private,predicted.nn.values$net.result)##
## 0 1
## 0 57 7
## 1 10 159
Visualize the NN:
plot(nn)