data(iris)
x <- as.matrix(iris[,1:4])
y <- ifelse(iris$Species=="setosa",1,0)
Weights matrix for the first hidden layer:
x <- scale(x)
x <- t(x)
w1 <- matrix(
runif(56 * ncol(x)),
nrow = nrow(x),
ncol = 56,
byrow = TRUE
)
Forward propogation
#Output of layer 1, pre-nonlinear activation
a0 <- x
#init bias term=1
b1 <- 1
z1 <- (t(w1) %*% a0) + b1
#Tanh activation function applied to input of layer1
g1 <- function(z){
(2/(1+exp(-2*z)))-1
}
a1 <- g1(z=z1)
#Weights matrix of layer 2
w2 <- matrix(
runif(28 * nrow(a1)),
nrow = nrow(a1),
ncol = 28,
byrow = TRUE
)
#Output of layer2, pre-nonlinear activation
b2 <- 1
z2 <- (t(w2) %*% a1) + b2
#Parametric Rectified Linear Unit (PReLU) activation function applied to input of layer1
alpha=0.05
g2 <- function(z, alpha){
for(i in 1:nrow(z)){
for(j in 1:ncol(z)){
if(z[i,j]>=0){
z[i,j] <- z[i,j]
}else{
z[i,j] <- alpha * z[i,j]
}
}
}
return(z)
}
a2 <- g2(z=z2,alpha)
#We'll use the Tanh Activation function in layer 3
w3 <- matrix(
runif(nrow(a2) * 4),
nrow = nrow(a2),
ncol = 4,
byrow = TRUE
)
b3 <- 1
z3 <- (t(w3) %*% a2) + b3
a3 <- g1(z3)
#And in layer 4 we'll use SoftPlus
w4 <- matrix(
runif(nrow(a3)),
nrow = nrow(a3),
ncol = 1,
byrow = TRUE
)
b4 <- 1
z4 <- (t(w4) %*% a3) + b4
g3 <- function(z){
log(1+exp(z),base=exp(1))
}
a4 <- g3(z4)
#To provide a probability between 0 and 1, we do a bounded relu
y.hat <- as.numeric(a4)
y.hat.prob <- function(y.hat){
y.hat[y.hat<0] <- 0
y.hat[y.hat>1] <- 1
y.hat
}
y.hat <- y.hat.prob(y.hat)
As an aside, before we train this network, let’s plot the activation functions used in this neural network:
#Tanh
tanh.plot.x <- seq(-10,10,length.out = 100)
tanh.plot.y <- g1(z=tanh.plot.x)
plot(x=tanh.plot.x,y=tanh.plot.y,type="l",col="red",main="Tanh Activation Function", ylab = "Tanh(z)",xlab="z")
abline(h=0,lty="dashed")
abline(v=0,lty="dashed")

#PReLU
prelu.x <- as.matrix(seq(-10,10,length.out = 100))
prelu.y <- g2(z=prelu.x,alpha)
plot(x=prelu.x,y=prelu.y,type="l",col="red",main="PReLU Activation Function", ylab = "PReLU(z)",xlab="z")
abline(h=0,lty="dashed")
abline(v=0,lty="dashed")

#Softplus//TODO:
softplus.x <- seq(-10,10,length.out = 100)
softplus.y <- g3(softplus.x)
plot(x=softplus.x,y=softplus.y,type="l",col="red",main="SoftPlus Activation Function", ylab = "SoftPlus(z)",xlab="z")
abline(h=0,lty="dashed")
abline(v=0,lty="dashed")

#"Bounded ReLU"
brelu.x <- as.numeric(seq(-2,2,length.out = 1000))
brelu.y <- y.hat.prob(y.hat=brelu.x)
plot(x=brelu.x,y=brelu.y,type="l",col="red",main="Bounded ReLU Transformation", ylab = "p.model(y|x)",xlab="a4: output layer")
abline(h=0,lty="dashed")
abline(v=0,lty="dashed")
abline(v=1,lty="dashed")
abline(h=1,lty="dashed")

After one epoch of passing all the data through the neural network, let’s view the states of the various elements:
par(mfrow=c(1,3))
image(a0,yaxt='n',xaxt='n',main="Input data: 150 observations\nof 4 features")
image(w1,yaxt='n',xaxt='n',main="Initialized weights for layer 1\nwith 56 neurons")
image(as.matrix(b1),yaxt='n',xaxt='n',main="Bias term for layer 1,\ninitialized as b=1")

image(a1,yaxt='n',xaxt='n',main="Input to layer 2 is\nTanh activated output of layer 1")
image(w2,yaxt='n',xaxt='n',main="Layer 2 has 28 neurons \nDimensions of layer 2 weights\nis therefore 56x28",cex.main=0.8)
image(as.matrix(b2),yaxt='n',xaxt='n',main="Bias term for layer 2,\ninitialized as b=1")

image(a2,yaxt='n',xaxt='n',main="Input to layer 3 is\nTanh activated output of layer 2")
image(w3,yaxt='n',xaxt='n',main="Layer 3 has 4 neurons \nDimensions of layer 3 weights\nis therefore 2x48",cex.main=0.8)
image(a3,yaxt='n',xaxt='n',main="Input to layer 4 is\nTanh activated output of layer 3")

image(w4,yaxt='n',xaxt='n',main="Layer 4 has 1 neuron \nDimensions of layer 4 weights\nis therefore 2x4",cex.main=0.8)
par(mfrow=c(1,1))

image(t(a4),yaxt='n',xaxt='n',main="The Raw Output of the Neural Network")

image(as.matrix(y.hat),yaxt='n',xaxt='n',main="The Raw Output of the Neural Network\n
Transformed to probabilities for each obeservation")

image(as.matrix(y),yaxt='n',xaxt='n',main="The True Class Labels (if 'setosa' then TRUE else FALSE end")

Let’s train this network
cache.nn <- list(
#parameters for layer1:
#input
a0 <- a0,
#parameters tuned for this layer
w1 <- w1,
b1 <- b1,
z1 <- z1,
#parameters for layer2:
#input
a1 <- a1,
#parameters tuned for this layer
w2 <- w2,
b2 <- b2,
z2 <- z2,
alpha <- alpha,
#parameters for layer2:
#input
a2 <- a2,
#parameters tuned for this layer
w3 <- w3,
b3 <- b3,
z3 <- z3,
#parameters for layer2:
#input
a3 <- a3,
#parameters tuned for this layer
w4 <- w4,
b4 <- b4,
z4 <- z4,
#parameters for layer2:
#input
a3 <- a3,
#parameters tuned for this layer
w4 <- w4,
b4 <- b4,
z4 <- z4,
#output of neural network
a4 <- a4,
#transformed to a probability
y.hat <- y.hat
)