Key reference: McKay (2003). Algorithm 39.5 (page 478)

data(iris)
library(nnet)
## Warning: package 'nnet' was built under R version 3.2.3
x <- as.matrix(iris[,1:4]) # N=100 individuals
t <- class.ind(iris[,5]) # "one-hot" coding

softmax <- function (x){ # for normalizing y to sum up to 1
  x.exp <- exp(x);
  return(x.exp/sum(x.exp))
}

# Each neuron separately; works better (parameters converges quickly)
neuron2 <- function(training.mat, target.vec, max.epoch, learn.rate) {
  out.list <- list();
  x <- training.mat;
  N <- nrow(training.mat);
  S<- length(levels(iris$Species));
  w <- matrix(runif(n = S * ncol(x), min = 1e-4, max = 1e-3), nrow=S, ncol = ncol(x)) # 4 random wts for each species/neuron
  b <- runif(n = S, min = 1e-4, max=1e-3); # 3 random biases
  for (i in 1:max.epoch) {
    for (s in 1:S) { # for each of three neurons
      g <- rep(0,5);
      for (j in 1:N) { # for each input row
        a <- sum(w[s,] * x[j,]) + b[s]; # Input/activation for each individual flower
        y <- 1/(1+exp(-1*a)); # Output/activity for each neuron
        e <- target.vec[j,s] - y; # no need for softmax
        g <- g - e * c(1,x[j,]); # descent gradient for wts and biases, cumulative
      }
      w[s,] <- w[s,] - learn.rate * g[2:5]; # back-prop error to update weights
      b[s] <- b[s] - learn.rate * g[1]; # update bias
    }
    out.list[[i]] <- list(epoch = i, bias = b, wts = w);
  }
  out.list;
}  

# plot diagnostics
out <- neuron2(training.mat = x, target.vec = t, max.epoch = 1000, learn.rate = 0.1)
# parameter evolution: biases
n1.bias <- unlist(lapply(out, function(x) x$bias[1]))
n2.bias <- unlist(lapply(out, function(x) x$bias[2]))
n3.bias <- unlist(lapply(out, function(x) x$bias[3]))
plot(1:1000, n1.bias, type="l", log="x", xlab="epoch", ylab="biases", las=1, ylim=range(n1.bias, n2.bias, n3.bias))
lines(1:1000, n2.bias, type="l", col=2)
lines(1:1000, n3.bias, type="l", col=3)
legend("topleft", c("b1", "b2", "b3"), lty=1, col=1:3, bty = "n", cex=0.8)

# weights for neuron 1
n1.w1 <- unlist(lapply(out, function(x) x$wts[1,1]))
n1.w2 <- unlist(lapply(out, function(x) x$wts[1,2]))
n1.w3 <- unlist(lapply(out, function(x) x$wts[1,3]))
n1.w4 <- unlist(lapply(out, function(x) x$wts[1,4]))
n1 <- data.frame(epoch=1:1000, w1=n1.w1, w2=n1.w2, w3=n1.w3, w4=n1.w4)
plot(n1$epoch, n1$w1, ylim =range(n1[,2:5]), type="l", log="x", xlab = "epoch", ylab="weights", las=1, main="neuron 1")
lines(n1$epoch, n1$w2,  type="l", col=2)
lines(n1$epoch, n1$w3,  type="l", col=3)
lines(n1$epoch, n1$w4,  type="l", col=4)
legend("topleft", c("w1", "w2", "w3", "w4"), col=1:4, lty = 1, cex = 0.5, bty = "n")

# weights for neuron 2
n2.w1 <- unlist(lapply(out, function(x) x$wts[2,1]))
n2.w2 <- unlist(lapply(out, function(x) x$wts[2,2]))
n2.w3 <- unlist(lapply(out, function(x) x$wts[2,3]))
n2.w4 <- unlist(lapply(out, function(x) x$wts[2,4]))
n2 <- data.frame(epoch=1:1000, w1=n2.w1, w2=n2.w2, w3=n2.w3, w4=n2.w4)
plot(n2$epoch, n2$w1, ylim =range(n2[,2:5]), type="l", log="x", xlab = "epoch", ylab="weights", las=1, main="neuron 2")
lines(n2$epoch, n2$w2,  type="l", col=2)
lines(n2$epoch, n2$w3,  type="l", col=3)
lines(n2$epoch, n2$w4,  type="l", col=4)
legend("topleft", c("w1", "w2", "w3", "w4"), col=1:4, lty = 1, cex = 0.5, bty = "n")

# weights for neuron 3
n3.w1 <- unlist(lapply(out, function(x) x$wts[3,1]))
n3.w2 <- unlist(lapply(out, function(x) x$wts[3,2]))
n3.w3 <- unlist(lapply(out, function(x) x$wts[3,3]))
n3.w4 <- unlist(lapply(out, function(x) x$wts[3,4]))
n3 <- data.frame(epoch=1:1000, w1=n3.w1, w2=n3.w2, w3=n3.w3, w4=n3.w4)
plot(n3$epoch, n3$w1, ylim =range(n3[,2:5]), type="l", log="x", xlab = "epoch", ylab="weights", las=1, main="neuron 3")
lines(n3$epoch, n3$w2,  type="l", col=2)
lines(n3$epoch, n3$w3,  type="l", col=3)
lines(n3$epoch, n3$w4,  type="l", col=4)
legend("topleft", c("w1", "w2", "w3", "w4"), col=1:4, lty = 1, cex = 0.5, bty = "n")

# predictions using softmax
col <- integer();
for (i in 1:150) {
#  cat(i, "\t");
  n.1 <- 1/(1+exp(-sum(out[[1000]]$wts[1,] * x[i,]) - out[[1000]]$bias[1]));
  n.2 <- 1/(1+exp(-sum(out[[1000]]$wts[2,] * x[i,]) - out[[1000]]$bias[2]));
  n.3 <- 1/(1+exp(-sum(out[[1000]]$wts[3,] * x[i,]) - out[[1000]]$bias[3]));
  y.raw <- softmax(c(n.1, n.2, n.3));  # normalize to sum of 1
  y <- round(y.raw);
#  cat(y, "\t", round(y), "\n")
  if(y[1] == 0 & y[2] == 0 & y[3] == 1) {col[i] <- 3}
  else if (y[1] == 0 & y[2] == 1 & y[3] == 0) {col[i] <- 2}
  else if (y[1] == 1 & y[2] == 0 & y[3] == 0) {col[i] <- 1}
  else {col[i]<-4}
#  cat("\n")
}

# Visualize accuracy with cmdscale
dist.iris <-dist(iris[,1:4])
cmd.iris <- cmdscale(dist.iris, k=3)
plot(cmd.iris, col = iris[,5], las=1, pch=16)
text(cmd.iris[,1], cmd.iris[,2], col, col=col, pos=4, cex=0.8)

error <- (length(which(col[1:50]!=1)) +  length(which(col[51:100]!=2)) + length(which(col[101:150]!=3)))/150
error
## [1] 0.3466667
# To Do: Add weight decay
# To Do: Estimate Bayesian error by MCMC
# To Do: scale with z-values
# To Do: Add a hidden layer with two neurons