Adaline was developed by Bernard Widrow and Ted Hoff at Stanford University during 1960. The algorithm is generally considered an improvement over the Perceptron. Yet, what surprises my is that it is much more intuitive than the latter algorithm. The back-and-forth rotation of the weight vector and its eventual convergence is surprising to me. The conceptual intuitiveness of the algorithm comes from the usage of a convex, differentiable cost function that is to be minimized. The divergence/convergence of the weight factors for high/low learning rates is obvious.
In this document, we will play with two variants, the Gradient Descent and the Stochastic Gradient Descent method. Other stategies include marrying the two previous algorithms in something called the mini-batch algorithms.
Weâre interested in both the regular iris dataset as well as the standardized version with the mean subtracted out and the standard deviation reduced to 1.
# load the iris dataset
data(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# sepal and petal dimensions
X <- iris[1:100, 1:4]
names(X) <- tolower(names(X))
# initialize X_std dataframe
X_std <- iris[1:100, 1:4]
# define standardized data set
for (i in 1:4) {
X_std[, i] <- (X[, i]-mean(X[, i]))/sd(X[, i])
}
# first few rows of both data frames
head(X)
## sepal.length sepal.width petal.length petal.width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
head(X_std)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -0.5781533 0.837617378 -1.007900 -1.0368872
## 2 -0.8898262 -0.206793318 -1.007900 -1.0368872
## 3 -1.2014991 0.210970961 -1.076887 -1.0368872
## 4 -1.3573356 0.002088821 -0.938913 -1.0368872
## 5 -0.7339897 1.046499517 -1.007900 -1.0368872
## 6 -0.1106439 1.673145934 -0.800939 -0.6830008
# binary classification vector according to species: setosa(-1), versicolor(1)
y <- rep(1, 100)
y[which(iris[, 5] == "setosa")] <- -1
y[which(iris[, 5] == "versicolor")] <- 1
# pre-process non-separable data points (ns stands for non-separable)
# sepal and petal dimensions
Xns <- iris[51:150, 1:4]
names(Xns) <- tolower(names(Xns))
# initialize X_std dataframe
Xns_std <- iris[51:150, 1:4]
# define standardized data set
for (i in 1:4) {
Xns_std[, i] <- (Xns[, i]-mean(Xns[, i]))/sd(Xns[, i])
}
# binary classification vector according to species: setosa(-1), versicolor(1)
yns <- rep(1, 100)
l1 <- sum(iris[, 5] == "versicolor")
l2 <- sum(iris[, 5] == "virginica")
yns[1:l1] <- -1
yns[(l1+1):(l1+l2)] <- 1
dim(Xns)
## [1] 100 4
length(yns)
## [1] 100
Now, let us implement the adaline gradient descent algorithm. Weâre interested in the update of the weights, cost function and the amount of errors made per epoch. In the adaline, the weights are updated over entire data set in one go.
adalineGD <- function(X, y, n_iter=10, eta=0.01) {
# extend input vector and initialize extended weight
X[, dim(X)[2] + 1] <- 1
X <- as.matrix(X)
w <- as.matrix(rep(0, dim(X)[2]))
# initialize cost values - gets updated according to epochnums - number of epochs
cost <- rep(0, n_iter)
error <- rep(0, n_iter)
# loop over the number of epochs
for (i in 1:n_iter) {
# find the number of wrong prediction before weight update
for (j in 1:dim(X)[1]) {
# compute net input
z <- sum(w * X[j, ])
# quantizer
if(z < 0.0) {
ypred <- -1
} else {
ypred <- 1
}
# comparison with actual labels and counting error
if(ypred != y[j]) {
error[i] <- error[i] + 1
}
}
cost[i] <- sum((y - X %*% w)^2)/2
# update weight according to gradient descent
w <- w + eta*t(X) %*% (y - X %*% w)
}
# data frame consisting of cost and error info
infomatrix <- matrix(rep(0, 3 * n_iter), nrow = n_iter, ncol = 3)
infomatrix[, 1] <- 1:n_iter
infomatrix[, 2] <- log(cost)
infomatrix[, 3] <- error
infodf <- as.data.frame(infomatrix)
names(infodf) <- c("epoch", "log(cost)", "error")
return(infodf)
}
library(ggplot2)
library(reshape2)
# Standardized vs non-standard data: Cost and error function
eta <- 0.0001
n_iter <- 100
result1 <- adalineGD(X, y, n_iter, eta)
label <- rep("non-standard", dim(result1)[1])
result1 <- cbind(label, result1)
result2 <- adalineGD(X_std, y, n_iter, eta)
label <- rep("standard", dim(result2)[1])
result2 <- cbind(label, result2)
df <- rbind(result1, result2)
# long format of data frame
dflong <- melt(df, id.vars=c("epoch", "label"))
head(dflong)
## epoch label variable value
## 1 1 non-standard log(cost) 3.912023
## 2 2 non-standard log(cost) 3.863653
## 3 3 non-standard log(cost) 3.823895
## 4 4 non-standard log(cost) 3.786400
## 5 5 non-standard log(cost) 3.749531
## 6 6 non-standard log(cost) 3.712870
ggplot(dflong, aes(x=epoch, y=value)) +
geom_line(aes(color=label, linetype=label), size = 1) +
facet_grid(variable ~ .) + xlab("Epoch #") + ylab("") +
ggtitle("Cost and error function for a dataset \n and its standardized form: eta = 0.0001")
This makes me curious as to the convergence properties based on different learning rates. That is what we will look at next. Furthermore, we will focus on non-standard and standard data
library(ggplot2)
library(reshape2)
# set the number of sweeps through the entire data set
n_iter <- 50
# the list of learning rates that interests me
eta <- c("0.00005", "0.0001", "0.00025", "0.0005")
# initialize data frames before looping
# adaline applied before standardizing data
result <- adalineGD(X, y, n_iter, as.numeric(eta[1]))
learnrate <- rep(eta[1], dim(result)[1])
result <- cbind(learnrate, result)
# adaline applied after standardizing data
result_std <- adalineGD(X_std, y, n_iter, as.numeric(eta[1]))
learnrate_std <- rep(eta[1], dim(result_std)[1])
result_std <- cbind(learnrate_std, result_std)
# updating the dataframe by row binding cost and error function for each new learning rate
df <- result
df_std <- result_std
# Standardized vs non-standard data: Cost and error function
for(i in 2:length(eta)) {
# adaline applied before standardizing data
result <- adalineGD(X, y, n_iter, as.numeric(eta[i]))
learnrate <- rep(eta[i], dim(result)[1])
result <- cbind(learnrate, result)
# adaline applied after standardizing data
result_std <- adalineGD(X_std, y, n_iter, as.numeric(eta[i]))
learnrate_std <- rep(eta[i], dim(result_std)[1])
result_std <- cbind(learnrate_std, result_std)
# updating the dataframe by row binding cost and error function for each new learning rate
df <- rbind(df, result)
df_std <- rbind(df_std, result_std)
}
# long format of data frame
dflong <- melt(df, id.vars=c("epoch", "learnrate"))
df_stdlong <- melt(df_std, id.vars=c("epoch", "learnrate_std"))
head(dflong)
## epoch learnrate variable value
## 1 1 0.00005 log(cost) 3.912023
## 2 2 0.00005 log(cost) 3.887043
## 3 3 0.00005 log(cost) 3.864910
## 4 4 0.00005 log(cost) 3.844420
## 5 5 0.00005 log(cost) 3.824876
## 6 6 0.00005 log(cost) 3.805880
head(df_stdlong)
## epoch learnrate_std variable value
## 1 1 0.00005 log(cost) 3.912023
## 2 2 0.00005 log(cost) 3.883413
## 3 3 0.00005 log(cost) 3.854852
## 4 4 0.00005 log(cost) 3.826341
## 5 5 0.00005 log(cost) 3.797882
## 6 6 0.00005 log(cost) 3.769476
g <- ggplot(dflong, aes(x=epoch, y=value)) +
geom_line(aes(color = learnrate, linetype = learnrate), size = 1) +
facet_grid(variable ~ ., scales="free") + xlab("Epoch #") + ylab("") +
ggtitle("Cost and error function convergence for \n varying learning rates")
head(df_stdlong)
## epoch learnrate_std variable value
## 1 1 0.00005 log(cost) 3.912023
## 2 2 0.00005 log(cost) 3.883413
## 3 3 0.00005 log(cost) 3.854852
## 4 4 0.00005 log(cost) 3.826341
## 5 5 0.00005 log(cost) 3.797882
## 6 6 0.00005 log(cost) 3.769476
g_std <- ggplot(df_stdlong, aes(x=epoch, y=value)) +
geom_line(aes(color = learnrate_std, linetype = learnrate_std), size = 1) +
facet_grid(variable ~ ., scales="free") + xlab("Epoch #") + ylab("") +
ggtitle("Cost and error function convergence for \n varying learning rates - standardized data")
# print plots
print(g)
print(g_std)
Wow! Those were great plots to look at because I was forced to think about several new things:
From this plot, we can see that the cost function approaches the minimum faster as well as the error function reducing to nil quicker in the case of the standardized data.
Now, let us look at what happens when we run the adalineGD for non-separable data. This is the same dataset we also applied the Perceptron on and found that the minimum attainable error was 2.
# Standardized vs non-standard data: Cost and error function
eta <- 0.0001
n_iter <- 300
result1 <- adalineGD(Xns, yns, n_iter, eta)
label <- rep("non-standard", dim(result1)[1])
result1 <- cbind(label, result1)
result2 <- adalineGD(Xns_std, yns, n_iter, eta)
label <- rep("standard", dim(result2)[1])
result2 <- cbind(label, result2)
df <- rbind(result1, result2)
# long format of data frame
dflong <- melt(df, id.vars=c("epoch", "label"))
head(dflong)
## epoch label variable value
## 1 1 non-standard log(cost) 3.912023
## 2 2 non-standard log(cost) 3.902632
## 3 3 non-standard log(cost) 3.899033
## 4 4 non-standard log(cost) 3.895763
## 5 5 non-standard log(cost) 3.892514
## 6 6 non-standard log(cost) 3.889270
ggplot(dflong, aes(x=epoch, y=value)) +
geom_line(aes(color=label, linetype=label), size = 1) +
facet_grid(variable ~ ., scales = "free") + xlab("Epoch #") + ylab("") +
ggtitle("Cost and error function for a non-separable dataset \n and its standardized form: eta = 0.0001")
Moral: (i) Standardize data set (ii) Stop updating weights once cost function begins to increase
Now that I am familiar with the adaline gradient descent algorithm, it is time to implement the adaline stochastic gradient algorithm, i.e. adalineSGD. A few comments.
* I would like to learn how to implement C++ code into R.
* I will be implementing this in Python after this.
In the stochastic gradient descent method, similar to the perceptron algorithm, the weights are updated after examining each data point. This makes the convergence noisier than the gradiant descent method, however, it also filters out shallow minima of the cost function. A crucial aspect of the SGD algorithm is to randomize the data so as to avoid cycles leading to unwanted behaviour in the update of the weight function.
adalineSGD <- function(X, y, n_iter=10, eta=0.0001) {
# we are dealing with extended input and weight vectors
X[, dim(X)[2] + 1] <- 1
X <- as.matrix(X)
w <- as.matrix(rep(0, dim(X)[2]))
# initialize vector to keep track of cost and error function per epoch
cost <- rep(0, n_iter)
error <- rep(0, n_iter)
# loop through each epoch
for(i in 1:n_iter) {
# loop through each data point
for(j in sample(1:dim(X)[1], dim(X)[1], replace = FALSE)) {
# keep track of incorrect predictions
z <- sum(w * X[j, ])
# quantizer
if(z < 0.0) {
ypred <- -1
} else {
ypred <- 1
}
if(ypred != y[j]) {
error[i] <- error[i] + 1
}
# update weight
w <- w + eta*(y[j] - z) * X[j, ]
}
# compute cost function
cost[i] <- sum((y - X %*% w)^2)/2
}
# data frame consisting of cost and error info
infomatrix <- matrix(rep(0, 3 * n_iter), nrow = n_iter, ncol = 3)
infomatrix[, 1] <- 1:n_iter
infomatrix[, 2] <- log(cost)
infomatrix[, 3] <- error
infodf <- as.data.frame(infomatrix)
names(infodf) <- c("epoch", "log(cost)", "error")
return(infodf)
}
# test the adalineSDG algorithm
# Standardized vs non-standard data: Cost and error function
eta <- 0.0001
n_iter <- 100
result1 <- adalineSGD(X, y, n_iter, eta)
label <- rep("non-standard", dim(result1)[1])
result1 <- cbind(label, result1)
result2 <- adalineSGD(X_std, y, n_iter, eta)
label <- rep("standard", dim(result2)[1])
result2 <- cbind(label, result2)
df <- rbind(result1, result2)
# long format of data frame
dflong <- melt(df, id.vars=c("epoch", "label"))
head(dflong)
## epoch label variable value
## 1 1 non-standard log(cost) 3.866236
## 2 2 non-standard log(cost) 3.825341
## 3 3 non-standard log(cost) 3.787215
## 4 4 non-standard log(cost) 3.750625
## 5 5 non-standard log(cost) 3.714274
## 6 6 non-standard log(cost) 3.677990
ggplot(dflong, aes(x=epoch, y=value)) +
geom_line(aes(color=label, linetype=label), size = 1) +
facet_grid(variable ~ .) + xlab("Epoch #") + ylab("") +
ggtitle("Cost and error function for a dataset \n and its standardized form: eta = 0.0001 (Stochastic Gradient Descent)")