library(mlbench)
data(Ozone)
# Wind will only be used for Q2
mydata = data.frame("time" = seq(1:nrow(Ozone))/nrow(Ozone), "ozone" = Ozone$V4, "wind" = Ozone$V6)
set.seed(1)
trainid = sample(1:nrow(Ozone), 250)
train = mydata[trainid, ]
test = mydata[-trainid, ]
par(mfrow=c(1,2))
plot(train$time, train$ozone, pch = 19, cex = 0.5)
plot(train$wind, train$ozone, pch = 19, cex = 0.5)
I Consider two kernel functions:
For both kernel functions, I incorporate a bandwidth \(h\) and start with the Silverman’s rule-of-thumb for the choice of \(h\), and then I tune \(h\).
I fit and plot the regression line with both kernel functions, and plot them together in a single figure.
I report the testing MSE of both methods.
# Remone NA data points
train = train[-which(is.na(train$ozone)), ]
test = test[-which(is.na(test$ozone)), ]
# Gaussian and Epanechnikow functions
Gaussian.Kernel <- function(x, y, bw){
exp(- ((x - y)/bw)^2) / sqrt(2*pi)
}
Epanechnikov.Kernel <- function(x, y, bw){
u = 1 - ((x - y)/bw)^2
return(0.75 * u * (u > 0))
}
Kernel.pred <- function(x, y, testx, K = "Gaussian", bw){
pred = rep(NA, length(testx))
for (i in 1:length(testx)){
if (K == "Gaussian")
w = Gaussian.Kernel(testx[i], x, bw)
if (K == "Epanechnikov")
w = Epanechnikov.Kernel(testx[i], x, bw)
pred[i] = sum(y * w) / sum(w)
}
return(pred)
}
# Bandwith based on Silverman's rule-of-thumb
h = 1.06*sd(train$time)*nrow(train)^{-1/5}
print(h)
## [1] 0.1015303
# Plot of the regression lines
#install.packages("ggplot2")
library(ggplot2)
Gaussian_Pred = Kernel.pred(train$time, train$ozone, testx = test$time, bw =h)
Epanechnikov_Pred = Kernel.pred(train$time, train$ozone, testx = test$time,
K = "Epanechnikov", bw = h)
p = ggplot(train, aes(time, ozone)) + geom_point(data=train, pch=19, col="blue")+
geom_point(data=test, pch=19, col="darkorange") +
geom_line(data= test, aes(y = Gaussian_Pred, col = "red")) +
geom_line(data= test, aes(y = Epanechnikov_Pred, col = "green")) +
theme_bw() + theme(legend.position = "right") +
scale_color_discrete(name = "kernel functions:", labels = c("Gaussian", "Epanechnikov"))
print(p)
# testing MSE of both methods
mse_Gaussian <- mean((Gaussian_Pred - test$ozone)^2)
mse_Epanechnikov <- mean((Epanechnikov_Pred - test$ozone)^2)
print(c(mse_Gaussian, mse_Epanechnikov))
## [1] 29.54065 29.87374
# Bandwidth over-smoothing
h1 = 0.4
# Bandwidth under-smoothing
h2 = 0.05
# Plot
Over_Gaussian_Pred = Kernel.pred(train$time, train$ozone, testx = test$time, bw =h1)
Under_Gaussian_Pred = Kernel.pred(train$time, train$ozone, testx = test$time, bw =h2)
p = ggplot(train, aes(time, ozone)) + geom_point(data=train, pch=19, col="blue")+
geom_point(data=test, pch=19, col="darkorange") +
geom_line(data= test, aes(y = Over_Gaussian_Pred, col = "red")) +
geom_line(data= test, aes(y = Under_Gaussian_Pred, col = "green")) +
theme_bw() + theme(legend.position = "right") +
scale_color_discrete(name = "kernel curves:", labels = c("Under-smoothing", "Over-smoothing"))
print(p)
Clearly, I see that when the bandwidth h is too small (under-smoothing), there are many wiggly structures on our density curve. On the contrary, when h is large the curve becomes smoother.
# 10 fold cross validation
nfold = 10
infold = sample(rep(1:nfold, length.out=length(train)))
# we many consider a set of possible k values
all_h = c(0.1:0.5, seq(0.1, 1, 0.1))
# save prediction errors of each fold
errorMatrix = matrix(NA, length(all_h), nfold)
# loop across all possible k and all folds
for (l in 1:nfold)
{
for (h in 1:length(all_h))
{
Kernel.prediction = Kernel.pred(train$time, train$ozone, testx = test$time,K = "Epanechnikov", bw = all_h[h])
errorMatrix[h, l] = mean((Kernel.prediction - test$ozone)^2)
}
}
# plot the results for each fold
plot(rep(all_h, nfold), as.vector(errorMatrix), pch = 19, cex = 0.5, xlab = "h", ylab = "prediction error")
points(all_h, apply(errorMatrix, 1, mean), col = "red", pch = 19, type = "l", lwd = 3)
# which k is the best? average across all folds
all_h[which.min(apply(errorMatrix, 1, mean))]
## [1] 0.2
# Optimal plot
Optimal_E_Pred = Kernel.pred(train$time, train$ozone, testx = test$time,
K = "Epanechnikov", bw = 0.2)
p = ggplot(train, aes(time, ozone)) + geom_point(data=train, pch=19, col="blue")+
geom_point(data=test, pch=19, col="darkorange") +
geom_line(data= test, aes(y = Optimal_E_Pred, col = "red")) +
theme_bw() + theme(legend.position = "right") +
scale_color_discrete(name = "Optimal regression line:", labels = c("Epanechnikov curve"))
print(p)
The optimal bandwidth is 0.2.
I consider using both time and wind in the regression. I use the following multivariate kernel function, which is essentially a Gaussian kernel with diagonal covariance matrix. \[ K_{\boldsymbol \lambda}(x_i, x_j) = e^{-\frac{1}{2} \sum_{k=1}^p \left((x_{ik} - x_{jk})/\lambda_k\right)^2}\] Based on Silverman’s formula, the bandwidth for the \(k\)th variable is given by \[\lambda_k = \left(\frac{4}{p+2}\right)^{\frac{1}{p+4}} n^{-\frac{1}{p+4}} \, \, \widehat \sigma_k,\] where \(\widehat\sigma_k\) is the estimated standard deviation for variable \(k\), \(p\) is the number of variables, and \(n\) is the sample size. I use the Nadaraya-Watson kernel estimator to fit and predict the ozone level.
n <- nrow(train)
p <- 2
bw.time <- (4 / (p+2))^(1/(p+4)) * n^(-1/(p+4)) * sd(train$time)
bw.wind <- (4 / (p+2))^(1/(p+4)) * n^(-1/(p+4)) * sd(train$wind)
# bandwidth
bw_all <- c(bw.time, bw.wind)
print(bw_all)
## [1] 0.1150768 0.8821737
Multi.Kernel <- function(x, y, bw){
exp(- ((x - y)/bw)^2 / 2)
}
Multi.kernel.pred <- function(x, y, testx, bw){
pred = rep(NA, nrow(testx))
p <- length(bw)
for (i in 1:nrow(testx)){
w_matrix <- matrix(0, nrow = nrow(x), ncol= p)
for(j in 1:p){
w_matrix[,j] = Multi.Kernel(testx[i, j], x[, j], bw[j])
}
w <- apply(w_matrix, 1, prod)
pred[i] = sum(y * w) / sum(w)
}
return(pred)
}
Prediction = Multi.kernel.pred(cbind(train$time, train$wind), train$ozone,
testx = cbind(test$time, test$wind), bw = bw_all)
mse_multi_kernel <- mean((Prediction- test$ozone)^2)
print(mse_multi_kernel)
## [1] 32.38576
The prediction error of the multi-dimensional model (0.3238576) is higher than that of the two previous univariate models.