One-dimensional Kernel Regression

‘time’ is the covariate
I create training and test data

  library(mlbench)
  data(Ozone)
  
  # Wind will only be used for Q2
  mydata = data.frame("time" = seq(1:nrow(Ozone))/nrow(Ozone), "ozone" = Ozone$V4, "wind" = Ozone$V6)
  
  set.seed(1)
  trainid = sample(1:nrow(Ozone), 250)
  train = mydata[trainid, ]
  test = mydata[-trainid, ]
  
  par(mfrow=c(1,2))
  plot(train$time, train$ozone, pch = 19, cex = 0.5)
  plot(train$wind, train$ozone, pch = 19, cex = 0.5)

I Consider two kernel functions:

Gaussian kernel, defined as \(K(u) = \frac{1}{\sqrt{2 \pi}} e^{- \frac{u^2}{2}}\)
Epanechnikov kernel, defined as \(K(u) = \frac{3}{4}(1-u^2)\) for \(|u| \leq 1\).

For both kernel functions, I incorporate a bandwidth \(h\) and start with the Silverman’s rule-of-thumb for the choice of \(h\), and then I tune \(h\).

I fit and plot the regression line with both kernel functions, and plot them together in a single figure.
I report the testing MSE of both methods.

 # Remone NA data points
  train = train[-which(is.na(train$ozone)), ]
  test = test[-which(is.na(test$ozone)), ]

# Gaussian and Epanechnikow functions
  Gaussian.Kernel <- function(x, y, bw){
  exp(- ((x - y)/bw)^2) / sqrt(2*pi)
       }
  
  Epanechnikov.Kernel <- function(x, y, bw){
  u = 1 - ((x - y)/bw)^2
  return(0.75 * u * (u > 0))
  }
  
  Kernel.pred <- function(x, y, testx, K = "Gaussian", bw){
  pred = rep(NA, length(testx))
  for (i in 1:length(testx)){
  if (K == "Gaussian")
  w = Gaussian.Kernel(testx[i], x, bw)
  if (K == "Epanechnikov")
  w = Epanechnikov.Kernel(testx[i], x, bw)
  pred[i] = sum(y * w) / sum(w)
  }
  return(pred)
  }

# Bandwith based on Silverman's rule-of-thumb

  h = 1.06*sd(train$time)*nrow(train)^{-1/5}
  print(h)

## [1] 0.1015303

# Plot of the regression lines
  #install.packages("ggplot2")
  library(ggplot2)
  Gaussian_Pred = Kernel.pred(train$time, train$ozone, testx = test$time, bw =h)
  Epanechnikov_Pred = Kernel.pred(train$time, train$ozone, testx = test$time,
  K = "Epanechnikov", bw = h)
  
  p = ggplot(train, aes(time, ozone)) + geom_point(data=train, pch=19, col="blue")+ 
    geom_point(data=test, pch=19, col="darkorange") + 
    geom_line(data= test, aes(y = Gaussian_Pred, col = "red")) + 
    geom_line(data= test, aes(y = Epanechnikov_Pred, col = "green")) + 
    theme_bw() + theme(legend.position = "right") +
    scale_color_discrete(name = "kernel functions:", labels = c("Gaussian", "Epanechnikov"))
    
   print(p)

# testing MSE of both methods
  mse_Gaussian <- mean((Gaussian_Pred - test$ozone)^2)
  mse_Epanechnikov <- mean((Epanechnikov_Pred - test$ozone)^2)
  print(c(mse_Gaussian, mse_Epanechnikov))

## [1] 29.54065 29.87374

Base on my theoretical understanding of the bias-variance trade-off, I select two \(h\) values for the Gaussian kernel: a value with over-smoothing (small variance and large bias); a value with under-smoothing (large variance and small bias), and plot the two curves, along with the Gaussian rule-of-thumb curve, in a single figure.

# Bandwidth over-smoothing
  h1 = 0.4
# Bandwidth under-smoothing
  h2 = 0.05
# Plot
  
  Over_Gaussian_Pred = Kernel.pred(train$time, train$ozone, testx = test$time, bw =h1)
  Under_Gaussian_Pred = Kernel.pred(train$time, train$ozone, testx = test$time, bw =h2)
  
  p = ggplot(train, aes(time, ozone)) + geom_point(data=train, pch=19, col="blue")+ 
    geom_point(data=test, pch=19, col="darkorange") + 
    geom_line(data= test, aes(y = Over_Gaussian_Pred, col = "red")) + 
    geom_line(data= test, aes(y = Under_Gaussian_Pred, col = "green")) + 
    theme_bw() + theme(legend.position = "right") +
    scale_color_discrete(name = "kernel curves:", labels = c("Under-smoothing", "Over-smoothing"))
    
  print(p)

Clearly, I see that when the bandwidth h is too small (under-smoothing), there are many wiggly structures on our density curve. On the contrary, when h is large the curve becomes smoother.

For the Epanechnikov kernel, I tune the \(h\) value (on a grid of 10 different \(h\) values) by minimizing the testing data and plot the optimal regression line.

  # 10 fold cross validation
  nfold = 10
  infold = sample(rep(1:nfold, length.out=length(train)))
      # we many consider a set of possible k values
    all_h = c(0.1:0.5, seq(0.1, 1, 0.1))
    
    # save prediction errors of each fold
    errorMatrix = matrix(NA, length(all_h), nfold) 
    
    # loop across all possible k and all folds
    for (l in 1:nfold)
    {
        for (h in 1:length(all_h))
        {
            Kernel.prediction = Kernel.pred(train$time, train$ozone, testx = test$time,K = "Epanechnikov", bw = all_h[h]) 
            errorMatrix[h, l] = mean((Kernel.prediction - test$ozone)^2)
        }
    }
    # plot the results for each fold
    plot(rep(all_h, nfold), as.vector(errorMatrix), pch = 19, cex = 0.5, xlab = "h", ylab = "prediction error")
    points(all_h, apply(errorMatrix, 1, mean), col = "red", pch = 19, type = "l", lwd = 3)

        # which k is the best? average across all folds
    all_h[which.min(apply(errorMatrix, 1, mean))]

## [1] 0.2

    # Optimal plot
  Optimal_E_Pred = Kernel.pred(train$time, train$ozone, testx = test$time,
  K = "Epanechnikov", bw = 0.2)
  
  p = ggplot(train, aes(time, ozone)) + geom_point(data=train, pch=19, col="blue")+ 
    geom_point(data=test, pch=19, col="darkorange") + 
    geom_line(data= test, aes(y = Optimal_E_Pred, col = "red")) + 
    theme_bw() + theme(legend.position = "right") +
    scale_color_discrete(name = "Optimal regression line:", labels = c("Epanechnikov curve"))
    
  print(p)

The optimal bandwidth is 0.2.

Multi-dimensional Kernel

I consider using both time and wind in the regression. I use the following multivariate kernel function, which is essentially a Gaussian kernel with diagonal covariance matrix. \[ K_{\boldsymbol \lambda}(x_i, x_j) = e^{-\frac{1}{2} \sum_{k=1}^p \left((x_{ik} - x_{jk})/\lambda_k\right)^2}\] Based on Silverman’s formula, the bandwidth for the \(k\)th variable is given by \[\lambda_k = \left(\frac{4}{p+2}\right)^{\frac{1}{p+4}} n^{-\frac{1}{p+4}} \, \, \widehat \sigma_k,\] where \(\widehat\sigma_k\) is the estimated standard deviation for variable \(k\), \(p\) is the number of variables, and \(n\) is the sample size. I use the Nadaraya-Watson kernel estimator to fit and predict the ozone level.

I calculate the prediction error and compare this to the univariate model, in terms of prediction accuracy

  n <- nrow(train)
  p <- 2
  bw.time <- (4 / (p+2))^(1/(p+4)) * n^(-1/(p+4)) * sd(train$time)
  bw.wind <- (4 / (p+2))^(1/(p+4)) * n^(-1/(p+4)) * sd(train$wind)
  
  # bandwidth
  bw_all <- c(bw.time, bw.wind)
  print(bw_all)

## [1] 0.1150768 0.8821737

  Multi.Kernel <- function(x, y, bw){
  exp(- ((x - y)/bw)^2 / 2)
  }
  Multi.kernel.pred <- function(x, y, testx, bw){
  pred = rep(NA, nrow(testx))
  p <- length(bw)
  for (i in 1:nrow(testx)){
  w_matrix <- matrix(0, nrow = nrow(x), ncol= p)
  for(j in 1:p){
  w_matrix[,j] = Multi.Kernel(testx[i, j], x[, j], bw[j])
  }
  w <- apply(w_matrix, 1, prod)
  pred[i] = sum(y * w) / sum(w)
  }
  return(pred)
  }
  
  Prediction = Multi.kernel.pred(cbind(train$time, train$wind), train$ozone,
  testx = cbind(test$time, test$wind), bw = bw_all)
  mse_multi_kernel <- mean((Prediction- test$ozone)^2)
  print(mse_multi_kernel)

## [1] 32.38576

The prediction error of the multi-dimensional model (0.3238576) is higher than that of the two previous univariate models.

Project 2

Leon Hounnou

7/27/2021

One-dimensional Kernel Regression

Multi-dimensional Kernel