Problem Set 1

SETUP

library(tidyverse)
library(extrafont)
library(plotly)

Excercice 1: Kernels

1. Write down the formulas for the following kernels:

Uniform: \(K(u) = \frac{1}{2}\)
Epanechnikov: \(K(u) = \frac{3}{4}(1-u^2)\) with support \((-1, 1)\)
Gaussian: \(K(u) = \frac{1}{\sqrt{2 \pi}}\exp(- \frac{1}{2} u^2)\)
Triangular \(K(u) = 1 - |u|\)

2. Implement these kernels in R and plot their graphs

Setup: here we define the data basis (i.e. the x-values) over which we calculate the kernels as well as the different kernels

## CALCULATE KERNELS OVER AN INTERVAL FROM -3 TO 3 IN 0.1 INCREMENTS
data <- tibble(u = (-300:300)/100) %>% 
  mutate(uniform = 1/2,
         epanechnikov = 0.75 * (1-u^2),
         gaussian = (1/(sqrt(2 * pi))) * exp(-0.5 * u^2),
         triangular = (1 - abs(u))
  )
## SHOW DATA
data

## # A tibble: 601 x 5
##        u uniform epanechnikov gaussian triangular
##    <dbl>   <dbl>        <dbl>    <dbl>      <dbl>
##  1 -3        0.5        -6     0.00443      -2   
##  2 -2.99     0.5        -5.96  0.00457      -1.99
##  3 -2.98     0.5        -5.91  0.00470      -1.98
##  4 -2.97     0.5        -5.87  0.00485      -1.97
##  5 -2.96     0.5        -5.82  0.00499      -1.96
##  6 -2.95     0.5        -5.78  0.00514      -1.95
##  7 -2.94     0.5        -5.73  0.00530      -1.94
##  8 -2.93     0.5        -5.69  0.00545      -1.93
##  9 -2.92     0.5        -5.64  0.00562      -1.92
## 10 -2.91     0.5        -5.60  0.00578      -1.91
## # ... with 591 more rows

Visualisation

## PLOT THE KERNELS
data %>% 
  pivot_longer(names_to = "kernel",
               values_to = "values",
               -u) %>% 
  ggplot(mapping = aes(x = u, y = values, colour = kernel)) +
  geom_line(size = 0.8) +
  theme_minimal() +
  theme(text = element_text(family = "CMU Serif"),
        legend.position = "bottom") +
  lims(y = c(-0, 1.1))

3. What is the order of a kernel? Calculate the order for kernels above.

Define: \[ \kappa_\lambda = \int v^\lambda K(v) dv \qquad \text{and} \qquad \bar{\kappa}_\lambda = \int v^\lambda K(v)^2dv \] Then, a kernel \(K\) is of order \(r\) if \[\begin{aligned} \kappa_0 &= 1 \\ \kappa_\lambda &= 0 \qquad \text{for} \qquad 1 \leq \lambda \leq \lambda- 1 \\ \infty > \kappa_\lambda &\neq 0 \qquad \text{for} \qquad \lambda = r \end{aligned}\]

The Epanechnikov kernel (\(K(u) = \frac{3}{4}(1-u^2)\)) has order \(r = 2\) \[\begin{aligned} \kappa_0 &= \frac{3}{4} \int_{-1}^1 (1-u^2) = \frac{3}{4}\left[u - \frac{1}{3}u^3\right]_{-1}^1 = \frac{3}{4}\left[ \left(1 - \frac{1}{3} \right) - \left(-1 + \frac{1}{3} \right) \right] = \frac{3}{4}\frac{4}{3} = 1\\ \kappa_1 &= \frac{3}{4} \int_{-1}^1 u(1-u^2) = \frac{3}{4}\left[\frac{1}{2}u^2 - \frac{1}{4}u^4\right]_{-1}^1 = \frac{3}{4}\left[ \left(\frac{1}{2} - \frac{1}{4} \right) - \left(\frac{1}{2} - \frac{1}{4} \right) \right] = 0\\ \kappa_2 &= \frac{3}{4} \int_{-1}^1 u^2(1-u^2) = \frac{3}{4}\left[\frac{1}{3}u^3 - \frac{1}{5}u^5\right]_{-1}^1 = \frac{3}{4}\left[ \left(\frac{1}{3} - \frac{1}{5} \right) - \left(-\frac{1}{3} + \frac{1}{5} \right) \right] = \frac{3}{4}\frac{4}{15} = \frac{1}{5} \end{aligned}\]

The Gaussian kernel (\(K(u) = \frac{1}{\sqrt{2 \pi}}\exp\left(- \frac{1}{2} u^2\right)\)) has order \(r=2\) \[\begin{aligned} \kappa_0 &= \int \frac{1}{\sqrt{2 \pi}}\exp\left(- \frac{1}{2} u^2\right) = 1 \\ \kappa_1 &= \frac{1}{\sqrt{2 \pi}} \int u \exp\left(- \frac{1}{2} u^2\right) = \left[ -\exp\left(-\frac{1}{2} u^2\right) \right]_{-\infty}^\infty = 0 \\ \kappa_2 &= \frac{1}{\sqrt{2 \pi}} \int u^2 \exp\left(- \frac{1}{2} u^2\right) \end{aligned}\]

Excercise 2: One-dimensional kernel smoothers

For this exercise, use the mars.dta data set. It contains data on temperature-pressure profile of the Martian atmosphere, as measured by the Mars Global Surveyor spacecraft in 2003 using a radio occultation technique.

1. Plot the radius and temperature variables on a scatter plot. Note the apparent nonlinearity of the relationship.

## READ IN DATA
data_mars <- read_delim(file = "../data/mars.dat",delim = " ")

## SCATTERPLOT OF RADIUS AND TEMPERATURE
data_mars %>% 
  ggplot(mapping = aes(x = radius, y = temperature)) +
  geom_point() +
  theme_minimal() +
  theme(text = element_text(family = "CMU Serif"),
        legend.position = "bottom")

2. Write down formulas for the following non-parametric kernel smoothers

Nadaraya–Watson estimator: \(\hat{m}(x_0; h) = \frac{\sum_{i=1}^n Y_i K \left(\frac{X_i-x_0}{h}\right)}{\sum_{i=1}^n K \left(\frac{X_i-x_0}{h}\right)}\)
Local linear estimator: \(\hat{m}(x_0; h) = \frac{\sum_{i=1}^n Y_i K_i^*}{\sum_{i=1}^n K_i^*}\) with \(K_i^* = [Q_2 - Q_1 (X_i - x_0)] K \left(\frac{X_i-x_0}{h}\right)\) and \(Q_l(x_0) = \sum_{i=1}^n K \left(\frac{X_i-x_0}{h}\right) (X_i - x_0\)
Local polynomial estimator

3. Implement estimators from above in R and apply them to estimate the regression of Mars temperature on altitude (radius). Try and compare different kernels.

Define functions that calculate \(u\) and \(K(u)\) for different kernels \(K\)

#' Calculate kernel weight u which will be used in `K(u)`
#'
#' @param x_0 the x_i of interest
#' @param X a vector of all x_i
#' @param h bandwith parameter
kernel_weight <- function(x_0, X, h){
  (X - x_0) / h
}

#' Epanechnikov kernel (the `K(u)` function)
#' @param u a vector of weights u, calculated by `kernel_weight()`
kernel_epan <- function(u){
  ret <- 0.75 * (1-(u^2))
  ret[which(u < -1 | u > 1)] <- 0
  ret
}

#' Uniform kernel (the `K(u)` function)
#' @param u a vector of weights u, calculated by `kernel_weight()`
kernel_uniform <- function(u, h ){
  abs(u) <= h
}

#' Triangular kernel (the `K(u)` function)
#' @param u a vector of weights u, calculated by `kernel_weight()`
kernel_trian <- function(u){
  ret <- (1 - abs(u))
  ret[which(u < -1 | u > 1)] <- 0
  ret
}

#' Gaussian kernel (the `K(u)` function)
#' @param u a vector of weights u, calculated by `kernel_weight()`
kernel_gauss <- function(u){
  (1/(sqrt(2 * pi))) * exp(-0.5 * u^2)
}

Define Nadaraya-Watson estimator function

#' Implementation of the Nadaraya-Watson estimator
#'
#' @param x_0 The x_i of interest
#' @param X The vector of all x_i
#' @param h Bandwith parameter
#' @param Y The vector of all y_i
#' @param kernel String indicating which kernel to use
mhat_NW <- function(x_0, Y, X, h, kernel){
  
  u <- kernel_weight(x_0, X, h = h)
  
  if(kernel == "epan"){
    K <- kernel_epan(u)
  } else if (kernel == "uniform"){
    K <- kernel_uniform(u, h)
  } else if (kernel == "triangular"){
    K <- kernel_trian(u)
  } else if (kernel == "gaussian"){
    K <- kernel_gauss(u)
  }
  sum(Y * K) / sum(K)
}

Calculate the kernels and show them on a plot

## BANDWIDTH FOR THIS CHUNK
bandwidth <- 0.5

## CALCULATE SMOOTH ESTIMATES
data_mars <- data_mars %>% 
  mutate(smooth_NW_epan = map_dbl(.x = radius,
                                  .f = mhat_NW,
                                  Y = temperature,
                                  X = radius,
                                  h = bandwidth,
                                  kernel = "epan"),
         smooth_NW_unif = map_dbl(.x = radius,
                                  .f = mhat_NW,
                                  Y = temperature,
                                  X = radius,
                                  h = bandwidth,
                                  kernel = "uniform"),
         smooth_NW_trian = map_dbl(.x = radius,
                                   .f = mhat_NW,
                                   Y = temperature,
                                   X = radius,
                                   h = bandwidth,
                                   kernel = "triangular"),
         smooth_NW_gauss = map_dbl(.x = radius,
                                   .f = mhat_NW,
                                   Y = temperature,
                                   X = radius,
                                   h = bandwidth,
                                   kernel = "gaussian"))

## CREATE PLOT
smooth_plot <- data_mars %>% 
  select(-sigmatemperature) %>% 
  pivot_longer(names_to = "kernel",
               values_to = "value",
               -c("radius", "temperature")) %>% 
  ggplot(mapping = aes(x = radius, y = value, color = kernel)) +
  geom_line() +
  geom_point(mapping = aes(y = temperature), color = "black") +
  theme_minimal() +
  theme(text = element_text(family = "CMU Serif"),
        legend.position = "bottom") +
  labs(title = "Bandwidth 0.5")

## SHOW INTERACTIVE PLOT
ggplotly(smooth_plot)

## BANDWIDTH FOR THIS CHUNK
bandwidth <- 1

## CALCULATE SMOOTH ESTIMATES
data_mars <- data_mars %>% 
  mutate(smooth_NW_epan = map_dbl(.x = radius,
                                  .f = mhat_NW,
                                  Y = temperature,
                                  X = radius,
                                  h = bandwidth,
                                  kernel = "epan"),
         smooth_NW_unif = map_dbl(.x = radius,
                                  .f = mhat_NW,
                                  Y = temperature,
                                  X = radius,
                                  h = bandwidth,
                                  kernel = "uniform"),
         smooth_NW_trian = map_dbl(.x = radius,
                                   .f = mhat_NW,
                                   Y = temperature,
                                   X = radius,
                                   h = bandwidth,
                                   kernel = "triangular"),
         smooth_NW_gauss = map_dbl(.x = radius,
                                   .f = mhat_NW,
                                   Y = temperature,
                                   X = radius,
                                   h = bandwidth,
                                   kernel = "gaussian"))

## CREATE PLOT
smooth_plot_2 <-  data_mars %>% 
  select(-sigmatemperature) %>% 
  pivot_longer(names_to = "kernel",
               values_to = "value",
               -c("radius", "temperature")) %>% 
  ggplot(mapping = aes(x = radius, y = value, color = kernel)) +
  geom_line() +
  geom_point(mapping = aes(y = temperature), color = "black") +
  theme_minimal() +
  theme(text = element_text(family = "CMU Serif"),
        legend.position = "bottom") +
  labs(title = "Bandwidth 1")

## SHOW INTERACTIVE PLOT
ggplotly(smooth_plot_2)

5. Select bandwidth using leave-one-out cross-validation

Define function that calculates MSE given data and bandwidth. In this case for N-W estimator

samplefun <- function(bandwidth, data){
  
  loo_fun <- function(row, data, bandwidth){
    Y_i <- data[row,]$temperature
    x_0 <- data[row,]$radius  
    data <- data[-row,]
    
    tibble(y_epan = mhat_NW(x_0 = x_0,
                            Y = data$temperature,
                            X = data$radius,
                            h = bandwidth,
                            kernel = "epan"),
           y_unif = mhat_NW(x_0 = x_0,
                            Y = data$temperature,
                            X = data$radius,
                            h = bandwidth,
                            kernel = "uniform"),
           y_trian = mhat_NW(x_0 = x_0,
                             Y = data$temperature,
                             X = data$radius,
                             h = bandwidth,
                             kernel = "triangular"),
           y_gauss = mhat_NW(x_0 = x_0,
                             Y = data$temperature,
                             X = data$radius,
                             h = bandwidth,
                             kernel = "gaussian"),
           Y_i = Y_i)
  }
  
  map_dfr(1:nrow(data),
          loo_fun,
          data = data,
          bandwidth = bandwidth) %>% 
      summarise(CV_epan = sum((Y_i-y_epan)^2),
                CV_unif = sum((Y_i-y_unif)^2),
                CV_trian = sum((Y_i-y_trian)^2),
                CV_gauss = sum((Y_i-y_gauss)^2),
                bandwidth = bandwidth)
  
}

## DATASET GIVING THE CV VALUE FOR RESPECTIVE BANDWIDTH VALUES
bandwidth <- map_dfr((1:300)/10,
        samplefun,
        data = data_mars)

## FILTERED DATASET CONTAINING THE OPTIMAL BANDWIDTH VALUES
bandwidth_opt <- bandwidth %>% 
  pivot_longer(-bandwidth,
               names_to = "kernel",
               values_to = "Jacknife_CV") %>% 
  group_by(kernel) %>% 
  filter(Jacknife_CV == min(Jacknife_CV, na.rm = T))

## CREATE PLOT OF CV-VALUE OVER BANDWIDTH AND SHOW OPTIMAL BANDWIDTH
bw_plot <- bandwidth %>% 
  pivot_longer(-bandwidth,
               names_to = "kernel",
               values_to = "Jacknife_CV") %>% 
  ggplot(mapping = aes(x = bandwidth, y = Jacknife_CV, colour = kernel)) +
  geom_vline(data = bandwidth_opt,
             mapping = aes(xintercept = bandwidth, colour = kernel),
             linetype = 2,
             size = 0.6) +
  geom_text(data = bandwidth_opt,
             mapping = aes(x = bandwidth - c(0.7, 0.7, 0.7, -0.7),
                           y = c(-100, -600, -100, -600), label = bandwidth)) +
  geom_line(size = 0.8) +
  labs(title = "Optimal Bandwidth for Nadaraya-Watson estimator / different kernels") +
  theme_minimal() +
  theme(text = element_text(family = "CMU Serif"),
        legend.position = "bottom")

## SHOW INTERACTIVE PLOT
ggplotly(bw_plot)