library(tidyverse)
library(extrafont)
library(plotly)
Setup: here we define the data basis (i.e. the x-values) over which we calculate the kernels as well as the different kernels
## CALCULATE KERNELS OVER AN INTERVAL FROM -3 TO 3 IN 0.1 INCREMENTS
data <- tibble(u = (-300:300)/100) %>%
mutate(uniform = 1/2,
epanechnikov = 0.75 * (1-u^2),
gaussian = (1/(sqrt(2 * pi))) * exp(-0.5 * u^2),
triangular = (1 - abs(u))
)
## SHOW DATA
data
## # A tibble: 601 x 5
## u uniform epanechnikov gaussian triangular
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -3 0.5 -6 0.00443 -2
## 2 -2.99 0.5 -5.96 0.00457 -1.99
## 3 -2.98 0.5 -5.91 0.00470 -1.98
## 4 -2.97 0.5 -5.87 0.00485 -1.97
## 5 -2.96 0.5 -5.82 0.00499 -1.96
## 6 -2.95 0.5 -5.78 0.00514 -1.95
## 7 -2.94 0.5 -5.73 0.00530 -1.94
## 8 -2.93 0.5 -5.69 0.00545 -1.93
## 9 -2.92 0.5 -5.64 0.00562 -1.92
## 10 -2.91 0.5 -5.60 0.00578 -1.91
## # ... with 591 more rows
Visualisation
## PLOT THE KERNELS
data %>%
pivot_longer(names_to = "kernel",
values_to = "values",
-u) %>%
ggplot(mapping = aes(x = u, y = values, colour = kernel)) +
geom_line(size = 0.8) +
theme_minimal() +
theme(text = element_text(family = "CMU Serif"),
legend.position = "bottom") +
lims(y = c(-0, 1.1))
Define: \[ \kappa_\lambda = \int v^\lambda K(v) dv \qquad \text{and} \qquad \bar{\kappa}_\lambda = \int v^\lambda K(v)^2dv \] Then, a kernel \(K\) is of order \(r\) if \[\begin{aligned} \kappa_0 &= 1 \\ \kappa_\lambda &= 0 \qquad \text{for} \qquad 1 \leq \lambda \leq \lambda- 1 \\ \infty > \kappa_\lambda &\neq 0 \qquad \text{for} \qquad \lambda = r \end{aligned}\]
The Epanechnikov kernel (\(K(u) = \frac{3}{4}(1-u^2)\)) has order \(r = 2\) \[\begin{aligned} \kappa_0 &= \frac{3}{4} \int_{-1}^1 (1-u^2) = \frac{3}{4}\left[u - \frac{1}{3}u^3\right]_{-1}^1 = \frac{3}{4}\left[ \left(1 - \frac{1}{3} \right) - \left(-1 + \frac{1}{3} \right) \right] = \frac{3}{4}\frac{4}{3} = 1\\ \kappa_1 &= \frac{3}{4} \int_{-1}^1 u(1-u^2) = \frac{3}{4}\left[\frac{1}{2}u^2 - \frac{1}{4}u^4\right]_{-1}^1 = \frac{3}{4}\left[ \left(\frac{1}{2} - \frac{1}{4} \right) - \left(\frac{1}{2} - \frac{1}{4} \right) \right] = 0\\ \kappa_2 &= \frac{3}{4} \int_{-1}^1 u^2(1-u^2) = \frac{3}{4}\left[\frac{1}{3}u^3 - \frac{1}{5}u^5\right]_{-1}^1 = \frac{3}{4}\left[ \left(\frac{1}{3} - \frac{1}{5} \right) - \left(-\frac{1}{3} + \frac{1}{5} \right) \right] = \frac{3}{4}\frac{4}{15} = \frac{1}{5} \end{aligned}\]
The Gaussian kernel (\(K(u) = \frac{1}{\sqrt{2 \pi}}\exp\left(- \frac{1}{2} u^2\right)\)) has order \(r=2\) \[\begin{aligned} \kappa_0 &= \int \frac{1}{\sqrt{2 \pi}}\exp\left(- \frac{1}{2} u^2\right) = 1 \\ \kappa_1 &= \frac{1}{\sqrt{2 \pi}} \int u \exp\left(- \frac{1}{2} u^2\right) = \left[ -\exp\left(-\frac{1}{2} u^2\right) \right]_{-\infty}^\infty = 0 \\ \kappa_2 &= \frac{1}{\sqrt{2 \pi}} \int u^2 \exp\left(- \frac{1}{2} u^2\right) \end{aligned}\]
For this exercise, use the mars.dta data set. It contains data on temperature-pressure profile of the Martian atmosphere, as measured by the Mars Global Surveyor spacecraft in 2003 using a radio occultation technique.
## READ IN DATA
data_mars <- read_delim(file = "../data/mars.dat",delim = " ")
## SCATTERPLOT OF RADIUS AND TEMPERATURE
data_mars %>%
ggplot(mapping = aes(x = radius, y = temperature)) +
geom_point() +
theme_minimal() +
theme(text = element_text(family = "CMU Serif"),
legend.position = "bottom")
Define functions that calculate \(u\) and \(K(u)\) for different kernels \(K\)
#' Calculate kernel weight u which will be used in `K(u)`
#'
#' @param x_0 the x_i of interest
#' @param X a vector of all x_i
#' @param h bandwith parameter
kernel_weight <- function(x_0, X, h){
(X - x_0) / h
}
#' Epanechnikov kernel (the `K(u)` function)
#' @param u a vector of weights u, calculated by `kernel_weight()`
kernel_epan <- function(u){
ret <- 0.75 * (1-(u^2))
ret[which(u < -1 | u > 1)] <- 0
ret
}
#' Uniform kernel (the `K(u)` function)
#' @param u a vector of weights u, calculated by `kernel_weight()`
kernel_uniform <- function(u, h ){
abs(u) <= h
}
#' Triangular kernel (the `K(u)` function)
#' @param u a vector of weights u, calculated by `kernel_weight()`
kernel_trian <- function(u){
ret <- (1 - abs(u))
ret[which(u < -1 | u > 1)] <- 0
ret
}
#' Gaussian kernel (the `K(u)` function)
#' @param u a vector of weights u, calculated by `kernel_weight()`
kernel_gauss <- function(u){
(1/(sqrt(2 * pi))) * exp(-0.5 * u^2)
}
Define Nadaraya-Watson estimator function
#' Implementation of the Nadaraya-Watson estimator
#'
#' @param x_0 The x_i of interest
#' @param X The vector of all x_i
#' @param h Bandwith parameter
#' @param Y The vector of all y_i
#' @param kernel String indicating which kernel to use
mhat_NW <- function(x_0, Y, X, h, kernel){
u <- kernel_weight(x_0, X, h = h)
if(kernel == "epan"){
K <- kernel_epan(u)
} else if (kernel == "uniform"){
K <- kernel_uniform(u, h)
} else if (kernel == "triangular"){
K <- kernel_trian(u)
} else if (kernel == "gaussian"){
K <- kernel_gauss(u)
}
sum(Y * K) / sum(K)
}
Calculate the kernels and show them on a plot
## BANDWIDTH FOR THIS CHUNK
bandwidth <- 0.5
## CALCULATE SMOOTH ESTIMATES
data_mars <- data_mars %>%
mutate(smooth_NW_epan = map_dbl(.x = radius,
.f = mhat_NW,
Y = temperature,
X = radius,
h = bandwidth,
kernel = "epan"),
smooth_NW_unif = map_dbl(.x = radius,
.f = mhat_NW,
Y = temperature,
X = radius,
h = bandwidth,
kernel = "uniform"),
smooth_NW_trian = map_dbl(.x = radius,
.f = mhat_NW,
Y = temperature,
X = radius,
h = bandwidth,
kernel = "triangular"),
smooth_NW_gauss = map_dbl(.x = radius,
.f = mhat_NW,
Y = temperature,
X = radius,
h = bandwidth,
kernel = "gaussian"))
## CREATE PLOT
smooth_plot <- data_mars %>%
select(-sigmatemperature) %>%
pivot_longer(names_to = "kernel",
values_to = "value",
-c("radius", "temperature")) %>%
ggplot(mapping = aes(x = radius, y = value, color = kernel)) +
geom_line() +
geom_point(mapping = aes(y = temperature), color = "black") +
theme_minimal() +
theme(text = element_text(family = "CMU Serif"),
legend.position = "bottom") +
labs(title = "Bandwidth 0.5")
## SHOW INTERACTIVE PLOT
ggplotly(smooth_plot)
## BANDWIDTH FOR THIS CHUNK
bandwidth <- 1
## CALCULATE SMOOTH ESTIMATES
data_mars <- data_mars %>%
mutate(smooth_NW_epan = map_dbl(.x = radius,
.f = mhat_NW,
Y = temperature,
X = radius,
h = bandwidth,
kernel = "epan"),
smooth_NW_unif = map_dbl(.x = radius,
.f = mhat_NW,
Y = temperature,
X = radius,
h = bandwidth,
kernel = "uniform"),
smooth_NW_trian = map_dbl(.x = radius,
.f = mhat_NW,
Y = temperature,
X = radius,
h = bandwidth,
kernel = "triangular"),
smooth_NW_gauss = map_dbl(.x = radius,
.f = mhat_NW,
Y = temperature,
X = radius,
h = bandwidth,
kernel = "gaussian"))
## CREATE PLOT
smooth_plot_2 <- data_mars %>%
select(-sigmatemperature) %>%
pivot_longer(names_to = "kernel",
values_to = "value",
-c("radius", "temperature")) %>%
ggplot(mapping = aes(x = radius, y = value, color = kernel)) +
geom_line() +
geom_point(mapping = aes(y = temperature), color = "black") +
theme_minimal() +
theme(text = element_text(family = "CMU Serif"),
legend.position = "bottom") +
labs(title = "Bandwidth 1")
## SHOW INTERACTIVE PLOT
ggplotly(smooth_plot_2)
Define function that calculates MSE given data and bandwidth. In this case for N-W estimator
samplefun <- function(bandwidth, data){
loo_fun <- function(row, data, bandwidth){
Y_i <- data[row,]$temperature
x_0 <- data[row,]$radius
data <- data[-row,]
tibble(y_epan = mhat_NW(x_0 = x_0,
Y = data$temperature,
X = data$radius,
h = bandwidth,
kernel = "epan"),
y_unif = mhat_NW(x_0 = x_0,
Y = data$temperature,
X = data$radius,
h = bandwidth,
kernel = "uniform"),
y_trian = mhat_NW(x_0 = x_0,
Y = data$temperature,
X = data$radius,
h = bandwidth,
kernel = "triangular"),
y_gauss = mhat_NW(x_0 = x_0,
Y = data$temperature,
X = data$radius,
h = bandwidth,
kernel = "gaussian"),
Y_i = Y_i)
}
map_dfr(1:nrow(data),
loo_fun,
data = data,
bandwidth = bandwidth) %>%
summarise(CV_epan = sum((Y_i-y_epan)^2),
CV_unif = sum((Y_i-y_unif)^2),
CV_trian = sum((Y_i-y_trian)^2),
CV_gauss = sum((Y_i-y_gauss)^2),
bandwidth = bandwidth)
}
## DATASET GIVING THE CV VALUE FOR RESPECTIVE BANDWIDTH VALUES
bandwidth <- map_dfr((1:300)/10,
samplefun,
data = data_mars)
## FILTERED DATASET CONTAINING THE OPTIMAL BANDWIDTH VALUES
bandwidth_opt <- bandwidth %>%
pivot_longer(-bandwidth,
names_to = "kernel",
values_to = "Jacknife_CV") %>%
group_by(kernel) %>%
filter(Jacknife_CV == min(Jacknife_CV, na.rm = T))
## CREATE PLOT OF CV-VALUE OVER BANDWIDTH AND SHOW OPTIMAL BANDWIDTH
bw_plot <- bandwidth %>%
pivot_longer(-bandwidth,
names_to = "kernel",
values_to = "Jacknife_CV") %>%
ggplot(mapping = aes(x = bandwidth, y = Jacknife_CV, colour = kernel)) +
geom_vline(data = bandwidth_opt,
mapping = aes(xintercept = bandwidth, colour = kernel),
linetype = 2,
size = 0.6) +
geom_text(data = bandwidth_opt,
mapping = aes(x = bandwidth - c(0.7, 0.7, 0.7, -0.7),
y = c(-100, -600, -100, -600), label = bandwidth)) +
geom_line(size = 0.8) +
labs(title = "Optimal Bandwidth for Nadaraya-Watson estimator / different kernels") +
theme_minimal() +
theme(text = element_text(family = "CMU Serif"),
legend.position = "bottom")
## SHOW INTERACTIVE PLOT
ggplotly(bw_plot)