# Entering data and making data in approprite form to do analysis
x <- c(11, 22, 33, 44, 56, 67, 78, 89, 100, 50, 70, 90)
y <- c(2337, 2750, 2301, 2500, 2100, 1100, 1000, 1642, 1932, 1700, 1750, 2000)
df <- data.frame(x, y)
# Library for plots
library(ggplot2)
library(ggpubr)
#Library for data manipulation
library(dplyr)
# Making distance matrix of xi's
M <- as.matrix(dist(df$x, diag = T, upper = T))
M
## 1 2 3 4 5 6 7 8 9 10 11 12
## 1 0 11 22 33 45 56 67 78 89 39 59 79
## 2 11 0 11 22 34 45 56 67 78 28 48 68
## 3 22 11 0 11 23 34 45 56 67 17 37 57
## 4 33 22 11 0 12 23 34 45 56 6 26 46
## 5 45 34 23 12 0 11 22 33 44 6 14 34
## 6 56 45 34 23 11 0 11 22 33 17 3 23
## 7 67 56 45 34 22 11 0 11 22 28 8 12
## 8 78 67 56 45 33 22 11 0 11 39 19 1
## 9 89 78 67 56 44 33 22 11 0 50 30 10
## 10 39 28 17 6 6 17 28 39 50 0 20 40
## 11 59 48 37 26 14 3 8 19 30 20 0 20
## 12 79 68 57 46 34 23 12 1 10 40 20 0
#Defining k
k = 5
# Loop to find the fitted_knearest value for above given k
fitted_knearest = c()
for (i in 1:nrow(M)) {
index = c()
index <- sort(M[i, ], index.return = T)$ix[1:k]
fitted_knearest <- c(fitted_knearest, mean(df$y[index]))
}
df$fitted_knearest = fitted_knearest
# Making plot for K = 5
ggplot(df, aes(x = x, y = fitted_knearest)) + geom_point() +
geom_point(aes( y = y), col = "red") + geom_line() + ggtitle("k=5")
Here the black line is the fitted_knearest line and red points are observed points
Similarly we can make this plot for different values of k = 1, 3, 7, 12
As we can clearly see that plot for k = 5 seems to be reasonably good.
Here we will use two commanly used examples
\[ K(x) = \frac{1}{\sqrt{2 \pi}}exp(-x^2/2) \] 2. Epanechnikov Kernel :
\[ K(x) = \left \{ 3/4(1-x^2) \ \ if \ |x| \leq1,0,otherwise \right. \] Then for a given bandwidth h>0 , Kerenl regression estimate is defined as:
\[ \hat{f(x)} = \frac{\sum_{i=1}^{n}K(\frac{x-x_i}{h})y_i}{\sum_{i=1}^{n}K(\frac{x-x_i}{h})} \]
Following is the code for the above given dataset:
#Defining both gaussian and epanechinkov kernel functions
gassian_ker <- function(x){
val = (1/sqrt(2*pi))*exp(-0.5*x^2)
return(val)
}
epan_ker <- function(x){
ind1 = ifelse((-1 <= x & x<= 1),1,0)
val = (3/4)*(1-x^2)*ind1
return(val)
}
# defining bandwidth
h = 10
fitted_gaussian <- c()
fitted_epan <- c()
# For loop to find fitted value for both kernel functions
for (i in 1:nrow(df)) {
x_temp <- df$x[i]
temp_denominator_gauss <- 0
temp_denominator_epan <- 0
temp_num_gauss <- 0
temp_num_epan <- 0
for (k in 1:nrow(df)) {
temp_denominator_gauss = temp_denominator_gauss + gassian_ker((x_temp - df$x[k])/h)
temp_denominator_epan = temp_denominator_epan + epan_ker((x_temp - df$x[k])/h)
temp_num_gauss = temp_num_gauss + gassian_ker((x_temp - df$x[k])/h)*df$y[k]
temp_num_epan = temp_num_epan + epan_ker((x_temp - df$x[k])/h)*df$y[k]
}
fitted_gaussian[i] = temp_num_gauss/temp_denominator_gauss
fitted_epan[i] = temp_num_epan/temp_denominator_epan
}
df$fitted_gaussian <- fitted_gaussian
df$fitted_epan <- fitted_epan
# Data frame for all fitted values is given as:
df
# For plotting
p1 <- ggplot(df, aes(x = x, y = fitted_gaussian)) + geom_point() +
geom_point(aes( y = y), col = "red")+ geom_line()
p2 <- ggplot(df, aes(x = x, y = fitted_epan)) + geom_point() +
geom_point(aes( y = y), col = "red") + geom_line()
ggarrange(p1, p2, labels = c("Gaussian", "Epanechnikov"), nrow = 1, ncol = 2 )
We can further plot it for different values of h