K-Nearest Neighbours

# Entering data and making data in approprite form to do analysis
x <- c(11, 22, 33, 44, 56, 67, 78, 89, 100, 50, 70, 90)
y <- c(2337, 2750, 2301, 2500, 2100, 1100, 1000, 1642, 1932, 1700, 1750, 2000)
df <- data.frame(x, y)

# Library for plots
library(ggplot2) 
library(ggpubr)

#Library for data manipulation
library(dplyr)

# Making distance matrix of xi's

M <- as.matrix(dist(df$x, diag = T, upper = T))

M
##     1  2  3  4  5  6  7  8  9 10 11 12
## 1   0 11 22 33 45 56 67 78 89 39 59 79
## 2  11  0 11 22 34 45 56 67 78 28 48 68
## 3  22 11  0 11 23 34 45 56 67 17 37 57
## 4  33 22 11  0 12 23 34 45 56  6 26 46
## 5  45 34 23 12  0 11 22 33 44  6 14 34
## 6  56 45 34 23 11  0 11 22 33 17  3 23
## 7  67 56 45 34 22 11  0 11 22 28  8 12
## 8  78 67 56 45 33 22 11  0 11 39 19  1
## 9  89 78 67 56 44 33 22 11  0 50 30 10
## 10 39 28 17  6  6 17 28 39 50  0 20 40
## 11 59 48 37 26 14  3  8 19 30 20  0 20
## 12 79 68 57 46 34 23 12  1 10 40 20  0
#Defining k
k = 5

# Loop to find the fitted_knearest value for above given k
fitted_knearest = c()
for (i in 1:nrow(M)) {
  index = c()
    index <- sort(M[i, ], index.return  = T)$ix[1:k]
    fitted_knearest <- c(fitted_knearest, mean(df$y[index]))
  
}


df$fitted_knearest = fitted_knearest

# Making plot for K = 5 

ggplot(df, aes(x = x, y = fitted_knearest)) + geom_point() + 
  geom_point(aes( y = y), col = "red")   + geom_line() + ggtitle("k=5")

Here the black line is the fitted_knearest line and red points are observed points

Similarly we can make this plot for different values of k = 1, 3, 7, 12

As we can clearly see that plot for k = 5 seems to be reasonably good.

Kernel Smoothing

Here we will use two commanly used examples

  1. Gaussion Kernenl :

\[ K(x) = \frac{1}{\sqrt{2 \pi}}exp(-x^2/2) \] 2. Epanechnikov Kernel :

\[ K(x) = \left \{ 3/4(1-x^2) \ \ if \ |x| \leq1,0,otherwise \right. \] Then for a given bandwidth h>0 , Kerenl regression estimate is defined as:

\[ \hat{f(x)} = \frac{\sum_{i=1}^{n}K(\frac{x-x_i}{h})y_i}{\sum_{i=1}^{n}K(\frac{x-x_i}{h})} \]

Following is the code for the above given dataset:

#Defining both gaussian and epanechinkov kernel functions
gassian_ker <- function(x){
  val = (1/sqrt(2*pi))*exp(-0.5*x^2)
  return(val)
}

epan_ker <- function(x){
  ind1  = ifelse((-1 <= x & x<= 1),1,0)
  val = (3/4)*(1-x^2)*ind1
  return(val)
}

# defining bandwidth
h = 10

fitted_gaussian <- c()
fitted_epan <- c()

# For loop to find fitted value for both kernel functions
for (i in 1:nrow(df)) {
  x_temp <- df$x[i]
  
  temp_denominator_gauss <- 0
  temp_denominator_epan <- 0
  temp_num_gauss <- 0
  temp_num_epan <- 0
  
  for (k in 1:nrow(df)) {
    temp_denominator_gauss = temp_denominator_gauss + gassian_ker((x_temp - df$x[k])/h)
    temp_denominator_epan = temp_denominator_epan + epan_ker((x_temp - df$x[k])/h)
    temp_num_gauss = temp_num_gauss + gassian_ker((x_temp - df$x[k])/h)*df$y[k]
    temp_num_epan = temp_num_epan + epan_ker((x_temp - df$x[k])/h)*df$y[k]
  }
  fitted_gaussian[i] = temp_num_gauss/temp_denominator_gauss
  fitted_epan[i] = temp_num_epan/temp_denominator_epan
}

df$fitted_gaussian <- fitted_gaussian
df$fitted_epan <- fitted_epan  

# Data frame for all fitted values is given as:
df
# For plotting
p1 <- ggplot(df, aes(x = x, y = fitted_gaussian)) + geom_point() +
  geom_point(aes( y = y), col = "red")+ geom_line() 

p2 <- ggplot(df, aes(x = x, y = fitted_epan)) +      geom_point() +
  geom_point(aes( y = y), col = "red") + geom_line() 

ggarrange(p1, p2, labels  = c("Gaussian", "Epanechnikov"), nrow = 1, ncol = 2 )

We can further plot it for different values of h