Kernel Density and Distribution Estimation for data with different supports: \({\mathbb R}\), \({\mathbb R}_+\), and \((0,1)\)

Kernel density and Kernel distribution estimation are powerful tools to fit samples without model assumptions. The following R codes shows the implementation of kernel density and distribution estimators for data with support on \({\mathbb R}\), \({\mathbb R}_+\), and \((0,1)\) by using a transformation approach. This is, data on \({\mathbb R}\) are fitted directly; data on \({\mathbb R}_+\) are fitted on the logarithmic scale; and data on \((0,1)\) are fitted on the logit scale.

rm(list=ls())

library(kerdiest) # required package
## Warning: package 'kerdiest' was built under R version 3.3.3
## Loading required package: date
## Warning: package 'date' was built under R version 3.3.2
## Loading required package: chron
## Warning: package 'chron' was built under R version 3.3.3
## Loading required package: evir
## Warning: package 'evir' was built under R version 3.3.2
# Logit function
logit <- Vectorize(function(p) log(p) - log(1-p))

######################################################################################################
# Kernel density estimator (PDF)
######################################################################################################
# x:    vector of quantiles
# h:    smoothing parameter (bandwidth)
# data: data set

# Function
kde.dnorm <- function(x,h,data) mean( dnorm( x-data, 0, h ) )

#---------------------------------------------
# Example 1: Data with support on R
#---------------------------------------------
set.seed(123)
sim.data <- c(rnorm(250,-1,0.5),rnorm(250,1,0.5)) # Simulated data
h0 <- bw.nrd0(sim.data)
kde.dfit <- Vectorize(function(x) kde.dnorm(x,h0,sim.data))
hist(sim.data,probability = T, breaks=10,ylim=c(0,0.5), 
     xlab="Data",ylab="Density", main="Histogram vs KDE",cex.axis=1.5,cex.lab=1.5)
curve(kde.dfit,-3,3,lwd=3,col="blue",add=T,n=1000)
box()

#---------------------------------------------
# Example 2: Data with support on R+
# Logarithmic transformation of the data
#---------------------------------------------
set.seed(123)
sim.data <- rlnorm(1000,0,0.5) # Simulated data
h0 <- bw.nrd0(log(sim.data))
# Transformed KDE
kde.dfit <- Vectorize(function(x) kde.dnorm(log(x),h0,log(sim.data))/x)
hist(sim.data,probability = T, breaks=30,ylim=c(0,1), 
     xlab="Data",ylab="Density", main="Histogram vs Transformed KDE",cex.axis=1.5,cex.lab=1.5)
curve(kde.dfit,0,5,lwd=3,col="blue",add=T,n=1000)
box()

#---------------------------------------------
# Example 3: Data with support on (0,1)
# Logit transformation of the data
#---------------------------------------------
set.seed(123)
sim.data <- rbeta(1000,3,3) # Simulated data
h0 <- bw.nrd0(logit(sim.data))
# Transformed KDE
kde.dfit <- Vectorize(function(x) kde.dnorm(logit(x),h0,logit(sim.data))/(x*(1-x)))
hist(sim.data,probability = T, breaks=30,ylim=c(0,2.5), 
     xlab="Data",ylab="Density", main="Histogram vs Transformed KDE",cex.axis=1.5,cex.lab=1.5)
curve(kde.dfit,0,1,lwd=3,col="blue",add=T,n=1000)
box()

######################################################################################################
# Kernel distribution estimator (CDF)
######################################################################################################
# x:    vector of quantiles
# h:    smoothing parameter (bandwidth)
# data: data set

# Function
kde.pnorm <- function(x,h,data) mean( pnorm( x-data, 0, h ) )

#---------------------------------------------
# Example 1: Data with support on R
#---------------------------------------------
set.seed(123)
sim.data <- c(rnorm(250,-1,0.5),rnorm(250,1,0.5)) # Simulated data
h1 <- ALbw(type_kernel = "n", vec_data = sim.data) # Bandwith for CDF
kde.pfit <- Vectorize(function(x) kde.pnorm(x,h1,sim.data))
plot(ecdf(sim.data),ylim=c(0,1), 
     xlab="Data",ylab="Distribution", main="ECDF vs KDE",cex.axis=1.5,cex.lab=1.5)
curve(kde.pfit,-3,3,lwd=3,col="blue",add=T,n=1000)
box()

#---------------------------------------------
# Example 2: Data with support on R+
# Logarithmic transformation of the data
#---------------------------------------------
set.seed(123)
sim.data <- rlnorm(1000,0,0.5) # Simulated data
h1 <- ALbw(type_kernel = "n", vec_data = log(sim.data)) # Bandwith for CDF
# Transformed KDE
kde.pfit <- Vectorize(function(x) kde.pnorm(log(x),h0,log(sim.data)))
plot(ecdf(sim.data),ylim=c(0,1), lwd=3,
     xlab="Data",ylab="Distribution", main="ECDF vs KDE",cex.axis=1.5,cex.lab=1.5)
curve(kde.pfit,0,6,lwd=3,col="blue",add=T,n=1000)
box()

#---------------------------------------------
# Example 3: Data with support on (0,1)
# Logit transformation of the data
#---------------------------------------------
set.seed(123)
sim.data <- rbeta(1000,3,3) # Simulated data
h1 <- ALbw(type_kernel = "n", vec_data = logit(sim.data)) # Bandwith for CDF
# Transformed KDE
kde.pfit <- Vectorize(function(x) kde.pnorm(logit(x),h0,logit(sim.data)))
plot(ecdf(sim.data),ylim=c(0,1), lwd=3,
     xlab="Data",ylab="Distribution", main="ECDF vs KDE",cex.axis=1.5,cex.lab=1.5)
curve(kde.pfit,0,1,lwd=3,col="blue",add=T,n=1000)
box()