First off set a few things up: humpcol
is the colour of the KDE - bgcol
is the colour of the true distribution. closeup
closes the distribution curve to a polygon with a base line on the x axis; this is useful for shading in its area. tweens
contains the landmark curves at regularly spaced bandwidths, and the tweened curves in between them. The do.call
and rbind
combination turn a list of tweened vectors into a matrix.
N <- 8
nn <- 8
library(tweenr)
set.seed(250162)
x <- rnorm(400)
bw <- seq(0.02,0.7,l=N)
results <- vector(N,mode='list')
for(i in 1:N) {
kd <- density(x,bw=bw[i],from=-3,to=3)
xx <- kd$x
results[[i]] <- kd$y
}
tweens <- do.call(rbind,tween_numeric(results,n=nn))
humpcol <- adjustcolor('indianred',alpha.f=0.5)
bgcol <- adjustcolor('dodgerblue',alpha.f=0.5)
nd <- dnorm(xx)
closeup <- function(x,y) {
xy <- cbind(x,y)
top <- xy[1,]
btm <- xy[nrow(xy),]
top[2] <- btm[2] <- 0
return(rbind(top,xy,btm))
}
Now the preparation is done (much of it in the tweening) just create an animation by drawing each kde in turn.
for (i in 1:ncol(tweens)) {
plot(closeup(xx,tweens[,i]),type='n',ylim=c(0,0.6))
polygon(closeup(xx,nd),border=bgcol,col=bgcol)
polygon(closeup(xx,tweens[,i]),border=humpcol,col=humpcol)
}
Another approach is to tween the KDE bandwidths rather than the KDEs themselves
bwvec <- tween_numeric(c(0.02,0.7),n=25)[[1]]
for (bwi in bwvec) {
kd <- density(x,bw=bwi,from=-3,to=3)
plot(closeup(kd$x,kd$y),type='n',ylim=c(0,0.6))
polygon(closeup(xx,nd),border=bgcol,col=bgcol)
polygon(closeup(kd$x,kd$y),border=humpcol,col=humpcol)
}
The above isn’t that interesting, not much different from using seq(0.02,0.7,l=25)
to generate the bandwidths. It gets more interesting with other easing effects:
bwvec <- tween_numeric(c(0.02,0.7),n=65,ease='bounce-out')[[1]]
for (bwi in bwvec) {
kd <- density(x,bw=bwi,from=-3,to=3)
plot(closeup(kd$x,kd$y),type='n',ylim=c(0,0.6))
polygon(closeup(xx,nd),border=bgcol,col=bgcol)
polygon(closeup(kd$x,kd$y),border=humpcol,col=humpcol)
}
Here is another one:
bwvec <- tween_numeric(c(0.2,0.7),n=65,ease='elastic-in-out')[[1]]
for (bwi in bwvec) {
kd <- density(x,bw=abs(bwi),from=-3,to=3)
plot(closeup(kd$x,kd$y),type='n',ylim=c(0,0.6))
polygon(closeup(xx,nd),border=bgcol,col=bgcol)
polygon(closeup(kd$x,kd$y),border=humpcol,col=humpcol)
}
For a given sample the best choice is one providing a KDE that is in some way ‘nearest’ to the true density. Here we know the true density is Gaussian (because its a simulation) and so some sort of difference between that and the KDE is a good measure. One possibility is to measure the absolute difference between the KDE and the Gaussian distribution function, integrating over the range of possible \(x\) values:
\[ \Delta_h = \int_{-\infty}^{\infty} | \hat{f}_h(x) - \phi(x) | \, dx \] where \(\phi\) is the Gaussian probability density function. However, a better option might be the average absolute deviation - this allows for the fact that if most of the observations are close to the median of the distribution, then more weighting should be given to \(x\) and \(|\hat{f}_h(x) - \phi(x)|\) near to this point. This can be achieved by modifying the measure to be: \[ \Delta_h = \int_{-\infty}^{\infty} \phi(x) | \hat{f}_h(x) - \phi(x) | \, dx \] which can be approximated discretely as the weighted mean of \(|\hat{f}_h(x) - \phi(x)|\) over a range of \(x\) values:
bwvec <- tween_numeric(c(0.1,0.7),n=50)[[1]]
score <- bwvec*0.0
for (i in 1:length(bwvec)) {
kd <- density(x,bw=bwvec[i],from=-3,to=3)
score[i] <- weighted.mean(abs(kd$y-nd),nd)
}
plot(bwvec,score,type='l')
From this, a score of around 0.3938776
gives the best estimate for this sample. It is then possible to tween from a low value to a high one, then settle on the optimum to create a simulated search.
min_ind <- which.min(score)
minval <- bwvec[min_ind]
bwvec <- c(tween_numeric(c(0.1,0.7),n=20)[[1]],tween_numeric(c(0.7,minval),n=60,ease='elastic-out')[[1]])
for (bwi in bwvec) {
kd <- density(x,bw=abs(bwi),from=-3,to=3)
plot(closeup(kd$x,kd$y),type='n',ylim=c(0,0.6))
polygon(closeup(xx,nd),border=bgcol,col=bgcol)
polygon(closeup(kd$x,kd$y),border=humpcol,col=humpcol)
}
Finally - a novelty, but another way to visually imply the optimisation process:
score <- bwvec*0.0
for (i in 1:length(bwvec)) {
kd <- density(x,bw=bwvec[i],from=-3,to=3)
score[i] <- weighted.mean(abs(kd$y-nd),nd)
}
bwvec2 <- tween_numeric(c(0.1,0.7),n=60)[[1]]
score2 <- bwvec2*0.0
for (i in 1:length(bwvec2)) {
kd <- density(x,bw=bwvec2[i],from=-3,to=3)
score2[i] <- weighted.mean(abs(kd$y-nd),nd)
}
for (i in 1:length(bwvec)) {
plot(bwvec2,score2,type='l')
if (i > 1) points(bwvec[i-1],score[i-1],pch=16,cex=3,col=adjustcolor('indianred',alpha.f=0.2))
points(bwvec[i],score[i],pch=16,cex=3,col=adjustcolor('indianred',alpha.f=0.4))
}