1 Parametric density estimation

1.1 Basic Gaussian simulation

normal.data <- rnorm(100, mean = 10, sd = 10)
hist(normal.data)

1.2 Reproducible RNG

par(mfrow=c(1,2)) 
set.seed(42); 
normal.data2  <-rnorm(100, mean = 10, sd = 10)
hist(normal.data2)
set.seed(42); 
normal.data3  <-rnorm(100, mean = 10, sd = 10)
hist(normal.data3)

identical(normal.data2,normal.data3)
## [1] TRUE
all.equal(normal.data2,normal.data3)
## [1] TRUE

1.3 Log likelihood

normal.data4  <-rnorm(100, mean = 8, sd = 10)
LL <- function(mu,sigma=10) {
    R = suppressWarnings(dnorm(normal.data4, mu,sigma)) 
    return(-sum(log(R))) 
}
mu=1:20
#sapply(mu, LL)
plot(mu,sapply(mu, LL),type="l",ylab="Log likelihood")

2 Parametrics and Nonparametric density estimation

2.1 Basic exploratory data analysis

height = read.delim("height-1.txt")
par(mfrow=c(2,2))
hist(height[,2], probability = TRUE, breaks = 5,main = "Histogram of The Heights", xlab = "Heights (m)")
hist(height[,2], probability = TRUE, breaks = 10,main = "Histogram of The Heights", xlab = "Heights (m)")
hist(height[,2], probability = TRUE, breaks = 15,main = "Histogram of The Heights", xlab = "Heights (m)")
hist(height[,2], probability = TRUE, breaks = 20,main = "Histogram of The Heights", xlab = "Heights (m)")

2.2 Kernel density estimation

d1 <- density(height[,2], bw = 0.01, kernel = "gaussian") 
d2 <- density(height[,2], bw = 0.01, kernel = "epanechnikov") 
d3 <- density(height[,2], bw = 0.01, kernel = "triangular") 

hist(height[,2], freq = FALSE, breaks = 20, main = "Histogram of The Heights", xlab = "Heights (m)")
lines(d1$x, d1$y, col = "red") 
lines(d2$x, d2$y, col = "green2") 
lines(d3$x, d3$y, col = "purple") 
legend("topright",c("Gaussian","Epanechnikov", "Triangular"),
      lty = c(1, 1, 1), col = c("red", "green2", "purple"))

2.3 Probability calculation

d4 <- density(height[,2], bw = 0.05, kernel = "gaussian") 
f <- approxfun(d4)
p <- integrate(f, 1.7,max(d4$x))
p
## 0.4278724 with absolute error < 8.4e-06
plot(d4, main = "Density estimation")
abline(v=1.7, col = "red")
abline(v=max(d4$x), col = "red")

### 2.4 MLE

library(stats4) 
mle.fit <- mle(LL, start = list(mu = 5, sigma = 8))
mle.fit
## 
## Call:
## mle(minuslogl = LL, start = list(mu = 5, sigma = 8))
## 
## Coefficients:
##       mu    sigma 
## 7.125134 8.996318

3 Bandwidth selection

3.1 Integrated square error

StandardNorm  <- rnorm(100, mean = 0, sd = 1)

ISE <- function(bw) {
    ds <- density(x = StandardNorm, from = -3, to = 3, n = 512,bw=bw)
    result <- suppressWarnings((ds$y - dnorm(ds$x))^2) 
    return(sum(result)) 
}
bw=seq(0.01, 1, length=10)
ISER=sapply(bw, ISE)
plot(bw,ISER,type="l",ylab="ISE")

ISERmin=min(ISER)
optimalbw=bw[which(ISER==ISERmin)]
#list(bw,ISER)
print(cat('min ISE',ISERmin,'\noptimal bandwidth:',as.double(optimalbw)))
## min ISE 0.1638532 
## optimal bandwidth: 0.34NULL
optimalds=density(x = StandardNorm, from = -3, to = 3, n = 512,bw=optimalbw)
plot(optimalds$x,dnorm(optimalds$x),col='red',main="Density Estimation vs Normal Distribution Density",xlab="",ylab="density")
lines(optimalds$x,optimalds$y,type="l",xlab="")

3.2 Bandwidth selection algorithms

bw <- bw.ucv(height[,2], lower = 0.005, upper = 0.05) 
d5 <- density(height[,2], bw = bw, kernel = "gaussian") 
plot(d5, main = "Density estimation")

3.3 Bandwidth selection app

click to view the shiny app:

Density Estimator

APP. Student Info

Course: STAT5003_Computational Statistical Methods
Assignment: Lab Week 3
Student Name: Yujun Yao(June Yao)
SID: 500316995
Email: