Height Data set
setwd("C:/Users/gabeg/Documents/Uni/Stat 5003/Week 3")
height <- read_delim('height.txt')
## Rows: 200 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (1): ID
## dbl (1): Height_m
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(ggplot2)
theme_set(
theme_bw()
)
ggplot(data = height, aes(x = Height_m)) +
geom_histogram(aes(y=..density..), bins = 5, colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") +
labs(title = "5 Bins for Probability Density Function of Height")

ggplot(data = height, aes(x = Height_m)) +
geom_histogram(aes(y=..density..), bins = 10, colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") +
labs(title = "10 Bins for Probability Density Function of Height")

ggplot(data = height, aes(x = Height_m)) +
geom_histogram(aes(y=..density..), bins = 20, colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") +
labs(title = "20 Bins for Probability Density Function of Height")

## 2.2 Kernel Density Estimation
kernels <- c("gaussian", "triangular", "epanechnikov")
par(mfrow = c(2,2))
invisible(lapply(kernels, function(k){
plot(density(height$Height_m, kernel = k),
main = paste(tools::toTitleCase(k), "kernel"))
rug(height$Height_m)
}
))

##2.3 Probability person is taller than 1.7m using Kernel desnity estimator
density_est <- density(height[["Height_m"]])
f <- approxfun(density_est)
prob <- integrate(f,1.7, max(density_est$x))
prob
## 0.4314333 with absolute error < 3.1e-05
##2.4
#If we assume the heights data follows a Gaussian distribution, what are the maximum likelihood estimates of the parameters
library(stats4)
log_likelihood <- function(mu,sigma) {
-sum(dnorm(height$Height_m, mean=mu, sd=sigma, log=TRUE))
}
fit <- mle(log_likelihood, start = c(mu = mean(height$Height_m), sigma = sd(height$Height_m)))
## Warning in dnorm(height$Height_m, mean = mu, sd = sigma, log = TRUE): NaNs
## produced
## Warning in dnorm(height$Height_m, mean = mu, sd = sigma, log = TRUE): NaNs
## produced
## Warning in dnorm(height$Height_m, mean = mu, sd = sigma, log = TRUE): NaNs
## produced
## Warning in dnorm(height$Height_m, mean = mu, sd = sigma, log = TRUE): NaNs
## produced
summary(fit)
## Maximum likelihood estimation
##
## Call:
## mle(minuslogl = log_likelihood, start = c(mu = mean(height$Height_m),
## sigma = sd(height$Height_m)))
##
## Coefficients:
## Estimate Std. Error
## mu 1.68177698 0.005716404
## sigma 0.08084216 0.004040079
##
## -2 log L: -438.577
## 3.1
set.seed(530306322)
sim_data <- rnorm(100, mean = 0, sd = 1)
## get bandwidth matrix
sim_bands <- seq(from =0.01, to = 3, by = 0.01)
ise_calc <- function(data, bandwidth) {
density <- density(data, bw = bandwidth, n= 512, from = -3, to = 3)
x <- density$x
y_pop <- dnorm(x)
y_est <- density$y
ise <- sum((y_pop - y_est)^2)
return(ise)
}
ise_array <- matrix(ncol = 2, nrow = length(sim_bands))
for (h in 1:length(sim_bands)){
ise_array[h, 1] <- sim_bands[h]
ise_array[h, 2] <- ise_calc(sim_data, sim_bands[h])
}
colnames(ise_array) <- c("Bandwidth", "ISE")
ise_df <- as.data.frame(ise_array)
best_bw <- filter(ise_df, ISE == min(ISE))
cat("The Bandwidth that minimises ISE is", best_bw$Bandwidth, "with an ISE of", best_bw$ISE)
## The Bandwidth that minimises ISE is 0.36 with an ISE of 0.8872737
best_h <- best_bw$Bandwidth
ggplot(data = ise_df, aes(x = Bandwidth, y = ISE)) +
geom_line() +
scale_x_continuous(breaks = round(seq(min(ise_df$Bandwidth), max(ise_df$Bandwidth), by = 0.2),1)) +
geom_vline(xintercept = best_h, linetype = "dashed", color = "red", size = 0.5) +
labs(title = "Bandwidth of Gaussian Kernel Density Estimator vs. ISE")

optimal_dense <- density(sim_data, bw = best_h, n= 512, from = -3, to = 3)
x_values <- seq(min(sim_data), max(sim_data), length.out = 1000)
# Calculate the true standard normal density
y_actual <- dnorm(x_values)
plot(x_values, y_actual, type = "l", col = "blue", lwd = 2,
xlab = "Value", ylab = "Density",
main = "Kernel Density Estimate vs. Standard Normal Distribution")
lines(optimal_dense, col = "red", lwd = 2)
legend("topright", legend = c("Standard Normal", "Optimal Bandwidth KDE"),
col = c("blue", "red"), lwd = 2)
