library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
##1.1
smpl_data <- rnorm(100, mean = 10, sd = 10)

hist(smpl_data)

##1.2
set.seed(530306322)

smpl_data2 <- rnorm(100, mean = 10, sd = 10)

identical(smpl_data, smpl_data2)
## [1] FALSE
smpl_data3 <- rnorm(100, mean = 10, sd = 10)

identical(smpl_data3, smpl_data2)
## [1] FALSE
set.seed(530306322)

smpl_data4 <- rnorm(100, mean = 10, sd = 10)

identical(smpl_data4, smpl_data2)
## [1] TRUE

This is testing to see which value of mu (population mean) maximises the MLE

## 1.3 Log Likelihood
## to find Maximum Likelihood Estimation

data_mean8 <- rnorm(100, mean = 8, sd = 10)

mu <- 1:20

log.likelihood <- sapply(mu, function(x) sum(log(dnorm(data_mean8, mean = x, sd = 10))))

plot(mu, log.likelihood, type = "l", ylab = "Log Likelihood")
mu.mle <- mu[which.max(log.likelihood)]

abline(v = mu.mle, lty = 2)

mu.mle
## [1] 8

Height Data set

setwd("C:/Users/gabeg/Documents/Uni/Stat 5003/Week 3")
height <- read_delim('height.txt')
## Rows: 200 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (1): ID
## dbl (1): Height_m
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(ggplot2)
theme_set(
  theme_bw()
)
ggplot(data = height, aes(x = Height_m)) +
  geom_histogram(aes(y=..density..), bins = 5, colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") +
  labs(title = "5 Bins for Probability Density Function of Height")

ggplot(data = height, aes(x = Height_m)) +
  geom_histogram(aes(y=..density..), bins = 10, colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") +
  labs(title = "10 Bins for Probability Density Function of Height")

ggplot(data = height, aes(x = Height_m)) +
  geom_histogram(aes(y=..density..), bins = 20, colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") +
  labs(title = "20 Bins for Probability Density Function of Height")

## 2.2 Kernel Density Estimation

kernels <- c("gaussian", "triangular", "epanechnikov")

par(mfrow = c(2,2))

invisible(lapply(kernels, function(k){
  plot(density(height$Height_m, kernel = k),
       main = paste(tools::toTitleCase(k), "kernel"))
  rug(height$Height_m)
}
  ))

##2.3 Probability person is taller than 1.7m using Kernel desnity estimator

density_est <- density(height[["Height_m"]])
f <- approxfun(density_est)

prob <- integrate(f,1.7, max(density_est$x))

prob
## 0.4314333 with absolute error < 3.1e-05
##2.4
#If we assume the heights data follows a Gaussian distribution, what are the maximum likelihood estimates of the parameters 
library(stats4)



log_likelihood <- function(mu,sigma) {
  -sum(dnorm(height$Height_m, mean=mu, sd=sigma, log=TRUE))
}


fit <- mle(log_likelihood, start = c(mu = mean(height$Height_m), sigma = sd(height$Height_m)))
## Warning in dnorm(height$Height_m, mean = mu, sd = sigma, log = TRUE): NaNs
## produced

## Warning in dnorm(height$Height_m, mean = mu, sd = sigma, log = TRUE): NaNs
## produced

## Warning in dnorm(height$Height_m, mean = mu, sd = sigma, log = TRUE): NaNs
## produced

## Warning in dnorm(height$Height_m, mean = mu, sd = sigma, log = TRUE): NaNs
## produced
summary(fit)
## Maximum likelihood estimation
## 
## Call:
## mle(minuslogl = log_likelihood, start = c(mu = mean(height$Height_m), 
##     sigma = sd(height$Height_m)))
## 
## Coefficients:
##         Estimate  Std. Error
## mu    1.68177698 0.005716404
## sigma 0.08084216 0.004040079
## 
## -2 log L: -438.577
## 3.1
set.seed(530306322)
sim_data <- rnorm(100, mean = 0, sd = 1)

## get bandwidth matrix

sim_bands <- seq(from =0.01, to = 3, by = 0.01)

ise_calc <- function(data, bandwidth) {
  density <- density(data, bw = bandwidth, n= 512, from = -3, to = 3)
  x <- density$x
  y_pop <- dnorm(x)
  y_est <- density$y
  ise <- sum((y_pop - y_est)^2) * (x[2] - x[1])
  return(ise)
}

min_ise <- Inf
best_bw <- NULL


ise_array <- matrix(ncol = 2, nrow = length(sim_bands))

for (h in 1:length(sim_bands)){
  ise_array[h, 1] <- sim_bands[h]
  ise_array[h, 2] <- ise_calc(sim_data, sim_bands[h])
  
}

colnames(ise_array) <- c("Bandwidth", "ISE")

ise_df <- as.data.frame(ise_array)

best_bw <- filter(ise_df, ISE == min(ISE))


cat("The Bandwidth that minimises ISE is", best_bw$Bandwidth, "with an ISE of", best_bw$ISE)
## The Bandwidth that minimises ISE is 0.36 with an ISE of 0.01041809
best_h <- best_bw$Bandwidth
ggplot(data = ise_df, aes(x = Bandwidth, y = ISE)) +
  geom_line() +
  scale_x_continuous(breaks = round(seq(min(ise_df$Bandwidth), max(ise_df$Bandwidth), by = 0.2),1)) +
  scale_y_continuous(breaks = round(seq(min(ise_df$ISE), max(ise_df$ISE), by = 0.01),1)) +
  geom_vline(xintercept = best_h, linetype = "dashed", color = "red", size = 0.5) +
  labs(title = "Bandwidth of Gaussian Kernel Density Estimator vs. ISE")

optimal_dense <- density(sim_data, bw = best_h, n= 512, from = -3, to = 3)

x_values <- seq(min(sim_data), max(sim_data), length.out = 1000)

# Calculate the true standard normal density
y_actual <- dnorm(x_values)

plot(x_values, y_actual, type = "l", col = "blue", lwd = 2,
     xlab = "Value", ylab = "Density",
     main = "Kernel Density Estimate vs. Standard Normal Distribution")
lines(optimal_dense, col = "red", lwd = 2)
legend("topright", legend = c("Standard Normal", "Optimal Bandwidth KDE"),
       col = c("blue", "red"), lwd = 2)