Introduction

In this deliverable, a tutorial in which I will explain to the reader how one might use MLE and MM to model (a) Glycohemoglobin and (b) Height of adult females. The data will be from National Health and Nutrition Examination Survey 2009-2010 (NHANES), available from the Hmisc package. I will compare and contrast the two methods in addition to comparing and contrasting the choice of underlying distribution. The audience of this blog post is a new comer to data science, who is hoping to learn about these commonly used tools. The tutorial will 1. Show how you calculated estimates of parameters 2. Provide visuals that show the estimated distribution compared to the empirical distribution (comment on the quality of the fit) Overlay estimated pdf onto histogram Overlay estimated CDF onto eCDF QQ plot (sample vs estimated dist) 3. Explain how I calculated the median from the estimated distribution At the end of this tutorial, I will provide the reader with a set of “take-home messages”.

Data

library(tidyverse)
library(Hmisc)
require(stats4)
require(dplyr)

  Hmisc::getHdata(nhgh) 
  d1 <- nhgh %>%
  filter(sex == "female") %>%
  filter(age >= 18) %>%
  select(gh, ht) %>%
  filter(1:n()<=1000)

Normal Distribution

Height

require(stats4)
ht_mean_norm = mean(d1$ht)
ht_sd_norm = sd(d1$ht)
# parameters

nLL_norm <- function(mean, sd){
  fs_norm <- dnorm(
    x = d1$ht
    , mean = mean
    , sd = sd
    , log = T
  )
  -sum(fs_norm)
}
fit_norm <- mle(
  nLL_norm
  , start = list(mean = 1, sd = 1)
  , method = "L-BFGS-B"
  , lower = c(0, 0.01)
)
par(mfrow = c(1,2)); plot(profile(fit_norm), absVal = F)

# mean close to 160.7, sd close to 7.31

# Overlay estimated cdf onto ecdf
par(mfrow = c(1,2))
mm_Fht_norm <- function(x){
  pnorm(x, mean = ht_mean_norm, sd = ht_sd_norm)
}
plot(ecdf(d1$ht), main = "Height")
curve(mm_Fht_norm(x), add = T, col = "Blue", main = "Height")

par(mfrow = c(1,2))

plot(ecdf(d1$ht), main = "Height")
curve(
  pnorm(x, coef(fit_norm)[1], coef(fit_norm)[2])
  , add = T
  , col = "Blue"
  , lwd = 3
)

# Overlay estimated pdf on to histogram
mm_fht_norm <- function(x){
  dnorm(x, mean = ht_mean_norm, sd = ht_sd_norm)
}
hist(d1$ht, freq = F, main = "Height")
curve(mm_fht_norm(x), add = T, col = "Blue", main = "Height")

hist(d1$ht, freq = F, main = "Height")
curve(
  dnorm(x, coef(fit_norm)[1], coef(fit_norm)[2])
  , add = T
  , col = "Blue"
  , lwd = 3
)

# qqplot
par(mfrow = c(1,1))

p = ppoints(300)
y = quantile(d1$ht, probs = p)
x_norm_mm =  qnorm(p, mean = ht_mean_norm, sd = ht_sd_norm)
plot(x_norm_mm, y, main = "Normal QQ plot: Height")
abline(0,1)

p = ppoints(300)
y = quantile(d1$ht, prob = p)
x_norm_mle = qnorm(p, mean = coef(fit_norm)[1], sd = coef(fit_norm)[2])
par(mfrow = c(1,1))
plot(x_norm_mle, y, main = "Normal QQ plot: Height")
abline(0,1)

# Median
median_norm_mm_ht = qnorm(0.5, mean = ht_mean_norm, sd = ht_sd_norm)
median_norm_mle_ht = qnorm(0.5, mean = coef(fit_norm)[1], sd = coef(fit_norm)[2])

Glycohemoglobin

gh_mean_norm = mean(d1$gh)
gh_sd_norm = sd(d1$gh)
  
#parameters

nLL_norm <- function(mean, sd){
  fs_norm <- dnorm(
    x = d1$gh
    , mean = mean
    , sd = sd
    , log = T
  )
  -sum(fs_norm)
}
fit_norm <- mle(
  nLL_norm
  , start = list(mean = 1, sd = 1)
  , method = "L-BFGS-B"
  , lower = c(0, 0.01)
)
par(mfrow = c(1,2)); plot(profile(fit_norm), absVal = F)

# mean close to 5.72, sd close to 1.05
# Overlay estimated cdf onto ecdf
par(mfrow = c(1,2))
mm_Fgh_norm <- function(x){
  pnorm(x, mean = gh_mean_norm, sd = gh_sd_norm)
}
plot(ecdf(d1$gh), main = "Glycohemoglobin")
curve(mm_Fgh_norm(x), add = T, col = "Blue", main = "Glycohemoglobin")

# Overlay estimated pdf on to histogram
mm_fgh_norm <- function(x){
  dnorm(x, mean = gh_mean_norm, sd = gh_sd_norm)
}
hist(d1$gh, freq = F, main = "Glycohemoglobin")
curve(mm_fgh_norm(x), add = T, col = "Blue", main = "Glycohemoglobin")

# qqplot
par(mfrow = c(1,1))
p = ppoints(300)
y = quantile(d1$gh, probs = p)
x_norm_mm =  qnorm(p, mean = gh_mean_norm, sd = gh_sd_norm)
plot(x_norm_mm, y, main = "Normal QQ plot: Glycohemoglobin")
abline(0,1)

# Median
median_norm_mm_gh = qnorm(0.5, mean = gh_mean_norm, sd = gh_sd_norm)

Gamma Distribution

##Height

# Estimates of parameters

mean_ht_mm = mean(d1$ht)
var_ht_mm = var(d1$ht)
scale_ht_mm = var_ht_mm/mean_ht_mm
shape_ht_mm = mean_ht_mm^2/var_ht_mm

nLL_gamma <- function(shape, scale){
  fs <- dgamma( x=d1$ht, shape = shape ,scale=scale ,log=TRUE) 
  -sum(fs)
}

fit_gamma <- mle(nLL_gamma, start=list(shape=1,scale=1) , method = "L-BFGS-B",lower = c(0, 0.01))

# Overlay estimated pdf onto histogram

hist(d1$ht, freq = FALSE, main = "Overlay estimated pdf onto histogram") 
curve(dgamma(x, shape =coef(fit_gamma)[1], scale =coef(fit_gamma)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(dgamma(x, shape =shape_ht_mm, scale =scale_ht_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("topright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)

# Overlay estimated CDF onto eCDF

plot(ecdf(d1$ht), main = "Overlay estimated CDF onto eCDF") 
curve(pgamma(x, shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(pgamma(x, shape=shape_ht_mm,scale =scale_ht_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("bottomright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)

# QQ plot (sample vs estimated dist)


qqplot(d1$ht,rgamma(1000,shape=shape_ht_mm,scale =scale_ht_mm),xlab = "Sample",ylab = "Estimated Simulation", main="QQ plot of Glycohemoglobin(MM)" )
abline(0,1)

qqplot(d1$ht,rgamma(1000,shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]),xlab = "Sample",ylab = "Estimated Simulation",main="QQ plot of Glycohemoglobin(MLE)" )
abline(0,1)

# Estimated Median

(estimated_median_mm =  qgamma(0.5,shape=shape_ht_mm,scale =scale_ht_mm))
## [1] 160.6308
(estimated_median_mle =  qgamma(0.5,shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]))
## [1] 160.6312

Glycohemoglobin

mean_gh_mm = mean(d1$gh)
var_gh_mm = var(d1$gh)
scale_gh_mm = var_gh_mm/mean_gh_mm
shape_gh_mm = mean_gh_mm^2/var_gh_mm

nLL_gamma <- function(shape, scale){
  fs <- dgamma( x=d1$gh, shape = shape ,scale=scale ,log=TRUE) 
  -sum(fs)
}

fit_gamma <- mle(nLL_gamma, start=list(shape=1,scale=1) , method = "L-BFGS-B",lower = c(0, 0.01))

# Overlay estimated pdf onto histogram

hist(d1$gh, freq = FALSE, main = "Overlay estimated pdf onto histogram") 
curve(dgamma(x, shape =coef(fit_gamma)[1], scale =coef(fit_gamma)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(dgamma(x, shape =shape_gh_mm, scale =scale_gh_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("topright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)

# Overlay estimated CDF onto eCDF

plot(ecdf(d1$gh), main = "Overlay estimated CDF onto eCDF") 
curve(pgamma(x, shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(pgamma(x, shape=shape_gh_mm,scale =scale_gh_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("bottomright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)

# QQ plot (sample vs estimated dist)

qqplot(d1$gh,rgamma(1000,shape=shape_gh_mm,scale =scale_gh_mm),xlab = "Sample",ylab = "Estimated Simulation", main="QQ plot of Glycohemoglobin(MM)" )
abline(0,1)

qqplot(d1$gh,rgamma(1000,shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]),xlab = "Sample",ylab = "Estimated Simulation",main="QQ plot of Glycohemoglobin(MLE)" )
abline(0,1)

# Estimated Median

(estimated_median_mm =  qgamma(0.5,shape=shape_gh_mm,scale =scale_gh_mm))
## [1] 5.660259
(estimated_median_mle =  qgamma(0.5,shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]))
## [1] 5.677983

Weibull

Glycohemoglobin

# Estimates of parameters

mean_gh = mean(d1$gh)
var_gh = var(d1$gh)

scale_gh_mm = function(k){
  mean_gh/gamma(1+1/k)
}

minequation = function(k){
  abs(scale_gh_mm(k)^2*(gamma(1+2/k)-gamma(1+1/k)^2)-var_gh)
}

shape_gh_mm =optimize(f=minequation, lower = 0, upper = 100)
shape_gh_mm = shape_gh_mm$minimum
scale_gh_mm = scale_gh_mm(shape_gh_mm)


nLL_weibull <- function(shape, scale){
  fs <- dweibull( x=d1$gh, shape = shape ,scale=scale ,log=TRUE) 
  -sum(fs)
}

fit_weibull <- mle(nLL_weibull, start=list(shape=1,scale=1) , method = "L-BFGS-B",lower = c(0, 0.01))


# Overlay estimated pdf onto histogram

hist(d1$gh, freq = FALSE, main = "Overlay estimated pdf onto histogram") 
curve(dweibull(x, shape =coef(fit_weibull)[1], scale =coef(fit_weibull)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(dweibull(x, shape =shape_gh_mm, scale =scale_gh_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("topright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)

# Overlay estimated CDF onto eCDF

plot(ecdf(d1$gh), main = "Overlay estimated CDF onto eCDF") 
curve(pweibull(x, shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(pweibull(x, shape=shape_gh_mm,scale =scale_gh_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("bottomright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)

# QQ plot (sample vs estimated dist)

qqplot(d1$gh,rweibull(1000,shape=shape_gh_mm,scale =scale_gh_mm),xlab = "Sample",ylab = "Estimated Simulation", main="QQ plot of Glycohemoglobin(MM)" )
abline(0,1)

qqplot(d1$gh,rweibull(1000,shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]),xlab = "Sample",ylab = "Estimated Simulation",main="QQ plot of Glycohemoglobin(MLE)" )
abline(0,1)

# Estimated Median


(estimated_median_mm =  qweibull(0.5,shape=shape_gh_mm,scale =scale_gh_mm))
## [1] 5.806483
(estimated_median_mle =  qweibull(0.5,shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]))
## [1] 5.64902

#Height

# Estimates of parameters

mean_ht = mean(d1$ht)
var_ht = var(d1$ht)

scale_ht_mm = function(k){
  mean_ht/gamma(1+1/k)
}

minequation = function(k){
  abs(scale_ht_mm(k)^2*(gamma(1+2/k)-gamma(1+1/k)^2)-var_ht)
}

shape_ht_mm =optimize(f=minequation, lower = 0, upper = 100)
shape_ht_mm = shape_ht_mm$minimum
scale_ht_mm = scale_ht_mm(shape_ht_mm)


nLL_weibull <- function(shape, scale){
  fs <- dweibull( x=d1$ht, shape = shape ,scale=scale ,log=TRUE) 
  -sum(fs)
}

fit_weibull <- mle(nLL_weibull, start=list(shape=1,scale=1) , method = "L-BFGS-B",lower = c(0, 0.01))

# Overlay estimated pdf onto histogram

hist(d1$ht, freq = FALSE, main = "Overlay estimated pdf onto histogram") 
curve(dweibull(x, shape =coef(fit_weibull)[1], scale =coef(fit_weibull)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(dweibull(x, shape =shape_ht_mm, scale =scale_ht_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("topright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)

# Overlay estimated CDF onto eCDF

plot(ecdf(d1$ht), main = "Overlay estimated CDF onto eCDF") 
curve(pweibull(x, shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(pweibull(x, shape=shape_ht_mm,scale =scale_ht_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("bottomright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)

# QQ plot (sample vs estimated dist)

qqplot(d1$ht,rweibull(1000,shape=shape_ht_mm,scale =scale_ht_mm),xlab = "Sample",ylab = "Estimated Simulation", main="QQ plot of Height(MM)" )
abline(0,1)

qqplot(d1$ht,rweibull(1000,shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]),xlab = "Sample",ylab = "Estimated Simulation",main="QQ plot of Height(MLE)" )
abline(0,1)

# Estimated Median

(estimated_median_mm =  qweibull(0.5,shape=shape_ht_mm,scale =scale_ht_mm))
## [1] 161.8065
(estimated_median_mle =  qweibull(0.5,shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]))
## [1] 161.5156