In this deliverable, a tutorial in which I will explain to the reader how one might use MLE and MM to model (a) Glycohemoglobin and (b) Height of adult females. The data will be from National Health and Nutrition Examination Survey 2009-2010 (NHANES), available from the Hmisc package. I will compare and contrast the two methods in addition to comparing and contrasting the choice of underlying distribution. The audience of this blog post is a new comer to data science, who is hoping to learn about these commonly used tools. The tutorial will 1. Show how you calculated estimates of parameters 2. Provide visuals that show the estimated distribution compared to the empirical distribution (comment on the quality of the fit) Overlay estimated pdf onto histogram Overlay estimated CDF onto eCDF QQ plot (sample vs estimated dist) 3. Explain how I calculated the median from the estimated distribution At the end of this tutorial, I will provide the reader with a set of “take-home messages”.
library(tidyverse)
library(Hmisc)
require(stats4)
require(dplyr)
Hmisc::getHdata(nhgh)
d1 <- nhgh %>%
filter(sex == "female") %>%
filter(age >= 18) %>%
select(gh, ht) %>%
filter(1:n()<=1000)
require(stats4)
ht_mean_norm = mean(d1$ht)
ht_sd_norm = sd(d1$ht)
# parameters
nLL_norm <- function(mean, sd){
fs_norm <- dnorm(
x = d1$ht
, mean = mean
, sd = sd
, log = T
)
-sum(fs_norm)
}
fit_norm <- mle(
nLL_norm
, start = list(mean = 1, sd = 1)
, method = "L-BFGS-B"
, lower = c(0, 0.01)
)
par(mfrow = c(1,2)); plot(profile(fit_norm), absVal = F)
# mean close to 160.7, sd close to 7.31
# Overlay estimated cdf onto ecdf
par(mfrow = c(1,2))
mm_Fht_norm <- function(x){
pnorm(x, mean = ht_mean_norm, sd = ht_sd_norm)
}
plot(ecdf(d1$ht), main = "Height")
curve(mm_Fht_norm(x), add = T, col = "Blue", main = "Height")
par(mfrow = c(1,2))
plot(ecdf(d1$ht), main = "Height")
curve(
pnorm(x, coef(fit_norm)[1], coef(fit_norm)[2])
, add = T
, col = "Blue"
, lwd = 3
)
# Overlay estimated pdf on to histogram
mm_fht_norm <- function(x){
dnorm(x, mean = ht_mean_norm, sd = ht_sd_norm)
}
hist(d1$ht, freq = F, main = "Height")
curve(mm_fht_norm(x), add = T, col = "Blue", main = "Height")
hist(d1$ht, freq = F, main = "Height")
curve(
dnorm(x, coef(fit_norm)[1], coef(fit_norm)[2])
, add = T
, col = "Blue"
, lwd = 3
)
# qqplot
par(mfrow = c(1,1))
p = ppoints(300)
y = quantile(d1$ht, probs = p)
x_norm_mm = qnorm(p, mean = ht_mean_norm, sd = ht_sd_norm)
plot(x_norm_mm, y, main = "Normal QQ plot: Height")
abline(0,1)
p = ppoints(300)
y = quantile(d1$ht, prob = p)
x_norm_mle = qnorm(p, mean = coef(fit_norm)[1], sd = coef(fit_norm)[2])
par(mfrow = c(1,1))
plot(x_norm_mle, y, main = "Normal QQ plot: Height")
abline(0,1)
# Median
median_norm_mm_ht = qnorm(0.5, mean = ht_mean_norm, sd = ht_sd_norm)
median_norm_mle_ht = qnorm(0.5, mean = coef(fit_norm)[1], sd = coef(fit_norm)[2])
gh_mean_norm = mean(d1$gh)
gh_sd_norm = sd(d1$gh)
#parameters
nLL_norm <- function(mean, sd){
fs_norm <- dnorm(
x = d1$gh
, mean = mean
, sd = sd
, log = T
)
-sum(fs_norm)
}
fit_norm <- mle(
nLL_norm
, start = list(mean = 1, sd = 1)
, method = "L-BFGS-B"
, lower = c(0, 0.01)
)
par(mfrow = c(1,2)); plot(profile(fit_norm), absVal = F)
# mean close to 5.72, sd close to 1.05
# Overlay estimated cdf onto ecdf
par(mfrow = c(1,2))
mm_Fgh_norm <- function(x){
pnorm(x, mean = gh_mean_norm, sd = gh_sd_norm)
}
plot(ecdf(d1$gh), main = "Glycohemoglobin")
curve(mm_Fgh_norm(x), add = T, col = "Blue", main = "Glycohemoglobin")
# Overlay estimated pdf on to histogram
mm_fgh_norm <- function(x){
dnorm(x, mean = gh_mean_norm, sd = gh_sd_norm)
}
hist(d1$gh, freq = F, main = "Glycohemoglobin")
curve(mm_fgh_norm(x), add = T, col = "Blue", main = "Glycohemoglobin")
# qqplot
par(mfrow = c(1,1))
p = ppoints(300)
y = quantile(d1$gh, probs = p)
x_norm_mm = qnorm(p, mean = gh_mean_norm, sd = gh_sd_norm)
plot(x_norm_mm, y, main = "Normal QQ plot: Glycohemoglobin")
abline(0,1)
# Median
median_norm_mm_gh = qnorm(0.5, mean = gh_mean_norm, sd = gh_sd_norm)
##Height
# Estimates of parameters
mean_ht_mm = mean(d1$ht)
var_ht_mm = var(d1$ht)
scale_ht_mm = var_ht_mm/mean_ht_mm
shape_ht_mm = mean_ht_mm^2/var_ht_mm
nLL_gamma <- function(shape, scale){
fs <- dgamma( x=d1$ht, shape = shape ,scale=scale ,log=TRUE)
-sum(fs)
}
fit_gamma <- mle(nLL_gamma, start=list(shape=1,scale=1) , method = "L-BFGS-B",lower = c(0, 0.01))
# Overlay estimated pdf onto histogram
hist(d1$ht, freq = FALSE, main = "Overlay estimated pdf onto histogram")
curve(dgamma(x, shape =coef(fit_gamma)[1], scale =coef(fit_gamma)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(dgamma(x, shape =shape_ht_mm, scale =scale_ht_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("topright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)
# Overlay estimated CDF onto eCDF
plot(ecdf(d1$ht), main = "Overlay estimated CDF onto eCDF")
curve(pgamma(x, shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(pgamma(x, shape=shape_ht_mm,scale =scale_ht_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("bottomright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)
# QQ plot (sample vs estimated dist)
qqplot(d1$ht,rgamma(1000,shape=shape_ht_mm,scale =scale_ht_mm),xlab = "Sample",ylab = "Estimated Simulation", main="QQ plot of Glycohemoglobin(MM)" )
abline(0,1)
qqplot(d1$ht,rgamma(1000,shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]),xlab = "Sample",ylab = "Estimated Simulation",main="QQ plot of Glycohemoglobin(MLE)" )
abline(0,1)
# Estimated Median
(estimated_median_mm = qgamma(0.5,shape=shape_ht_mm,scale =scale_ht_mm))
## [1] 160.6308
(estimated_median_mle = qgamma(0.5,shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]))
## [1] 160.6312
mean_gh_mm = mean(d1$gh)
var_gh_mm = var(d1$gh)
scale_gh_mm = var_gh_mm/mean_gh_mm
shape_gh_mm = mean_gh_mm^2/var_gh_mm
nLL_gamma <- function(shape, scale){
fs <- dgamma( x=d1$gh, shape = shape ,scale=scale ,log=TRUE)
-sum(fs)
}
fit_gamma <- mle(nLL_gamma, start=list(shape=1,scale=1) , method = "L-BFGS-B",lower = c(0, 0.01))
# Overlay estimated pdf onto histogram
hist(d1$gh, freq = FALSE, main = "Overlay estimated pdf onto histogram")
curve(dgamma(x, shape =coef(fit_gamma)[1], scale =coef(fit_gamma)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(dgamma(x, shape =shape_gh_mm, scale =scale_gh_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("topright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)
# Overlay estimated CDF onto eCDF
plot(ecdf(d1$gh), main = "Overlay estimated CDF onto eCDF")
curve(pgamma(x, shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(pgamma(x, shape=shape_gh_mm,scale =scale_gh_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("bottomright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)
# QQ plot (sample vs estimated dist)
qqplot(d1$gh,rgamma(1000,shape=shape_gh_mm,scale =scale_gh_mm),xlab = "Sample",ylab = "Estimated Simulation", main="QQ plot of Glycohemoglobin(MM)" )
abline(0,1)
qqplot(d1$gh,rgamma(1000,shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]),xlab = "Sample",ylab = "Estimated Simulation",main="QQ plot of Glycohemoglobin(MLE)" )
abline(0,1)
# Estimated Median
(estimated_median_mm = qgamma(0.5,shape=shape_gh_mm,scale =scale_gh_mm))
## [1] 5.660259
(estimated_median_mle = qgamma(0.5,shape=coef(fit_gamma)[1], scale =coef(fit_gamma)[2]))
## [1] 5.677983
# Estimates of parameters
mean_gh = mean(d1$gh)
var_gh = var(d1$gh)
scale_gh_mm = function(k){
mean_gh/gamma(1+1/k)
}
minequation = function(k){
abs(scale_gh_mm(k)^2*(gamma(1+2/k)-gamma(1+1/k)^2)-var_gh)
}
shape_gh_mm =optimize(f=minequation, lower = 0, upper = 100)
shape_gh_mm = shape_gh_mm$minimum
scale_gh_mm = scale_gh_mm(shape_gh_mm)
nLL_weibull <- function(shape, scale){
fs <- dweibull( x=d1$gh, shape = shape ,scale=scale ,log=TRUE)
-sum(fs)
}
fit_weibull <- mle(nLL_weibull, start=list(shape=1,scale=1) , method = "L-BFGS-B",lower = c(0, 0.01))
# Overlay estimated pdf onto histogram
hist(d1$gh, freq = FALSE, main = "Overlay estimated pdf onto histogram")
curve(dweibull(x, shape =coef(fit_weibull)[1], scale =coef(fit_weibull)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(dweibull(x, shape =shape_gh_mm, scale =scale_gh_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("topright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)
# Overlay estimated CDF onto eCDF
plot(ecdf(d1$gh), main = "Overlay estimated CDF onto eCDF")
curve(pweibull(x, shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(pweibull(x, shape=shape_gh_mm,scale =scale_gh_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("bottomright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)
# QQ plot (sample vs estimated dist)
qqplot(d1$gh,rweibull(1000,shape=shape_gh_mm,scale =scale_gh_mm),xlab = "Sample",ylab = "Estimated Simulation", main="QQ plot of Glycohemoglobin(MM)" )
abline(0,1)
qqplot(d1$gh,rweibull(1000,shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]),xlab = "Sample",ylab = "Estimated Simulation",main="QQ plot of Glycohemoglobin(MLE)" )
abline(0,1)
# Estimated Median
(estimated_median_mm = qweibull(0.5,shape=shape_gh_mm,scale =scale_gh_mm))
## [1] 5.806483
(estimated_median_mle = qweibull(0.5,shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]))
## [1] 5.64902
#Height
# Estimates of parameters
mean_ht = mean(d1$ht)
var_ht = var(d1$ht)
scale_ht_mm = function(k){
mean_ht/gamma(1+1/k)
}
minequation = function(k){
abs(scale_ht_mm(k)^2*(gamma(1+2/k)-gamma(1+1/k)^2)-var_ht)
}
shape_ht_mm =optimize(f=minequation, lower = 0, upper = 100)
shape_ht_mm = shape_ht_mm$minimum
scale_ht_mm = scale_ht_mm(shape_ht_mm)
nLL_weibull <- function(shape, scale){
fs <- dweibull( x=d1$ht, shape = shape ,scale=scale ,log=TRUE)
-sum(fs)
}
fit_weibull <- mle(nLL_weibull, start=list(shape=1,scale=1) , method = "L-BFGS-B",lower = c(0, 0.01))
# Overlay estimated pdf onto histogram
hist(d1$ht, freq = FALSE, main = "Overlay estimated pdf onto histogram")
curve(dweibull(x, shape =coef(fit_weibull)[1], scale =coef(fit_weibull)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(dweibull(x, shape =shape_ht_mm, scale =scale_ht_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("topright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)
# Overlay estimated CDF onto eCDF
plot(ecdf(d1$ht), main = "Overlay estimated CDF onto eCDF")
curve(pweibull(x, shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]) ,add=TRUE, col = "blue",lwd=5)
curve(pweibull(x, shape=shape_ht_mm,scale =scale_ht_mm) ,add=TRUE, col = "red",lwd=3,lty=2)
legend("bottomright",legend =c("MM","MLE"), col = c("red","blue"), lwd = 3)
# QQ plot (sample vs estimated dist)
qqplot(d1$ht,rweibull(1000,shape=shape_ht_mm,scale =scale_ht_mm),xlab = "Sample",ylab = "Estimated Simulation", main="QQ plot of Height(MM)" )
abline(0,1)
qqplot(d1$ht,rweibull(1000,shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]),xlab = "Sample",ylab = "Estimated Simulation",main="QQ plot of Height(MLE)" )
abline(0,1)
# Estimated Median
(estimated_median_mm = qweibull(0.5,shape=shape_ht_mm,scale =scale_ht_mm))
## [1] 161.8065
(estimated_median_mle = qweibull(0.5,shape=coef(fit_weibull)[1], scale =coef(fit_weibull)[2]))
## [1] 161.5156