Suppose that X is a random variable that represents the height of a women ages 30-75 in Sweden. Assume that X is normally distributed with a mean of 166 cm and a standard deviation of 6 cm.
library(tidyverse)
library(gridExtra)
pn <- list(mu = 166, sd = 6)
dt_norm <- data_frame(
x = seq(pn$mu - 4*pn$sd, pn$mu + 4*pn$sd, length.out = 100),
density = dnorm(x, pn$mu, pn$sd),
cumulative = pnorm(x, pn$mu, pn$sd)
)
p_dn <- ggplot(dt_norm, aes(x, density)) +
geom_line() +
ggtitle("dnorm(x, 166, 6)") +
theme_classic() + theme(plot.title = element_text(hjust = 0.5))
p_pn <- ggplot(dt_norm, aes(x, cumulative)) +
geom_line() +
ggtitle("pnorm(x, 166, 6)") +
theme_classic() + theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p_dn, p_pn, ncol = 2)
\(P(X < 160) = P((X - mu)/sigma < (160 - 166)/6) = P(Z < (160 - 166)/6)\)
pnorm((160-166)/6)
[1] 0.1586553
pnorm(160, 166, 6)
[1] 0.1586553
1 - pnorm(175, 166, 6)
[1] 0.0668072
pnorm(175, 166, 6, lower.tail = F)
[1] 0.0668072
pnorm(170, 166, 6) - pnorm(160, 166, 6)
[1] 0.5888522
dt_qn <- data_frame(
cumsum = c(.25, .5, .75),
q = qnorm(cumsum, 166, 6),
d = dnorm(q, 166, 6)
)
qnorm(c(.25, .5, .75), 166, 6)
[1] 161.9531 166.0000 170.0469
grid.arrange(
p_dn +
geom_segment(data = dt_qn, aes(x = q, y = d, xend = q, yend = 0), linetype = "dotted") +
geom_area(data = filter(dt_norm, x <= dt_qn$q[1]), fill = "pink") +
annotate("text", label = "25%", x = 158, y = .01),
p_pn +
geom_segment(data = dt_qn, aes(x = q, y = cumsum, xend = q, yend = 0), linetype = "dotted") +
geom_segment(data = dt_qn, aes(x = q, y = cumsum, xend = 140, yend = cumsum), linetype = "dotted"),
ncol = 2
)
Use simulation to check the previous results
set.seed(1234)
x <- rnorm(1000, 166, 6)
summary(x)
Min. 1st Qu. Median Mean 3rd Qu. Max.
145.6 162.0 165.8 165.8 169.7 185.2
c(mean = mean(x), sd = sd(x))
mean sd
165.840417 5.984026
mean(x < 160)
[1] 0.157
mean(x > 176)
[1] 0.047
mean(x > 160 & x < 170)
[1] 0.609
q_sim <- quantile(x, c(.25, .5, .75))
q_sim
25% 50% 75%
161.9605 165.7612 169.6949
p_dn_sim <- ggplot(NULL, aes(x)) +
geom_histogram(aes(y = ..density..), fill = "blue", alpha = .7) +
geom_line(stat = "density")
p_n_sim <- ggplot(NULL, aes(x)) +
stat_ecdf() + labs(y = "cumulative")
grid.arrange(p_dn_sim, p_n_sim, ncol = 2)
grid.arrange(
p_dn_sim +
geom_vline(xintercept = q_sim, linetype = "dotted"),
p_n_sim +
geom_segment(aes(x = q_sim, y = c(.25, .5, .75), xend = q_sim, yend = 0), linetype = "dotted") +
geom_segment(aes(x = q_sim, y = c(.25, .5, .75), xend = 150, yend = c(.25, .5, .75)), linetype = "dotted"),
ncol = 2
)
ps <- list(mean = mean(x), sd = sd(x))
map_dbl(1:3, ~ mean(x >= ps$mean - .x*ps$sd & x <= ps$mean + .x*ps$sd))
[1] 0.696 0.954 0.996
pd_dat <- ggplot_build(ggplot(NULL, aes(x)) +
geom_line(stat = "density"))$data[[1]]
ggplot(NULL, aes(x)) +
geom_line(stat = "density") +
geom_area(data = filter(pd_dat, x >= ps$mean - 2*ps$sd,
x <= ps$mean + 2*ps$sd), aes(x, y), fill = "violet", alpha = .4) +
annotate("text", label = "95%", x = 165, y = .03)