Part 1: Normal distribution

Suppose that X is a random variable that represents the height of a women ages 30-75 in Sweden. Assume that X is normally distributed with a mean of 166 cm and a standard deviation of 6 cm.

  1. Plot the probability and cumulative density functions of the assumed distribution.
library(tidyverse)
library(gridExtra)

pn <- list(mu = 166, sd = 6) 
dt_norm <- data_frame(
  x = seq(pn$mu - 4*pn$sd, pn$mu + 4*pn$sd, length.out = 100),
  density = dnorm(x, pn$mu, pn$sd),
  cumulative = pnorm(x, pn$mu, pn$sd)
)

p_dn <- ggplot(dt_norm, aes(x, density)) +
  geom_line() +
  ggtitle("dnorm(x, 166, 6)") + 
  theme_classic() + theme(plot.title = element_text(hjust = 0.5))
p_pn <- ggplot(dt_norm, aes(x, cumulative)) +
  geom_line() +
  ggtitle("pnorm(x, 166, 6)") + 
  theme_classic() + theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p_dn, p_pn, ncol = 2)

  1. What is the probability that a randomly selected woman is less than 160 cm tall?

\(P(X < 160) = P((X - mu)/sigma < (160 - 166)/6) = P(Z < (160 - 166)/6)\)

pnorm((160-166)/6)
[1] 0.1586553
pnorm(160, 166, 6)
[1] 0.1586553
  1. What is the probability that a randomly selected woman is higher than 175 cm tall?
1 - pnorm(175, 166, 6)
[1] 0.0668072
pnorm(175, 166, 6, lower.tail = F)
[1] 0.0668072
  1. What is the probability that a randomly selected woman is between 160 and 170 cm tall?
pnorm(170, 166, 6) - pnorm(160, 166, 6)
[1] 0.5888522
  1. Calculate and interpret the theoretical quantiles with q = 0.25, 0.5, and 0.75.
dt_qn <- data_frame(
  cumsum = c(.25, .5, .75),
  q = qnorm(cumsum, 166, 6),
  d = dnorm(q, 166, 6)
)
qnorm(c(.25, .5, .75), 166, 6)
[1] 161.9531 166.0000 170.0469
grid.arrange(
p_dn +
  geom_segment(data = dt_qn, aes(x = q, y = d, xend = q, yend = 0), linetype = "dotted") +
  geom_area(data = filter(dt_norm, x <= dt_qn$q[1]), fill = "pink") +
  annotate("text", label = "25%", x = 158, y = .01),
p_pn +
  geom_segment(data = dt_qn, aes(x = q, y = cumsum, xend = q, yend = 0), linetype = "dotted") +
  geom_segment(data = dt_qn, aes(x = q, y = cumsum, xend = 140, yend = cumsum), linetype = "dotted"),
ncol = 2
)

Use simulation to check the previous results

set.seed(1234)
x <- rnorm(1000, 166, 6)
  1. Describe the simulated variable x. What is its mean and standard deviation?
summary(x)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  145.6   162.0   165.8   165.8   169.7   185.2 
c(mean = mean(x), sd = sd(x))
      mean         sd 
165.840417   5.984026 
  1. What is the percentage of observations with x < 160? What about with x > 175, and with 160 < x < 170? Compare with the answers to questions 2-4.
mean(x < 160)
[1] 0.157
mean(x > 176)
[1] 0.047
mean(x > 160 & x < 170)
[1] 0.609
  1. Calculate the quartiles of x. Compare the results with the answer to question 5.
q_sim <- quantile(x, c(.25, .5, .75))
q_sim
     25%      50%      75% 
161.9605 165.7612 169.6949 
  1. Provide a graphical presentation of the probability and cumulative distribution function based on the results of the simulation.
p_dn_sim <- ggplot(NULL, aes(x)) +
  geom_histogram(aes(y = ..density..), fill = "blue", alpha = .7) +
  geom_line(stat = "density")
p_n_sim <- ggplot(NULL, aes(x)) +
  stat_ecdf() + labs(y = "cumulative")
grid.arrange(p_dn_sim, p_n_sim, ncol = 2)

grid.arrange(
  p_dn_sim +
    geom_vline(xintercept = q_sim, linetype = "dotted"),
  p_n_sim +
    geom_segment(aes(x = q_sim, y = c(.25, .5, .75), xend = q_sim, yend = 0), linetype = "dotted") +
    geom_segment(aes(x = q_sim, y = c(.25, .5, .75), xend = 150, yend = c(.25, .5, .75)), linetype = "dotted"),
  ncol = 2
)

  1. Check that the percentage of x values that lie within around the mean with a width of two, four and six standard deviations is 68, 95, and 99.7% (aka 68-95-99.7 rule).
ps <- list(mean = mean(x), sd = sd(x))
map_dbl(1:3, ~ mean(x >= ps$mean - .x*ps$sd & x <= ps$mean + .x*ps$sd))
[1] 0.696 0.954 0.996
pd_dat <- ggplot_build(ggplot(NULL, aes(x)) +
                         geom_line(stat = "density"))$data[[1]]
ggplot(NULL, aes(x)) +
  geom_line(stat = "density") +
  geom_area(data = filter(pd_dat, x >= ps$mean - 2*ps$sd,
                          x <= ps$mean + 2*ps$sd), aes(x, y), fill = "violet", alpha = .4) +
  annotate("text", label = "95%", x = 165, y = .03)