The probability that a newborn baby is a girl is 0.488. Suppose that 10 babies are born in one day in a hospital.
\[X \sim \textrm{Bin}(n = 10, p = 0.488)\]
\(P(X = 3) = \frac{n!}{x!(n-x)!}p^x(1-p)^{n-x} = \frac{10!}{3!7!}(0.488)^3(1-0.488)^7\)
factorial(10)/(factorial(3)*factorial(7))*(0.488)^3*(1-0.488)^7
[1] 0.1286265
# using statistical functions in R
dbinom(3, 10, .488)
[1] 0.1286265
\(P(X < 3) = P(X = 0) + P(X = 1) + P(X = 2)\)
sum(dbinom(0:2, 10, .488))
[1] 0.0636442
pbinom(2, 10, .488)
[1] 0.0636442
\(P(X \ge 3) = 1 - P(X < 3)\)
1 - pbinom(2, 10, .488)
[1] 0.9363558
pbinom(2, 10, .488, lower.tail = FALSE)
[1] 0.9363558
\[E[X] = np = 10\cdot0.488 = 4.88\] \[\textrm{VAR}[X] = np(1-p) = 10\cdot0.488(1-.488) = 4.88\]
Plot the theoeretical distribution
library(tidyverse)
library(gridExtra)
dt_bin <- data_frame(
x = 0:10,
density = dbinom(x, 10, .488),
cumulative = cumsum(density)
)
p_dbin <- ggplot(dt_bin, aes(x, density)) +
geom_bar(stat = "identity", fill = "red") +
ggtitle("dbinom(x, 10, 0.488)") +
theme_classic() + theme(plot.title = element_text(hjust = 0.5))
p_bin <- ggplot(dt_bin, aes(x, cumulative)) +
geom_bar(stat = "identity", fill = "red") +
ggtitle("binom(x, 10, 0.488)") +
theme_classic() + theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p_dbin, p_bin, ncol = 2)
Use simulation to check the previous results
set.seed(1234)
x <- map_int(1:1000, function(i) sum(rbinom(1, n = 10, prob = .488)))
dat_sim_bin <- as_data_frame(table(x)) %>%
mutate(
x = as.integer(x),
freq = n/sum(n),
cum_freq = cumsum(freq)
)
dat_sim_bin
# A tibble: 11 x 4
x n freq cum_freq
<int> <int> <dbl> <dbl>
1 0 1 0.001 0.001
2 1 9 0.009 0.010
3 2 43 0.043 0.053
4 3 143 0.143 0.196
5 4 215 0.215 0.411
6 5 240 0.240 0.651
7 6 197 0.197 0.848
8 7 101 0.101 0.949
9 8 43 0.043 0.992
10 9 7 0.007 0.999
11 10 1 0.001 1.000
# Question 2
filter(dat_sim_bin, x == 3) %>% select(x, p = freq)
# A tibble: 1 x 2
x p
<int> <dbl>
1 3 0.143
# Question 3
filter(dat_sim_bin, x == 2) %>% select(x, p = cum_freq)
# A tibble: 1 x 2
x p
<int> <dbl>
1 2 0.053
# Question 4
filter(dat_sim_bin, x == 2) %>% transmute(x, p = 1 - cum_freq)
# A tibble: 1 x 2
x p
<int> <dbl>
1 2 0.947
p_dbin_sim <- ggplot(dat_sim_bin, aes(x, freq)) +
geom_bar(stat = "identity", fill = "blue") +
labs(y = "density") +
theme_classic()
p_bin_sim <- ggplot(dat_sim_bin, aes(x, cum_freq)) +
geom_bar(stat = "identity", fill = "blue") +
labs(y = "Cumulative") +
theme_classic()
grid.arrange(p_dbin_sim, p_bin_sim, ncol = 2)
p <- seq(.1, .9, by = .1)
data.frame(map(p, ~ dbinom(0:10, size = 10, .x))) %>%
set_names(p) %>%
mutate(x = 0:10) %>%
gather(p, density, -x) %>%
ggplot(aes(x, density, color = p)) +
geom_line()
Suppose that the expected number of deaths due to bladder cancer for all workers at a tire plant over the next 20 years is 1.8.
\[Y \sim \textrm{Poisson}(\lambda = 1.8)\]
\(P(Y = 4) = exp(-\lambda)\frac{\lambda^y}{4!} = exp(-1.8)1.8^4/4!\)
exp(-1.8)*1.8^4/factorial(4)
[1] 0.07230173
dpois(4, 1.8)
[1] 0.07230173
\(P(Y \öe 4) = P(Y = 0) + P(Y = 1) + P(Y = 2) + P(Y = 3) + P(Y = 4)\)
sum(dpois(0:4, 1.8))
[1] 0.9635933
ppois(4, 1.8)
[1] 0.9635933
\(P(Y > 4) = 1 - P(Y \ge 4)\)
1- ppois(4, 1.8)
[1] 0.03640666
ppois(4, 1.8, lower.tail = F)
[1] 0.03640666
\[E[Y] = \lambda = 1.8\] \[\textrm{VAR}[X] = \lambda = 1.8\]
Plot the theoeretical distribution
dt_pois <- data_frame(
y = 0:10,
density = dpois(y, 1.8),
cumulative = cumsum(density)
)
p_dpois <- ggplot(dt_pois, aes(y, density)) +
geom_bar(stat = "identity", fill = "red") +
ggtitle("dpois(y, 1.8)") +
theme_classic() + theme(plot.title = element_text(hjust = 0.5))
p_pois <- ggplot(dt_pois, aes(y, cumulative)) +
geom_bar(stat = "identity", fill = "red") +
ggtitle("pois(y, 1.8)") +
theme_classic() + theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p_dpois, p_pois, ncol = 2)
y <- rpois(1000, 1.8)
dat_sim_poi <- as_data_frame(table(y)) %>%
mutate(
y = as.integer(y),
freq = n/sum(n),
cum_freq = cumsum(freq)
)
dat_sim_poi
# A tibble: 9 x 4
y n freq cum_freq
<int> <int> <dbl> <dbl>
1 0 147 0.147 0.147
2 1 283 0.283 0.430
3 2 289 0.289 0.719
4 3 180 0.180 0.899
5 4 66 0.066 0.965
6 5 24 0.024 0.989
7 6 7 0.007 0.996
8 7 3 0.003 0.999
9 8 1 0.001 1.000
# Question 2
filter(dat_sim_poi, y == 4) %>% select(y, p = freq)
# A tibble: 1 x 2
y p
<int> <dbl>
1 4 0.066
# Question 3
filter(dat_sim_poi, y == 4) %>% select(y, p = cum_freq)
# A tibble: 1 x 2
y p
<int> <dbl>
1 4 0.965
# Question 4
filter(dat_sim_poi, y == 4) %>% transmute(y, p = 1 - cum_freq)
# A tibble: 1 x 2
y p
<int> <dbl>
1 4 0.035
p_dpoi_sim <- ggplot(dat_sim_poi, aes(y, freq)) +
geom_bar(stat = "identity", fill = "blue") +
labs(y = "density") +
theme_classic()
p_poi_sim <- ggplot(dat_sim_poi, aes(y, cum_freq)) +
geom_bar(stat = "identity", fill = "blue") +
labs(y = "Cumulative") +
theme_classic()
grid.arrange(p_dpoi_sim, p_poi_sim, ncol = 2)
lamda <- seq(1, 10, by = 1)
data.frame(map(lamda, ~ dpois(0:20, .x))) %>%
set_names(lamda) %>%
mutate(y = 0:20) %>%
gather(lamda, density, -y) %>%
ggplot(aes(y, density, color = lamda)) +
geom_line()
\[Y \sim \textrm{Poisson}(\lambda = 4.88)\] 11) Answer question 2-4 (Part 1) and compare the results.
# question 2
dpois(3, 1.8)
[1] 0.1606705
# question 3
ppois(2, 1.8)
[1] 0.7306211
# question 4
ppois(2, 1.8, lower.tail = F)
[1] 0.2693789