library(tidyverse)
library(magrittr) # %<>%
library(DT) # datatable()
library(GGally) # ggpairs()
library(infer) # rep_sample_n()
options(digits = 3, scipen = 999)
load("G:/My Drive/homework/Cooper P/covid_example_population.RData")
names(covid_pop) %<>% str_to_title()
covid_pop %>%
datatable(options = list(pageLength = 5))
covid_pop %>%
mutate(across(.cols = everything(), as_factor)) %>%
summary()
## Symptoms Contact Infected
## asymptomatic:9398 not contact:8956 uninfected:9704
## symptomatic : 602 contact :1044 infected : 296
covid_pop %>%
mutate(across(.cols = everything(), as_factor)) %>%
ggpairs()
with(covid_pop, table(Symptoms, Infected)) # %>% addmargins()
## Infected
## Symptoms infected uninfected
## asymptomatic 162 9236
## symptomatic 134 468
with(covid_pop, ftable(Contact, Symptoms, Infected))
## Infected infected uninfected
## Contact Symptoms
## contact asymptomatic 123 798
## symptomatic 87 36
## not contact asymptomatic 39 8438
## symptomatic 47 432
covid_pop_n500 <-
covid_pop %>%
slice_sample(n = 500, replace = FALSE)
p_hat_n500 <-
covid_pop_n500 %>%
filter(Infected == "infected") %>%
nrow() / 500
covid_pop_n500_symptomatic <-
covid_pop %>%
filter(Symptoms == "symptomatic") %>%
slice_sample(n = 500, replace = FALSE)
p_hat_n500_symptomatic <-
covid_pop_n500_symptomatic %>%
filter(Infected == "infected") %>%
nrow() / 500
covid_pop_n10000_contact_symptomatic <-
covid_pop %>%
filter(Contact == "contact" | Symptoms == "symptomatic")
covid_pop_n10000_contact_symptomatic %>%
filter(Infected == "infected") %>%
nrow()
## [1] 257
(p_hat_n10000_contact_symptomatic <-
covid_pop_n10000_contact_symptomatic %>%
filter(Infected == "infected") %>%
nrow() / nrow(covid_pop_n10000_contact_symptomatic)
)
## [1] 0.169
Sample
covid_pop_n10000_symptomatic <-
covid_pop %>%
filter(Symptoms == "symptomatic")
sum(covid_pop_n10000_contact_symptomatic$Infected == "infected") -
sum(covid_pop_n10000_symptomatic$Infected == "infected")
## [1] 123
(p_hat_n10000_symptomatic <-
covid_pop_n10000_symptomatic %>%
filter(Infected == "infected") %>%
nrow() / nrow(covid_pop_n10000_symptomatic)
)
## [1] 0.223
p_hat_n10000 <-
covid_pop %>%
filter(Infected == "infected") %>%
nrow() / 10000
data.frame(p_hat_n500,
p_hat_n500_symptomatic,
p_hat_n10000_contact_symptomatic,
p_hat_n10000_symptomatic,
p_hat_n10000,
row.names = "Proportion_Infected") %>%
t()
## Proportion_Infected
## p_hat_n500 0.0180
## p_hat_n500_symptomatic 0.2240
## p_hat_n10000_contact_symptomatic 0.1687
## p_hat_n10000_symptomatic 0.2226
## p_hat_n10000 0.0296
covid_pop_n500_rep1000 <-
covid_pop %>%
rep_sample_n(size = 500, reps = 1000, replace = FALSE)
covid_pop_n500_rep1000_distrubtion <-
covid_pop_n500_rep1000 %>%
group_by(replicate) %>%
summarize(Proportion_Infected = sum(Infected == "infected") / 500)
std_error_n500_rep1000 <-
sd(covid_pop_n500_rep1000_distrubtion$Proportion_Infected) / sqrt(500)
covid_pop_n1000_rep1000 <-
covid_pop %>%
rep_sample_n(size = 1000, reps = 1000, replace = TRUE)
covid_pop_n1000_rep1000_distrubtion <-
covid_pop_n1000_rep1000 %>%
group_by(replicate) %>%
summarize(Proportion_Infected = sum(Infected == "infected") / 1000)
std_error_n1000_rep1000 <-
sd(covid_pop_n1000_rep1000_distrubtion$Proportion_Infected) / sqrt(1000)
covid_pop_n2500_rep1000 <-
covid_pop %>%
rep_sample_n(size = 2500, reps = 1000, replace = TRUE)
covid_pop_n2500_rep1000_distrubtion <-
covid_pop_n2500_rep1000 %>%
group_by(replicate) %>%
summarize(Proportion_Infected = sum(Infected == "infected") / 2500)
std_error_n2500_rep1000 <-
sd(covid_pop_n2500_rep1000_distrubtion$Proportion_Infected) / sqrt(2500)
covid_pop_symptomatic_n500_rep1000 <-
covid_pop %>%
filter(Symptoms == "symptomatic") %>%
rep_sample_n(size = 500, reps = 1000, replace = TRUE)
covid_pop_symptomatic_n500_rep1000_distrubtion <-
covid_pop_symptomatic_n500_rep1000 %>%
group_by(replicate) %>%
summarize(Proportion_Infected = sum(Infected == "infected") / 500)
std_error_symptomatic_n500_rep1000 <-
sd(covid_pop_symptomatic_n500_rep1000_distrubtion$Proportion_Infected) / sqrt(500)
tibble("n500" = covid_pop_n500_rep1000_distrubtion$Proportion_Infected,
"n1000" = covid_pop_n1000_rep1000_distrubtion$Proportion_Infected,
"n2500" = covid_pop_n2500_rep1000_distrubtion$Proportion_Infected,
"n500symptomatic" =
covid_pop_symptomatic_n500_rep1000_distrubtion$Proportion_Infected
) %>%
slice_sample(n = 10)
## # A tibble: 10 x 4
## n500 n1000 n2500 n500symptomatic
## <dbl> <dbl> <dbl> <dbl>
## 1 0.026 0.039 0.0332 0.216
## 2 0.024 0.021 0.0276 0.208
## 3 0.024 0.037 0.0272 0.224
## 4 0.026 0.026 0.026 0.226
## 5 0.026 0.036 0.0228 0.228
## 6 0.026 0.032 0.0304 0.246
## 7 0.03 0.033 0.0256 0.222
## 8 0.038 0.031 0.0272 0.236
## 9 0.022 0.036 0.0248 0.194
## 10 0.026 0.02 0.032 0.23
bind_rows("n0500" = covid_pop_n500_rep1000_distrubtion,
"n1000" = covid_pop_n1000_rep1000_distrubtion,
"n2500" = covid_pop_n2500_rep1000_distrubtion,
"n500symptomatic" = covid_pop_symptomatic_n500_rep1000_distrubtion,
.id = "SampleSize"
) -> combined.distributions
combined.distributions %>%
ggplot(aes(Proportion_Infected, fill = SampleSize)) +
geom_histogram() + # How to adjust bins ?
geom_vline(xintercept = p_hat_n10000, color = "red") +
facet_grid(cols = vars(SampleSize))
combined.distributions %>%
ggplot(aes(Proportion_Infected, fill = SampleSize)) +
geom_histogram() +
geom_vline(xintercept = p_hat_n10000, color = "red", size = 0.66) +
facet_grid(cols = vars(SampleSize), scales = "free_x") +
labs(subtitle = "Free X Scales")
combined.distributions %>%
ggplot(aes(y = Proportion_Infected, fill = SampleSize)) +
geom_boxplot() +
geom_hline(yintercept = p_hat_n10000, color = "red") +
# facet_grid(cols = vars(SampleSize)) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
combined.distributions %>%
group_by(SampleSize) %>%
summarise(Min = min(Proportion_Infected),
# Q1 = quantile(Proportion_Infected, 0.25),
Median = median(Proportion_Infected),
Mean = mean(Proportion_Infected),
# Q3 = quantile(Proportion_Infected, 0.75),
Max = max(Proportion_Infected)
) %>%
mutate("Std_ErrorEE-7" = c(std_error_n500_rep1000,
std_error_n1000_rep1000,
std_error_n2500_rep1000,
std_error_symptomatic_n500_rep1000) *
10^7
)
## # A tibble: 4 x 6
## SampleSize Min Median Mean Max `Std_ErrorEE-7`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 n0500 0.01 0.028 0.0292 0.062 3368.
## 2 n1000 0.014 0.029 0.0296 0.048 1618.
## 3 n2500 0.0204 0.0296 0.0296 0.04 662.
## 4 n500symptomatic 0.168 0.222 0.222 0.29 8127.
Quadrupled the tests and doubled the positive cases. 75% of positive cases are for asymptomatic & no-contact citizens.
1000 * 0.01
## [1] 10
4000 * 0.005
## [1] 20
https://moderndive.com/7-sampling.html#moral-of-the-story https://moderndive.com/7-sampling.html#fig:accuracy-vs-precision
https://cals.arizona.edu/classes/rnr321/Ch4.pdf#page=3
(covid_pop %>%
group_by(Symptoms) %>%
summarise(Infections = sum(Infected == "infected"),
N = NROW(Symptoms),
Proportion_Infected = Infections / N) ->
covid_pop_stratefied)
## # A tibble: 2 x 4
## Symptoms Infections N Proportion_Infected
## <chr> <int> <int> <dbl>
## 1 asymptomatic 162 9398 0.0172
## 2 symptomatic 134 602 0.223
(p_hat_n10000_stratefied <-
(500*covid_pop_stratefied$Proportion_Infected[1] +
500*covid_pop_stratefied$Proportion_Infected[2]) /
(500 + 500))
## [1] 0.12
(p_hat_n500_rep1000_asymptomatic <-
covid_pop %>%
filter(Symptoms == "asymptomatic") %>%
rep_sample_n(size = 500, reps = 1000, replace = TRUE) %>%
group_by(replicate) %>%
summarise(Proportion_Infected = sum(Infected == "infected") / 500) %>%
pull(Proportion_Infected) %>%
mean())
## [1] 0.0173
(p_hat_n500_rep1000_symptomatic <-
covid_pop %>%
filter(Symptoms == "symptomatic") %>%
rep_sample_n(size = 500, reps = 1000, replace = TRUE) %>%
group_by(replicate) %>%
summarise(Proportion_Infected = sum(Infected == "infected") / 500) %>%
pull(Proportion_Infected) %>%
mean())
## [1] 0.223
(p_hat_n500_rep1000_stratefied <-
(500 * p_hat_n500_rep1000_asymptomatic + 500 * p_hat_n500_rep1000_symptomatic) /
(500 + 500))
## [1] 0.12