library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(DT)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(skimr)
library(infer) # bootstrapping
options(digits = 3, scipen = 999)
load("G:/My Drive/homework/Cooper P/US_COVID19_21_03_23.RData")
us_counties %>% glimpse()
## Rows: 3,067
## Columns: 13
## $ date <date> 2021-03-22, 2021-03-22, 2021-03-22, 2021-03-22, 2021~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "B~
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama"~
## $ fips <int> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017,~
## $ cases <int> 6517, 20361, 2213, 2529, 6387, 1194, 2072, 14165, 346~
## $ deaths <int> 98, 296, 54, 58, 130, 39, 66, 303, 114, 42, 107, 24, ~
## $ new_cases <dbl> 4, 14, 1, 0, 4, 0, 0, 3, 3, 0, 1, 1, 1, 0, 0, 5, 0, 1~
## $ new_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ cases_last_week <dbl> 46, 151, 18, 30, 34, 1, 6, 60, 12, 3, 32, 7, 9, 3, 10~
## $ deaths_last_week <dbl> 3, 2, 1, 0, 1, 0, 0, 2, 2, 0, 1, 1, 1, 0, 0, 2, 0, 0,~
## $ county_num <int> 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29~
## $ pop_2019 <int> 55869, 223234, 24686, 22394, 57826, 10101, 19448, 113~
## $ region <chr> "South", "South", "South", "South", "South", "South",~
us_counties %>% datatable(options = list(pageLength = 5))
us_counties %>% distinct(date)
## # A tibble: 1 x 1
## date
## <date>
## 1 2021-03-22
us_counties %<>% mutate(cases_per_100000 = cases / pop_2019 * 100000, .after = cases)
x_bar <- mean(us_counties$cases_per_100000)
us_counties %>% skim_without_charts()
Name | Piped data |
Number of rows | 3067 |
Number of columns | 14 |
_______________________ | |
Column type frequency: | |
character | 3 |
Date | 1 |
numeric | 10 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
county | 0 | 1 | 3 | 33 | 0 | 1804 | 0 |
state | 0 | 1 | 4 | 20 | 0 | 50 | 0 |
region | 0 | 1 | 4 | 9 | 0 | 4 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
date | 0 | 1 | 2021-03-22 | 2021-03-22 | 2021-03-22 | 1 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
fips | 0 | 1 | 30592.57 | 15253.70 | 1001 | 18150 | 30009 | 46028.0 | 56045 |
cases | 0 | 1 | 9237.85 | 34283.08 | 1 | 984 | 2373 | 6024.5 | 1215330 |
cases_per_100000 | 0 | 1 | 9336.92 | 3021.02 | 261 | 7574 | 9343 | 11052.5 | 35044 |
deaths | 0 | 1 | 162.03 | 618.55 | 0 | 17 | 45 | 105.5 | 22834 |
new_cases | 0 | 1 | 16.17 | 64.70 | -36 | 0 | 1 | 7.5 | 1137 |
new_deaths | 0 | 1 | 0.19 | 1.39 | -15 | 0 | 0 | 0.0 | 38 |
cases_last_week | 0 | 1 | 115.55 | 381.27 | -118 | 5 | 18 | 65.0 | 7535 |
deaths_last_week | 0 | 1 | 2.26 | 9.96 | -7 | 0 | 0 | 2.0 | 358 |
county_num | 0 | 1 | 104.47 | 108.63 | 1 | 35 | 79 | 135.0 | 840 |
pop_2019 | 0 | 1 | 102622.29 | 329881.08 | 86 | 10787 | 25619 | 67536.5 | 10039107 |
us_counties %>%
ggplot(aes(cases_per_100000)) +
geom_histogram(bins = sqrt(3067))
us_counties %>%
ggplot(aes(cases_per_100000)) +
geom_density()
us_counties %>%
ggplot(aes(y = cases_per_100000)) +
geom_boxplot() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
county_sample <-
us_counties %>%
select(state, county, cases_per_100000) %>%
slice_sample(n = 50)
county_sample %>% glimpse()
## Rows: 50
## Columns: 3
## $ state <chr> "Ohio", "Mississippi", "Wisconsin", "Mississippi", "S~
## $ county <chr> "Sandusky", "Madison", "Adams", "Grenada", "Walworth"~
## $ cases_per_100000 <dbl> 8363, 9146, 8769, 12145, 13634, 13799, 7861, 9172, 11~
county_sample %>% datatable(options = list(pageLength = 5))
county_sample %>% skim_without_charts()
Name | Piped data |
Number of rows | 50 |
Number of columns | 3 |
_______________________ | |
Column type frequency: | |
character | 2 |
numeric | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
state | 0 | 1 | 4 | 14 | 0 | 28 | 0 |
county | 0 | 1 | 4 | 25 | 0 | 49 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
cases_per_100000 | 0 | 1 | 9551 | 3052 | 2550 | 7654 | 9164 | 12090 | 14788 |
county_sample %>%
ggplot(aes(cases_per_100000)) +
geom_histogram() +
geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")
county_sample %>%
ggplot(aes(cases_per_100000)) +
geom_density() +
geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")
county_sample %>%
ggplot(aes(y = cases_per_100000)) +
geom_boxplot() +
geom_hline(yintercept = mean(county_sample$cases_per_100000), color = "red")+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
county_sample %>%
ggplot(aes(sample = cases_per_100000)) +
geom_qq() +
geom_qq_line()
bootstrap1 <-
county_sample %>%
rep_sample_n(size = 50, reps = 1, replace = TRUE)
bootstrap1 %>%
group_by(county) %>%
summarize(Count = NROW(county)) %>%
arrange(desc(Count))
## # A tibble: 33 x 2
## county Count
## <chr> <int>
## 1 Sandusky 4
## 2 Cumberland 3
## 3 Currituck 3
## 4 Keokuk 3
## 5 Adams 2
## 6 Benton 2
## 7 Codington 2
## 8 Fresno 2
## 9 Lincoln 2
## 10 Nevada 2
## # ... with 23 more rows
mean(bootstrap1$cases_per_100000)
## [1] 10052
bootstrap35 <-
county_sample %>%
rep_sample_n(size = 50, reps = 35, replace = TRUE) %>%
group_by(replicate) %>%
summarise(mean_cases_per_100000 = mean(cases_per_100000))
bootstrap35 %>%
ggplot(aes(mean_cases_per_100000)) +
geom_histogram() +
geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")
bootstrap35 %>%
ggplot(aes(mean_cases_per_100000)) +
geom_density() +
geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")
bootstrap1000 <-
county_sample %>%
rep_sample_n(size = 50, reps = 1000, replace = TRUE) %>%
group_by(replicate) %>%
summarise(mean_cases_per_100000 = mean(cases_per_100000))
bootstrap1000 %>%
ggplot(aes(mean_cases_per_100000)) +
geom_histogram() +
geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")
bootstrap1000 %>%
ggplot(aes(mean_cases_per_100000)) +
geom_density() +
geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")
vignette("infer")
## starting httpd help server ... done
bootstrap1000 <-
county_sample %>%
specify(response = cases_per_100000) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean")
bootstrap1000 %>%
visualise(bins = 30)
(percentile_ci <-
bootstrap1000 %>%
get_confidence_interval(level = 0.95, type = "percentile")
)
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 8707. 10383.
bootstrap1000 %>%
visualize(bins = 30) +
shade_confidence_interval(endpoints = percentile_ci) +
geom_vline(xintercept = x_bar)
(standard_error_ci <-
bootstrap1000 %>%
get_confidence_interval(type = "se",
level = 0.95,
point_estimate = x_bar))
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 8493. 10181.
bootstrap1000 %>%
visualize() +
shade_confidence_interval(endpoints = standard_error_ci) +
geom_vline(xintercept = x_bar)
percentile_ci
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 8707. 10383.
standard_error_ci
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 8493. 10181.
x_bar
## [1] 9337
median(us_counties$cases_per_100000)
## [1] 9343
set.seed(1)
inside <- TRUE
trial <- 0
n <- 12
while(inside == TRUE){
percentile_ci <-
us_counties %>%
# slice_sample(n = 50, replace = TRUE) %>%
specify(response = cases_per_100000) %>%
generate(reps = n, type = "bootstrap") %>%
calculate(stat = "mean") %>%
get_confidence_interval(level = 0.95, type = "percentile")
if(x_bar < percentile_ci[1] | percentile_ci[2] < x_bar){inside = FALSE}
trial <- trial + 1
}
trial
## [1] 7
1 / trial
## [1] 0.143