library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(DT)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(skimr)
library(infer) # bootstrapping

options(digits = 3, scipen = 999)

2

(a)

load("G:/My Drive/homework/Cooper P/US_COVID19_21_03_23.RData")

us_counties %>% glimpse()
## Rows: 3,067
## Columns: 13
## $ date             <date> 2021-03-22, 2021-03-22, 2021-03-22, 2021-03-22, 2021~
## $ county           <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "B~
## $ state            <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama"~
## $ fips             <int> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017,~
## $ cases            <int> 6517, 20361, 2213, 2529, 6387, 1194, 2072, 14165, 346~
## $ deaths           <int> 98, 296, 54, 58, 130, 39, 66, 303, 114, 42, 107, 24, ~
## $ new_cases        <dbl> 4, 14, 1, 0, 4, 0, 0, 3, 3, 0, 1, 1, 1, 0, 0, 5, 0, 1~
## $ new_deaths       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ cases_last_week  <dbl> 46, 151, 18, 30, 34, 1, 6, 60, 12, 3, 32, 7, 9, 3, 10~
## $ deaths_last_week <dbl> 3, 2, 1, 0, 1, 0, 0, 2, 2, 0, 1, 1, 1, 0, 0, 2, 0, 0,~
## $ county_num       <int> 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29~
## $ pop_2019         <int> 55869, 223234, 24686, 22394, 57826, 10101, 19448, 113~
## $ region           <chr> "South", "South", "South", "South", "South", "South",~
us_counties %>% datatable(options = list(pageLength = 5))
us_counties %>% distinct(date)
## # A tibble: 1 x 1
##   date      
##   <date>    
## 1 2021-03-22
us_counties %<>% mutate(cases_per_100000 = cases / pop_2019 * 100000, .after = cases)

x_bar <- mean(us_counties$cases_per_100000)

us_counties %>% skim_without_charts()
Data summary
Name Piped data
Number of rows 3067
Number of columns 14
_______________________
Column type frequency:
character 3
Date 1
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
county 0 1 3 33 0 1804 0
state 0 1 4 20 0 50 0
region 0 1 4 9 0 4 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 2021-03-22 2021-03-22 2021-03-22 1

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
fips 0 1 30592.57 15253.70 1001 18150 30009 46028.0 56045
cases 0 1 9237.85 34283.08 1 984 2373 6024.5 1215330
cases_per_100000 0 1 9336.92 3021.02 261 7574 9343 11052.5 35044
deaths 0 1 162.03 618.55 0 17 45 105.5 22834
new_cases 0 1 16.17 64.70 -36 0 1 7.5 1137
new_deaths 0 1 0.19 1.39 -15 0 0 0.0 38
cases_last_week 0 1 115.55 381.27 -118 5 18 65.0 7535
deaths_last_week 0 1 2.26 9.96 -7 0 0 2.0 358
county_num 0 1 104.47 108.63 1 35 79 135.0 840
pop_2019 0 1 102622.29 329881.08 86 10787 25619 67536.5 10039107
us_counties %>%
  ggplot(aes(cases_per_100000)) +
  geom_histogram(bins = sqrt(3067))

us_counties %>%
  ggplot(aes(cases_per_100000)) +
  geom_density()

us_counties %>%
  ggplot(aes(y = cases_per_100000)) +
  geom_boxplot() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

(b)

county_sample <-
  us_counties %>%
  select(state, county, cases_per_100000) %>%
  slice_sample(n = 50)

county_sample %>% glimpse()
## Rows: 50
## Columns: 3
## $ state            <chr> "Ohio", "Mississippi", "Wisconsin", "Mississippi", "S~
## $ county           <chr> "Sandusky", "Madison", "Adams", "Grenada", "Walworth"~
## $ cases_per_100000 <dbl> 8363, 9146, 8769, 12145, 13634, 13799, 7861, 9172, 11~
county_sample %>% datatable(options = list(pageLength = 5))
county_sample %>% skim_without_charts()
Data summary
Name Piped data
Number of rows 50
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
state 0 1 4 14 0 28 0
county 0 1 4 25 0 49 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
cases_per_100000 0 1 9551 3052 2550 7654 9164 12090 14788

(c)

county_sample %>%
  ggplot(aes(cases_per_100000)) +
  geom_histogram() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

county_sample %>%
  ggplot(aes(cases_per_100000)) +
  geom_density() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

county_sample %>%
  ggplot(aes(y = cases_per_100000)) +
  geom_boxplot() +
  geom_hline(yintercept = mean(county_sample$cases_per_100000), color = "red")+
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

county_sample %>%
  ggplot(aes(sample = cases_per_100000)) +
  geom_qq() +
  geom_qq_line()

(d)

bootstrap1 <- 
  county_sample %>%
  rep_sample_n(size = 50, reps = 1, replace = TRUE)

bootstrap1 %>%
  group_by(county) %>%
  summarize(Count = NROW(county)) %>%
  arrange(desc(Count))
## # A tibble: 33 x 2
##    county     Count
##    <chr>      <int>
##  1 Sandusky       4
##  2 Cumberland     3
##  3 Currituck      3
##  4 Keokuk         3
##  5 Adams          2
##  6 Benton         2
##  7 Codington      2
##  8 Fresno         2
##  9 Lincoln        2
## 10 Nevada         2
## # ... with 23 more rows
mean(bootstrap1$cases_per_100000)
## [1] 10052

(e)

bootstrap35 <- 
  county_sample %>%
  rep_sample_n(size = 50, reps = 35, replace = TRUE) %>%
  group_by(replicate) %>%
  summarise(mean_cases_per_100000 = mean(cases_per_100000))

bootstrap35 %>%
  ggplot(aes(mean_cases_per_100000)) +
  geom_histogram() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

bootstrap35 %>%
  ggplot(aes(mean_cases_per_100000)) +
  geom_density() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

(f)

bootstrap1000 <- 
  county_sample %>%
  rep_sample_n(size = 50, reps = 1000, replace = TRUE) %>%
  group_by(replicate) %>%
  summarise(mean_cases_per_100000 = mean(cases_per_100000))

bootstrap1000 %>%
  ggplot(aes(mean_cases_per_100000)) +
  geom_histogram() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

bootstrap1000 %>%
  ggplot(aes(mean_cases_per_100000)) +
  geom_density() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

(g)

vignette("infer")
## starting httpd help server ... done
bootstrap1000 <-
  county_sample %>%
  specify(response = cases_per_100000) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "mean")

bootstrap1000 %>%
  visualise(bins = 30)

(i)

(percentile_ci <-
  bootstrap1000 %>% 
  get_confidence_interval(level = 0.95, type = "percentile")
 )
## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    8707.   10383.
bootstrap1000 %>% 
  visualize(bins = 30) + 
  shade_confidence_interval(endpoints = percentile_ci) +
  geom_vline(xintercept = x_bar)

(k)

(standard_error_ci <-
  bootstrap1000 %>% 
  get_confidence_interval(type = "se",
                          level = 0.95,
                          point_estimate = x_bar))
## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    8493.   10181.
bootstrap1000 %>%
  visualize() + 
  shade_confidence_interval(endpoints = standard_error_ci) +
  geom_vline(xintercept = x_bar)

(l)

percentile_ci
## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    8707.   10383.
standard_error_ci
## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    8493.   10181.
x_bar
## [1] 9337
median(us_counties$cases_per_100000)
## [1] 9343

(m)

set.seed(1)

inside <- TRUE
trial <- 0
n <- 12

while(inside == TRUE){
  percentile_ci <-
    us_counties %>%
    # slice_sample(n = 50, replace = TRUE) %>%
    specify(response = cases_per_100000) %>%
    generate(reps = n, type = "bootstrap") %>%
    calculate(stat = "mean") %>% 
    get_confidence_interval(level = 0.95, type = "percentile")
  
  if(x_bar < percentile_ci[1] | percentile_ci[2] < x_bar){inside = FALSE}
  
  trial <- trial + 1
}

trial
## [1] 7
1 / trial
## [1] 0.143