library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(DT)
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(skimr)
library(infer) # bootstrapping

options(digits = 3, scipen = 999)

2

(a)

load("G:/My Drive/homework/Cooper P/US_COVID19_21_03_23.RData")

us_counties %>% glimpse()

## Rows: 3,067
## Columns: 13
## $ date             <date> 2021-03-22, 2021-03-22, 2021-03-22, 2021-03-22, 2021~
## $ county           <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "B~
## $ state            <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama"~
## $ fips             <int> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017,~
## $ cases            <int> 6517, 20361, 2213, 2529, 6387, 1194, 2072, 14165, 346~
## $ deaths           <int> 98, 296, 54, 58, 130, 39, 66, 303, 114, 42, 107, 24, ~
## $ new_cases        <dbl> 4, 14, 1, 0, 4, 0, 0, 3, 3, 0, 1, 1, 1, 0, 0, 5, 0, 1~
## $ new_deaths       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ cases_last_week  <dbl> 46, 151, 18, 30, 34, 1, 6, 60, 12, 3, 32, 7, 9, 3, 10~
## $ deaths_last_week <dbl> 3, 2, 1, 0, 1, 0, 0, 2, 2, 0, 1, 1, 1, 0, 0, 2, 0, 0,~
## $ county_num       <int> 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29~
## $ pop_2019         <int> 55869, 223234, 24686, 22394, 57826, 10101, 19448, 113~
## $ region           <chr> "South", "South", "South", "South", "South", "South",~

us_counties %>% datatable(options = list(pageLength = 5))

us_counties %>% distinct(date)

## # A tibble: 1 x 1
##   date      
##   <date>    
## 1 2021-03-22

us_counties %<>% mutate(cases_per_100000 = cases / pop_2019 * 100000, .after = cases)

x_bar <- mean(us_counties$cases_per_100000)

us_counties %>% skim_without_charts()

Data summary
Name	Piped data
Number of rows	3067
Number of columns	14
_______________________
Column type frequency:
character	3
Date	1
numeric	10
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
county	1	3	33	1804
state	1	4	20	50
region	1	4	9	4

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
date	0	1	2021-03-22	2021-03-22	2021-03-22	1

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
fips	1	30592.57	15253.70	1001	18150	30009	46028.0	56045
cases	1	9237.85	34283.08	1	984	2373	6024.5	1215330
cases_per_100000	1	9336.92	3021.02	261	7574	9343	11052.5	35044
deaths	1	162.03	618.55	0	17	45	105.5	22834
new_cases	1	16.17	64.70	-36	0	1	7.5	1137
new_deaths	1	0.19	1.39	-15	0	0	0.0	38
cases_last_week	1	115.55	381.27	-118	5	18	65.0	7535
deaths_last_week	1	2.26	9.96	-7	0	0	2.0	358
county_num	1	104.47	108.63	1	35	79	135.0	840
pop_2019	1	102622.29	329881.08	86	10787	25619	67536.5	10039107

us_counties %>%
  ggplot(aes(cases_per_100000)) +
  geom_histogram(bins = sqrt(3067))

us_counties %>%
  ggplot(aes(cases_per_100000)) +
  geom_density()

us_counties %>%
  ggplot(aes(y = cases_per_100000)) +
  geom_boxplot() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

(b)

county_sample <-
  us_counties %>%
  select(state, county, cases_per_100000) %>%
  slice_sample(n = 50)

county_sample %>% glimpse()

## Rows: 50
## Columns: 3
## $ state            <chr> "Ohio", "Mississippi", "Wisconsin", "Mississippi", "S~
## $ county           <chr> "Sandusky", "Madison", "Adams", "Grenada", "Walworth"~
## $ cases_per_100000 <dbl> 8363, 9146, 8769, 12145, 13634, 13799, 7861, 9172, 11~

county_sample %>% datatable(options = list(pageLength = 5))

county_sample %>% skim_without_charts()

Data summary
Name	Piped data
Number of rows	50
Number of columns	3
_______________________
Column type frequency:
character	2
numeric	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
state	0	1	4	14	0	28	0
county	0	1	4	25	0	49	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
cases_per_100000	0	1	9551	3052	2550	7654	9164	12090	14788

(c)

county_sample %>%
  ggplot(aes(cases_per_100000)) +
  geom_histogram() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

county_sample %>%
  ggplot(aes(cases_per_100000)) +
  geom_density() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

county_sample %>%
  ggplot(aes(y = cases_per_100000)) +
  geom_boxplot() +
  geom_hline(yintercept = mean(county_sample$cases_per_100000), color = "red")+
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

county_sample %>%
  ggplot(aes(sample = cases_per_100000)) +
  geom_qq() +
  geom_qq_line()

(d)

bootstrap1 <- 
  county_sample %>%
  rep_sample_n(size = 50, reps = 1, replace = TRUE)

bootstrap1 %>%
  group_by(county) %>%
  summarize(Count = NROW(county)) %>%
  arrange(desc(Count))

## # A tibble: 33 x 2
##    county     Count
##    <chr>      <int>
##  1 Sandusky       4
##  2 Cumberland     3
##  3 Currituck      3
##  4 Keokuk         3
##  5 Adams          2
##  6 Benton         2
##  7 Codington      2
##  8 Fresno         2
##  9 Lincoln        2
## 10 Nevada         2
## # ... with 23 more rows

mean(bootstrap1$cases_per_100000)

## [1] 10052

(e)

bootstrap35 <- 
  county_sample %>%
  rep_sample_n(size = 50, reps = 35, replace = TRUE) %>%
  group_by(replicate) %>%
  summarise(mean_cases_per_100000 = mean(cases_per_100000))

bootstrap35 %>%
  ggplot(aes(mean_cases_per_100000)) +
  geom_histogram() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

bootstrap35 %>%
  ggplot(aes(mean_cases_per_100000)) +
  geom_density() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

(f)

bootstrap1000 <- 
  county_sample %>%
  rep_sample_n(size = 50, reps = 1000, replace = TRUE) %>%
  group_by(replicate) %>%
  summarise(mean_cases_per_100000 = mean(cases_per_100000))

bootstrap1000 %>%
  ggplot(aes(mean_cases_per_100000)) +
  geom_histogram() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

bootstrap1000 %>%
  ggplot(aes(mean_cases_per_100000)) +
  geom_density() +
  geom_vline(xintercept = mean(county_sample$cases_per_100000), color = "red") +
  geom_vline(xintercept = median(county_sample$cases_per_100000), color = "blue")

(g)

vignette("infer")

## starting httpd help server ... done

bootstrap1000 <-
  county_sample %>%
  specify(response = cases_per_100000) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "mean")

bootstrap1000 %>%
  visualise(bins = 30)

(h)

https://moderndive.com/8-confidence-intervals.html#percentile-method-infer

(i)

(percentile_ci <-
  bootstrap1000 %>% 
  get_confidence_interval(level = 0.95, type = "percentile")
 )

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    8707.   10383.

bootstrap1000 %>% 
  visualize(bins = 30) + 
  shade_confidence_interval(endpoints = percentile_ci) +
  geom_vline(xintercept = x_bar)

(j)

https://moderndive.com/8-confidence-intervals.html#infer-se

(k)

(standard_error_ci <-
  bootstrap1000 %>% 
  get_confidence_interval(type = "se",
                          level = 0.95,
                          point_estimate = x_bar))

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    8493.   10181.

bootstrap1000 %>%
  visualize() + 
  shade_confidence_interval(endpoints = standard_error_ci) +
  geom_vline(xintercept = x_bar)

(l)

percentile_ci

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    8707.   10383.

standard_error_ci

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    8493.   10181.

x_bar

## [1] 9337

median(us_counties$cases_per_100000)

## [1] 9343

(m)

set.seed(1)

inside <- TRUE
trial <- 0
n <- 12

while(inside == TRUE){
  percentile_ci <-
    us_counties %>%
    # slice_sample(n = 50, replace = TRUE) %>%
    specify(response = cases_per_100000) %>%
    generate(reps = n, type = "bootstrap") %>%
    calculate(stat = "mean") %>% 
    get_confidence_interval(level = 0.95, type = "percentile")
  
  if(x_bar < percentile_ci[1] | percentile_ci[2] < x_bar){inside = FALSE}
  
  trial <- trial + 1
}

trial

## [1] 7

1 / trial

## [1] 0.143

03.30.2021 Cooper P:HW8

Nathan at GrandValleyTutor.com

5/9/2021

2

(a)

(b)

(c)

(d)

(e)

(f)

(g)

(h)

(i)

(j)

(k)

(l)

(m)