Foundations of Inference

DataCamp: Statistics with `R`

Bonnie Cooper

library( dplyr )
library( ggplot2 )
library( gridExtra )
library( infer )
library( NHANES )

Introduction to Ideas of Inference

Making inferential claims (as opposed to descriptive) about the data
Statistical Inference: The process of making claims about a population based on information from a sample.

Vocabulary:

Null Hypothesis ($H_0$): the claim that is not interesting. e.g. there is no statistical difference between two sample groups
Alternative Hypothesis ($H_A$): the claim corresponding to the research hypothesis
Goal: prove that the alternative hypothesis is true and the null is false.
Example 1: Comparing the speed of two different subspecies of cheetah
- $H_0$: Asian and African cheetahs run at the same average speed
- $H_A$: there is a statistically significant difference between the average running pace of Asian and African cheetahs
Example 2: From a sample, researchers would like to claim that Candidate X will win an election
- $H_0$: Candidate X will get half the votes
- $H_A$: Candidate X will get more than half the votes

Randomized Distributions

Understanding the Null Distribution
Generating a distribution of the statistic from the null population gives information about whether the observed data are inconsistent with the null hypothesis.

Repeat the sampling several times:

Repeat the sampling and calculations enough times to generate a distribution:

colnames( NHANES )

##  [1] "ID"               "SurveyYr"         "Gender"           "Age"             
##  [5] "AgeDecade"        "AgeMonths"        "Race1"            "Race3"           
##  [9] "Education"        "MaritalStatus"    "HHIncome"         "HHIncomeMid"     
## [13] "Poverty"          "HomeRooms"        "HomeOwn"          "Work"            
## [17] "Weight"           "Length"           "HeadCirc"         "Height"          
## [21] "BMI"              "BMICatUnder20yrs" "BMI_WHO"          "Pulse"           
## [25] "BPSysAve"         "BPDiaAve"         "BPSys1"           "BPDia1"          
## [29] "BPSys2"           "BPDia2"           "BPSys3"           "BPDia3"          
## [33] "Testosterone"     "DirectChol"       "TotChol"          "UrineVol1"       
## [37] "UrineFlow1"       "UrineVol2"        "UrineFlow2"       "Diabetes"        
## [41] "DiabetesAge"      "HealthGen"        "DaysPhysHlthBad"  "DaysMentHlthBad" 
## [45] "LittleInterest"   "Depressed"        "nPregnancies"     "nBabies"         
## [49] "Age1stBaby"       "SleepHrsNight"    "SleepTrouble"     "PhysActive"      
## [53] "PhysActiveDays"   "TVHrsDay"         "CompHrsDay"       "TVHrsDayChild"   
## [57] "CompHrsDayChild"  "Alcohol12PlusYr"  "AlcoholDay"       "AlcoholYear"     
## [61] "SmokeNow"         "Smoke100"         "Smoke100n"        "SmokeAge"        
## [65] "Marijuana"        "AgeFirstMarij"    "RegularMarij"     "AgeRegMarij"     
## [69] "HardDrugs"        "SexEver"          "SexAge"           "SexNumPartnLife" 
## [73] "SexNumPartYear"   "SameSex"          "SexOrientation"   "PregnantNow"

# Create bar plot for Home Ownership by Gender
ggplot(NHANES, aes(x = Gender, fill = HomeOwn)) + 
  # Set the position to fill
  geom_bar(position = 'fill') +
  ylab("Relative frequencies")

# Density plot of SleepHrsNight colored by SleepTrouble
ggplot(NHANES, aes(x = SleepHrsNight, color = SleepTrouble)) + 
  # Adjust by 2
  geom_density(adjust = 2) + 
  # Facet by HealthGen
  facet_wrap(~ HealthGen)

## Warning: Removed 2245 rows containing non-finite values (stat_density).

Investigate the relationship between gender and home ownership

Calculate the original observed statistic

homes <- NHANES %>%
  # Select Gender and HomeOwn
  select(Gender, HomeOwn) %>%
  # Filter for HomeOwn equal to "Own" or "Rent"
  filter(HomeOwn %in% c("Own", "Rent"))

glimpse( homes )

## Rows: 9,712
## Columns: 2
## $ Gender  <fct> male, male, male, male, female, male, male, female, female, f…
## $ HomeOwn <fct> Own, Own, Own, Own, Rent, Rent, Own, Own, Own, Own, Own, Rent…

Find the observed differences in proportions of home ownerships between the genders

diff_orig <- homes %>%   
  # Group by gender
  group_by(Gender) %>%
  # Summarize proportion of homeowners
  summarize(prop_own = mean( HomeOwn == 'Own')) %>%
  # Summarize difference in proportion of homeowners
  summarize(obs_diff_prop = diff(prop_own)) # male - female
  
# See the result
diff_orig

## # A tibble: 1 x 1
##   obs_diff_prop
##           <dbl>
## 1      -0.00783

Randomize the data under the null model of independence

# Specify variables
homeown_perm <- homes %>%
  specify(HomeOwn ~ Gender, success = "Own") %>%
  hypothesize(null = "independence")  %>% 
  generate(reps = 100, type = "permute") %>% 
  calculate(stat = 'diff in props', order = c( 'male','female' ))

# Print results to console
homeown_perm

## # A tibble: 100 x 2
##    replicate      stat
##        <int>     <dbl>
##  1         1  0.00247 
##  2         2  0.00412 
##  3         3 -0.00700 
##  4         4  0.00123 
##  5         5 -0.00618 
##  6         6  0.00823 
##  7         7  0.000821
##  8         8 -0.0157  
##  9         9 -0.000415
## 10        10 -0.00330 
## # … with 90 more rows

# Dotplot of 100 permuted differences in proportions
ggplot(homeown_perm, aes(x = stat)) + 
  geom_dotplot(binwidth = 0.001)

Try permuting 1000x and visualize the distribution as a density plot:

# Perform 1000 permutations
homeown_perm <- homes %>%
  # Specify HomeOwn vs. Gender, with `"Own" as success
  specify(HomeOwn ~ Gender, success = "Own") %>%
  # Use a null hypothesis of independence
  hypothesize(null = 'independence') %>% 
  # Generate 1000 repetitions (by permutation)
  generate(reps = 1000, type = "permute") %>% 
  # Calculate the difference in proportions (male then female)
  calculate(stat = 'diff in props', order = c('male','female'))

# Density plot of 1000 permuted differences in proportions
ggplot(homeown_perm, aes(x = stat)) + 
  geom_density()

The distribution is approximately normally distributed around ~-0.01, but what can we conclude from this finding?….

Four Steps of Inference:

specify: will specify the response and explanatory variables
hypothesize: will declare the null hypothesis
generate: will generate resamples, permutations, or simulations
calculate: will calculate summary statistics

Using the Randomization Distribution

Comparing the observed statistic against a distribution of the null hypothesis. Quantify how different the observed is…

glimpse( homeown_perm )

## Rows: 1,000
## Columns: 2
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ stat      <dbl> 0.006587140, 0.026769349, -0.002474260, -0.002886141, 0.011…

# Plot permuted differences, diff_perm
ggplot(homeown_perm, aes(x = stat)) + 
  # Add a density layer
  geom_density() +
  # Add a vline layer with intercept diff_orig
  geom_vline(aes(xintercept = diff_orig$obs_diff_prop), color = "red")

# Compare permuted differences to observed difference
homeown_perm %>%
  mutate( diff_orig = diff_orig$obs_diff_prop ) %>%
  summarize(n_perm_le_obs = sum(stat <= diff_orig))

## # A tibble: 1 x 1
##   n_perm_le_obs
##           <int>
## 1           211

Here we fail to reject the null hypothesis. There is no evidence that our data are inconsistent with the null hypothesis that there is no difference between homeownership between genders.

Completing a Randomization Test: Gender Dicrimination

Here we will complete a full hypothesis test
data from Influence of sex role stereotypes on personnel decisions

$H_0$: gender and promotion are unrelated variables
$H_A$: men are more likely to be promoted

disc <- data.frame( promote = c( rep( 'promoted', 35 ), rep( 'not_promoted', 13 )),
                    sex = c( rep( 'male', 21 ), rep( 'female', 14 ),
                             rep( 'male', 3), rep( 'female', 10 )))
disc %>%
  group_by( sex ) %>%
  summarize( promoted_prop = mean( promote == 'promoted' ))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 2
##   sex    promoted_prop
##   <chr>          <dbl>
## 1 female         0.583
## 2 male           0.875

Is it plausible to observe such a difference in proportions if the null hypothesis is true? How big of a difference does there have to be?

# Create a contingency table summarizing the data
disc %>%
  # Count the rows by sex, promote
  count( sex, promote)

##      sex      promote  n
## 1 female not_promoted 10
## 2 female     promoted 14
## 3   male not_promoted  3
## 4   male     promoted 21

# Find proportion of each sex who were promoted
disc %>%
  # Group by sex
  group_by( sex ) %>%
  # Calculate proportion promoted summary stat
  summarize( promoted_prop = mean( promote == 'promoted'))

## # A tibble: 2 x 2
##   sex    promoted_prop
##   <chr>          <dbl>
## 1 female         0.583
## 2 male           0.875

The difference in prportions is ~0.3

# Replicate the entire data frame, permuting the promote variable
disc_perm <- disc %>%
  specify(promote ~ sex, success = "promoted") %>%
  hypothesize(null = "independence") %>%
  generate(reps = 5, type = "permute")

disc_perm %>%
  # Group by replicate
  group_by( replicate ) %>%
  # Count per group
  count()

## # A tibble: 5 x 2
## # Groups:   replicate [5]
##   replicate     n
##       <int> <int>
## 1         1    48
## 2         2    48
## 3         3    48
## 4         4    48
## 5         5    48

disc_perm %>%
  # Calculate difference in proportion, male then female
  calculate( stat = 'diff in props', order = c( 'male','female'))

## # A tibble: 5 x 2
##   replicate    stat
##       <int>   <dbl>
## 1         1  0.125 
## 2         2 -0.125 
## 3         3 -0.125 
## 4         4 -0.458 
## 5         5 -0.0417

calculate the observed difference:

# Calculate the observed difference in promotion rate
diff_orig <- disc %>%
  # Group by sex
  group_by(sex) %>%
  # Summarize to calculate fraction promoted
  summarize(prop_prom = mean(promote == 'promoted')) %>%
  # Summarize to calculate difference
  summarize(stat = diff(prop_prom)) %>% 
  pull()

## `summarise()` ungrouping output (override with `.groups` argument)

# See the result
diff_orig

## [1] 0.2916667

create a randomized distribution of the null statistic

# Create data frame of permuted differences in promotion rates
disc_perm <- disc %>%
  # Specify promote vs. sex
  specify(promote ~ sex, success = "promoted") %>%
  # Set null hypothesis as independence
  hypothesize(null = "independence") %>%
  # Generate 1000 permutations
  generate(reps = 1000, type = "permute") %>%
  # Calculate difference in proportions
  calculate(stat = "diff in props", order = c("male", "female"))

glimpse( disc_perm )

## Rows: 1,000
## Columns: 2
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ stat      <dbl> 0.04166667, -0.12500000, 0.12500000, -0.04166667, 0.2916666…

Visualize the simulation

# Using permutation data, plot stat
ggplot(disc_perm, aes(x = stat)) + 
  # Add a histogram layer
  geom_histogram(binwidth = 0.01) +
  # Add a vertical line at diff_orig
  geom_vline(aes(xintercept = diff_orig), color = "red")

Distribution of Statistics

We are interested in whether the observed statistic is different from values obtained by shuffling. can use:

Difference in proportions: $\hat{p}-p$
Ratio: $\frac{\hat{p}}{p}$

Calculating quantiles

disc_perm %>%
  summarize( q.05 = quantile( stat, p = 0.05),
             q.95 = quantile( stat, p = 0.95))

## # A tibble: 1 x 2
##     q.05  q.95
##    <dbl> <dbl>
## 1 -0.208 0.208

The observed statistic is greater than the upper limit quantile supporting the idea that the observed data is not consistent with the bulk of the null distribution.

Why 0.05?

“It is a common practice to judge a result significant, if it is of such a magnitude that it would have been produced by chance not more frequently than once in twenty trials. This is an arbitrary, but convenient, level of significance for the practical investigator, but it does not mean that he allows himself to be deceived one in every twenty experiments. The test of significance only tells him what to ignore, namely all experiments in which significant results are not obtained. He should only claim that a phenomenon is experimentally demonstrable when he knows how to design an experiment so that it will rarely fail to give a significant result. Consequently, isolated significant results which he does not know how to reproduce are left in suspense pending further investigation.”

0.05 is about the same odds as flipping 5 heads in a row with a coin.

Degree of skepticism: using a cutoff of 0.05 is more skeptical of the observed result

A cutoff level is subjective. Only significant results from well designed studies should lead to further investigation.

How does the sample size of the randomization null distribution matter:

disc_small <- data.frame( promote = c( rep( 'promoted', 12 ), rep( 'not_promoted', 4 )),
                    sex = c( rep( 'male', 7 ), rep( 'female', 5 ),
                             rep( 'male', 1), rep( 'female', 3 )))

disc_big <- data.frame( promote = c( rep( 'promoted', 350 ), rep( 'not_promoted', 130 )),
                    sex = c( rep( 'male', 210 ), rep( 'female', 140 ),
                             rep( 'male', 30), rep( 'female', 100 )))

# Tabulate the small dataset
disc_small %>% 
  # Select sex and promote
  count(sex, promote)

##      sex      promote n
## 1 female not_promoted 3
## 2 female     promoted 5
## 3   male not_promoted 1
## 4   male     promoted 7

# Do the same for disc_big
disc_big %>% 
  # Select sex and promote
  count(sex, promote)

##      sex      promote   n
## 1 female not_promoted 100
## 2 female     promoted 140
## 3   male not_promoted  30
## 4   male     promoted 210

# Create data frame of permuted differences in promotion rates
disc_perm_small <- disc_small %>%
  # Specify promote vs. sex
  specify(promote ~ sex, success = "promoted") %>%
  # Set null hypothesis as independence
  hypothesize(null = "independence") %>%
  # Generate 1000 permutations
  generate(reps = 1000, type = "permute") %>%
  # Calculate difference in proportions
  calculate(stat = "diff in props", order = c("male", "female"))


# Create data frame of permuted differences in promotion rates
disc_perm_big <- disc_big %>%
  # Specify promote vs. sex
  specify(promote ~ sex, success = "promoted") %>%
  # Set null hypothesis as independence
  hypothesize(null = "independence") %>%
  # Generate 1000 permutations
  generate(reps = 1000, type = "permute") %>%
  # Calculate difference in proportions
  calculate(stat = "diff in props", order = c("male", "female"))

calculate the observed difference:

# Calculate the observed difference in promotion rate
diff_orig_small <- disc_small %>%
  # Group by sex
  group_by(sex) %>%
  # Summarize to calculate fraction promoted
  summarize(prop_prom = mean(promote == 'promoted')) %>%
  # Summarize to calculate difference
  summarize(stat = diff(prop_prom)) %>% 
  pull()

## `summarise()` ungrouping output (override with `.groups` argument)

diff_orig_big <- disc_big %>%
  # Group by sex
  group_by(sex) %>%
  # Summarize to calculate fraction promoted
  summarize(prop_prom = mean(promote == 'promoted')) %>%
  # Summarize to calculate difference
  summarize(stat = diff(prop_prom)) %>% 
  pull()

## `summarise()` ungrouping output (override with `.groups` argument)

visualize

# Using disc_perm_small, plot stat
sm <- ggplot(disc_perm_small, aes(x = stat)) + 
  # Add a histogram layer with binwidth 0.01
  geom_histogram(binwidth = 0.01) +
  # Add a vline layer, crossing x-axis at diff_orig_small
  geom_vline(aes(xintercept = diff_orig_small), color = "red")

# Swap the dataset to disc_perm_big
bg <- ggplot(disc_perm_big, aes(x = stat)) + 
  geom_histogram(binwidth = 0.01) +
  # Change the x-axis intercept to diff_orig_big
  geom_vline(aes(xintercept = diff_orig_big), color = "red")

grid.arrange( sm, bg, ncol = 2 )

calc_upper_quantiles <- function(dataset) {
  dataset %>% 
    summarize(
      q.90 = quantile(stat, p = 0.90),
      q.95 = quantile(stat, p = 0.95),
      q.99 = quantile(stat, p = 0.99)
    )
}


# Calculate the quantiles associated with the small dataset
calc_upper_quantiles(disc_perm_small)

## # A tibble: 1 x 3
##    q.90  q.95  q.99
##   <dbl> <dbl> <dbl>
## 1  0.25  0.25   0.5

# Calculate the quantiles associated with the big dataset
calc_upper_quantiles(disc_perm_big)

## # A tibble: 1 x 3
##     q.90   q.95  q.99
##    <dbl>  <dbl> <dbl>
## 1 0.0583 0.0667 0.100

What is a p-value?

p-value: the probability of observing data as or more extreme than what we actually got given that the null hypothesis is true.

Visualize and Calculate p-values:

# Visualize and calculate the p-value for the small dataset
disc_perm_small %>%
  get_p_value(obs_stat = diff_orig_small, direction = "greater")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.299

disc_perm_small %>%
  visualize(obs_stat = diff_orig_small, direction = "greater")

## Warning: `visualize()` should no longer be used to plot a p-value. Arguments
## `obs_stat`, `obs_stat_color`, `pvalue_fill`, and `direction` are deprecated. Use
## `shade_p_value()` instead.

# Visualize and calculate the p-value for the big dataset
disc_perm_big %>%
  get_p_value(obs_stat = diff_orig_big, direction = "greater")

## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0

disc_perm_big %>%
  visualize(obs_stat = diff_orig_big, direction = "greater")

## Warning: `visualize()` should no longer be used to plot a p-value. Arguments
## `obs_stat`, `obs_stat_color`, `pvalue_fill`, and `direction` are deprecated. Use
## `shade_p_value()` instead.

The result is very consistent with the small dataset (p-value = 0.277 ) and extremely unusual for the large dataset (p-value = 0)

Does the difference in promotion rates still appear to be statistically significant for this new data?

disc_new <- data.frame( promote = c( rep( 'promoted', 35 ), rep( 'not_promoted', 13 )),
                    sex = c( rep( 'male', 18 ), rep( 'female', 17 ),
                             rep( 'male', 6), rep( 'female', 7 )))

# Tabulate the new data
disc_new %>%
count( sex, promote )

##      sex      promote  n
## 1 female not_promoted  7
## 2 female     promoted 17
## 3   male not_promoted  6
## 4   male     promoted 18

# Create data frame of permuted differences in promotion rates
disc_perm_new <- disc_new %>%
  # Specify promote vs. sex
  specify(promote ~ sex, success = "promoted") %>%
  # Set null hypothesis as independence
  hypothesize(null = "independence") %>%
  # Generate 1000 permutations
  generate(reps = 1000, type = "permute") %>%
  # Calculate difference in proportions
  calculate(stat = "diff in props", order = c("male", "female"))

diff_orig_new <- disc_new %>%
  # Group by sex
  group_by(sex) %>%
  # Summarize to calculate fraction promoted
  summarize(prop_prom = mean(promote == 'promoted')) %>%
  # Summarize to calculate difference
  summarize(stat = diff(prop_prom)) %>% 
  pull()

## `summarise()` ungrouping output (override with `.groups` argument)

# Plot the distribution of the new permuted differences
ggplot(disc_perm_new, aes(x = stat)) + 
  geom_histogram() +
  geom_vline(aes(xintercept = diff_orig_new), color = "red")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

find the p-values of the new dataset:

# Find the p-value from the new data
disc_perm_new %>%
  summarize(p_value = mean(diff_orig_new <= stat))

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.509

Hypothesis Testing Errors: Opportunity Cost

Example: Opportunity Cost

Opportunity Cost Neglect

The Hypothesis:

$H_0$: Reminding students will have no impact on their spending decisions
$H_A$: Reminding students will reduce the chance they will continue with a purchace

opportunity <- data.frame( group = c( rep( 'control', 75 ), rep( 'treatment', 75 )),
                    decision = c( rep( 'buyDVD', 56 ), rep( 'nobuyDVD', 19 ),
                             rep( 'buyDVD', 41), rep( 'nobuyDVD', 34 )))

# Tabulate the data
opportunity %>%
  count(decision, group)

##   decision     group  n
## 1   buyDVD   control 56
## 2   buyDVD treatment 41
## 3 nobuyDVD   control 19
## 4 nobuyDVD treatment 34

# Find the proportion who bought the DVD in each group
opportunity %>%
  group_by(group) %>%
  summarize(buy_prop = mean(decision == 'buyDVD' ))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 2
##   group     buy_prop
##   <chr>        <dbl>
## 1 control      0.747
## 2 treatment    0.547

About 75% of the control group bought the DVD while 55% of the treament group made the purchase.

# Plot group, filled by decision
ggplot(opportunity, aes(x = group, fill = decision)) + 
  # Add a bar layer, with position "fill"
  geom_bar( position = 'fill')

calculate the observed difference

# Calculate the observed difference in purchase rate
diff_obs <- opportunity %>%
  # Group by group
  group_by( group ) %>%
  # Calculate proportion deciding to buy a DVD
  summarise(prop_buy = mean(decision == 'buyDVD')) %>%
  # Calculate difference between groups
  summarise(stat = diff(prop_buy)) %>% 
  pull()

## `summarise()` ungrouping output (override with `.groups` argument)

# Create data frame of permuted differences in purchase rates
opp_perm <- opportunity %>%
  # Specify decision vs. group, where success is buying a DVD
  specify(decision ~ group, success = 'buyDVD') %>%
  # Set the null hypothesis to independence
  hypothesize(null = 'independence') %>%
  # Generate 1000 reps of type permute
  generate(reps = 1000, type = 'permute') %>%
  # Calculate the summary stat difference in proportions
  calculate(stat = 'diff in props', order = c("treatment", "control"))

# Using the permuation data, plot stat
ggplot(opp_perm, aes(x = stat)) + 
  # Add a histogram layer with binwidth 0.005
  geom_histogram(binwidth = 0.005) +
  # Add a vline layer with intercept diff_obs
  geom_vline(aes(xintercept = diff_obs), color = "red")

# Visualize the statistic 
opp_perm %>%
  visualize(obs_stat = diff_obs, direction = "less")

## Warning: `visualize()` should no longer be used to plot a p-value. Arguments
## `obs_stat`, `obs_stat_color`, `pvalue_fill`, and `direction` are deprecated. Use
## `shade_p_value()` instead.

# Calculate the p-value using `get_p_value`
opp_perm %>%
  get_p_value(obs_stat = diff_obs, direction = "less")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.007

# Calculate the p-value using `summarize`
opp_perm %>%
  summarize(p_value = mean( stat <= diff_obs ))

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.007

We can confidently say the different messaging caused the students to change their buying habits, since they were randomly assigned to treatment and control groups.

Errors and their Consequences

Errors in Hypothesis Testing

Type 1 Error: Falsely reject the null hypothesis when it was true
Type 2 Error: Falsely accept the null hypothesis when it was false

Causal Inference:

The study was randomized
Nothing systematically different about participants in the treatment or control groups
Therefore: any difference in the buying rates is due to the experimental treatment.

Confidence Intervals

Parameters and Condidence Intervals

Confidence intervals are used when the research question seeks to arrive at an estimate.
A Confidence Interval is a range of numbers the (hopefully) captures the true parameter. The goal of creating a confidence interval is to calculate a range of plausible values for the paramter of interest.

Bootstrapping

Hypothesis Testing

How do samples from the null population vary?
Statistic: proportion of successes in sample $\longrightarrow \hat{p}$
Parameter: proportion of successes in the population $\longrightarrow p$

How do $p$ and $\hat{p}$ vary?
Bootstrapping is a method that allows us to measure the distance of the statistic from the parameter. Resample from the sample the statistic was derived from.

Standard Errordescribes how the statistic varies around a parameter. Bootstrapping provides an approximation of the standard error.

Variability of $\hat{p}$ from the sample (bootstrapping)

all_polls <- readRDS( 'all_polls.rds' )
#glimpse( all_polls )

ex1_props <- all_polls %>%
  group_by( poll ) %>%
  summarize( stat = mean( vote == 'yes' ))

## `summarise()` ungrouping output (override with `.groups` argument)

one_poll <- all_polls %>%
  filter( poll == 1 ) %>%
  select( vote )

ex2_props <- one_poll %>%
  specify( response = vote, success = 'yes' ) %>%
  generate( reps = 1000, type = 'bootstrap' ) %>%
  summarize( stat = mean(vote == 'yes' ))

## `summarise()` ungrouping output (override with `.groups` argument)

#glimpse( ex2_props )

#calculate the variability of p-hat
ex1_props %>%
  summarise( variability = sd( stat ))

## # A tibble: 1 x 1
##   variability
##         <dbl>
## 1      0.0868

ex2_props %>%
  summarize( sd( stat ) )

## # A tibble: 1 x 1
##   `sd(stat)`
##        <dbl>
## 1     0.0821

# Combine data from both experiments
both_ex_props <- bind_rows(ex1_props, ex2_props, .id = "experiment")

# Using both_ex_props, plot stat colored by experiment
ggplot(both_ex_props, aes(stat, color = experiment)) + 
  # Add a density layer with bandwidth 0.1
  geom_density(bw = 0.1)

Variability in $\hat{p}$

How far are the data from the parameter? The variability of the #$ statistics give a measure for how far apart any given $\hat{p}$ and the paramter are expected to be.

Impiracle Rule: Approximately 95% of sample will produce $\hat{p}$s that are within 2SE of the distribution center.
Check that this holds:

# Proportion of yes votes by poll
props <- all_polls %>% 
  group_by(poll) %>% 
  summarize(prop_yes = mean(vote == "yes"))

## `summarise()` ungrouping output (override with `.groups` argument)

# The true population proportion of yes votes
true_prop_yes <- 0.6

# Proportion of polls within 2SE
props %>%
  # Add column: is prop_yes in 2SE of 0.6
  mutate(is_in_conf_int = abs(prop_yes - true_prop_yes) < 2 * sd(prop_yes)) %>%
  # Calculate  proportion in conf int
  summarize(prop_in_conf_int = mean(is_in_conf_int))

## # A tibble: 1 x 1
##   prop_in_conf_int
##              <dbl>
## 1            0.966

Bootstrapping t-confidence interval.

You can measure the variability associated with $\hat{p}$ by resampling from the original sample.
Once you know the variability of $\hat{p}$ you can use it as a way to measure how far away the true proportion is.

one_poll_boot <- one_poll %>%
  specify(response = vote, success = "yes") %>%
  generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "prop")
  
p_hat <- one_poll %>%
  # Calculate proportion of yes votes
  summarize(stat = mean( vote == 'yes' )) %>%
  pull()

# Create an interval of plausible values
one_poll_boot %>%
  summarize(
    # Lower bound is p_hat minus 2 std errs
    lower = p_hat - 2*sd( stat ),
    # Upper bound is p_hat plus 2 std errs
    upper = p_hat + 2*sd( stat )
  )

## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 0.529 0.871

# Manually calculate a 95% percentile interval
one_poll_boot %>%
  summarize(
    lower = quantile(stat, p = 0.025),
    upper = quantile(stat, p = 0.975)
  )

## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 0.533 0.867

# Calculate the same interval, more conveniently
percentile_ci <- one_poll_boot %>% 
  get_confidence_interval( level = 0.95)
percentile_ci

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    0.533    0.867

#here is a function for that too
calc_t_conf_int <- function(resampled_dataset) {
  resampled_dataset %>%
    summarize(
      lower = p_hat - 2 * sd(stat),
      upper = p_hat + 2 * sd(stat)
    )
}

one_poll_boot %>% 
  # Visualize in-between the endpoints given by percentile_ci
  visualize( endpoints = percentile_ci, direction = 'between' )

## Warning: `visualize()` should no longer be used to plot a confidence interval.
## Arguments `endpoints`, `endpoints_color`, and `ci_fill` are deprecated. Use
## `shade_confidence_interval()` instead.

Interpreting CIs and technical conditions

Motivation for Confidence Intervals

Goal is to find the parameter when all we know if the statistic
Never know whether the sample you colleted actually contains the true parameter
Can only say: We are 95% confident that the true proportion is between the the CI.

Technical Conditions that must be met:

Sampling distribution is reasonably symmetric and bell-shaped
Sample size is reasonably large
Variability of resampled proportions

Sample Size Effects