Samples and Populations

library(pacman)
p_load(mdsr, nycflights13, tidyverse)

Recall the flights dataframe.

flights

Get all of the flights to SF and take a sample of 25 of them.

Note that I did not set a seed for the sample, so your answers will differ from what is in the book.

SF <- flights %>% 
  filter(dest == "SFO", !is.na(arr_delay))
SF
  
Sample25 <- SF %>%
  sample_n(size = 25)
Sample25
Sample25 %>% favstats(~ arr_delay, data = .)
SF %>% favstats(~ arr_delay, data = .)
Sample25 %>% qdata(~arr_delay, p = 0.98, data = .)
       p quantile 
    0.98   158.12 
SF %>% tally(~ arr_delay < 90, data = ., format = "proportion")
arr_delay < 90
      TRUE      FALSE 
0.95141577 0.04858423 
SF %>% qdata(~ arr_delay, p = 0.98, data = .)
       p quantile 
    0.98   153.00 

Sample Statistics

The sampling distribution.

n <- 25
SF %>% sample_n(size=n, replace = FALSE) %>%
  summarize(mean(arr_delay))
SF %>% sample_n(size=n, replace = FALSE) %>%
  summarize(mean(arr_delay))
Trials <- do(500)*
  mean( ~ arr_delay, data = sample_n(SF, size = n, replace = TRUE))
Trials
Trials %>% favstats(~ mean, data = .)

Confidence intervals

mean(~ mean, data = Trials) + 2*sd(~ mean, data = Trials)*c(-1,1)
[1] -17.19053  22.00797

Using a larger sample size.

n <- 100
Trials_100 <- do(500)*
  mean( ~ arr_delay, data = sample_n(SF, size = n, replace = TRUE))
Trials_100

Plots to compare the sampling distributions with different sample sizes.

rbind(Trials %>% mutate(n=25), Trials_100 %>% mutate(n=100)) %>%
  ggplot(aes(x = mean)) +
  geom_histogram(bins = 30) +
  facet_grid(~ n) +
  xlab("Sample mean")

Boostrap

Small <- SF %>% sample_n(size = 3, replace = FALSE)
Small
Small <- SF %>% sample_n(size = 3, replace = TRUE)
Small
n <- 200
Orig_sample <- SF %>% sample_n(size = n, replace = FALSE)
Orig_sample
Orig_sample %>% sample_n(size=n, replace = TRUE) %>%
  summarize(mean(arr_delay))
Boostrap_trials <- do(500)*
  mean(~ arr_delay, data = sample_n(Orig_sample, size = n, replace = TRUE))
       
Boostrap_trials %>% favstats(~ mean, data = .)
Trials_200 <- do(500)*
  mean(~ arr_delay, data = sample_n(SF, size = n, replace = FALSE))
       
Trials_200 %>% favstats(~ mean, data = .)
Orig_sample %>% qdata( ~arr_delay, p = 0.98, data = .)
       p quantile 
    0.98   187.18 
Bootstrap_trials <- do(500)*
  qdata(~arr_delay, p = 0.98, data = sample_n(Orig_sample, size = n, replace = TRUE))
Bootstrap_trials %>% favstats(~ quantile, data = .)
Bootstrap_trials <- do(10000)*
  qdata(~arr_delay, p = 0.98, data = sample_n(Orig_sample, size = n, replace = TRUE))
Bootstrap_trials %>% favstats(~ quantile, data = .)

outliers

SF %>% filter(arr_delay >= 420) %>%
  select(month, day, dep_delay, arr_delay, carrier)
SF %>% mutate(long_delay = arr_delay > 60) %>%
  tally(~ long_delay | month, data = .)
          month
long_delay    1    2    3    4    5    6    7    8    9   10   11   12
     TRUE    29   21   61  112   65  209  226   96   65   36   51   66
     FALSE  856  741  812  993 1128  980  966 1159 1124 1177 1107 1093
SF %>% mutate(long_delay = arr_delay > 60) %>%
  tally(~ long_delay | carrier, data = .)
          carrier
long_delay   AA   B6   DL   UA   VX
     TRUE   148   86   91  492  220
     FALSE 1250  934 1757 6236 1959
LS0tCnRpdGxlOiAiQ2hhcHRlciA3IFN0YXRpc3RpY2FsIEZvdW5kYXRpb25zIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIyBTYW1wbGVzIGFuZCBQb3B1bGF0aW9ucwoKYGBge3J9CmxpYnJhcnkocGFjbWFuKQoKcF9sb2FkKG1kc3IsIG55Y2ZsaWdodHMxMywgdGlkeXZlcnNlKQpgYGAKClJlY2FsbCB0aGUgKmZsaWdodHMqIGRhdGFmcmFtZS4KCmBgYHtyfQpmbGlnaHRzCmBgYAoKR2V0IGFsbCBvZiB0aGUgZmxpZ2h0cyB0byBTRiBhbmQgdGFrZSBhIHNhbXBsZSBvZiAyNSBvZiB0aGVtLgoKTm90ZSB0aGF0IEkgZGlkIG5vdCBzZXQgYSBzZWVkIGZvciB0aGUgc2FtcGxlLCBzbyB5b3VyIGFuc3dlcnMgd2lsbCBkaWZmZXIgZnJvbSB3aGF0IGlzIGluIHRoZSBib29rLgoKYGBge3J9ClNGIDwtIGZsaWdodHMgJT4lIAogIGZpbHRlcihkZXN0ID09ICJTRk8iLCAhaXMubmEoYXJyX2RlbGF5KSkKU0YKICAKU2FtcGxlMjUgPC0gU0YgJT4lCiAgc2FtcGxlX24oc2l6ZSA9IDI1KQpTYW1wbGUyNQpgYGAKCmBgYHtyfQpTYW1wbGUyNSAlPiUgZmF2c3RhdHMofiBhcnJfZGVsYXksIGRhdGEgPSAuKQpgYGAKCmBgYHtyfQpTRiAlPiUgZmF2c3RhdHMofiBhcnJfZGVsYXksIGRhdGEgPSAuKQpgYGAKCmBgYHtyfQpTYW1wbGUyNSAlPiUgcWRhdGEofmFycl9kZWxheSwgcCA9IDAuOTgsIGRhdGEgPSAuKQpgYGAKCmBgYHtyfQpTRiAlPiUgdGFsbHkofiBhcnJfZGVsYXkgPCA5MCwgZGF0YSA9IC4sIGZvcm1hdCA9ICJwcm9wb3J0aW9uIikKYGBgCgoKYGBge3J9ClNGICU+JSBxZGF0YSh+IGFycl9kZWxheSwgcCA9IDAuOTgsIGRhdGEgPSAuKQpgYGAKCiMjIFNhbXBsZSBTdGF0aXN0aWNzCgpUaGUgc2FtcGxpbmcgZGlzdHJpYnV0aW9uLgoKYGBge3J9Cm4gPC0gMjUKU0YgJT4lIHNhbXBsZV9uKHNpemU9biwgcmVwbGFjZSA9IEZBTFNFKSAlPiUKICBzdW1tYXJpemUobWVhbihhcnJfZGVsYXkpKQoKCmBgYAoKYGBge3J9ClNGICU+JSBzYW1wbGVfbihzaXplPW4sIHJlcGxhY2UgPSBGQUxTRSkgJT4lCiAgc3VtbWFyaXplKG1lYW4oYXJyX2RlbGF5KSkKYGBgCgpgYGB7cn0KVHJpYWxzIDwtIGRvKDUwMCkqCiAgbWVhbiggfiBhcnJfZGVsYXksIGRhdGEgPSBzYW1wbGVfbihTRiwgc2l6ZSA9IG4sIHJlcGxhY2UgPSBUUlVFKSkKVHJpYWxzCmBgYAoKYGBge3J9ClRyaWFscyAlPiUgZmF2c3RhdHMofiBtZWFuLCBkYXRhID0gLikKYGBgCgpDb25maWRlbmNlIGludGVydmFscwoKYGBge3J9Cm1lYW4ofiBtZWFuLCBkYXRhID0gVHJpYWxzKSArIDIqc2QofiBtZWFuLCBkYXRhID0gVHJpYWxzKSpjKC0xLDEpCmBgYAoKVXNpbmcgYSBsYXJnZXIgc2FtcGxlIHNpemUuCgpgYGB7cn0KbiA8LSAxMDAKVHJpYWxzXzEwMCA8LSBkbyg1MDApKgogIG1lYW4oIH4gYXJyX2RlbGF5LCBkYXRhID0gc2FtcGxlX24oU0YsIHNpemUgPSBuLCByZXBsYWNlID0gVFJVRSkpClRyaWFsc18xMDAKYGBgCgpQbG90cyB0byBjb21wYXJlIHRoZSBzYW1wbGluZyBkaXN0cmlidXRpb25zIHdpdGggZGlmZmVyZW50IHNhbXBsZSBzaXplcy4KCmBgYHtyfQpyYmluZChUcmlhbHMgJT4lIG11dGF0ZShuPTI1KSwgVHJpYWxzXzEwMCAlPiUgbXV0YXRlKG49MTAwKSkgJT4lCiAgZ2dwbG90KGFlcyh4ID0gbWVhbikpICsKICBnZW9tX2hpc3RvZ3JhbShiaW5zID0gMzApICsKICBmYWNldF9ncmlkKH4gbikgKwogIHhsYWIoIlNhbXBsZSBtZWFuIikKCmBgYAoKIyMgQm9vc3RyYXAKCmBgYHtyfQpTbWFsbCA8LSBTRiAlPiUgc2FtcGxlX24oc2l6ZSA9IDMsIHJlcGxhY2UgPSBGQUxTRSkKU21hbGwKYGBgCgpgYGB7cn0KU21hbGwgPC0gU0YgJT4lIHNhbXBsZV9uKHNpemUgPSAzLCByZXBsYWNlID0gVFJVRSkKU21hbGwKYGBgCgpgYGB7cn0KbiA8LSAyMDAKT3JpZ19zYW1wbGUgPC0gU0YgJT4lIHNhbXBsZV9uKHNpemUgPSBuLCByZXBsYWNlID0gRkFMU0UpCk9yaWdfc2FtcGxlCmBgYAoKYGBge3J9Ck9yaWdfc2FtcGxlICU+JSBzYW1wbGVfbihzaXplPW4sIHJlcGxhY2UgPSBUUlVFKSAlPiUKICBzdW1tYXJpemUobWVhbihhcnJfZGVsYXkpKQpgYGAKCgpgYGB7cn0KQm9vc3RyYXBfdHJpYWxzIDwtIGRvKDUwMCkqCiAgbWVhbih+IGFycl9kZWxheSwgZGF0YSA9IHNhbXBsZV9uKE9yaWdfc2FtcGxlLCBzaXplID0gbiwgcmVwbGFjZSA9IFRSVUUpKQogICAgICAgCkJvb3N0cmFwX3RyaWFscyAlPiUgZmF2c3RhdHMofiBtZWFuLCBkYXRhID0gLikKCmBgYAoKYGBge3J9ClRyaWFsc18yMDAgPC0gZG8oNTAwKSoKICBtZWFuKH4gYXJyX2RlbGF5LCBkYXRhID0gc2FtcGxlX24oU0YsIHNpemUgPSBuLCByZXBsYWNlID0gRkFMU0UpKQogICAgICAgClRyaWFsc18yMDAgJT4lIGZhdnN0YXRzKH4gbWVhbiwgZGF0YSA9IC4pCmBgYAoKCmBgYHtyfQpPcmlnX3NhbXBsZSAlPiUgcWRhdGEoIH5hcnJfZGVsYXksIHAgPSAwLjk4LCBkYXRhID0gLikKYGBgCgpgYGB7cn0KQm9vdHN0cmFwX3RyaWFscyA8LSBkbyg1MDApKgogIHFkYXRhKH5hcnJfZGVsYXksIHAgPSAwLjk4LCBkYXRhID0gc2FtcGxlX24oT3JpZ19zYW1wbGUsIHNpemUgPSBuLCByZXBsYWNlID0gVFJVRSkpCgpCb290c3RyYXBfdHJpYWxzICU+JSBmYXZzdGF0cyh+IHF1YW50aWxlLCBkYXRhID0gLikKYGBgCgoKYGBge3J9CkJvb3RzdHJhcF90cmlhbHMgPC0gZG8oMTAwMDApKgogIHFkYXRhKH5hcnJfZGVsYXksIHAgPSAwLjk4LCBkYXRhID0gc2FtcGxlX24oT3JpZ19zYW1wbGUsIHNpemUgPSBuLCByZXBsYWNlID0gVFJVRSkpCgpCb290c3RyYXBfdHJpYWxzICU+JSBmYXZzdGF0cyh+IHF1YW50aWxlLCBkYXRhID0gLikKYGBgCgojIyBvdXRsaWVycwoKYGBge3J9ClNGICU+JSBmaWx0ZXIoYXJyX2RlbGF5ID49IDQyMCkgJT4lCiAgc2VsZWN0KG1vbnRoLCBkYXksIGRlcF9kZWxheSwgYXJyX2RlbGF5LCBjYXJyaWVyKQpgYGAKCmBgYHtyfQpTRiAlPiUgbXV0YXRlKGxvbmdfZGVsYXkgPSBhcnJfZGVsYXkgPiA2MCkgJT4lCiAgdGFsbHkofiBsb25nX2RlbGF5IHwgbW9udGgsIGRhdGEgPSAuKQpgYGAKCmBgYHtyfQpTRiAlPiUgbXV0YXRlKGxvbmdfZGVsYXkgPSBhcnJfZGVsYXkgPiA2MCkgJT4lCiAgdGFsbHkofiBsb25nX2RlbGF5IHwgY2FycmllciwgZGF0YSA9IC4pCmBgYAoK