Samples and Populations
library(pacman)
p_load(mdsr, nycflights13, tidyverse)
Recall the flights dataframe.
flights
Get all of the flights to SF and take a sample of 25 of them.
Note that I did not set a seed for the sample, so your answers will differ from what is in the book.
SF <- flights %>%
filter(dest == "SFO", !is.na(arr_delay))
SF
Sample25 <- SF %>%
sample_n(size = 25)
Sample25
Sample25 %>% favstats(~ arr_delay, data = .)
SF %>% favstats(~ arr_delay, data = .)
Sample25 %>% qdata(~arr_delay, p = 0.98, data = .)
p quantile
0.98 158.12
SF %>% tally(~ arr_delay < 90, data = ., format = "proportion")
arr_delay < 90
TRUE FALSE
0.95141577 0.04858423
SF %>% qdata(~ arr_delay, p = 0.98, data = .)
p quantile
0.98 153.00
Sample Statistics
The sampling distribution.
n <- 25
SF %>% sample_n(size=n, replace = FALSE) %>%
summarize(mean(arr_delay))
SF %>% sample_n(size=n, replace = FALSE) %>%
summarize(mean(arr_delay))
Trials <- do(500)*
mean( ~ arr_delay, data = sample_n(SF, size = n, replace = TRUE))
Trials
Trials %>% favstats(~ mean, data = .)
Confidence intervals
mean(~ mean, data = Trials) + 2*sd(~ mean, data = Trials)*c(-1,1)
[1] -17.19053 22.00797
Using a larger sample size.
n <- 100
Trials_100 <- do(500)*
mean( ~ arr_delay, data = sample_n(SF, size = n, replace = TRUE))
Trials_100
Plots to compare the sampling distributions with different sample sizes.
rbind(Trials %>% mutate(n=25), Trials_100 %>% mutate(n=100)) %>%
ggplot(aes(x = mean)) +
geom_histogram(bins = 30) +
facet_grid(~ n) +
xlab("Sample mean")

Boostrap
Small <- SF %>% sample_n(size = 3, replace = FALSE)
Small
Small <- SF %>% sample_n(size = 3, replace = TRUE)
Small
n <- 200
Orig_sample <- SF %>% sample_n(size = n, replace = FALSE)
Orig_sample
Orig_sample %>% sample_n(size=n, replace = TRUE) %>%
summarize(mean(arr_delay))
Boostrap_trials <- do(500)*
mean(~ arr_delay, data = sample_n(Orig_sample, size = n, replace = TRUE))
Boostrap_trials %>% favstats(~ mean, data = .)
Trials_200 <- do(500)*
mean(~ arr_delay, data = sample_n(SF, size = n, replace = FALSE))
Trials_200 %>% favstats(~ mean, data = .)
Orig_sample %>% qdata( ~arr_delay, p = 0.98, data = .)
p quantile
0.98 187.18
Bootstrap_trials <- do(500)*
qdata(~arr_delay, p = 0.98, data = sample_n(Orig_sample, size = n, replace = TRUE))
Bootstrap_trials %>% favstats(~ quantile, data = .)
Bootstrap_trials <- do(10000)*
qdata(~arr_delay, p = 0.98, data = sample_n(Orig_sample, size = n, replace = TRUE))
Bootstrap_trials %>% favstats(~ quantile, data = .)
outliers
SF %>% filter(arr_delay >= 420) %>%
select(month, day, dep_delay, arr_delay, carrier)
SF %>% mutate(long_delay = arr_delay > 60) %>%
tally(~ long_delay | month, data = .)
month
long_delay 1 2 3 4 5 6 7 8 9 10 11 12
TRUE 29 21 61 112 65 209 226 96 65 36 51 66
FALSE 856 741 812 993 1128 980 966 1159 1124 1177 1107 1093
SF %>% mutate(long_delay = arr_delay > 60) %>%
tally(~ long_delay | carrier, data = .)
carrier
long_delay AA B6 DL UA VX
TRUE 148 86 91 492 220
FALSE 1250 934 1757 6236 1959
LS0tCnRpdGxlOiAiQ2hhcHRlciA3IFN0YXRpc3RpY2FsIEZvdW5kYXRpb25zIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIyBTYW1wbGVzIGFuZCBQb3B1bGF0aW9ucwoKYGBge3J9CmxpYnJhcnkocGFjbWFuKQoKcF9sb2FkKG1kc3IsIG55Y2ZsaWdodHMxMywgdGlkeXZlcnNlKQpgYGAKClJlY2FsbCB0aGUgKmZsaWdodHMqIGRhdGFmcmFtZS4KCmBgYHtyfQpmbGlnaHRzCmBgYAoKR2V0IGFsbCBvZiB0aGUgZmxpZ2h0cyB0byBTRiBhbmQgdGFrZSBhIHNhbXBsZSBvZiAyNSBvZiB0aGVtLgoKTm90ZSB0aGF0IEkgZGlkIG5vdCBzZXQgYSBzZWVkIGZvciB0aGUgc2FtcGxlLCBzbyB5b3VyIGFuc3dlcnMgd2lsbCBkaWZmZXIgZnJvbSB3aGF0IGlzIGluIHRoZSBib29rLgoKYGBge3J9ClNGIDwtIGZsaWdodHMgJT4lIAogIGZpbHRlcihkZXN0ID09ICJTRk8iLCAhaXMubmEoYXJyX2RlbGF5KSkKU0YKICAKU2FtcGxlMjUgPC0gU0YgJT4lCiAgc2FtcGxlX24oc2l6ZSA9IDI1KQpTYW1wbGUyNQpgYGAKCmBgYHtyfQpTYW1wbGUyNSAlPiUgZmF2c3RhdHMofiBhcnJfZGVsYXksIGRhdGEgPSAuKQpgYGAKCmBgYHtyfQpTRiAlPiUgZmF2c3RhdHMofiBhcnJfZGVsYXksIGRhdGEgPSAuKQpgYGAKCmBgYHtyfQpTYW1wbGUyNSAlPiUgcWRhdGEofmFycl9kZWxheSwgcCA9IDAuOTgsIGRhdGEgPSAuKQpgYGAKCmBgYHtyfQpTRiAlPiUgdGFsbHkofiBhcnJfZGVsYXkgPCA5MCwgZGF0YSA9IC4sIGZvcm1hdCA9ICJwcm9wb3J0aW9uIikKYGBgCgoKYGBge3J9ClNGICU+JSBxZGF0YSh+IGFycl9kZWxheSwgcCA9IDAuOTgsIGRhdGEgPSAuKQpgYGAKCiMjIFNhbXBsZSBTdGF0aXN0aWNzCgpUaGUgc2FtcGxpbmcgZGlzdHJpYnV0aW9uLgoKYGBge3J9Cm4gPC0gMjUKU0YgJT4lIHNhbXBsZV9uKHNpemU9biwgcmVwbGFjZSA9IEZBTFNFKSAlPiUKICBzdW1tYXJpemUobWVhbihhcnJfZGVsYXkpKQoKCmBgYAoKYGBge3J9ClNGICU+JSBzYW1wbGVfbihzaXplPW4sIHJlcGxhY2UgPSBGQUxTRSkgJT4lCiAgc3VtbWFyaXplKG1lYW4oYXJyX2RlbGF5KSkKYGBgCgpgYGB7cn0KVHJpYWxzIDwtIGRvKDUwMCkqCiAgbWVhbiggfiBhcnJfZGVsYXksIGRhdGEgPSBzYW1wbGVfbihTRiwgc2l6ZSA9IG4sIHJlcGxhY2UgPSBUUlVFKSkKVHJpYWxzCmBgYAoKYGBge3J9ClRyaWFscyAlPiUgZmF2c3RhdHMofiBtZWFuLCBkYXRhID0gLikKYGBgCgpDb25maWRlbmNlIGludGVydmFscwoKYGBge3J9Cm1lYW4ofiBtZWFuLCBkYXRhID0gVHJpYWxzKSArIDIqc2QofiBtZWFuLCBkYXRhID0gVHJpYWxzKSpjKC0xLDEpCmBgYAoKVXNpbmcgYSBsYXJnZXIgc2FtcGxlIHNpemUuCgpgYGB7cn0KbiA8LSAxMDAKVHJpYWxzXzEwMCA8LSBkbyg1MDApKgogIG1lYW4oIH4gYXJyX2RlbGF5LCBkYXRhID0gc2FtcGxlX24oU0YsIHNpemUgPSBuLCByZXBsYWNlID0gVFJVRSkpClRyaWFsc18xMDAKYGBgCgpQbG90cyB0byBjb21wYXJlIHRoZSBzYW1wbGluZyBkaXN0cmlidXRpb25zIHdpdGggZGlmZmVyZW50IHNhbXBsZSBzaXplcy4KCmBgYHtyfQpyYmluZChUcmlhbHMgJT4lIG11dGF0ZShuPTI1KSwgVHJpYWxzXzEwMCAlPiUgbXV0YXRlKG49MTAwKSkgJT4lCiAgZ2dwbG90KGFlcyh4ID0gbWVhbikpICsKICBnZW9tX2hpc3RvZ3JhbShiaW5zID0gMzApICsKICBmYWNldF9ncmlkKH4gbikgKwogIHhsYWIoIlNhbXBsZSBtZWFuIikKCmBgYAoKIyMgQm9vc3RyYXAKCmBgYHtyfQpTbWFsbCA8LSBTRiAlPiUgc2FtcGxlX24oc2l6ZSA9IDMsIHJlcGxhY2UgPSBGQUxTRSkKU21hbGwKYGBgCgpgYGB7cn0KU21hbGwgPC0gU0YgJT4lIHNhbXBsZV9uKHNpemUgPSAzLCByZXBsYWNlID0gVFJVRSkKU21hbGwKYGBgCgpgYGB7cn0KbiA8LSAyMDAKT3JpZ19zYW1wbGUgPC0gU0YgJT4lIHNhbXBsZV9uKHNpemUgPSBuLCByZXBsYWNlID0gRkFMU0UpCk9yaWdfc2FtcGxlCmBgYAoKYGBge3J9Ck9yaWdfc2FtcGxlICU+JSBzYW1wbGVfbihzaXplPW4sIHJlcGxhY2UgPSBUUlVFKSAlPiUKICBzdW1tYXJpemUobWVhbihhcnJfZGVsYXkpKQpgYGAKCgpgYGB7cn0KQm9vc3RyYXBfdHJpYWxzIDwtIGRvKDUwMCkqCiAgbWVhbih+IGFycl9kZWxheSwgZGF0YSA9IHNhbXBsZV9uKE9yaWdfc2FtcGxlLCBzaXplID0gbiwgcmVwbGFjZSA9IFRSVUUpKQogICAgICAgCkJvb3N0cmFwX3RyaWFscyAlPiUgZmF2c3RhdHMofiBtZWFuLCBkYXRhID0gLikKCmBgYAoKYGBge3J9ClRyaWFsc18yMDAgPC0gZG8oNTAwKSoKICBtZWFuKH4gYXJyX2RlbGF5LCBkYXRhID0gc2FtcGxlX24oU0YsIHNpemUgPSBuLCByZXBsYWNlID0gRkFMU0UpKQogICAgICAgClRyaWFsc18yMDAgJT4lIGZhdnN0YXRzKH4gbWVhbiwgZGF0YSA9IC4pCmBgYAoKCmBgYHtyfQpPcmlnX3NhbXBsZSAlPiUgcWRhdGEoIH5hcnJfZGVsYXksIHAgPSAwLjk4LCBkYXRhID0gLikKYGBgCgpgYGB7cn0KQm9vdHN0cmFwX3RyaWFscyA8LSBkbyg1MDApKgogIHFkYXRhKH5hcnJfZGVsYXksIHAgPSAwLjk4LCBkYXRhID0gc2FtcGxlX24oT3JpZ19zYW1wbGUsIHNpemUgPSBuLCByZXBsYWNlID0gVFJVRSkpCgpCb290c3RyYXBfdHJpYWxzICU+JSBmYXZzdGF0cyh+IHF1YW50aWxlLCBkYXRhID0gLikKYGBgCgoKYGBge3J9CkJvb3RzdHJhcF90cmlhbHMgPC0gZG8oMTAwMDApKgogIHFkYXRhKH5hcnJfZGVsYXksIHAgPSAwLjk4LCBkYXRhID0gc2FtcGxlX24oT3JpZ19zYW1wbGUsIHNpemUgPSBuLCByZXBsYWNlID0gVFJVRSkpCgpCb290c3RyYXBfdHJpYWxzICU+JSBmYXZzdGF0cyh+IHF1YW50aWxlLCBkYXRhID0gLikKYGBgCgojIyBvdXRsaWVycwoKYGBge3J9ClNGICU+JSBmaWx0ZXIoYXJyX2RlbGF5ID49IDQyMCkgJT4lCiAgc2VsZWN0KG1vbnRoLCBkYXksIGRlcF9kZWxheSwgYXJyX2RlbGF5LCBjYXJyaWVyKQpgYGAKCmBgYHtyfQpTRiAlPiUgbXV0YXRlKGxvbmdfZGVsYXkgPSBhcnJfZGVsYXkgPiA2MCkgJT4lCiAgdGFsbHkofiBsb25nX2RlbGF5IHwgbW9udGgsIGRhdGEgPSAuKQpgYGAKCmBgYHtyfQpTRiAlPiUgbXV0YXRlKGxvbmdfZGVsYXkgPSBhcnJfZGVsYXkgPiA2MCkgJT4lCiAgdGFsbHkofiBsb25nX2RlbGF5IHwgY2FycmllciwgZGF0YSA9IC4pCmBgYAoK