library(dplyr)
library(ggplot2)
library(NHANES)
library(oilabs)
source('create_datasets.R')
# Packages are loaded: NHANES, dplyr, ggplot2
# What are the variables in the NHANES dataset?
names(NHANES)
## [1] "ID" "SurveyYr" "Gender"
## [4] "Age" "AgeDecade" "AgeMonths"
## [7] "Race1" "Race3" "Education"
## [10] "MaritalStatus" "HHIncome" "HHIncomeMid"
## [13] "Poverty" "HomeRooms" "HomeOwn"
## [16] "Work" "Weight" "Length"
## [19] "HeadCirc" "Height" "BMI"
## [22] "BMICatUnder20yrs" "BMI_WHO" "Pulse"
## [25] "BPSysAve" "BPDiaAve" "BPSys1"
## [28] "BPDia1" "BPSys2" "BPDia2"
## [31] "BPSys3" "BPDia3" "Testosterone"
## [34] "DirectChol" "TotChol" "UrineVol1"
## [37] "UrineFlow1" "UrineVol2" "UrineFlow2"
## [40] "Diabetes" "DiabetesAge" "HealthGen"
## [43] "DaysPhysHlthBad" "DaysMentHlthBad" "LittleInterest"
## [46] "Depressed" "nPregnancies" "nBabies"
## [49] "Age1stBaby" "SleepHrsNight" "SleepTrouble"
## [52] "PhysActive" "PhysActiveDays" "TVHrsDay"
## [55] "CompHrsDay" "TVHrsDayChild" "CompHrsDayChild"
## [58] "Alcohol12PlusYr" "AlcoholDay" "AlcoholYear"
## [61] "SmokeNow" "Smoke100" "Smoke100n"
## [64] "SmokeAge" "Marijuana" "AgeFirstMarij"
## [67] "RegularMarij" "AgeRegMarij" "HardDrugs"
## [70] "SexEver" "SexAge" "SexNumPartnLife"
## [73] "SexNumPartYear" "SameSex" "SexOrientation"
## [76] "PregnantNow"
glimpse(NHANES)
## Observations: 10,000
## Variables: 76
## $ ID <int> 51624, 51624, 51624, 51625, 51630, 51638, 516...
## $ SurveyYr <fctr> 2009_10, 2009_10, 2009_10, 2009_10, 2009_10,...
## $ Gender <fctr> male, male, male, male, female, male, male, ...
## $ Age <int> 34, 34, 34, 4, 49, 9, 8, 45, 45, 45, 66, 58, ...
## $ AgeDecade <fctr> 30-39, 30-39, 30-39, 0-9, 40-49, 0-9, ...
## $ AgeMonths <int> 409, 409, 409, 49, 596, 115, 101, 541, 541, 5...
## $ Race1 <fctr> White, White, White, Other, White, White, Wh...
## $ Race3 <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Education <fctr> High School, High School, High School, NA, S...
## $ MaritalStatus <fctr> Married, Married, Married, NA, LivePartner, ...
## $ HHIncome <fctr> 25000-34999, 25000-34999, 25000-34999, 20000...
## $ HHIncomeMid <int> 30000, 30000, 30000, 22500, 40000, 87500, 600...
## $ Poverty <dbl> 1.36, 1.36, 1.36, 1.07, 1.91, 1.84, 2.33, 5.0...
## $ HomeRooms <int> 6, 6, 6, 9, 5, 6, 7, 6, 6, 6, 5, 10, 6, 10, 1...
## $ HomeOwn <fctr> Own, Own, Own, Own, Rent, Rent, Own, Own, Ow...
## $ Work <fctr> NotWorking, NotWorking, NotWorking, NA, NotW...
## $ Weight <dbl> 87.4, 87.4, 87.4, 17.0, 86.7, 29.8, 35.2, 75....
## $ Length <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ HeadCirc <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ Height <dbl> 164.7, 164.7, 164.7, 105.4, 168.4, 133.1, 130...
## $ BMI <dbl> 32.22, 32.22, 32.22, 15.30, 30.57, 16.82, 20....
## $ BMICatUnder20yrs <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ BMI_WHO <fctr> 30.0_plus, 30.0_plus, 30.0_plus, 12.0_18.5, ...
## $ Pulse <int> 70, 70, 70, NA, 86, 82, 72, 62, 62, 62, 60, 6...
## $ BPSysAve <int> 113, 113, 113, NA, 112, 86, 107, 118, 118, 11...
## $ BPDiaAve <int> 85, 85, 85, NA, 75, 47, 37, 64, 64, 64, 63, 7...
## $ BPSys1 <int> 114, 114, 114, NA, 118, 84, 114, 106, 106, 10...
## $ BPDia1 <int> 88, 88, 88, NA, 82, 50, 46, 62, 62, 62, 64, 7...
## $ BPSys2 <int> 114, 114, 114, NA, 108, 84, 108, 118, 118, 11...
## $ BPDia2 <int> 88, 88, 88, NA, 74, 50, 36, 68, 68, 68, 62, 7...
## $ BPSys3 <int> 112, 112, 112, NA, 116, 88, 106, 118, 118, 11...
## $ BPDia3 <int> 82, 82, 82, NA, 76, 44, 38, 60, 60, 60, 64, 7...
## $ Testosterone <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ DirectChol <dbl> 1.29, 1.29, 1.29, NA, 1.16, 1.34, 1.55, 2.12,...
## $ TotChol <dbl> 3.49, 3.49, 3.49, NA, 6.70, 4.86, 4.09, 5.82,...
## $ UrineVol1 <int> 352, 352, 352, NA, 77, 123, 238, 106, 106, 10...
## $ UrineFlow1 <dbl> NA, NA, NA, NA, 0.094, 1.538, 1.322, 1.116, 1...
## $ UrineVol2 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ UrineFlow2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ Diabetes <fctr> No, No, No, No, No, No, No, No, No, No, No, ...
## $ DiabetesAge <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ HealthGen <fctr> Good, Good, Good, NA, Good, NA, NA, Vgood, V...
## $ DaysPhysHlthBad <int> 0, 0, 0, NA, 0, NA, NA, 0, 0, 0, 10, 0, 4, NA...
## $ DaysMentHlthBad <int> 15, 15, 15, NA, 10, NA, NA, 3, 3, 3, 0, 0, 0,...
## $ LittleInterest <fctr> Most, Most, Most, NA, Several, NA, NA, None,...
## $ Depressed <fctr> Several, Several, Several, NA, Several, NA, ...
## $ nPregnancies <int> NA, NA, NA, NA, 2, NA, NA, 1, 1, 1, NA, NA, N...
## $ nBabies <int> NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA...
## $ Age1stBaby <int> NA, NA, NA, NA, 27, NA, NA, NA, NA, NA, NA, N...
## $ SleepHrsNight <int> 4, 4, 4, NA, 8, NA, NA, 8, 8, 8, 7, 5, 4, NA,...
## $ SleepTrouble <fctr> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, ...
## $ PhysActive <fctr> No, No, No, NA, No, NA, NA, Yes, Yes, Yes, Y...
## $ PhysActiveDays <int> NA, NA, NA, NA, NA, NA, NA, 5, 5, 5, 7, 5, 1,...
## $ TVHrsDay <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ CompHrsDay <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ TVHrsDayChild <int> NA, NA, NA, 4, NA, 5, 1, NA, NA, NA, NA, NA, ...
## $ CompHrsDayChild <int> NA, NA, NA, 1, NA, 0, 6, NA, NA, NA, NA, NA, ...
## $ Alcohol12PlusYr <fctr> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Ye...
## $ AlcoholDay <int> NA, NA, NA, NA, 2, NA, NA, 3, 3, 3, 1, 2, 6, ...
## $ AlcoholYear <int> 0, 0, 0, NA, 20, NA, NA, 52, 52, 52, 100, 104...
## $ SmokeNow <fctr> No, No, No, NA, Yes, NA, NA, NA, NA, NA, No,...
## $ Smoke100 <fctr> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, ...
## $ Smoke100n <fctr> Smoker, Smoker, Smoker, NA, Smoker, NA, NA, ...
## $ SmokeAge <int> 18, 18, 18, NA, 38, NA, NA, NA, NA, NA, 13, N...
## $ Marijuana <fctr> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Ye...
## $ AgeFirstMarij <int> 17, 17, 17, NA, 18, NA, NA, 13, 13, 13, NA, 1...
## $ RegularMarij <fctr> No, No, No, NA, No, NA, NA, No, No, No, NA, ...
## $ AgeRegMarij <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2...
## $ HardDrugs <fctr> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, ...
## $ SexEver <fctr> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Ye...
## $ SexAge <int> 16, 16, 16, NA, 12, NA, NA, 13, 13, 13, 17, 2...
## $ SexNumPartnLife <int> 8, 8, 8, NA, 10, NA, NA, 20, 20, 20, 15, 7, 1...
## $ SexNumPartYear <int> 1, 1, 1, NA, 1, NA, NA, 0, 0, 0, NA, 1, 1, NA...
## $ SameSex <fctr> No, No, No, NA, Yes, NA, NA, Yes, Yes, Yes, ...
## $ SexOrientation <fctr> Heterosexual, Heterosexual, Heterosexual, NA...
## $ PregnantNow <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# Create bar plot for Home Ownership by Gender
ggplot(NHANES, aes(x = Gender, fill = HomeOwn)) +
geom_bar(position = "fill") +
ylab("Relative frequencies")
# Density for SleepHrsNight colored by SleepTrouble, faceted by HealthGen
ggplot(NHANES, aes(x = SleepHrsNight, col = SleepTrouble)) +
geom_density(adjust = 2) +
facet_wrap(~ HealthGen)
We will randomly permute the observations and calculate a difference in proportions that could arise from a null distribution
# Subset the data: homes
homes <- NHANES %>%
select(Gender, HomeOwn) %>%
filter(HomeOwn %in% c("Own", "Rent"))
# Perform one permutation
homes %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own))
## # A tibble: 1 x 2
## diff_perm diff_orig
## <dbl> <dbl>
## 1 0.0004089131 -0.007828723
# Perform 10 permutations
homeown_perm <- homes %>%
rep_sample_n(size = dim(homes)[1], reps = 10) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(
prop_own_perm = mean(HomeOwn_perm == 'Own'),
prop_own = mean(HomeOwn == 'Own')
) %>%
summarize(
diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)
) # male - female
# Print differences to console
homeown_perm
## # A tibble: 10 x 3
## replicate diff_perm diff_orig
## <int> <dbl> <dbl>
## 1 1 -2.968670e-06 -0.007828723
## 2 2 6.999022e-03 -0.007828723
## 3 3 -2.968670e-06 -0.007828723
## 4 4 -8.267323e-04 -0.007828723
## 5 5 3.292086e-03 -0.007828723
## 6 6 1.644559e-03 -0.007828723
## 7 7 6.175258e-03 -0.007828723
## 8 8 1.317725e-02 -0.007828723
## 9 9 -2.886141e-03 -0.007828723
## 10 10 -1.606636e-02 -0.007828723
# Dotplot of 10 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_dotplot(binwidth = .001)
# Perform 100 permutations
homeown_perm <- homes %>%
rep_sample_n(size = dim(homes)[1], reps = 100) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(
prop_own_perm = mean(HomeOwn_perm == 'Own'),
prop_own = mean(HomeOwn == 'Own')
) %>%
summarize(
diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)
) # male - female
glimpse(homeown_perm)
## Observations: 100
## Variables: 3
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ diff_perm <dbl> 0.0065871402, -0.0016504959, -0.0103000138, -0.00288...
## $ diff_orig <dbl> -0.007828723, -0.007828723, -0.007828723, -0.0078287...
# Dotplot of 100 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_dotplot(binwidth = .001)
Using 1000 permutations
# Perform 1000 permutations
homeown_perm <- homes %>%
rep_sample_n(size = dim(homes)[1], reps = 1000) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(
prop_own_perm = mean(HomeOwn_perm == 'Own'),
prop_own = mean(HomeOwn == 'Own')
) %>%
summarize(
diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)
)
glimpse(homeown_perm)
## Observations: 1,000
## Variables: 3
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ diff_perm <dbl> 0.0078227856, -0.0103000138, -0.0016504959, -0.00494...
## $ diff_orig <dbl> -0.007828723, -0.007828723, -0.007828723, -0.0078287...
# Density plot of 1000 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_dotplot(binwidth = .000824, alpha = .6) +
geom_density(color = "blue")
I could not quite figure out how to get the dots and the density to line up perfectly, but this is close enough and shows the density fits the distribution of the resampled statistic.
glimpse(homeown_perm)
## Observations: 1,000
## Variables: 3
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ diff_perm <dbl> 0.0078227856, -0.0103000138, -0.0016504959, -0.00494...
## $ diff_orig <dbl> -0.007828723, -0.007828723, -0.007828723, -0.0078287...
# Plot permuted differences
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_dotplot(binwidth = 0.0008) +
geom_density(col = "lightblue", size = 2) +
geom_vline(aes(xintercept = diff_orig),
col = 'red')
# Compare permuted differences to observed difference
homeown_perm %>%
summarize(sum(diff_orig >= diff_perm))
## # A tibble: 1 x 1
## `sum(diff_orig >= diff_perm)`
## <int>
## 1 211
Here we want proportion of women who were promoted, as opposed to the proportion of promoted individuals who were women.
The data loaded here is slightly different that that used in the class. It just has a larger sample. We just had 24 of each gender in the class. The variability of the sample distribution will be tighter in this notebook.
# disc has been created
# Create a contingency table summarizing the data
table(disc)
## sex
## promote female male
## not_promoted 10 3
## promoted 14 21
glimpse(disc)
## Observations: 48
## Variables: 2
## $ promote <fctr> promoted, promoted, promoted, promoted, promoted, pro...
## $ sex <fctr> male, male, male, male, male, male, male, male, male,...
# Find proportion of each sex who were promoted
disc %>%
group_by(sex) %>%
summarize(promoted_prop = mean(promote == 'promoted'))
## # A tibble: 2 x 2
## sex promoted_prop
## <fctr> <dbl>
## 1 female 0.5833333
## 2 male 0.8750000
# Sample the entire data frame 5 times
disc %>%
rep_sample_n(size = nrow(disc), reps = 5)
## # A tibble: 240 x 3
## # Groups: replicate [5]
## replicate promote sex
## * <int> <fctr> <fctr>
## 1 1 promoted male
## 2 1 promoted male
## 3 1 promoted male
## 4 1 not_promoted female
## 5 1 not_promoted female
## 6 1 not_promoted male
## 7 1 promoted male
## 8 1 not_promoted male
## 9 1 not_promoted female
## 10 1 promoted male
## # ... with 230 more rows
# Shuffle the promote variable within replicate
disc %>%
rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote))
## # A tibble: 240 x 4
## # Groups: replicate [5]
## replicate promote sex prom_perm
## <int> <fctr> <fctr> <fctr>
## 1 1 not_promoted male promoted
## 2 1 not_promoted male promoted
## 3 1 not_promoted female promoted
## 4 1 promoted male not_promoted
## 5 1 not_promoted female promoted
## 6 1 not_promoted male promoted
## 7 1 promoted male promoted
## 8 1 promoted male not_promoted
## 9 1 not_promoted female promoted
## 10 1 promoted male promoted
## # ... with 230 more rows
# Find the proportion of promoted in each replicate and sex
disc %>%
rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(
prop_prom_perm = mean(prom_perm == 'promoted'),
prop_prom = mean(promote == 'promoted'))
## # A tibble: 10 x 4
## # Groups: replicate [?]
## replicate sex prop_prom_perm prop_prom
## <int> <fctr> <dbl> <dbl>
## 1 1 female 0.7500000 0.5833333
## 2 1 male 0.7083333 0.8750000
## 3 2 female 0.6666667 0.5833333
## 4 2 male 0.7916667 0.8750000
## 5 3 female 0.6666667 0.5833333
## 6 3 male 0.7916667 0.8750000
## 7 4 female 0.7500000 0.5833333
## 8 4 male 0.7083333 0.8750000
## 9 5 female 0.7083333 0.5833333
## 10 5 male 0.7500000 0.8750000
# Difference in proportion of promoted across sex grouped by gender
disc %>%
rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(
prop_prom_perm = mean(prom_perm == 'promoted'),
prop_prom = mean(promote == 'promoted')) %>%
summarize(
diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
## # A tibble: 5 x 3
## replicate diff_perm diff_orig
## <int> <dbl> <dbl>
## 1 1 -0.04166667 0.2916667
## 2 2 0.04166667 0.2916667
## 3 3 0.20833333 0.2916667
## 4 4 0.04166667 0.2916667
## 5 5 0.04166667 0.2916667
# Create a data frame of differences in promotion rates
glimpse(disc)
## Observations: 48
## Variables: 2
## $ promote <fctr> promoted, promoted, promoted, promoted, promoted, pro...
## $ sex <fctr> male, male, male, male, male, male, male, male, male,...
disc_perm <- disc %>%
rep_sample_n(size = nrow(disc), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
glimpse(disc_perm)
## Observations: 1,000
## Variables: 3
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ diff_perm <dbl> 0.04166667, 0.29166667, 0.12500000, 0.04166667, 0.12...
## $ diff_orig <dbl> 0.2916667, 0.2916667, 0.2916667, 0.2916667, 0.291666...
# Histogram of permuted differences
ggplot(disc_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = 'red')
# Find the 0.90, 0.95, and 0.99 quantiles of diff_perm
disc_perm %>%
summarize(
q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99))
## # A tibble: 1 x 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.125 0.2083333 0.2916667
# Find the 0.01, 0.05, and 0.10 quantiles of diff_perm
disc_perm %>%
summarize(
q.01 = quantile(diff_perm, p = 0.01),
q.05 = quantile(diff_perm, p = 0.05),
q.10 = quantile(diff_perm, p = 0.10))
## # A tibble: 1 x 3
## q.01 q.05 q.10
## <dbl> <dbl> <dbl>
## 1 -0.2916667 -0.2083333 -0.125
… It is common practice to judge a result significant, if it is of such a magnitude that it would have been produced by chance no more frequently than once in twenty trials. This is an arbitrary, but convenient, level of significance for the practical investigator, but it does not mean that he allows himself to be deceived once in every twenty experiments. The test of significance only tells him what to ignore, namely all experiments in which significant results are not obtained. He should only claim that a phenomenon is experimentally demonstrable when he knows how to design an experiment so that it will rarely fail to give a significant result. Consequently, isolated significant results which he does not know how to reproduce are left in suspense pending further investigation. - RA Fisher (1929)
disc_small <- readRDS('data/disc_small.rds')
disc_big <- readRDS('data/disc_big.rds')
# Tabulate the small and big data frames
disc_small %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 3 5
## male 1 7
disc_big %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 100 140
## male 30 210
disc_small_perm <- disc_small %>%
rep_sample_n(size = nrow(disc_small), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
disc_big_perm <- disc_big %>%
rep_sample_n(size = nrow(disc_big), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
glimpse(disc_small_perm)
## Observations: 1,000
## Variables: 3
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ diff_perm <dbl> -0.25, -0.25, 0.25, -0.25, 0.25, 0.00, -0.25, 0.00, ...
## $ diff_orig <dbl> 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25...
glimpse(disc_big_perm)
## Observations: 1,000
## Variables: 3
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ diff_perm <dbl> -0.016666667, -0.041666667, 0.058333333, 0.008333333...
## $ diff_orig <dbl> 0.2916667, 0.2916667, 0.2916667, 0.2916667, 0.291666...
# Plot the distributions of permuted differences
ggplot(disc_small_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = 'red')
ggplot(disc_big_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = 'red')
# Recall the quantiles associated with the original dataset
disc_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99))
## # A tibble: 1 x 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.125 0.2083333 0.2916667
# Calculate the quantiles associated with the small dataset
disc_small_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99))
## # A tibble: 1 x 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.25 0.25 0.5
# Calculate the quantiles associated with the big dataset
disc_big_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99))
## # A tibble: 1 x 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.05833333 0.06666667 0.09175
# Calculate the p-value for the original dataset
disc_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 x 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.026
# Calculate the p-value for the small dataset
disc_small_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 x 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.284
# Calculate the p-value for the big dataset
disc_big_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 x 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0
disc_new <- readRDS('data/disc_new.rds')
# Recall the original data
disc %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 10 14
## male 3 21
# Tabulate the new data
disc_new %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 7 17
## male 6 18
disc_new_perm <- disc_new %>%
rep_sample_n(size = nrow(disc_new), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
glimpse(disc_new_perm)
## Observations: 1,000
## Variables: 3
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ diff_perm <dbl> -0.12500000, 0.29166667, 0.04166667, 0.20833333, 0.0...
## $ diff_orig <dbl> 0.04166667, 0.04166667, 0.04166667, 0.04166667, 0.04...
# Plot the distribution of the original permuted differences
ggplot(disc_perm, aes(x = diff_perm)) +
geom_histogram() +
geom_vline(aes(xintercept = diff_orig), col = 'red')
# Plot the distribution of the new permuted differences
ggplot(disc_new_perm, aes(x = diff_perm)) +
geom_histogram() +
geom_vline(aes(xintercept = diff_orig), col = 'red')
# Find the p-value from the original data
disc_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 x 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.026
# Find the p-value from the new data
disc_new_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 x 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.499
# Calculate the two-sided p-value
glimpse(disc_perm)
## Observations: 1,000
## Variables: 3
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ diff_perm <dbl> 0.04166667, 0.29166667, 0.12500000, 0.04166667, 0.12...
## $ diff_orig <dbl> 0.2916667, 0.2916667, 0.2916667, 0.2916667, 0.291666...
disc_perm %>%
summarize(mean(diff_orig <= diff_perm)*2)
## # A tibble: 1 x 1
## `mean(diff_orig <= diff_perm) * 2`
## <dbl>
## 1 0.052
The study
Hypotheses
glimpse(opportunity)
## Observations: 150
## Variables: 2
## $ decision <fctr> buyDVD, buyDVD, buyDVD, buyDVD, buyDVD, buyDVD, buyD...
## $ group <fctr> control, control, control, control, control, control...
# Tabulate the data
opportunity %>%
select(decision, group) %>%
table()
## group
## decision control treatment
## buyDVD 56 41
## nobuyDVD 19 34
# Find the proportion who bought the DVD in each group
opportunity %>%
group_by(group) %>%
summarize(buy_prop = mean(decision == 'buyDVD'))
## # A tibble: 2 x 2
## group buy_prop
## <fctr> <dbl>
## 1 control 0.7466667
## 2 treatment 0.5466667
# Create a barplot
ggplot(opportunity, aes(x = group, fill = decision)) +
geom_bar(position = "fill")
# Data frame of differences in purchase rates after permuting
opp_perm <- opportunity %>%
rep_sample_n(size = nrow(opportunity), reps = 1000) %>%
mutate(dec_perm = sample(decision)) %>%
group_by(replicate, group) %>%
summarize(
prop_buy_perm = mean(dec_perm == "buyDVD"),
prop_buy = mean(decision == "buyDVD")
) %>%
summarize(
diff_perm = diff(prop_buy_perm),
diff_orig = diff(prop_buy)) # treatment - control
glimpse(opp_perm)
## Observations: 1,000
## Variables: 3
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ diff_perm <dbl> -0.04000000, -0.06666667, -0.04000000, -0.09333333, ...
## $ diff_orig <dbl> -0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -0.2...
# Histogram of permuted differences
ggplot(opp_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = .005) +
geom_vline(aes(xintercept = diff_orig), col = 'red')
glimpse(opp_perm)
## Observations: 1,000
## Variables: 3
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ diff_perm <dbl> -0.04000000, -0.06666667, -0.04000000, -0.09333333, ...
## $ diff_orig <dbl> -0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -0.2...
# Calculate the p-value
opp_perm %>%
summarize(mean(diff_perm <= diff_orig))
## # A tibble: 1 x 1
## `mean(diff_perm <= diff_orig)`
## <dbl>
## 1 0.017
But what if we are wrong…
There are some cases where you really don’t want to be wrong in one of the ways, so you are extra careful to not get that error. Here is a good comparison to the jusicial system.
glimpse(opp_perm)
## Observations: 1,000
## Variables: 3
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ diff_perm <dbl> -0.04000000, -0.06666667, -0.04000000, -0.09333333, ...
## $ diff_orig <dbl> -0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -0.2...
# Calculate the two-sided p-value
opp_perm %>%
summarize(mean(diff_perm <= diff_orig)*2)
## # A tibble: 1 x 1
## `mean(diff_perm <= diff_orig) * 2`
## <dbl>
## 1 0.034
To investigate how much estimates of a population proportion change from sample to sample, you will set up two sampling experiments.
In the first experiment, you will simulate repeated samples from a population. In the second, you will choose a single sample from the first experiment and repeatedly resample from that sample—a method called bootstrapping. More specifically:
It’s important to realize that the first experiment relies on knowing the population and is typically impossible in practice. The second relies only on the sample of data and is therefore easy to implement for any statistic. Fortunately, as you will see, the variability in \(\widehat{p}\), or the proportion of “successes” in a sample, is approximately the same whether we sample from the population or resample from a sample.
load('data/all_polls.RData')
glimpse(all_polls)
## Observations: 30,000
## Variables: 2
## $ poll <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ vote <int> 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, ...
# Select one poll from which to resample: one_poll
one_poll <- all_polls %>%
filter(poll == 1) %>%
select(vote)
glimpse(one_poll)
## Observations: 30
## Variables: 1
## $ vote <int> 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, ...
# Generate 1000 resamples of one_poll: one_poll_boot_30
one_poll_boot_30 <- one_poll %>%
rep_sample_n(size = 30, replace = T, reps = 1000)
glimpse(one_poll_boot_30)
## Observations: 30,000
## Variables: 2
## $ replicate <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ vote <int> 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1...
# Compute p-hat for each poll: ex1_props
ex1_props <- all_polls %>%
group_by(poll) %>%
summarize(prop_yes = mean(vote))
glimpse(ex1_props)
## Observations: 1,000
## Variables: 2
## $ poll <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ prop_yes <dbl> 0.7000000, 0.6666667, 0.6333333, 0.6333333, 0.4000000...
# Compute p-hat* for each resampled poll: ex2_props
ex2_props <- one_poll_boot_30 %>%
summarize(prop_yes = mean(vote))
glimpse(ex2_props)
## Observations: 1,000
## Variables: 2
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ prop_yes <dbl> 0.5333333, 0.6333333, 0.7333333, 0.8333333, 0.633333...
# Compare variability of p-hat and p-hat*
ex1_props %>% summarize(sd(prop_yes))
## # A tibble: 1 x 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08683128
ex2_props %>% summarize(sd(prop_yes))
## # A tibble: 1 x 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08274726
In the previous exercise, the resamples (with replacement) were the same size as the original dataset. You originally polled 30 people, then you repeatedly resampled 30 votes from the original dataset (with replacement).
What if the original dataset was 30 observations, but you chose to resample only 3 individuals with replacement? Alternatively, what if you chose to resample 300 individuals with replacement? Let’s call these Experiment 3 and Experiment 4, respectively.
Would the variability in these resampled \(\widehat{p}^∗\) values still be a good proxy for the variability of the sampled \(\widehat{p}\) values taken from repeated samples from the population?
# Resample from one_poll with n = 3: one_poll_boot_3
one_poll_boot_3 <- one_poll %>%
rep_sample_n(3, replace = T, reps = 1000)
# Resample from one_poll with n = 300: one_poll_boot_300
one_poll_boot_300 <- one_poll %>%
rep_sample_n(300, replace = T, reps = 1000)
# Compute p-hat* for each resampled poll: ex3_props
ex3_props <- one_poll_boot_3 %>%
summarize(prop_yes = mean(vote))
# Compute p-hat* for each resampled poll: ex4_props
ex4_props <- one_poll_boot_300 %>%
summarize(prop_yes = mean(vote))
# Compare variability of p-hat* for n = 3 vs. n = 300
ex3_props %>% summarize(sd(prop_yes))
## # A tibble: 1 x 1
## `sd(prop_yes)`
## <dbl>
## 1 0.2563397
ex4_props %>% summarize(sd(prop_yes))
## # A tibble: 1 x 1
## `sd(prop_yes)`
## <dbl>
## 1 0.02625099
These standard deviations are way off from the original example of 0.0868. These are not good estimates of the variablity.
In order to compare the variability of the sampled p̂ p^ and p̂ ∗p^∗ values in the previous exercises, it is valuable to visualize their distributions. To recall, the exercises walked through four different experiments for investigating the variability of p̂ p^ and p̂ ∗p^∗:
# Recall the variability of sample proportions
ex1_props %>% summarize(sd(prop_yes))
## # A tibble: 1 x 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08683128
ex2_props %>% summarize(sd(prop_yes))
## # A tibble: 1 x 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08274726
ex3_props %>% summarize(sd(prop_yes))
## # A tibble: 1 x 1
## `sd(prop_yes)`
## <dbl>
## 1 0.2563397
ex4_props %>% summarize(sd(prop_yes))
## # A tibble: 1 x 1
## `sd(prop_yes)`
## <dbl>
## 1 0.02625099
# Create smoothed density curves for all four experiments
ggplot() +
geom_density(data = ex1_props, aes(x = prop_yes), col = "black", bw = .1) +
geom_density(data = ex2_props, aes(x = prop_yes), col = "green", bw = .1) +
geom_density(data = ex3_props, aes(x = prop_yes), col = "red", bw = .1) +
geom_density(data = ex4_props, aes(x = prop_yes), col = "blue", bw = .1)
One nice property is that if the variability of the sample proportion (called the standard error, or SE) is known, then approximately 95% of \(\widehat{p}\) values (from different samples) will be within 2SE2SE of the true population proportion.
To check whether that holds in the situation at hand, let’s go back to the polls generated by taking many samples from the same population.
# Compute proportion of votes for Candidate X: props
props <- all_polls %>%
group_by(poll) %>%
summarize(prop_yes = mean(vote))
glimpse(props)
## Observations: 1,000
## Variables: 2
## $ poll <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ prop_yes <dbl> 0.7000000, 0.6666667, 0.6333333, 0.6333333, 0.4000000...
# Proportion of polls within 2SE
props %>%
mutate(lower = mean(prop_yes) - 2 * sd(prop_yes),
upper = mean(prop_yes) + 2 * sd(prop_yes),
in_CI = prop_yes > lower & prop_yes < upper) %>%
summarize(mean(in_CI))
## # A tibble: 1 x 1
## `mean(in_CI)`
## <dbl>
## 1 0.966
Yep, here 97% of the \(widehat{p}\) values are within 2SE of the true population parameter
The previous exercises told you two things:
Note that the rate of closeness (here 95%) refers to how often a sample is chosen so that it is close to the population parameter. You won’t ever know if a particular dataset is close to the parameter or far from it, but you do know that over your lifetime, 95% of the samples you collect should give you estimates that are within 2SE of the true population parameter.
# Again, set the one sample that was collected
one_poll <- all_polls %>%
filter(poll == 1) %>%
select(vote)
# Compute p-hat from one_poll: p_hat
p_hat <- mean(one_poll$vote)
# Bootstrap to find the SE of p-hat: one_poll_boot
one_poll_boot <- one_poll %>%
rep_sample_n(30, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
glimpse(one_poll_boot)
## Observations: 1,000
## Variables: 2
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ prop_yes_boot <dbl> 0.7666667, 0.6000000, 0.7333333, 0.6333333, 0.56...
# Create an interval of plausible values
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot))
## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.5303574 0.8696426
This does contain the population parameter of 0.6. Nice.
The main idea in the previous exercise was that the distance between the original sample \(\widehat{p}\) and the resampled (or bootstrapped) \(\widehat{p}^*\) values gives a measure for how far the original \(\widehat{p}\) is from the true population proportion.
The same variability can be measured through a different mechanism. As before, if \(\widehat{p}\) is sufficiently close to the true parameter, then the resampled (bootstrapped) \(\widehat{p}^*\) values will vary in such a way that they overlap with the true parameter.
Instead of using ±2SE as a way to measure the middle 95% of the sampled \(\widehat{p}\) values, you can find the middle of the resampled \(\widehat{p}^*\) values by removing the upper and lower 2.5%. Note that this second method of constructing bootstrap intervals also gives an intuitive way for making 90% or 99% confidence intervals.
glimpse(one_poll_boot)
## Observations: 1,000
## Variables: 2
## $ replicate <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ prop_yes_boot <dbl> 0.7666667, 0.6000000, 0.7333333, 0.6333333, 0.56...
# Find the 2.5% and 97.5% of the p-hat values
one_poll_boot %>%
summarize(q025_prop = quantile(prop_yes_boot, p = .025),
q975_prop = quantile(prop_yes_boot, p = .975))
## # A tibble: 1 x 2
## q025_prop q975_prop
## <dbl> <dbl>
## 1 0.5325 0.8666667
p_hat
## [1] 0.7
# Bootstrap t-confidence interval for comparison
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot))
## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.5303574 0.8696426
This also contains the .6 parameter of the true population. And this gives us much more flexibility to create confidence intervals at any level.
We used two different ways to find the bootstrap variability in the sample data - Empirical rule - Using the natural variabilty of the resampled $^* values These two methods give slightly differnt data but they are consistent because they are based on the same bootstrapped sample distribution
We don’t really know how far our sample statistic (the percentage of people voting for candidate X) is from the the real statistic on the true population. We do however know the variability of the sample distribution that we bootstrapped and we have shown that the variability in the samle distribution is similar to that of performing multiple polls on the true population. So we know that the true statistic is likely withing that distance from the sample statistic. So we can say…
We are 95% confident that the true proportion of people planning to vote for candidate X is between 0.536 and 0.864 (or 0.533 and 0.833)
Both of these methods should work given the following techincal conditions are true:
Both of the methods used here will be used in later courses as well as more advanced methods to find other parameter intevals
# Recall the bootstrap t-confidence interval
p_hat <- mean(one_poll$vote)
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot))
## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.5303574 0.8696426
# Collect a sample of 30 observations from the population
one_poll <- as.tbl(data.frame(vote = rbinom(30, 1, .6)))
# Resample the data using samples of size 300 (an incorrect strategy!)
one_poll_boot_300 <- one_poll %>%
rep_sample_n(300, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Find the endpoints of the bootstrap t-confidence interval
one_poll_boot_300 %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot))
## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.6452294 0.7547706
# Resample the data using samples of size 3 (an incorrect strategy!)
one_poll_boot_3 <- one_poll %>%
rep_sample_n(3, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Find the endpoints of the the bootstrap t-confidence interval
one_poll_boot_3 %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot))
## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.1505962 1.249404
One additional element that changes the width of the confidence interval is the true parameter value.
When the true parameter is close to 0.5, the standard error of p̂ p^ is larger than when the true parameter is closer to 0 or 1. When calculating a bootstrap t-confidence interval, the standard error controls the width of the CI, and here the width will be narrower.
# Collect 30 observations from a population with true proportion of 0.8
one_poll <- as.tbl(data.frame(vote = rbinom(n = 30, size = 1, prob = 0.8)))
# Compute p-hat of new sample: p_hat
p_hat <- mean(one_poll$vote)
# Resample the 30 observations (with replacement)
one_poll_boot <- one_poll %>%
rep_sample_n(30, replace = T, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Calculate the bootstrap t-confidence interval
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot))
## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.6127027 0.9206307
# Calculate a 95% bootstrap percentile interval
one_poll_boot %>%
summarize(q025_prop = quantile(prop_yes_boot, p = .025),
q975_prop = quantile(prop_yes_boot, p = .975))
## # A tibble: 1 x 2
## q025_prop q975_prop
## <dbl> <dbl>
## 1 0.6 0.9
# Calculate a 99% bootstrap percentile interval
one_poll_boot %>%
summarize(q005_prop = quantile(prop_yes_boot, p = .005),
q995_prop = quantile(prop_yes_boot, p = .995))
## # A tibble: 1 x 2
## q005_prop q995_prop
## <dbl> <dbl>
## 1 0.5333333 0.9335
# Calculate a 90% bootstrap percentile interval
one_poll_boot %>%
summarize(q05_prop = quantile(prop_yes_boot, p = .05),
q95_prop = quantile(prop_yes_boot, p = .95))
## # A tibble: 1 x 2
## q05_prop q95_prop
## <dbl> <dbl>
## 1 0.6333333 0.9