library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(infer)
set.seed(06151982)
global_monitor <- tibble(
scientist_work = c(rep("Benefits", 80000), rep("Doesn't benefit", 20000))
)
ggplot(global_monitor, aes(x = scientist_work)) +
geom_bar() +
labs(
x = "", y = "",
title = "Do you believe that the work scientists do benefit people like you?"
) +
coord_flip()
global_monitor %>%
count(scientist_work) %>%
mutate(p = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n p
## <chr> <int> <dbl>
## 1 Benefits 80000 0.8
## 2 Doesn't benefit 20000 0.2
set.seed(06151983)
samp1 <- global_monitor %>%
sample_n(50)
ggplot(samp1, aes(x = scientist_work)) +
geom_bar() +
labs(
x = "", y = "",
title = "Sample1: Do you believe that the work scientists do benefit people like you?"
) +
coord_flip()
samp1 %>%
count(scientist_work) %>%
mutate(Sample1 = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n Sample1
## <chr> <int> <dbl>
## 1 Benefits 38 0.76
## 2 Doesn't benefit 12 0.24
samp1 %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n p_hat
## <chr> <int> <dbl>
## 1 Benefits 38 0.76
## 2 Doesn't benefit 12 0.24
set.seed(06151984)
samp2 <- global_monitor %>%
sample_n(50)
ggplot(samp2, aes(x = scientist_work)) +
geom_bar() +
labs(
x = "", y = "",
title = "Sample 2: Do you believe that the work scientists do benefit people like you?"
) +
coord_flip()
samp2 %>%
count(scientist_work) %>%
mutate(Sample2 = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n Sample2
## <chr> <int> <dbl>
## 1 Benefits 43 0.86
## 2 Doesn't benefit 7 0.14
set.seed(06151985)
samp3 <- global_monitor %>%
sample_n(500)
set.seed(06151986)
samp4 <- global_monitor %>%
sample_n(1000)
ggplot(samp3, aes(x = scientist_work)) +
geom_bar() +
labs(
x = "", y = "",
title = "Sample 3: Do you believe that the work scientists do benefit people like you?"
) +
coord_flip()
samp3 %>%
count(scientist_work) %>%
mutate(Sample3 = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n Sample3
## <chr> <int> <dbl>
## 1 Benefits 415 0.83
## 2 Doesn't benefit 85 0.17
ggplot(samp4, aes(x = scientist_work)) +
geom_bar() +
labs(
x = "", y = "",
title = "Sample 4: Do you believe that the work scientists do benefit people like you?"
) +
coord_flip()
samp4 %>%
count(scientist_work) %>%
mutate(Sample4 = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n Sample4
## <chr> <int> <dbl>
## 1 Benefits 801 0.801
## 2 Doesn't benefit 199 0.199
set.seed(06151987)
sample_props50 <- global_monitor %>%
rep_sample_n(size = 50, reps = 15000, replace = TRUE) %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n)) %>%
filter(scientist_work == "Doesn't benefit")
ggplot(data = sample_props50, aes(x = p_hat)) +
geom_histogram(binwidth = 0.02) +
labs(
x = "p_hat (Doesn't benefit)",
title = "Sampling distribution of p_hat",
subtitle = "Sample size = 50, Number of samples = 15000"
)
summary(sample_props50)
## replicate scientist_work n p_hat
## Min. : 1 Length:15000 Min. : 1.000 Min. :0.0200
## 1st Qu.: 3751 Class :character 1st Qu.: 8.000 1st Qu.:0.1600
## Median : 7500 Mode :character Median :10.000 Median :0.2000
## Mean : 7500 Mean : 9.974 Mean :0.1995
## 3rd Qu.:11250 3rd Qu.:12.000 3rd Qu.:0.2400
## Max. :15000 Max. :23.000 Max. :0.4600
ggplot(data = sample_props50, aes(sample = p_hat)) +
geom_line(stat = "qq")
##### Exercise 5: ###### a smaller sampling distribution of 25 observations of sample size 10 follows. In it there are 23 observations and 4 variables. Each of the observations represent a sample of the population of individuals who think scientist work does not benefit their lives.
set.seed(061519888)
sample_props_small <- global_monitor %>%
rep_sample_n(size = 10, reps = 25, replace = TRUE) %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n)) %>%
filter(scientist_work == "Doesn't benefit")
ggplot(data = sample_props_small, aes(x = p_hat)) +
geom_histogram(binwidth = .04) +
labs(
x = "p_hat (Doesn't benefit)",
title = "Sampling distribution of p_hat",
subtitle = "Sample size = 10, Number of samples = 25"
)
set.seed(06151989)
samp7 <- global_monitor %>%
sample_n(15)
ggplot(samp7, aes(x = scientist_work)) +
geom_bar() +
labs(
x = "", y = "",
title = "Sample7: Do you believe that the work scientists do benefit people like you?"
) +
coord_flip()
samp7 %>%
count(scientist_work) %>%
mutate(Sample7 = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n Sample7
## <chr> <int> <dbl>
## 1 Benefits 13 0.867
## 2 Doesn't benefit 2 0.133
set.seed(06151990)
sample_props15 <- global_monitor %>%
rep_sample_n(size = 15, reps = 2000, replace = TRUE) %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n)) %>%
filter(scientist_work == "Benefits")
ggplot(data = sample_props15, aes(x = p_hat)) +
geom_histogram(binwidth = 0.06) +
labs(
x = "p_hat (Benefits)",
title = "Sampling distribution of p_hat",
subtitle = "Sample size = 15, Number of samples = 2000"
)
ggplot(data = sample_props15, aes(sample = p_hat)) +
geom_line(stat = "qq")
summary(sample_props15)
## replicate scientist_work n p_hat
## Min. : 1.0 Length:2000 Min. : 7 Min. :0.4667
## 1st Qu.: 500.8 Class :character 1st Qu.:11 1st Qu.:0.7333
## Median :1000.5 Mode :character Median :12 Median :0.8000
## Mean :1000.5 Mean :12 Mean :0.7998
## 3rd Qu.:1500.2 3rd Qu.:13 3rd Qu.:0.8667
## Max. :2000.0 Max. :15 Max. :1.0000
set.seed(06151991)
sample_props150 <- global_monitor %>%
rep_sample_n(size = 150, reps = 2000, replace = TRUE) %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n)) %>%
filter(scientist_work == "Benefits")
ggplot(data = sample_props150, aes(x = p_hat)) +
geom_histogram(binwidth = 0.02) +
labs(
x = "p_hat (Benefits)",
title = "Sampling distribution of p_hat",
subtitle = "Sample size = 150, Number of samples = 2000"
)
ggplot(data = sample_props150, aes(sample = p_hat)) +
geom_line(stat = "qq")
summary(sample_props150)
## replicate scientist_work n p_hat
## Min. : 1.0 Length:2000 Min. :102.0 Min. :0.680
## 1st Qu.: 500.8 Class :character 1st Qu.:117.0 1st Qu.:0.780
## Median :1000.5 Mode :character Median :120.0 Median :0.800
## Mean :1000.5 Mean :120.1 Mean :0.801
## 3rd Qu.:1500.2 3rd Qu.:123.0 3rd Qu.:0.820
## Max. :2000.0 Max. :135.0 Max. :0.900