This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)
## Warning: package 'openintro' was built under R version 4.1.2
## Loading required package: airports
## Warning: package 'airports' was built under R version 4.1.2
## Loading required package: cherryblossom
## Warning: package 'cherryblossom' was built under R version 4.1.2
## Loading required package: usdata
## Warning: package 'usdata' was built under R version 4.1.2
library(infer)
## Warning: package 'infer' was built under R version 4.1.2
global_monitor <- tibble(
scientist_work = c(rep("Benefits", 80000), rep("Doesn't benefit", 20000))
)
global_monitor
## # A tibble: 100,000 x 1
## scientist_work
## <chr>
## 1 Benefits
## 2 Benefits
## 3 Benefits
## 4 Benefits
## 5 Benefits
## 6 Benefits
## 7 Benefits
## 8 Benefits
## 9 Benefits
## 10 Benefits
## # ... with 99,990 more rows
ggplot(global_monitor, aes(x = scientist_work)) +
geom_bar() +
labs(
x = "", y = "",
title = "Do you believe that the work scientists do benefit people like you?"
) +
coord_flip()
global_monitor %>%
count(scientist_work) %>%
mutate(p = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n p
## <chr> <int> <dbl>
## 1 Benefits 80000 0.8
## 2 Doesn't benefit 20000 0.2
samp1 <- global_monitor %>%
sample_n(50)
samp1
## # A tibble: 50 x 1
## scientist_work
## <chr>
## 1 Benefits
## 2 Benefits
## 3 Benefits
## 4 Benefits
## 5 Benefits
## 6 Doesn't benefit
## 7 Benefits
## 8 Benefits
## 9 Benefits
## 10 Doesn't benefit
## # ... with 40 more rows
samp1 %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n p_hat
## <chr> <int> <dbl>
## 1 Benefits 41 0.82
## 2 Doesn't benefit 9 0.18
ggplot(samp1, aes(x = scientist_work)) +
geom_bar(fill = "blue") +
labs(
x = "", y = "",
title = "Do you believe that the work scientists do benefit people like you?"
) +
coord_flip()
Exercise 2: # Would you expect the sample proportion to match the sample proportion of another student’s sample? # Why, or why not? If the answer is no, would you expect the proportions to be somewhat different # or very different?
# I would expect the sample proportion to not match the the sample proportion of another student’s sample.
# Because each time we take random sample from the population which is a large pool of 100000.
# This will bring a different sample data each time.
Exercise 3: # Take a second sample, also of size 50, and call it samp2. How does the sample proportion of samp2 # compare with that of samp1? Suppose we took two more samples, one of size 100 and one of size 1000. # Which would you think would provide a more accurate estimate of the population proportion?
samp2 <- global_monitor %>%
sample_n(50)
samp2 %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n p_hat
## <chr> <int> <dbl>
## 1 Benefits 41 0.82
## 2 Doesn't benefit 9 0.18
ggplot(samp2, aes(x = scientist_work)) +
geom_bar(fill = "purple") +
labs(
x = "", y = "",
title = "Do you believe that the work scientists do benefit people like you?"
) +
coord_flip()
# The data is a bit different in samp2, with more people in group ‘doesn’t benefit’.
# Also, the sample size is so small it also has a large impact on the p_hat.
# sample 100
samp3 <- global_monitor %>%
sample_n(100)
samp3 %>%
count(scientist_work) %>%
mutate(p3 = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n p3
## <chr> <int> <dbl>
## 1 Benefits 86 0.86
## 2 Doesn't benefit 14 0.14
ggplot(samp3, aes(x = scientist_work)) +
geom_bar(position = position_dodge(), width = 0.5, fill = "blue" ) +
coord_flip() +
ggtitle("Do you believe that the work scientists do benefit sample 0f 100?") +
xlab("") + ylab("")
# sample 1000
samp4 <- global_monitor %>%
sample_n(1000)
samp4 %>%
count(scientist_work) %>%
mutate(p4 = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n p4
## <chr> <int> <dbl>
## 1 Benefits 806 0.806
## 2 Doesn't benefit 194 0.194
ggplot(samp4, aes(x = scientist_work)) +
geom_bar(position = position_dodge(), width = 0.5, fill = "green" ) +
coord_flip() +
ggtitle("Do you believe that the work scientists do benefit sample 0f 10000?") +
xlab("") + ylab("")
# Everytime we take a different size sample, we get different proportion of results. The distribution of sample # proportions, called the sampling distribution (of the proportion), can help us understand this variability.
# In this lab, because you have access to the population, you can build up the sampling distribution for the
# sample proportion by repeating the above steps many times.
sample_props50 <- global_monitor %>%
rep_sample_n(size = 50, reps = 15000, replace = TRUE) %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n)) %>%
filter(scientist_work == "Doesn't benefit")
sample_props50
## # A tibble: 14,998 x 4
## # Groups: replicate [14,998]
## replicate scientist_work n p_hat
## <int> <chr> <int> <dbl>
## 1 1 Doesn't benefit 14 0.28
## 2 2 Doesn't benefit 11 0.22
## 3 3 Doesn't benefit 10 0.2
## 4 4 Doesn't benefit 8 0.16
## 5 5 Doesn't benefit 11 0.22
## 6 6 Doesn't benefit 5 0.1
## 7 7 Doesn't benefit 5 0.1
## 8 8 Doesn't benefit 15 0.3
## 9 9 Doesn't benefit 12 0.24
## 10 10 Doesn't benefit 8 0.16
## # ... with 14,988 more rows
ggplot(data = sample_props50, aes(x = p_hat)) +
geom_histogram(binwidth = 0.02, fill = "blue") +
labs(
x = "p_hat (Doesn't benefit)",
title = "Sampling distribution of p_hat",
subtitle = "Sample size = 50, Number of samples = 15000"
)
Exercise 4: # How many elements are there in sample_props50? Describe the sampling distribution, # and be sure to specifically note its center. Make sure to include a plot # of the distribution in your answer.
# There are 4 elements in sample_props50.
# 15000 samples were taken. This plot looks mostly normally distributed, with a slight skew to the right.
# The center seems to peak around .2 what looks consistent with the population distribution.
rep_sample_n <- global_monitor %>% rep_sample_n(size = 10, reps = 25, replace = TRUE) %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n)) %>%
filter(scientist_work == "Doesn't benefit")
rep_sample_n
## # A tibble: 23 x 4
## # Groups: replicate [23]
## replicate scientist_work n p_hat
## <int> <chr> <int> <dbl>
## 1 1 Doesn't benefit 3 0.3
## 2 2 Doesn't benefit 2 0.2
## 3 3 Doesn't benefit 3 0.3
## 4 4 Doesn't benefit 1 0.1
## 5 5 Doesn't benefit 1 0.1
## 6 6 Doesn't benefit 3 0.3
## 7 7 Doesn't benefit 1 0.1
## 8 8 Doesn't benefit 1 0.1
## 9 9 Doesn't benefit 3 0.3
## 10 10 Doesn't benefit 2 0.2
## # ... with 13 more rows
ggplot(data = rep_sample_n, aes(x = p_hat)) +
geom_histogram(binwidth = 0.02, fill = "purple")
# Sample size and the sampling distribution: # we used the rep_sample_n function: to compute a sampling distribution, specifically, # the sampling distribution of the proportions from samples of 50 people.
Exercise 6: # Use the app below to create sampling distributions of proportions of Doesn’t benefit from samples of size 10, # 50, and 100. Use 5,000 simulations. What does each observation in the sampling distribution represent? # How does the mean, standard error, and shape of the sampling distribution change as the sample size increases? # How (if at all) do these values change if you increase the number of simulations? (You do not need to include # plots in your answer.)
sample_app10 <- global_monitor %>% rep_sample_n(size = 10, reps = 5000, replace = TRUE) %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n)) %>%
filter(scientist_work == "Doesn't benefit")
sample_app10
## # A tibble: 4,469 x 4
## # Groups: replicate [4,469]
## replicate scientist_work n p_hat
## <int> <chr> <int> <dbl>
## 1 1 Doesn't benefit 3 0.3
## 2 2 Doesn't benefit 1 0.1
## 3 3 Doesn't benefit 5 0.5
## 4 4 Doesn't benefit 4 0.4
## 5 5 Doesn't benefit 3 0.3
## 6 6 Doesn't benefit 1 0.1
## 7 7 Doesn't benefit 1 0.1
## 8 8 Doesn't benefit 3 0.3
## 9 9 Doesn't benefit 2 0.2
## 10 11 Doesn't benefit 3 0.3
## # ... with 4,459 more rows
ggplot(data = sample_app10 , aes(x = p_hat)) +
geom_histogram(binwidth = 0.02, fill = "blue") +
labs(
x = "p_hat (Does not benefit)",
title = "Sampling distribution of Sampling distribution of people who believe scientist work does not Benefit their lives",
subtitle = "Sample size = 10, Number of samples = 5000"
)
# Smaller the sample size , the spread in the distribution response will get wider. The sampling distribution
# shows what could be a right skewed distribution. As sample size increase, the spread in the distribution
# response will get narrowed , and the sampling distribution shows a distribution centered at 20%
# which is the true estimate of the oginal population
summary(sample_app10) # mean 22.35%
## replicate scientist_work n p_hat
## Min. : 1 Length:4469 Min. :1.000 Min. :0.1000
## 1st Qu.:1236 Class :character 1st Qu.:1.000 1st Qu.:0.1000
## Median :2486 Mode :character Median :2.000 Median :0.2000
## Mean :2492 Mean :2.251 Mean :0.2251
## 3rd Qu.:3746 3rd Qu.:3.000 3rd Qu.:0.3000
## Max. :5000 Max. :8.000 Max. :0.8000
sample_app50 <- global_monitor %>%
rep_sample_n(size = 50, reps = 5000, replace = TRUE) %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n)) %>%
filter(scientist_work == "Does not benefit")
# ggplot(data = sample_props_small, aes(x = p_hat)) +
# geom_histogram(binwidth = 0.02)
ggplot(data = sample_app50 , aes(x = p_hat)) +
geom_histogram(binwidth = 0.02, fill = "blue") +
labs(
x = "p_hat (Does not benefit)",
title = "Sampling distribution of Sampling distribution of people who believe scientist work does not Benefit their lives",
subtitle = "Sample size = 50, Number of samples = 5000"
)
summary(sample_app50) # mean = 19.96%
## replicate scientist_work n p_hat
## Min. : NA Length:0 Min. : NA Min. : NA
## 1st Qu.: NA Class :character 1st Qu.: NA 1st Qu.: NA
## Median : NA Mode :character Median : NA Median : NA
## Mean :NaN Mean :NaN Mean :NaN
## 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA
## Max. : NA Max. : NA Max. : NA
samp15 <- global_monitor %>%
sample_n(15)
samp15 %>%
count(scientist_work) %>%
mutate(p15 = n /sum(n))
## # A tibble: 2 x 3
## scientist_work n p15
## <chr> <int> <dbl>
## 1 Benefits 11 0.733
## 2 Doesn't benefit 4 0.267
ggplot(samp15, aes(x = scientist_work)) +
geom_bar(position = position_dodge(), width = 0.5, fill = "blue" ) +
coord_flip() +
ggtitle("Do you believe that the work scientists do benefit people like you from sample 0f 15?") +
xlab("Element of study") + ylab("sample15")
Exercise 8: # Since you have access to the population, simulate the sampling distribution of proportion of those who # think the work scientists do enchances their lives for samples of size 15 by taking 2000 samples from # the population of size 15 and computing 2000 sample proportions. Store these proportions in as sample_props15. # Plot the data, then describe the shape of this sampling distribution. Based on this sampling distribution, # what would you guess the true proportion of those who think the work scientists do enchances their lives to be? # Finally, calculate and report the population proportion.
sample_props15 <- global_monitor %>%
rep_sample_n(size = 15, reps = 2000, replace = TRUE) %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n)) %>%
filter(scientist_work == "Benefits")
# ggplot(data = sample_props_small, aes(x = p_hat)) +
# geom_histogram(binwidth = 0.02)
ggplot(data = sample_props15 , aes(x = p_hat)) +
geom_histogram(binwidth = 0.02, fill = "blue") +
labs(
x = "p_hat (Benefits)",
title = "Sampling distribution of people who believe scientist work Benefits their lives",
subtitle = "Sample size = 15, Number of samples = 2000"
)
summary(sample_props15)
## replicate scientist_work n p_hat
## Min. : 1.0 Length:2000 Min. : 6.00 Min. :0.4000
## 1st Qu.: 500.8 Class :character 1st Qu.:11.00 1st Qu.:0.7333
## Median :1000.5 Mode :character Median :12.00 Median :0.8000
## Mean :1000.5 Mean :11.99 Mean :0.7997
## 3rd Qu.:1500.2 3rd Qu.:13.00 3rd Qu.:0.8667
## Max. :2000.0 Max. :15.00 Max. :1.0000
sample_props150 <- global_monitor %>%
rep_sample_n(size = 150, reps = 2000, replace = TRUE) %>%
count(scientist_work) %>%
mutate(p_hat = n /sum(n)) %>%
filter(scientist_work == "Benefits")
# ggplot(data = sample_props_small, aes(x = p_hat)) +
# geom_histogram(binwidth = 0.02)
ggplot(data = sample_props150 , aes(x = p_hat)) +
geom_histogram(binwidth = 0.02, fill = "blue") +
labs(
x = "p_hat (Benefits)",
title = "Sampling distribution of people who believe scientist work Benefits their lives",
subtitle = "Sample size = 150, Number of samples = 2000"
)
summary(sample_props150)
## replicate scientist_work n p_hat
## Min. : 1.0 Length:2000 Min. :104.0 Min. :0.6933
## 1st Qu.: 500.8 Class :character 1st Qu.:117.0 1st Qu.:0.7800
## Median :1000.5 Mode :character Median :120.0 Median :0.8000
## Mean :1000.5 Mean :119.9 Mean :0.7991
## 3rd Qu.:1500.2 3rd Qu.:123.0 3rd Qu.:0.8200
## Max. :2000.0 Max. :136.0 Max. :0.9067
# Pratice to take a sampling distribution with a small spread.
# This also mean getting a sample of large size.