## Warning: package 'tidyverse' was built under R version 4.0.5
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.4
## Warning: package 'forcats' was built under R version 4.0.5
## Warning: package 'moderndive' was built under R version 4.0.5
## Rows: 463
## Columns: 4
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,~
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, 4.5, 4.~
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.333, 3.333,~
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40, 40, 40~
## # A tibble: 5 x 4
## ID score bty_avg age
## <int> <dbl> <dbl> <int>
## 1 400 4.4 2.83 57
## 2 252 4.4 3.17 52
## 3 332 3.9 2.33 64
## 4 56 4.6 5.5 47
## 5 69 4.2 4.83 42
| Name | Piped data |
| Number of rows | 463 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| ID | 0 | 1 | 232.00 | 133.80 | 1.00 | 116.50 | 232.00 | 347.5 | 463.00 | ▇▇▇▇▇ |
| score | 0 | 1 | 4.17 | 0.54 | 2.30 | 3.80 | 4.30 | 4.6 | 5.00 | ▁▁▅▇▇ |
| bty_avg | 0 | 1 | 4.42 | 1.53 | 1.67 | 3.17 | 4.33 | 5.5 | 8.17 | ▃▇▇▃▂ |
| age | 0 | 1 | 48.37 | 9.80 | 29.00 | 42.00 | 48.00 | 57.0 | 73.00 | ▅▆▇▆▁ |
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.187
## # A tibble: 1 x 1
## correlation
## <dbl>
## 1 0.187
ggplot(evals_ch5, aes(x = bty_avg, y = score)) +
geom_point() +
labs(x = "Beauty Score", y = "Teaching Score",
title = "Relationship between teaching and beauty scores") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
#Build a model
score_model <- lm(score ~ bty_avg, data = evals_ch5)
# Get regression table
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.88 0.076 51.0 0 3.73 4.03
## 2 bty_avg 0.067 0.016 4.09 0 0.035 0.099
# Fit regression model:
score_model <- lm(score ~ bty_avg, data = evals_ch5)
# Get regression table:
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.88 0.076 51.0 0 3.73 4.03
## 2 bty_avg 0.067 0.016 4.09 0 0.035 0.099
score_model <- lm(score ~ bty_avg, data = evals_ch5)
get_regression_points(score_model)
## # A tibble: 463 x 5
## ID score bty_avg score_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 4.7 5 4.21 0.486
## 2 2 4.1 5 4.21 -0.114
## 3 3 3.9 5 4.21 -0.314
## 4 4 4.8 5 4.21 0.586
## 5 5 4.6 3 4.08 0.52
## 6 6 4.3 3 4.08 0.22
## 7 7 2.8 3 4.08 -1.28
## 8 8 4.1 3.33 4.10 -0.002
## 9 9 3.4 3.33 4.10 -0.702
## 10 10 4.5 3.17 4.09 0.409
## # ... with 453 more rows
library(gapminder)
## Warning: package 'gapminder' was built under R version 4.0.5
gapminder2007 <- gapminder %>%
filter(year == 2007) %>%
select(country, lifeExp, continent, gdpPercap)
glimpse(gapminder2007)
## Rows: 142
## Columns: 4
## $ country <fct> "Afghanistan", "Albania", "Algeria", "Angola", "Argentina", ~
## $ lifeExp <dbl> 43.828, 76.423, 72.301, 42.731, 75.320, 81.235, 79.829, 75.6~
## $ continent <fct> Asia, Europe, Africa, Africa, Americas, Oceania, Europe, Asi~
## $ gdpPercap <dbl> 974.5803, 5937.0295, 6223.3675, 4797.2313, 12779.3796, 34435~
gapminder2007 %>% sample_n(size = 5)
## # A tibble: 5 x 4
## country lifeExp continent gdpPercap
## <fct> <dbl> <fct> <dbl>
## 1 Sao Tome and Principe 65.5 Africa 1598.
## 2 Colombia 72.9 Americas 7007.
## 3 Equatorial Guinea 51.6 Africa 12154.
## 4 Mozambique 42.1 Africa 824.
## 5 Eritrea 58.0 Africa 641.
gapminder2007 %>% skim()
| Name | Piped data |
| Number of rows | 142 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| factor | 2 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| country | 0 | 1 | FALSE | 142 | Afg: 1, Alb: 1, Alg: 1, Ang: 1 |
| continent | 0 | 1 | FALSE | 5 | Afr: 52, Asi: 33, Eur: 30, Ame: 25 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| lifeExp | 0 | 1 | 67.01 | 12.07 | 39.61 | 57.16 | 71.94 | 76.41 | 82.60 | ▂▃▃▆▇ |
| gdpPercap | 0 | 1 | 11680.07 | 12859.94 | 277.55 | 1624.84 | 6124.37 | 18008.84 | 49357.19 | ▇▂▁▂▁ |
ggplot(gapminder2007, aes(x = lifeExp)) +
geom_histogram(binwidth = 5, color = "white") +
labs(x = "Life expectancy",
y = "Number of countries",
title = "Histogram of distribution of worldwide life expectancies") +
facet_wrap(~ continent, nrow = 2)
ggplot(gapminder2007, aes(x = continent, y = lifeExp)) +
geom_boxplot() +
labs(x = "Continent", y = "Life expectancy",
title = "Life expectancy by continent")
lifeExp_model <- lm(data = gapminder2007, lifeExp ~ continent)
get_regression_summaries(lifeExp_model)
## # A tibble: 1 x 9
## r_squared adj_r_squared mse rmse sigma statistic p_value df nobs
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.635 0.625 52.8 7.26 7.40 59.7 0 4 142
get_regression_table(lifeExp_model)
## # A tibble: 5 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 54.8 1.02 53.4 0 52.8 56.8
## 2 continent: Americas 18.8 1.8 10.4 0 15.2 22.4
## 3 continent: Asia 15.9 1.65 9.68 0 12.7 19.2
## 4 continent: Europe 22.8 1.70 13.5 0 19.5 26.2
## 5 continent: Oceania 25.9 5.33 4.86 0 15.4 36.4
get_regression_points(lifeExp_model)
## # A tibble: 142 x 5
## ID lifeExp continent lifeExp_hat residual
## <int> <dbl> <fct> <dbl> <dbl>
## 1 1 43.8 Asia 70.7 -26.9
## 2 2 76.4 Europe 77.6 -1.23
## 3 3 72.3 Africa 54.8 17.5
## 4 4 42.7 Africa 54.8 -12.1
## 5 5 75.3 Americas 73.6 1.71
## 6 6 81.2 Oceania 80.7 0.516
## 7 7 79.8 Europe 77.6 2.18
## 8 8 75.6 Asia 70.7 4.91
## 9 9 64.1 Asia 70.7 -6.67
## 10 10 79.4 Europe 77.6 1.79
## # ... with 132 more rows
gdpPerCap_model <- lm(data = gapminder2007, gdpPercap ~ continent)
get_regression_table(gdpPerCap_model)
## # A tibble: 5 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3089. 1373. 2.25 0.026 375. 5804.
## 2 continent: Americas 7914. 2409. 3.28 0.001 3150. 12678.
## 3 continent: Asia 9384. 2203. 4.26 0 5027. 13741.
## 4 continent: Europe 21965. 2270. 9.68 0 17478. 26453.
## 5 continent: Oceania 26721. 7133. 3.75 0 12616. 40826.
get_regression_points(gdpPerCap_model, ID = "country")
## # A tibble: 142 x 5
## country gdpPercap continent gdpPercap_hat residual
## <fct> <dbl> <fct> <dbl> <dbl>
## 1 Afghanistan 975. Asia 12473. -11498.
## 2 Albania 5937. Europe 25054. -19117.
## 3 Algeria 6223. Africa 3089. 3134.
## 4 Angola 4797. Africa 3089. 1708.
## 5 Argentina 12779. Americas 11003. 1776.
## 6 Australia 34435. Oceania 29810. 4625.
## 7 Austria 36126. Europe 25054. 11072.
## 8 Bahrain 29796. Asia 12473. 17323.
## 9 Bangladesh 1391. Asia 12473. -11082.
## 10 Belgium 33693. Europe 25054. 8638.
## # ... with 132 more rows
get_regression_summaries(gdpPerCap_model)
## # A tibble: 1 x 9
## r_squared adj_r_squared mse rmse sigma statistic p_value df nobs
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.424 0.407 94538944. 9723. 9899. 25.2 0 4 142
library(moderndive)
library(dplyr)
library(ggplot2)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
evals_ch6 <- evals %>% select(ID, score, age, gender)
glimpse(evals_ch6)
## Rows: 463
## Columns: 4
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ~
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, 4.5, 4.6~
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40, 40, 40,~
## $ gender <fct> female, female, female, female, male, male, male, male, male, f~
evals_ch6 %>% sample_n(size = 5)
## # A tibble: 5 x 4
## ID score age gender
## <int> <dbl> <int> <fct>
## 1 301 4.4 43 female
## 2 105 4.8 46 female
## 3 442 3.6 61 male
## 4 78 3.6 49 male
## 5 323 4.6 52 female
evals_ch6 %>% skim()
| Name | Piped data |
| Number of rows | 463 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| gender | 0 | 1 | FALSE | 2 | mal: 268, fem: 195 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| ID | 0 | 1 | 232.00 | 133.80 | 1.0 | 116.5 | 232.0 | 347.5 | 463 | ▇▇▇▇▇ |
| score | 0 | 1 | 4.17 | 0.54 | 2.3 | 3.8 | 4.3 | 4.6 | 5 | ▁▁▅▇▇ |
| age | 0 | 1 | 48.37 | 9.80 | 29.0 | 42.0 | 48.0 | 57.0 | 73 | ▅▆▇▆▁ |
evals_ch6 %>% get_correlation(formula = score ~ age)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 -0.107
theme_set(theme_light())
ggplot(evals_ch6, aes(x = age, y = score, color = gender)) +
geom_point() +
labs(x = "Age", y = "Teaching Score", color = "Gender") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
score_model_interaction <- lm(data = evals_ch6, score ~ age*gender)
# Get regression table
get_regression_table(score_model_interaction)
## # A tibble: 4 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 4.88 0.205 23.8 0 4.48 5.29
## 2 age -0.018 0.004 -3.92 0 -0.026 -0.009
## 3 gender: male -0.446 0.265 -1.68 0.094 -0.968 0.076
## 4 age:gendermale 0.014 0.006 2.45 0.015 0.003 0.024
library(ISLR)
credit_ch6 <- Credit %>% as_tibble() %>%
select(ID, debt = Balance, credit_limit = Limit,
income = Income, credit_rating = Rating, age = Age)
glimpse(credit_ch6)
## Rows: 400
## Columns: 6
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1~
## $ debt <int> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1350, 1407~
## $ credit_limit <int> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114, 3300, 68~
## $ income <dbl> 14.891, 106.025, 104.593, 148.924, 55.882, 80.180, 20.99~
## $ credit_rating <int> 283, 483, 514, 681, 357, 569, 259, 512, 266, 491, 589, 1~
## $ age <int> 34, 82, 71, 36, 68, 77, 37, 87, 66, 41, 30, 64, 57, 49, ~
credit_ch6 %>% sample_n(size = 5)
## # A tibble: 5 x 6
## ID debt credit_limit income credit_rating age
## <int> <int> <int> <dbl> <int> <int>
## 1 36 419 2558 23.4 220 49
## 2 247 199 3211 19.6 265 59
## 3 353 583 7140 104. 507 41
## 4 269 0 1349 26.0 142 82
## 5 274 1255 4706 16.8 353 48
credit_ch6 %>% select(debt, credit_limit, income) %>% skim()
| Name | Piped data |
| Number of rows | 400 |
| Number of columns | 3 |
| _______________________ | |
| Column type frequency: | |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| debt | 0 | 1 | 520.02 | 459.76 | 0.00 | 68.75 | 459.50 | 863.00 | 1999.00 | ▇▅▃▂▁ |
| credit_limit | 0 | 1 | 4735.60 | 2308.20 | 855.00 | 3088.00 | 4622.50 | 5872.75 | 13913.00 | ▆▇▃▁▁ |
| income | 0 | 1 | 45.22 | 35.24 | 10.35 | 21.01 | 33.12 | 57.47 | 186.63 | ▇▂▁▁▁ |
credit_ch6 %>% get_correlation(debt ~ credit_limit)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.862
credit_ch6 %>% get_correlation(debt ~ income)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.464
credit_ch6 %>%
select(debt, credit_limit, income) %>%
cor()
## debt credit_limit income
## debt 1.0000000 0.8616973 0.4636565
## credit_limit 0.8616973 1.0000000 0.7920883
## income 0.4636565 0.7920883 1.0000000
A lot of you might have asked yourselves: “Why would I force the lines to have parallel slopes (as seen in the right-hand plot) when they clearly have different slopes (as seen in the left-hand plot)?”.
The answer lies in a philosophical principle known as “Occam’s Razor.” It states that, “all other things being equal, simpler solutions are more likely to be correct than complex ones.” When viewed in a modeling framework, Occam’s Razor can be restated as, “all other things being equal, simpler models are to be preferred over complex ones.” In other words, we should only favor the more complex model if the additional complexity is warranted.
tactile_prop_red
## # A tibble: 33 x 4
## group replicate red_balls prop_red
## <chr> <int> <int> <dbl>
## 1 Ilyas, Yohan 1 21 0.42
## 2 Morgan, Terrance 2 17 0.34
## 3 Martin, Thomas 3 21 0.42
## 4 Clark, Frank 4 21 0.42
## 5 Riddhi, Karina 5 18 0.36
## 6 Andrew, Tyler 6 19 0.38
## 7 Julia 7 19 0.38
## 8 Rachel, Lauren 8 11 0.22
## 9 Daniel, Caroline 9 15 0.3
## 10 Josh, Maeve 10 17 0.34
## # ... with 23 more rows
ggplot(tactile_prop_red, aes(x = prop_red)) +
geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
labs(x = "Proportion of 50 balls that were red",
title = "Distribution of 33 proportions red")
# Segment 1: sample size = 25 ------------------------------
# 1.a) Virtually use shovel 1000 times
virtual_samples_25 <- bowl %>%
rep_sample_n(size = 25, reps = 1000)
# 1.b) Compute resulting 1000 replicates of proportion red
virtual_prop_red_25 <- virtual_samples_25 %>%
group_by(replicate) %>%
summarize(red = sum(color == "red")) %>%
mutate(prop_red = red / 25)
# 1.c) Plot distribution via a histogram
ggplot(virtual_prop_red_25, aes(x = prop_red)) +
geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
labs(x = "Proportion of 25 balls that were red", title = "25")
The purpose of this sampling activity was to develop an understanding of two key concepts relating to sampling:
Understanding the effect of sampling variation. Understanding the effect of sample size on sampling variation.
library(infer)
library(tidyverse)
library(ISLR)
library(moderndive)
ggplot(pennies_sample, aes(x = year)) +
geom_histogram(binwidth = 10, color = "white")
pennies_resample <- tibble(
year = c(1976, 1962, 1976, 1983, 2017, 2015, 2015, 1962, 2016, 1976,
2006, 1997, 1988, 2015, 2015, 1988, 2016, 1978, 1979, 1997,
1974, 2013, 1978, 2015, 2008, 1982, 1986, 1979, 1981, 2004,
2000, 1995, 1999, 2006, 1979, 2015, 1979, 1998, 1981, 2015,
2000, 1999, 1988, 2017, 1992, 1997, 1990, 1988, 2006, 2000)
)
ggplot(pennies_resample, aes(x = year)) +
geom_histogram(binwidth = 10, color = "white") +
labs(title = "Resample of 50 pennies")
ggplot(pennies_sample, aes(x = year)) +
geom_histogram(binwidth = 10, color = "white") +
labs(title = "Original sample of 50 pennies")
ggplot(pennies_resample, aes(x = year)) +
geom_histogram(binwidth = 10, color = "white") +
labs(title = "Resample of 50 pennies")
ggplot(pennies_sample, aes(x = year)) +
geom_histogram(binwidth = 10, color = "white") +
labs(title = "Original sample of 50 pennies")
resampled_means <- pennies_resamples %>%
group_by(name) %>%
summarize(mean_year = mean(year))
resampled_means
## # A tibble: 35 x 2
## name mean_year
## <chr> <dbl>
## 1 Arianna 1992.
## 2 Artemis 1996.
## 3 Bea 1996.
## 4 Camryn 1997.
## 5 Cassandra 1991.
## 6 Cindy 1995.
## 7 Claire 1996.
## 8 Dahlia 1998.
## 9 Dan 1994.
## 10 Eindra 1994.
## # ... with 25 more rows
ggplot(resampled_means, aes(x = mean_year)) +
geom_histogram(binwidth = 1, color = "white", boundary = 1990) +
labs(x = "Sampled mean year")
virtual_resample <- pennies_sample %>%
rep_sample_n(size = 50, replace = TRUE)
# Repeat resampling 1000 times
virtual_resamples <- pennies_sample %>%
rep_sample_n(size = 50, replace = TRUE, reps = 1000)
# Compute 1000 sample means
virtual_resampled_means <- virtual_resamples %>%
group_by(replicate) %>%
summarize(mean_year = mean(year))
virtual_resampled_means <- pennies_sample %>%
rep_sample_n(size = 50, replace = TRUE, reps = 1000) %>%
group_by(replicate) %>%
summarize(mean_year = mean(year))
virtual_resampled_means
## # A tibble: 1,000 x 2
## replicate mean_year
## <int> <dbl>
## 1 1 1998.
## 2 2 1993.
## 3 3 1997.
## 4 4 1994.
## 5 5 1993.
## 6 6 1997.
## 7 7 1991.
## 8 8 1995.
## 9 9 1995.
## 10 10 1996.
## # ... with 990 more rows
ggplot(virtual_resampled_means, aes(x = mean_year)) +
geom_histogram(binwidth = 1, color = "white", boundary = 1990) +
labs(x = "sample mean")
library(infer)
pennies_sample %>%
rep_sample_n(size = 50, replace = TRUE, reps = 1000)
## # A tibble: 50,000 x 3
## # Groups: replicate [1,000]
## replicate ID year
## <int> <int> <dbl>
## 1 1 3 2017
## 2 1 25 1979
## 3 1 26 1979
## 4 1 2 1986
## 5 1 4 1988
## 6 1 31 2013
## 7 1 9 2004
## 8 1 41 1992
## 9 1 7 2008
## 10 1 16 2015
## # ... with 49,990 more rows
pennies_sample %>%
rep_sample_n(size = 50, replace = TRUE, reps = 1000) %>%
group_by(replicate) %>%
summarize(mean_year = mean(year))
## # A tibble: 1,000 x 2
## replicate mean_year
## <int> <dbl>
## 1 1 1995.
## 2 2 1993.
## 3 3 1993.
## 4 4 1995.
## 5 5 1998.
## 6 6 1991.
## 7 7 1998.
## 8 8 1993.
## 9 9 1996.
## 10 10 1997.
## # ... with 990 more rows
pennies_sample %>%
summarize(stat = mean(year))
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 1995.
pennies_sample %>%
specify(response = year) %>%
calculate(stat = "mean")
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 1995.
bootstrap_distribution <- pennies_sample %>%
specify(response = year) %>%
generate(reps = 1000, type = "bootstrap")%>%
calculate(stat = "mean")
visualize(bootstrap_distribution)
percentile_ci <- bootstrap_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
visualize(bootstrap_distribution) + shade_confidence_interval(percentile_ci)
sample_1_bootstrap <- bowl_sample_1 %>%
specify(response = color, success = "red") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop")
percentile_ci_1 <- sample_1_bootstrap %>%
get_confidence_interval(level = 0.95, type = "percentile")
percentile_ci
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 1992. 2000.
sample_1_bootstrap %>%
visualize(bins = 15) +
shade_confidence_interval(endpoints = percentile_ci_1) +
geom_vline(xintercept = 0.42, linetype = "dashed")
bootstrap_distribution_yawning <- mythbusters_yawn %>%
specify(formula = yawn ~ group, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in props", order = c("seed", "control"))
bootstrap_distribution_yawning
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.167
## 2 2 0.232
## 3 3 -0.173
## 4 4 0.0635
## 5 5 0.135
## 6 6 0.185
## 7 7 0.361
## 8 8 0.286
## 9 9 0.143
## 10 10 0.132
## # ... with 990 more rows
visualize(bootstrap_distribution_yawning) +
geom_vline(xintercept = 0)
bootstrap_distribution_yawning %>%
get_confidence_interval(type = "percentile", level = 0.95)
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 -0.205 0.290
obs_diff_in_props <- mythbusters_yawn %>%
specify(formula = yawn ~ group, success = "yes") %>%
# generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in props", order = c("seed", "control"))
obs_diff_in_props
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.0441
library(tidyverse)
library(infer)
library(moderndive)
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.0.5
library(ggplot2movies)
## Warning: package 'ggplot2movies' was built under R version 4.0.3
promotions %>%
sample_n(size = 6) %>%
arrange(id)
## # A tibble: 6 x 3
## id decision gender
## <int> <fct> <fct>
## 1 10 promoted male
## 2 13 promoted male
## 3 14 promoted male
## 4 17 promoted male
## 5 26 promoted female
## 6 28 promoted female
bootstrap_distribution <- promotions %>%
specify(gender ~ decision, success = "male") %>%
generate(reps = 1000) %>%
calculate(stat = "diff in props", order = c("promoted", "not"))
## Setting `type = "bootstrap"` in `generate()`.
percentile_c <- bootstrap_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
visualise(bootstrap_distribution) + shade_confidence_interval(endpoints = percentile_c)
ggplot(promotions, aes(x = gender, fill = decision)) +
geom_bar()
promotions %>%
group_by(gender, decision) %>%
tally()
## # A tibble: 4 x 3
## # Groups: gender [2]
## gender decision n
## <fct> <fct> <int>
## 1 male not 3
## 2 male promoted 21
## 3 female not 10
## 4 female promoted 14
promotions %>%
specify(formula = decision ~ gender, success = "promoted")
## Response: decision (factor)
## Explanatory: gender (factor)
## # A tibble: 48 x 2
## decision gender
## <fct> <fct>
## 1 promoted male
## 2 promoted male
## 3 promoted male
## 4 promoted male
## 5 promoted male
## 6 promoted male
## 7 promoted male
## 8 promoted male
## 9 promoted male
## 10 promoted male
## # ... with 38 more rows
promotions %>%
specify(formula = decision ~ gender, success = "promoted") %>%
hypothesize(null = "independence")
## Response: decision (factor)
## Explanatory: gender (factor)
## Null Hypothesis: independence
## # A tibble: 48 x 2
## decision gender
## <fct> <fct>
## 1 promoted male
## 2 promoted male
## 3 promoted male
## 4 promoted male
## 5 promoted male
## 6 promoted male
## 7 promoted male
## 8 promoted male
## 9 promoted male
## 10 promoted male
## # ... with 38 more rows
null_distribution <- promotions %>%
specify(formula = decision ~ gender, success = "promoted") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in props", order = c("male", "female"))
null_distribution
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.0417
## 2 2 0.0417
## 3 3 -0.292
## 4 4 0.125
## 5 5 -0.125
## 6 6 0.208
## 7 7 0.125
## 8 8 -0.0417
## 9 9 -0.0417
## 10 10 -0.125
## # ... with 990 more rows
obs_diff_prop <- promotions %>%
specify(decision ~ gender, success = "promoted") %>%
calculate(stat = "diff in props", order = c("male", "female"))
obs_diff_prop
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.292
visualize(null_distribution, bins = 10)
visualize(null_distribution, bins = 10) +
shade_p_value(obs_stat = obs_diff_prop, direction = "right")
ggplot(data = movies_sample, aes(x = genre, y = rating)) +
geom_boxplot() +
labs(y = "IMDb rating")
movies_sample %>%
group_by(genre) %>%
summarize(n = n(), mean_rating = mean(rating), std_dev = sd(rating))
## # A tibble: 2 x 4
## genre n mean_rating std_dev
## <chr> <int> <dbl> <dbl>
## 1 Action 32 5.28 1.36
## 2 Romance 36 6.32 1.61
movies_sample %>%
specify(formula = rating ~ genre) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
View()
null_distribution_movies <- movies_sample %>%
specify(formula = rating ~ genre) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("Action", "Romance"))
null_distribution_movies
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 -0.0792
## 2 2 -0.528
## 3 3 0.688
## 4 4 0.169
## 5 5 0.269
## 6 6 -0.0142
## 7 7 0.122
## 8 8 -0.481
## 9 9 -0.510
## 10 10 0.364
## # ... with 990 more rows
obs_diff_means <- movies_sample %>%
specify(formula = rating ~ genre) %>%
calculate(stat = "diff in means", order = c("Action", "Romance"))
obs_diff_means
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 -1.05
visualize(null_distribution_movies, bins = 10) +
shade_p_value(obs_stat = obs_diff_means, direction = "both")
library(tidyverse)
library(moderndive)
library(infer)
evals_ch5 <- evals %>%
select(ID, score, bty_avg, age)
glimpse(evals_ch5)
## Rows: 463
## Columns: 4
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,~
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, 4.5, 4.~
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.333, 3.333,~
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40, 40, 40~
ggplot(evals_ch5,
aes(x = bty_avg, y = score)) +
geom_point() +
labs(x = "Beauty Score",
y = "Teaching Score",
title = "Relationship between teaching and beauty scores") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
# Fit regression model:
score_model <- lm(score ~ bty_avg, data = evals_ch5)
# Get regression table:
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.88 0.076 51.0 0 3.73 4.03
## 2 bty_avg 0.067 0.016 4.09 0 0.035 0.099
bootstrap_distn_slope <- evals_ch5 %>%
specify(formula = score ~ bty_avg) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope")
bootstrap_distn_slope
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.0667
## 2 2 0.0790
## 3 3 0.0964
## 4 4 0.0621
## 5 5 0.0624
## 6 6 0.0310
## 7 7 0.0623
## 8 8 0.0757
## 9 9 0.0997
## 10 10 0.0946
## # ... with 990 more rows
percentile_ci <- bootstrap_distn_slope %>%
get_confidence_interval(type = "percentile", level = 0.95)
percentile_ci
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0353 0.0990
null_distn_slope <- evals %>%
specify(score ~ bty_avg) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "slope")