Load packages

## Warning: package 'tidyverse' was built under R version 4.0.5

## Warning: package 'ggplot2' was built under R version 4.0.5

## Warning: package 'tibble' was built under R version 4.0.4

## Warning: package 'tidyr' was built under R version 4.0.4

## Warning: package 'readr' was built under R version 4.0.5

## Warning: package 'dplyr' was built under R version 4.0.4

## Warning: package 'forcats' was built under R version 4.0.5

## Warning: package 'moderndive' was built under R version 4.0.5

## Rows: 463
## Columns: 4
## $ ID      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,~
## $ score   <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, 4.5, 4.~
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.333, 3.333,~
## $ age     <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40, 40, 40~

## # A tibble: 5 x 4
##      ID score bty_avg   age
##   <int> <dbl>   <dbl> <int>
## 1   400   4.4    2.83    57
## 2   252   4.4    3.17    52
## 3   332   3.9    2.33    64
## 4    56   4.6    5.5     47
## 5    69   4.2    4.83    42

Data summary
Name	Piped data
Number of rows	463
Number of columns	4
_______________________
Column type frequency:
numeric	4
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ID	1	232.00	133.80	1.00	116.50	232.00	347.5	463.00	▇▇▇▇▇
score	1	4.17	0.54	2.30	3.80	4.30	4.6	5.00	▁▁▅▇▇
bty_avg	1	4.42	1.53	1.67	3.17	4.33	5.5	8.17	▃▇▇▃▂
age	1	48.37	9.80	29.00	42.00	48.00	57.0	73.00	▅▆▇▆▁

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.187

## # A tibble: 1 x 1
##   correlation
##         <dbl>
## 1       0.187

Data visualization

ggplot(evals_ch5, aes(x = bty_avg, y = score)) +
  geom_point() +
  labs(x = "Beauty Score", y = "Teaching Score",
       title = "Relationship between teaching and beauty scores") +  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

Model a linear regression

#Build a model
score_model <- lm(score ~ bty_avg, data = evals_ch5)

# Get regression table

get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    3.88      0.076     51.0        0    3.73     4.03 
## 2 bty_avg      0.067     0.016      4.09       0    0.035    0.099

Get coef of models

# Fit regression model:
score_model <- lm(score ~ bty_avg, data = evals_ch5)
# Get regression table:
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    3.88      0.076     51.0        0    3.73     4.03 
## 2 bty_avg      0.067     0.016      4.09       0    0.035    0.099

Get y_hat and residual

score_model <- lm(score ~ bty_avg, data = evals_ch5)
get_regression_points(score_model)

## # A tibble: 463 x 5
##       ID score bty_avg score_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1   4.7    5         4.21    0.486
##  2     2   4.1    5         4.21   -0.114
##  3     3   3.9    5         4.21   -0.314
##  4     4   4.8    5         4.21    0.586
##  5     5   4.6    3         4.08    0.52 
##  6     6   4.3    3         4.08    0.22 
##  7     7   2.8    3         4.08   -1.28 
##  8     8   4.1    3.33      4.10   -0.002
##  9     9   3.4    3.33      4.10   -0.702
## 10    10   4.5    3.17      4.09    0.409
## # ... with 453 more rows

library(gapminder)

## Warning: package 'gapminder' was built under R version 4.0.5

gapminder2007 <- gapminder %>%
               filter(year == 2007) %>%
               select(country, lifeExp, continent, gdpPercap)

glimpse(gapminder2007)

## Rows: 142
## Columns: 4
## $ country   <fct> "Afghanistan", "Albania", "Algeria", "Angola", "Argentina", ~
## $ lifeExp   <dbl> 43.828, 76.423, 72.301, 42.731, 75.320, 81.235, 79.829, 75.6~
## $ continent <fct> Asia, Europe, Africa, Africa, Americas, Oceania, Europe, Asi~
## $ gdpPercap <dbl> 974.5803, 5937.0295, 6223.3675, 4797.2313, 12779.3796, 34435~

gapminder2007 %>% sample_n(size = 5)

## # A tibble: 5 x 4
##   country               lifeExp continent gdpPercap
##   <fct>                   <dbl> <fct>         <dbl>
## 1 Sao Tome and Principe    65.5 Africa        1598.
## 2 Colombia                 72.9 Americas      7007.
## 3 Equatorial Guinea        51.6 Africa       12154.
## 4 Mozambique               42.1 Africa         824.
## 5 Eritrea                  58.0 Africa         641.

gapminder2007 %>% skim()

Data summary
Name	Piped data
Number of rows	142
Number of columns	4
_______________________
Column type frequency:
factor	2
numeric	2
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
country	0	1	FALSE	142	Afg: 1, Alb: 1, Alg: 1, Ang: 1
continent	0	1	FALSE	5	Afr: 52, Asi: 33, Eur: 30, Ame: 25

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
lifeExp	0	1	67.01	12.07	39.61	57.16	71.94	76.41	82.60	▂▃▃▆▇
gdpPercap	0	1	11680.07	12859.94	277.55	1624.84	6124.37	18008.84	49357.19	▇▂▁▂▁

ggplot(gapminder2007, aes(x = lifeExp)) +
  geom_histogram(binwidth = 5, color = "white") +
  labs(x = "Life expectancy", 
       y = "Number of countries",
       title = "Histogram of distribution of worldwide life expectancies") +
  facet_wrap(~ continent, nrow = 2)

ggplot(gapminder2007, aes(x = continent, y = lifeExp)) +
  geom_boxplot() +
  labs(x = "Continent", y = "Life expectancy",
       title = "Life expectancy by continent")

lifeExp_model <- lm(data = gapminder2007, lifeExp ~ continent)

get_regression_summaries(lifeExp_model)

## # A tibble: 1 x 9
##   r_squared adj_r_squared   mse  rmse sigma statistic p_value    df  nobs
##       <dbl>         <dbl> <dbl> <dbl> <dbl>     <dbl>   <dbl> <dbl> <dbl>
## 1     0.635         0.625  52.8  7.26  7.40      59.7       0     4   142

get_regression_table(lifeExp_model)

## # A tibble: 5 x 7
##   term                estimate std_error statistic p_value lower_ci upper_ci
##   <chr>                  <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept               54.8      1.02     53.4        0     52.8     56.8
## 2 continent: Americas     18.8      1.8      10.4        0     15.2     22.4
## 3 continent: Asia         15.9      1.65      9.68       0     12.7     19.2
## 4 continent: Europe       22.8      1.70     13.5        0     19.5     26.2
## 5 continent: Oceania      25.9      5.33      4.86       0     15.4     36.4

get_regression_points(lifeExp_model)

## # A tibble: 142 x 5
##       ID lifeExp continent lifeExp_hat residual
##    <int>   <dbl> <fct>           <dbl>    <dbl>
##  1     1    43.8 Asia             70.7  -26.9  
##  2     2    76.4 Europe           77.6   -1.23 
##  3     3    72.3 Africa           54.8   17.5  
##  4     4    42.7 Africa           54.8  -12.1  
##  5     5    75.3 Americas         73.6    1.71 
##  6     6    81.2 Oceania          80.7    0.516
##  7     7    79.8 Europe           77.6    2.18 
##  8     8    75.6 Asia             70.7    4.91 
##  9     9    64.1 Asia             70.7   -6.67 
## 10    10    79.4 Europe           77.6    1.79 
## # ... with 132 more rows

gdpPerCap_model <- lm(data = gapminder2007, gdpPercap ~ continent)

get_regression_table(gdpPerCap_model)

## # A tibble: 5 x 7
##   term                estimate std_error statistic p_value lower_ci upper_ci
##   <chr>                  <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept              3089.     1373.      2.25   0.026     375.    5804.
## 2 continent: Americas    7914.     2409.      3.28   0.001    3150.   12678.
## 3 continent: Asia        9384.     2203.      4.26   0        5027.   13741.
## 4 continent: Europe     21965.     2270.      9.68   0       17478.   26453.
## 5 continent: Oceania    26721.     7133.      3.75   0       12616.   40826.

get_regression_points(gdpPerCap_model, ID = "country")

## # A tibble: 142 x 5
##    country     gdpPercap continent gdpPercap_hat residual
##    <fct>           <dbl> <fct>             <dbl>    <dbl>
##  1 Afghanistan      975. Asia             12473.  -11498.
##  2 Albania         5937. Europe           25054.  -19117.
##  3 Algeria         6223. Africa            3089.    3134.
##  4 Angola          4797. Africa            3089.    1708.
##  5 Argentina      12779. Americas         11003.    1776.
##  6 Australia      34435. Oceania          29810.    4625.
##  7 Austria        36126. Europe           25054.   11072.
##  8 Bahrain        29796. Asia             12473.   17323.
##  9 Bangladesh      1391. Asia             12473.  -11082.
## 10 Belgium        33693. Europe           25054.    8638.
## # ... with 132 more rows

get_regression_summaries(gdpPerCap_model)

## # A tibble: 1 x 9
##   r_squared adj_r_squared       mse  rmse sigma statistic p_value    df  nobs
##       <dbl>         <dbl>     <dbl> <dbl> <dbl>     <dbl>   <dbl> <dbl> <dbl>
## 1     0.424         0.407 94538944. 9723. 9899.      25.2       0     4   142

library(moderndive)
library(dplyr)
library(ggplot2)
library(ISLR)

## Warning: package 'ISLR' was built under R version 4.0.5

evals_ch6 <- evals %>% select(ID, score, age, gender)


glimpse(evals_ch6)

## Rows: 463
## Columns: 4
## $ ID     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ~
## $ score  <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, 4.5, 4.6~
## $ age    <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40, 40, 40,~
## $ gender <fct> female, female, female, female, male, male, male, male, male, f~

evals_ch6 %>% sample_n(size = 5)

## # A tibble: 5 x 4
##      ID score   age gender
##   <int> <dbl> <int> <fct> 
## 1   301   4.4    43 female
## 2   105   4.8    46 female
## 3   442   3.6    61 male  
## 4    78   3.6    49 male  
## 5   323   4.6    52 female

evals_ch6 %>% skim()

Data summary
Name	Piped data
Number of rows	463
Number of columns	4
_______________________
Column type frequency:
factor	1
numeric	3
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
gender	0	1	FALSE	2	mal: 268, fem: 195

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ID	1	232.00	133.80	1.0	116.5	232.0	347.5	463	▇▇▇▇▇
score	1	4.17	0.54	2.3	3.8	4.3	4.6	5	▁▁▅▇▇
age	1	48.37	9.80	29.0	42.0	48.0	57.0	73	▅▆▇▆▁

evals_ch6 %>% get_correlation(formula = score ~ age)

## # A tibble: 1 x 1
##      cor
##    <dbl>
## 1 -0.107

theme_set(theme_light())

ggplot(evals_ch6, aes(x = age, y = score, color = gender)) +
  geom_point() +
  labs(x = "Age", y = "Teaching Score", color = "Gender") +
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

score_model_interaction <- lm(data = evals_ch6, score ~ age*gender)
 
# Get regression table

get_regression_table(score_model_interaction)

## # A tibble: 4 x 7
##   term           estimate std_error statistic p_value lower_ci upper_ci
##   <chr>             <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept         4.88      0.205     23.8    0        4.48     5.29 
## 2 age              -0.018     0.004     -3.92   0       -0.026   -0.009
## 3 gender: male     -0.446     0.265     -1.68   0.094   -0.968    0.076
## 4 age:gendermale    0.014     0.006      2.45   0.015    0.003    0.024

ISLR

library(ISLR)

credit_ch6 <- Credit %>% as_tibble() %>%
  select(ID, debt = Balance, credit_limit = Limit, 
         income = Income, credit_rating = Rating, age = Age)


glimpse(credit_ch6)

## Rows: 400
## Columns: 6
## $ ID            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1~
## $ debt          <int> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1350, 1407~
## $ credit_limit  <int> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114, 3300, 68~
## $ income        <dbl> 14.891, 106.025, 104.593, 148.924, 55.882, 80.180, 20.99~
## $ credit_rating <int> 283, 483, 514, 681, 357, 569, 259, 512, 266, 491, 589, 1~
## $ age           <int> 34, 82, 71, 36, 68, 77, 37, 87, 66, 41, 30, 64, 57, 49, ~

credit_ch6 %>% sample_n(size = 5)

## # A tibble: 5 x 6
##      ID  debt credit_limit income credit_rating   age
##   <int> <int>        <int>  <dbl>         <int> <int>
## 1    36   419         2558   23.4           220    49
## 2   247   199         3211   19.6           265    59
## 3   353   583         7140  104.            507    41
## 4   269     0         1349   26.0           142    82
## 5   274  1255         4706   16.8           353    48

credit_ch6 %>% select(debt, credit_limit, income) %>% skim()

Data summary
Name	Piped data
Number of rows	400
Number of columns	3
_______________________
Column type frequency:
numeric	3
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
debt	1	520.02	459.76	0.00	68.75	459.50	863.00	1999.00	▇▅▃▂▁
credit_limit	1	4735.60	2308.20	855.00	3088.00	4622.50	5872.75	13913.00	▆▇▃▁▁
income	1	45.22	35.24	10.35	21.01	33.12	57.47	186.63	▇▂▁▁▁

credit_ch6 %>% get_correlation(debt ~ credit_limit)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.862

credit_ch6 %>% get_correlation(debt ~ income)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.464

credit_ch6 %>% 
  select(debt, credit_limit, income) %>% 
  cor()

##                   debt credit_limit    income
## debt         1.0000000    0.8616973 0.4636565
## credit_limit 0.8616973    1.0000000 0.7920883
## income       0.4636565    0.7920883 1.0000000

A lot of you might have asked yourselves: “Why would I force the lines to have parallel slopes (as seen in the right-hand plot) when they clearly have different slopes (as seen in the left-hand plot)?”.

The answer lies in a philosophical principle known as “Occam’s Razor.” It states that, “all other things being equal, simpler solutions are more likely to be correct than complex ones.” When viewed in a modeling framework, Occam’s Razor can be restated as, “all other things being equal, simpler models are to be preferred over complex ones.” In other words, we should only favor the more complex model if the additional complexity is warranted.

tactile_prop_red

## # A tibble: 33 x 4
##    group            replicate red_balls prop_red
##    <chr>                <int>     <int>    <dbl>
##  1 Ilyas, Yohan             1        21     0.42
##  2 Morgan, Terrance         2        17     0.34
##  3 Martin, Thomas           3        21     0.42
##  4 Clark, Frank             4        21     0.42
##  5 Riddhi, Karina           5        18     0.36
##  6 Andrew, Tyler            6        19     0.38
##  7 Julia                    7        19     0.38
##  8 Rachel, Lauren           8        11     0.22
##  9 Daniel, Caroline         9        15     0.3 
## 10 Josh, Maeve             10        17     0.34
## # ... with 23 more rows

ggplot(tactile_prop_red, aes(x = prop_red)) +
  geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
  labs(x = "Proportion of 50 balls that were red", 
       title = "Distribution of 33 proportions red")

# Segment 1: sample size = 25 ------------------------------
# 1.a) Virtually use shovel 1000 times
virtual_samples_25 <- bowl %>% 
  rep_sample_n(size = 25, reps = 1000)

# 1.b) Compute resulting 1000 replicates of proportion red
virtual_prop_red_25 <- virtual_samples_25 %>% 
  group_by(replicate) %>% 
  summarize(red = sum(color == "red")) %>% 
  mutate(prop_red = red / 25)

# 1.c) Plot distribution via a histogram
ggplot(virtual_prop_red_25, aes(x = prop_red)) +
  geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
  labs(x = "Proportion of 25 balls that were red", title = "25")

The purpose of this sampling activity was to develop an understanding of two key concepts relating to sampling:

Understanding the effect of sampling variation. Understanding the effect of sample size on sampling variation.

library(infer)
library(tidyverse)
library(ISLR)
library(moderndive)


ggplot(pennies_sample, aes(x = year)) +
  geom_histogram(binwidth = 10, color = "white")

pennies_resample <- tibble(
  year = c(1976, 1962, 1976, 1983, 2017, 2015, 2015, 1962, 2016, 1976, 
           2006, 1997, 1988, 2015, 2015, 1988, 2016, 1978, 1979, 1997, 
           1974, 2013, 1978, 2015, 2008, 1982, 1986, 1979, 1981, 2004, 
           2000, 1995, 1999, 2006, 1979, 2015, 1979, 1998, 1981, 2015, 
           2000, 1999, 1988, 2017, 1992, 1997, 1990, 1988, 2006, 2000)
)


ggplot(pennies_resample, aes(x = year)) +
  geom_histogram(binwidth = 10, color = "white") +
  labs(title = "Resample of 50 pennies")

ggplot(pennies_sample, aes(x = year)) +
  geom_histogram(binwidth = 10, color = "white") +
  labs(title = "Original sample of 50 pennies")

ggplot(pennies_resample, aes(x = year)) +
  geom_histogram(binwidth = 10, color = "white") +
  labs(title = "Resample of 50 pennies")

ggplot(pennies_sample, aes(x = year)) +
  geom_histogram(binwidth = 10, color = "white") +
  labs(title = "Original sample of 50 pennies")

resampled_means <- pennies_resamples %>% 
  group_by(name) %>% 
  summarize(mean_year = mean(year))

resampled_means

## # A tibble: 35 x 2
##    name      mean_year
##    <chr>         <dbl>
##  1 Arianna       1992.
##  2 Artemis       1996.
##  3 Bea           1996.
##  4 Camryn        1997.
##  5 Cassandra     1991.
##  6 Cindy         1995.
##  7 Claire        1996.
##  8 Dahlia        1998.
##  9 Dan           1994.
## 10 Eindra        1994.
## # ... with 25 more rows

ggplot(resampled_means, aes(x = mean_year)) +
  geom_histogram(binwidth = 1, color = "white", boundary = 1990) +
  labs(x = "Sampled mean year")

virtual_resample <- pennies_sample %>% 
  rep_sample_n(size = 50, replace = TRUE)

# Repeat resampling 1000 times
virtual_resamples <- pennies_sample %>% 
  rep_sample_n(size = 50, replace = TRUE, reps = 1000)

# Compute 1000 sample means
virtual_resampled_means <- virtual_resamples %>% 
  group_by(replicate) %>% 
  summarize(mean_year = mean(year))


virtual_resampled_means <- pennies_sample %>% 
  rep_sample_n(size = 50, replace = TRUE, reps = 1000) %>% 
  group_by(replicate) %>% 
  summarize(mean_year = mean(year))
virtual_resampled_means

## # A tibble: 1,000 x 2
##    replicate mean_year
##        <int>     <dbl>
##  1         1     1998.
##  2         2     1993.
##  3         3     1997.
##  4         4     1994.
##  5         5     1993.
##  6         6     1997.
##  7         7     1991.
##  8         8     1995.
##  9         9     1995.
## 10        10     1996.
## # ... with 990 more rows

ggplot(virtual_resampled_means, aes(x = mean_year)) +
  geom_histogram(binwidth = 1, color = "white", boundary = 1990) +
  labs(x = "sample mean")

library(infer)

pennies_sample %>% 
  rep_sample_n(size = 50, replace = TRUE, reps = 1000)

## # A tibble: 50,000 x 3
## # Groups:   replicate [1,000]
##    replicate    ID  year
##        <int> <int> <dbl>
##  1         1     3  2017
##  2         1    25  1979
##  3         1    26  1979
##  4         1     2  1986
##  5         1     4  1988
##  6         1    31  2013
##  7         1     9  2004
##  8         1    41  1992
##  9         1     7  2008
## 10         1    16  2015
## # ... with 49,990 more rows

pennies_sample %>% 
  rep_sample_n(size = 50, replace = TRUE, reps = 1000) %>% 
  group_by(replicate) %>% 
  summarize(mean_year = mean(year))

## # A tibble: 1,000 x 2
##    replicate mean_year
##        <int>     <dbl>
##  1         1     1995.
##  2         2     1993.
##  3         3     1993.
##  4         4     1995.
##  5         5     1998.
##  6         6     1991.
##  7         7     1998.
##  8         8     1993.
##  9         9     1996.
## 10        10     1997.
## # ... with 990 more rows

pennies_sample %>% 
  summarize(stat = mean(year))

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 1995.

pennies_sample %>% 
  specify(response = year) %>% 
  calculate(stat = "mean")

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 1995.

bootstrap_distribution <- pennies_sample %>% 
  specify(response = year) %>%
  generate(reps = 1000, type = "bootstrap")%>%
  calculate(stat = "mean")

visualize(bootstrap_distribution)

percentile_ci <- bootstrap_distribution %>% 
  get_confidence_interval(level = 0.95, type = "percentile")

visualize(bootstrap_distribution) + shade_confidence_interval(percentile_ci)

sample_1_bootstrap <- bowl_sample_1 %>% 
  specify(response = color, success = "red") %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "prop")

percentile_ci_1 <- sample_1_bootstrap %>% 
  get_confidence_interval(level = 0.95, type = "percentile")
percentile_ci

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    1992.    2000.

sample_1_bootstrap %>% 
  visualize(bins = 15) + 
  shade_confidence_interval(endpoints = percentile_ci_1) +
  geom_vline(xintercept = 0.42, linetype = "dashed")

bootstrap_distribution_yawning <- mythbusters_yawn %>% 
  specify(formula = yawn ~ group, success = "yes") %>% 
  generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "diff in props", order = c("seed", "control"))

bootstrap_distribution_yawning

## # A tibble: 1,000 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1  0.167 
##  2         2  0.232 
##  3         3 -0.173 
##  4         4  0.0635
##  5         5  0.135 
##  6         6  0.185 
##  7         7  0.361 
##  8         8  0.286 
##  9         9  0.143 
## 10        10  0.132 
## # ... with 990 more rows

visualize(bootstrap_distribution_yawning) +
  geom_vline(xintercept = 0)

bootstrap_distribution_yawning %>% 
  get_confidence_interval(type = "percentile", level = 0.95)

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1   -0.205    0.290

obs_diff_in_props <- mythbusters_yawn %>% 
  specify(formula = yawn ~ group, success = "yes") %>% 
  # generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "diff in props", order = c("seed", "control"))

obs_diff_in_props

## # A tibble: 1 x 1
##     stat
##    <dbl>
## 1 0.0441

Load packages

library(tidyverse)
library(infer)
library(moderndive)
library(nycflights13)

## Warning: package 'nycflights13' was built under R version 4.0.5

library(ggplot2movies)

## Warning: package 'ggplot2movies' was built under R version 4.0.3

Run data sample

promotions %>% 
  sample_n(size = 6) %>% 
  arrange(id)

## # A tibble: 6 x 3
##      id decision gender
##   <int> <fct>    <fct> 
## 1    10 promoted male  
## 2    13 promoted male  
## 3    14 promoted male  
## 4    17 promoted male  
## 5    26 promoted female
## 6    28 promoted female

Use bootstrap resampling to check the difference between male and female in term of decision

bootstrap_distribution <- promotions %>% 
  specify(gender ~ decision, success = "male") %>% 
  generate(reps = 1000) %>% 
  calculate(stat = "diff in props", order = c("promoted", "not"))

## Setting `type = "bootstrap"` in `generate()`.

percentile_c <- bootstrap_distribution %>% 
  get_confidence_interval(level = 0.95, type = "percentile")

visualise(bootstrap_distribution) + shade_confidence_interval(endpoints = percentile_c)

ggplot(promotions, aes(x = gender, fill = decision)) +
  geom_bar()

promotions %>% 
  group_by(gender, decision) %>% 
  tally()

## # A tibble: 4 x 3
## # Groups:   gender [2]
##   gender decision     n
##   <fct>  <fct>    <int>
## 1 male   not          3
## 2 male   promoted    21
## 3 female not         10
## 4 female promoted    14

promotions %>% 
  specify(formula = decision ~ gender, success = "promoted")

## Response: decision (factor)
## Explanatory: gender (factor)
## # A tibble: 48 x 2
##    decision gender
##    <fct>    <fct> 
##  1 promoted male  
##  2 promoted male  
##  3 promoted male  
##  4 promoted male  
##  5 promoted male  
##  6 promoted male  
##  7 promoted male  
##  8 promoted male  
##  9 promoted male  
## 10 promoted male  
## # ... with 38 more rows

promotions %>% 
  specify(formula = decision ~ gender, success = "promoted") %>% 
  hypothesize(null = "independence")

## Response: decision (factor)
## Explanatory: gender (factor)
## Null Hypothesis: independence
## # A tibble: 48 x 2
##    decision gender
##    <fct>    <fct> 
##  1 promoted male  
##  2 promoted male  
##  3 promoted male  
##  4 promoted male  
##  5 promoted male  
##  6 promoted male  
##  7 promoted male  
##  8 promoted male  
##  9 promoted male  
## 10 promoted male  
## # ... with 38 more rows

null_distribution <- promotions %>% 
  specify(formula = decision ~ gender, success = "promoted") %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "diff in props", order = c("male", "female"))

null_distribution

## # A tibble: 1,000 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1  0.0417
##  2         2  0.0417
##  3         3 -0.292 
##  4         4  0.125 
##  5         5 -0.125 
##  6         6  0.208 
##  7         7  0.125 
##  8         8 -0.0417
##  9         9 -0.0417
## 10        10 -0.125 
## # ... with 990 more rows

obs_diff_prop <- promotions %>% 
  specify(decision ~ gender, success = "promoted") %>% 
  calculate(stat = "diff in props", order = c("male", "female"))
obs_diff_prop

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.292

visualize(null_distribution, bins = 10)

visualize(null_distribution, bins = 10) + 
  shade_p_value(obs_stat = obs_diff_prop, direction = "right")

ggplot(data = movies_sample, aes(x = genre, y = rating)) +
  geom_boxplot() +
  labs(y = "IMDb rating")

movies_sample %>% 
  group_by(genre) %>% 
  summarize(n = n(), mean_rating = mean(rating), std_dev = sd(rating))

## # A tibble: 2 x 4
##   genre       n mean_rating std_dev
##   <chr>   <int>       <dbl>   <dbl>
## 1 Action     32        5.28    1.36
## 2 Romance    36        6.32    1.61

movies_sample %>% 
  specify(formula = rating ~ genre) %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 1000, type = "permute") %>% 
  View()

null_distribution_movies <- movies_sample %>% 
  specify(formula = rating ~ genre) %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "diff in means", order = c("Action", "Romance"))

null_distribution_movies

## # A tibble: 1,000 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.0792
##  2         2 -0.528 
##  3         3  0.688 
##  4         4  0.169 
##  5         5  0.269 
##  6         6 -0.0142
##  7         7  0.122 
##  8         8 -0.481 
##  9         9 -0.510 
## 10        10  0.364 
## # ... with 990 more rows

obs_diff_means <- movies_sample %>% 
  specify(formula = rating ~ genre) %>% 
  calculate(stat = "diff in means", order = c("Action", "Romance"))
obs_diff_means

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 -1.05

visualize(null_distribution_movies, bins = 10) + 
  shade_p_value(obs_stat = obs_diff_means, direction = "both")

library(tidyverse)
library(moderndive)
library(infer)

evals_ch5 <- evals %>%
  select(ID, score, bty_avg, age)
glimpse(evals_ch5)

## Rows: 463
## Columns: 4
## $ ID      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,~
## $ score   <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, 4.5, 4.~
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.333, 3.333,~
## $ age     <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40, 40, 40~

ggplot(evals_ch5, 
       aes(x = bty_avg, y = score)) +
  geom_point() +
  labs(x = "Beauty Score", 
       y = "Teaching Score",
       title = "Relationship between teaching and beauty scores") +  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

# Fit regression model:
score_model <- lm(score ~ bty_avg, data = evals_ch5)
# Get regression table:
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    3.88      0.076     51.0        0    3.73     4.03 
## 2 bty_avg      0.067     0.016      4.09       0    0.035    0.099

bootstrap_distn_slope <- evals_ch5 %>% 
  specify(formula = score ~ bty_avg) %>%
  generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "slope")
bootstrap_distn_slope

## # A tibble: 1,000 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1 0.0667
##  2         2 0.0790
##  3         3 0.0964
##  4         4 0.0621
##  5         5 0.0624
##  6         6 0.0310
##  7         7 0.0623
##  8         8 0.0757
##  9         9 0.0997
## 10        10 0.0946
## # ... with 990 more rows

percentile_ci <- bootstrap_distn_slope %>% 
  get_confidence_interval(type = "percentile", level = 0.95)
percentile_ci

## # A tibble: 1 x 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1   0.0353   0.0990

null_distn_slope <- evals %>% 
  specify(score ~ bty_avg) %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "slope")

code_testing

Nguyen_LSCM

8/8/2021

Load packages

Data visualization

Model a linear regression

Get coef of models

Get y_hat and residual

ISLR

Load packages

Run data sample

Use bootstrap resampling to check the difference between male and female in term of decision