Data Science and statistics with RStudio

Ch 5 Simple Linear Regression

Esther Chen

Needed Package

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(moderndive)

Subpackages

  • ggplot2 for data visualization

  • dplyr for data wrangling

  • tidyr for converting data to “tidy” format

  • readr for importing spreadsheet data into R

  • As well as the more advanced purrr, tibble, stringr, and forcats packages

Moderndive Package

Always checking your data

data(package="moderndive")
# summary(un_member_states_2024)
# ?un_member_states_2024
# head(un_member_states_2024)
# glimpse(un_member_states_2024)

# in_rec <- un_member_states_2024 |> 
#   filter(country=="Denmark")
# in_rec

Un_memeber_states_2024

  1. iso: An identification variable used to distinguish between the 181 countries in the filtered dataset.

  2. fert_rate: A numerical variable representing the country’s fertility rate in 2022 corresponding to the expected number of children born per woman in child-bearing years. This is the outcome variable y of interest.

  3. life_exp: A numerical variable representing the country’s average life expectancy in 2022 in years. This is the primary explanatory variable x of interest.

  4. obes_rate: A numerical variable representing the country’s obesity rate in 2016.

Select key columns

UN_data_ch5 <- un_member_states_2024 |>
  select(iso, 
         life_exp = life_expectancy_2022, 
         fert_rate = fertility_rate_2022, 
         obes_rate = obesity_rate_2016)

# anyDuplicated(UN_data_ch5$iso)
# select() already can rename the select columns directly

Selected table

UN_data_ch5
# A tibble: 193 × 4
   iso   life_exp fert_rate obes_rate
   <chr>    <dbl>     <dbl>     <dbl>
 1 AFG       53.6       4.3       5.5
 2 ALB       79.5       1.4      21.7
 3 DZA       78.0       2.7      27.4
 4 AND       83.4      NA        25.6
 5 AGO       62.1       5         8.2
 6 ATG       77.8       1.6      18.9
 7 ARG       78.3       1.9      28.3
 8 ARM       76.1       1.6      20.2
 9 AUS       83.1       1.6      29  
10 AUT       82.3       1.5      20.1
# ℹ 183 more rows
# check if has na data
# anyNA(UN_data_ch5)

How to get ride of NA data?

na.omit()

UN_data_ch5 <- un_member_states_2024 |>
  select(iso, 
         life_exp = life_expectancy_2022, 
         fert_rate = fertility_rate_2022, 
         obes_rate = obesity_rate_2016)|>
  na.omit()

glimpse(UN_data_ch5)
Rows: 181
Columns: 4
$ iso       <chr> "AFG", "ALB", "DZA", "AGO", "ATG", "ARG", "ARM", "AUS", "AUT…
$ life_exp  <dbl> 53.65, 79.47, 78.03, 62.11, 77.80, 78.31, 76.13, 83.09, 82.2…
$ fert_rate <dbl> 4.3, 1.4, 2.7, 5.0, 1.6, 1.9, 1.6, 1.6, 1.5, 1.6, 1.4, 1.8, …
$ obes_rate <dbl> 5.5, 21.7, 27.4, 8.2, 18.9, 28.3, 20.2, 29.0, 20.1, 19.9, 31…

Random Sample

slice_sample()

# assume from 193 countries we want to have 5 random sample

UN_data_ch5 |>
  slice_sample(n = 5)
# A tibble: 5 × 4
  iso   life_exp fert_rate obes_rate
  <chr>    <dbl>     <dbl>     <dbl>
1 COL       74.9       1.7      22.3
2 PER       68.9       2.1      19.7
3 CAN       83.8       1.5      29.4
4 AUS       83.1       1.6      29  
5 LKA       78         1.9       5.2
# UN_data_ch5 |>
# sample_n(size = 5)

Summarize mean and mdeian

One way we can do:

summarize ()

UN_data_ch5 |>
  summarize(mean_life_exp = mean(life_exp), 
            mean_fert_rate = mean(fert_rate),
            median_life_exp = median(life_exp), 
            median_fert_rate = median(fert_rate))
# A tibble: 1 × 4
  mean_life_exp mean_fert_rate median_life_exp median_fert_rate
          <dbl>          <dbl>           <dbl>            <dbl>
1          73.6           2.50            75.1                2

Summary in easier way with moderndive

tidy_summary ()

UN_data_ch5 |> 
  select(fert_rate, life_exp) |> 
  tidy_summary()
# A tibble: 2 × 11
  column        n group type      min    Q1  mean median    Q3   max    sd
  <chr>     <int> <chr> <chr>   <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
1 fert_rate   181 <NA>  numeric   1.1   1.6  2.50    2     3.2   6.6  1.15
2 life_exp    181 <NA>  numeric  53.6  69.4 73.6    75.1  78.3  86.4  6.80
# Also can try this way
# UN_data_ch5 |> 
#   tidy_summary(columns = c(fert_rate, life_exp))

Correlation coefficient

get_correlation()

cor()

UN_data_ch5 |> 
  get_correlation(formula = fert_rate ~ life_exp)
# A tibble: 1 × 1
     cor
   <dbl>
1 -0.812
# please also try
# UN_data_ch5 |> 
#   summarize(correlation = cor(fert_rate, life_exp))

Scatter plot

ggplot(UN_data_ch5, 
       aes(x = life_exp, y = fert_rate)) +
  geom_point(alpha = 0.5) +
  labs(x = "Life Expectancy", y = "Fertility Rate")

Scatter plot style

R for data science 2e, section 9, can see different point shape.

Scatter plot style

ggplot(UN_data_ch5, 
       aes(x = life_exp, y = fert_rate)) +
  geom_point(color="lawngreen", fill="snow", shape=23, size=2) +
  labs(x = "Life Expectancy", y = "Fertility Rate")

Best fitting line

geom_smooth(method = “lm”, se = FALSE)

ggplot(UN_data_ch5, aes(x = life_exp, y = fert_rate)) +
  geom_point(alpha = 0.1) +
  labs(x = "Life Expectancy", 
    y = "Fertility Rate",
    title = "Relationship of life expectancy and fertility rate") +
  geom_smooth(method = "lm", se = TRUE)
# the band represent confidence interval (default is 95% in ggplot2)

Best fitting line

lm(y~x, dataframe) get the regression line

coef() get the regression model coeffiencts

# Fit regression model:
demographics_model <- lm(fert_rate ~ life_exp, data = UN_data_ch5)

# Get regression coefficients:
coef(demographics_model)
(Intercept)    life_exp 
 12.5992926  -0.1372879 

How to get residuals

get_regression_points ( )

# 𝑦-y_hat

regression_points <- get_regression_points(demographics_model)
regression_points
# A tibble: 181 × 5
      ID fert_rate life_exp fert_rate_hat residual
   <int>     <dbl>    <dbl>         <dbl>    <dbl>
 1     1       4.3     53.6          5.23   -0.934
 2     2       1.4     79.5          1.69   -0.289
 3     3       2.7     78.0          1.89    0.813
 4     4       5       62.1          4.07    0.928
 5     5       1.6     77.8          1.92   -0.318
 6     6       1.9     78.3          1.85    0.052
 7     7       1.6     76.1          2.15   -0.548
 8     8       1.6     83.1          1.19    0.408
 9     9       1.5     82.3          1.30    0.195
10    10       1.6     74.2          2.42   -0.819
# ℹ 171 more rows

Exercise 1

Follow this data table (UN_data_ch5)

  1. do the scatter plot of obes_rate and life_exp

  2. can you find the regression model?

  3. how you like to explain these 2 set of numbers?

# A tibble: 1 × 1
    cor
  <dbl>
1 0.493
ggplot(UN_data_ch5, 
       aes(x = life_exp, y = obes_rate)) +
  geom_point(alpha = 0.5) +
  labs(x = "Life Expectancy", y = "Obesity Rate")+
  geom_smooth(method = "lm", se = TRUE)
test_model <- lm(obes_rate~ life_exp, data = UN_data_ch5)
coef(test_model)
(Intercept)    life_exp 
-33.9428026   0.7235592 

Lets find out other factor!

Start up another data set!

Difference between continents?

Ch5.2

  • Differences between continents

  • Differences within continents

Exercise 2 (gapminder2022 table)

Continue with un_member_states_2024 as major data set.

  • name a table gapminder2022
  • columns need to select from un table
  • select: country, life_expectancy_2022, continent, gdp_per_capita
  • rename: life_expectancy_2022 as life_exp
  • Remove NA data
Rows: 188
Columns: 4
$ country        <chr> "Afghanistan", "Albania", "Algeria", "Andorra", "Angola…
$ life_exp       <dbl> 53.65, 79.47, 78.03, 83.42, 62.11, 77.80, 78.31, 76.13,…
$ continent      <fct> Asia, Europe, Africa, Europe, Africa, North America, So…
$ gdp_per_capita <dbl> 355.7778, 6810.1140, 4342.6380, 41992.7728, 3000.4442, …

Exercise 2 code

gapminder2022 <- un_member_states_2024 |>
  select(country, life_exp = life_expectancy_2022, continent, gdp_per_capita) |> 
  na.omit()

Exercise 3 (summary table)

Follow gapminder 2022 you just created at exercise 1

  • select life_exp and continent
  • use tidy_summary () for summary table
# A tibble: 7 × 11
  column        n group         type    min    Q1  mean median    Q3   max    sd
  <chr>     <int> <chr>         <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
1 life_exp    188 <NA>          nume…  53.6  69.4  73.8   75.2  78.4  89.6  6.93
2 continent    52 Africa        fact…  NA    NA    NA     NA    NA    NA   NA   
3 continent    44 Asia          fact…  NA    NA    NA     NA    NA    NA   NA   
4 continent    43 Europe        fact…  NA    NA    NA     NA    NA    NA   NA   
5 continent    23 North America fact…  NA    NA    NA     NA    NA    NA   NA   
6 continent    14 Oceania       fact…  NA    NA    NA     NA    NA    NA   NA   
7 continent    12 South America fact…  NA    NA    NA     NA    NA    NA   NA   

Exercise 3 code

gapminder2022 |> 
  select(life_exp, continent) |> 
  tidy_summary()
# A tibble: 7 × 11
  column        n group         type    min    Q1  mean median    Q3   max    sd
  <chr>     <int> <chr>         <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
1 life_exp    188 <NA>          nume…  53.6  69.4  73.8   75.2  78.4  89.6  6.93
2 continent    52 Africa        fact…  NA    NA    NA     NA    NA    NA   NA   
3 continent    44 Asia          fact…  NA    NA    NA     NA    NA    NA   NA   
4 continent    43 Europe        fact…  NA    NA    NA     NA    NA    NA   NA   
5 continent    23 North America fact…  NA    NA    NA     NA    NA    NA   NA   
6 continent    14 Oceania       fact…  NA    NA    NA     NA    NA    NA   NA   
7 continent    12 South America fact…  NA    NA    NA     NA    NA    NA   NA   

Gapminder Table

country: An identification variable of type character/text used to distinguish the 142 countries in the dataset.

life_exp: A numerical variable of that country’s life expectancy at birth. This is the outcome variable y of interest.

continent: A categorical variable with five levels. Here “levels” correspond to the possible categories: Africa, Asia, Americas, Europe, and Oceania. This is the explanatory variable x of interest.

gdp_per_capita: A numerical variable of that country’s GDP per capita in US inflation-adjusted dollars

Histogram catch up

Left-skewed distribution?

ggplot(gapminder2022, aes(x = life_exp)) +
  geom_histogram(binwidth = 5, color = "white") +
  labs(x = "Life expectancy", 
       y = "Number of countries",
       title = "Histogram of distribution of worldwide life expectancies")

How about separated by continental?

recap with facet_wrap()

ggplot(gapminder2022, aes(x = life_exp)) +
  geom_histogram(binwidth = 5, color = "white") +
  labs(x = "Life expectancy", 
       y = "Number of countries",
       title = "Histogram of distribution of worldwide life expectancies") +
  facet_wrap(~ continent, nrow = 2)

Is there any outliers?

Recall boxplot

ggplot(gapminder2022, aes(x = continent, y = life_exp)) +
  geom_boxplot() +
  labs(x = "Continent", y = "Life expectancy",
       title = "Life expectancy by continent")

Catgory data with scatter plot

ggplot(gapminder2022, aes(y = life_exp, x = continent)) +
  geom_point(alpha = 0.1) +
  labs(x = "continent", 
    y = "Life expectancy",
    title = "Life expectancy estimation by continent")

Is there any outliers?

Recall boxplot and mark outlier

Mean and median

life_exp_by_continent <- gapminder2022 |>
  group_by(continent) |>
  summarize(median = median(life_exp), mean = mean(life_exp))
life_exp_by_continent
# A tibble: 6 × 3
  continent     median  mean
  <fct>          <dbl> <dbl>
1 Africa          66.1  66.3
2 Asia            75.4  74.9
3 Europe          81.5  79.9
4 North America   76.1  76.3
5 Oceania         74.6  74.4
6 South America   75.4  75.2

Set the baseline to compare

R default for the baseline is according to alphabet

life_exp_model <- lm(life_exp ~ continent, data = gapminder2022)
coef(life_exp_model)
           (Intercept)          continentAsia        continentEurope 
             66.309808               8.639965              13.597867 
continentNorth America       continentOceania continentSouth America 
              9.985410               8.106621               8.917692 

How to read

  • intercept corresponds to the mean life expectancy of countries in Africa of 66.31 years.

  • continentAsia the mean life expectancy of countries in Asia is 66.31 + 8.64 = 74.95

  • continentEurope the mean life expectancy of countries in Europe is 66.31 + 13.6 = 79.91

  • continentNorth the mean life expectancy of countries in North America is 66.31 + 9.98 = 76.29

  • continentOceania the mean life expectancy of countries in Oceania is 66.31 + 8.11 = 74.42

  • continentSouth America the mean life expectancy of countries in South America is 66.31 + 8.92 = 75.23

Regression model apply on country

get_regression_points(life_exp_model, ID = "country")
# A tibble: 188 × 5
   country             life_exp continent     life_exp_hat residual
   <chr>                  <dbl> <fct>                <dbl>    <dbl>
 1 Afghanistan             53.6 Asia                  75.0  -21.3  
 2 Albania                 79.5 Europe                79.9   -0.438
 3 Algeria                 78.0 Africa                66.3   11.7  
 4 Andorra                 83.4 Europe                79.9    3.51 
 5 Angola                  62.1 Africa                66.3   -4.2  
 6 Antigua and Barbuda     77.8 North America         76.3    1.50 
 7 Argentina               78.3 South America         75.2    3.08 
 8 Armenia                 76.1 Asia                  75.0    1.18 
 9 Australia               83.1 Oceania               74.4    8.67 
10 Austria                 82.3 Europe                79.9    2.36 
# ℹ 178 more rows

Regression model apply on country

country_apply <- get_regression_points(life_exp_model, ID = "country")

Asia_model <- country_apply |> 
  filter(continent=="Asia")
              
Asia_model
# A tibble: 44 × 5
   country     life_exp continent life_exp_hat residual
   <chr>          <dbl> <fct>            <dbl>    <dbl>
 1 Afghanistan     53.6 Asia              75.0   -21.3 
 2 Armenia         76.1 Asia              75.0     1.18
 3 Azerbaijan      74.2 Asia              75.0    -0.8 
 4 Bahrain         79.9 Asia              75.0     4.95
 5 Bangladesh      74.7 Asia              75.0    -0.25
 6 Bhutan          72.3 Asia              75.0    -2.64
 7 Brunei          80.6 Asia              75.0     5.64
 8 Cambodia        70.6 Asia              75.0    -4.3 
 9 Cyprus          79.7 Asia              75.0     4.79
10 Georgia         77.5 Asia              75.0     2.55
# ℹ 34 more rows

Exercise 4

Follow on previous logic, can you find out the linear model for gdp_per_capital and continent? Can you explain?

life_gdp_model <- lm(gdp_per_capita ~ continent, data = gapminder2022)
coef(life_gdp_model)
           (Intercept)          continentAsia        continentEurope 
              2637.100              13014.284              43061.350 
continentNorth America       continentOceania continentSouth America 
             13713.476              10031.048               8083.508 
get_regression_points(life_gdp_model, ID = "country")
# A tibble: 188 × 5
   country             gdp_per_capita continent     gdp_per_capita_hat residual
   <chr>                        <dbl> <fct>                      <dbl>    <dbl>
 1 Afghanistan                   356. Asia                      15651.  -15296.
 2 Albania                      6810. Europe                    45698.  -38888.
 3 Algeria                      4343. Africa                     2637.    1706.
 4 Andorra                     41993. Europe                    45698.   -3706.
 5 Angola                       3000. Africa                     2637.     363.
 6 Antigua and Barbuda         19920. North America             16351.    3569.
 7 Argentina                   13651. South America             10721.    2930.
 8 Armenia                      7018. Asia                      15651.   -8633.
 9 Australia                   65100. Oceania                   12668.   52432.
10 Austria                     52085. Europe                    45698.    6386.
# ℹ 178 more rows