Data Science and statistics with RStudio

Ch 6 Multiple Linear Regression

Esther Chen

Needed Package

library(tidyverse)
library(moderndive)
library(ISLR2)
library(gridExtra)

data(package = "ISLR2")
# go to cran to search ISLR2

6.1.1 Data analysis

factor () converts a categorical variable into a factor, allowing for ordered levels and proper handling in analysis, visualization, and modeling.

UN_data_ch6 <- un_member_states_2024 |>
  select(country, 
         life_expectancy_2022, 
         fertility_rate_2022, 
         income_group_2024)|>
  na.omit()|>
  rename(life_exp = life_expectancy_2022, 
         fert_rate = fertility_rate_2022, 
         income = income_group_2024)|>
  mutate(income = factor(income, 
                         levels = c("Low income", "Lower middle income", 
                                    "Upper middle income", "High income")))

Back to previous class

gapminder2022 <- un_member_states_2024 |>
  select(country, life_exp = life_expectancy_2022, continent, gdp_per_capita) |> 
  na.omit()


gapminder2022_new <- gapminder2022 |>
  mutate(new_cont = factor(continent, levels = 
                             c("Europe", "Asia", "North America", 
                              "South America", "Oceania", "Africa")))

new_exp_model <- lm(life_exp ~ new_cont, data = gapminder2022_new)
coef(new_exp_model)

          (Intercept)          new_contAsia new_contNorth America 
            79.907674             -4.957902             -3.612457 
new_contSouth America       new_contOceania        new_contAfrica 
            -4.680174             -5.491246            -13.597867

Un_data_ch6

Check the income already changed

glimpse(UN_data_ch6)

Rows: 182
Columns: 4
$ country   <chr> "Afghanistan", "Albania", "Algeria", "Angola", "Antigua and …
$ life_exp  <dbl> 53.65, 79.47, 78.03, 62.11, 77.80, 78.31, 76.13, 83.09, 82.2…
$ fert_rate <dbl> 4.3, 1.4, 2.7, 5.0, 1.6, 1.9, 1.6, 1.6, 1.5, 1.6, 1.4, 1.8, …
$ income    <fct> Low income, Upper middle income, Lower middle income, Lower …

Sample

UN_data_ch6 |> 
  slice_sample(n = 10)

# A tibble: 10 × 4
   country             life_exp fert_rate income             
   <chr>                  <dbl>     <dbl> <fct>              
 1 Uruguay                 78.4       1.5 High income        
 2 Vietnam                 75.5       1.9 Lower middle income
 3 North Macedonia         76.8       1.4 Upper middle income
 4 Antigua and Barbuda     77.8       1.6 High income        
 5 Ecuador                 78         2   Upper middle income
 6 San Marino              83.9       1.2 High income        
 7 Philippines             70.1       2.7 Lower middle income
 8 Sierra Leone            58.8       3.7 Low income         
 9 United Kingdom          81.9       1.6 High income        
10 Belize                  75.8       2   Upper middle income

  # sample_n(size = 10)

Summary

UN_data_ch6 |> 
  select(life_exp, fert_rate, income) |> 
  tidy_summary()

# A tibble: 6 × 11
  column        n group         type    min    Q1  mean median    Q3   max    sd
  <chr>     <int> <chr>         <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
1 life_exp    182 <NA>          nume…  53.6  69.4 73.7    75.2  78.4  86.4  6.86
2 fert_rate   182 <NA>          nume…   0.9   1.6  2.49    2     3.2   6.6  1.16
3 income       25 Low income    fact…  NA    NA   NA      NA    NA    NA   NA   
4 income       52 Lower middle… fact…  NA    NA   NA      NA    NA    NA   NA   
5 income       49 Upper middle… fact…  NA    NA   NA      NA    NA    NA   NA   
6 income       56 High income   fact…  NA    NA   NA      NA    NA    NA   NA

Get correlation

UN_data_ch6 |> 
  get_correlation(formula = fert_rate ~ life_exp)

# A tibble: 1 × 1
     cor
   <dbl>
1 -0.815

Single Line

single <- ggplot(UN_data_ch6, aes(x = life_exp, y = fert_rate)) +
  geom_point(aes(color = income)) +  
  labs(x = "Life Expectancy", y = "Fertility Rate", color = "Income group") +
  geom_smooth(method = "lm", se = FALSE, color = "black")  +
  theme_gray() +
  theme(legend.position = "none")

Single Line

Multi lines

multi <- ggplot(UN_data_ch6, aes(x = life_exp, y = fert_rate, color = income)) +
  geom_point() +
  labs(x = "Life Expectancy", y = "Fertility Rate", color = "Income group") +
  geom_smooth(method = "lm", se = FALSE) +
  theme_gray()+ 
  theme(legend.text = element_text(size = 8),   
  legend.title = element_text(size = 10),  
  legend.key.size = unit(0.3, "cm"),         
  legend.spacing.y = unit(0.1, "cm"))

Multi lines

Compare

legend <- gtable::gtable_filter(ggplotGrob(multi), "guide-box")

multi <- multi + theme(legend.position = "none")
single_grob <- ggplotGrob(single)
multi_grob <- ggplotGrob(multi)
grid.arrange(
  arrangeGrob(single_grob, multi_grob, ncol = 2),  # Arrange two plots
  legend,   # Place legend below
  ncol = 1,  # One column layout
  heights = c(4, 1)  # Give more space to the plots than the legend
)

The average by income group

Low income, is the “baseline” group.

Similarly, the average fertility rate for Upper middle income member states is 4.28 + -2.25 = 2.03.

one_factor_model <- lm(fert_rate ~ income, data = UN_data_ch6)
coef(one_factor_model)

              (Intercept) incomeLower middle income incomeUpper middle income 
                 4.280000                 -1.299231                 -2.247347 
        incomeHigh income 
                -2.653214

Interaction Effect

Observe how the intercept and the slope are different for a High income member state when compared to the baseline Low income member state.

Mutiple linear regression

y ~ x1 + x2 + x1x2
we can also write y ~ x1 * x2 as the * sign accounts for both

# Fit regression model and get the coefficients of the model
model_int <- lm(fert_rate ~ life_exp * income, data = UN_data_ch6)
# coef(model_int)

coef_table <- as.data.frame(coef(model_int))
arrange(coef_table)

                                   coef(model_int)
(Intercept)                            11.91826460
life_exp                               -0.11848075
incomeLower middle income              -1.50420617
incomeUpper middle income              -1.89291757
incomeHigh income                      -6.57953685
life_exp:incomeLower middle income      0.01265027
life_exp:incomeUpper middle income      0.01117481
life_exp:incomeHigh income              0.07221696

How to read?

Income Low? Income high?

                                   coef(model_int)
(Intercept)                            11.91826460
life_exp                               -0.11848075
incomeLower middle income              -1.50420617
incomeUpper middle income              -1.89291757
incomeHigh income                      -6.57953685
life_exp:incomeLower middle income      0.01265027
life_exp:incomeUpper middle income      0.01117481
life_exp:incomeHigh income              0.07221696

6.1.3 Model without interactions

geom_parallel_slopes()

multi_wt <- ggplot(UN_data_ch6, aes(x = life_exp, y = fert_rate, color = income)) +
  geom_point() +
  labs(x = "Life expectancy", y = "Fertility rate", color = "Income group") +
  geom_parallel_slopes(se = FALSE)+
  theme_gray() +
  theme(legend.position = "none")

Compare with and without

Residual table

regression_points <- get_regression_points(model_int)
regression_points

# A tibble: 182 × 6
      ID fert_rate life_exp income              fert_rate_hat residual
   <int>     <dbl>    <dbl> <fct>                       <dbl>    <dbl>
 1     1       4.3     53.6 Low income                   5.56   -1.26 
 2     2       1.4     79.5 Upper middle income          1.50   -0.098
 3     3       2.7     78.0 Lower middle income          2.16    0.544
 4     4       5       62.1 Lower middle income          3.84    1.16 
 5     5       1.6     77.8 High income                  1.74   -0.139
 6     6       1.9     78.3 Upper middle income          1.62    0.278
 7     7       1.6     76.1 Upper middle income          1.86   -0.256
 8     8       1.6     83.1 High income                  1.50    0.105
 9     9       1.5     82.3 High income                  1.53   -0.033
10    10       1.6     74.2 Upper middle income          2.07   -0.469
# ℹ 172 more rows

Exercise 1

use palmerpenguins package, find out “penguins” data
create scatter plot with 3 linear lines according to species

Exercise 1 code

library(palmerpenguins)

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
) +
  geom_point() +
  geom_smooth(method = "lm")

# Just one line
# ggplot(
#   data = penguins,
#   mapping = aes(x = flipper_length_mm, y = body_mass_g)
# ) +
#   geom_point(mapping = aes(color = species)) +
#   geom_smooth(method = "lm")

Credit data in ISLR2

6.2 Two numerical explanatory variables

as_tibble() convert this data frame to be a tibble

library(ISLR2)

credit_ch6 <- Credit |> 
  as_tibble() |> 
  select(debt = Balance, credit_limit = Limit, 
         income = Income, credit_rating = Rating, age = Age)

glimpse(credit_ch6)

Rows: 400
Columns: 5
$ debt          <dbl> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1350, 1407…
$ credit_limit  <dbl> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114, 3300, 68…
$ income        <dbl> 14.891, 106.025, 104.593, 148.924, 55.882, 80.180, 20.99…
$ credit_rating <dbl> 283, 483, 514, 681, 357, 569, 259, 512, 266, 491, 589, 1…
$ age           <dbl> 34, 82, 71, 36, 68, 77, 37, 87, 66, 41, 30, 64, 57, 49, …

Credit data summary

credit_ch6 |>
  select(debt, credit_limit, income) |>
  tidy_summary()

# A tibble: 3 × 11
  column           n group type    min     Q1   mean median     Q3    max     sd
  <chr>        <int> <chr> <chr> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 debt           400 <NA>  nume…   0     68.8  520.   460.   863    1999   460. 
2 credit_limit   400 <NA>  nume… 855   3088   4736.  4622.  5873.  13913  2308. 
3 income         400 <NA>  nume…  10.4   21.0   45.2   33.1   57.5   187.   35.2

Debt and 2 numerical variable

plot3 <- ggplot(credit_ch6, aes(x = credit_limit, y = debt)) +
  geom_point() +
  labs(x = "Credit limit (in $)", y = "Credit card debt (in $)", 
       title = "Debt and credit limit") +
  geom_smooth(method = "lm", se = FALSE)

plot4 <- ggplot(credit_ch6, aes(x = income, y = debt)) +
  geom_point() +
  labs(x = "Income (in $1000)", y = "Credit card debt (in $)", 
       title = "Debt and income") +
  geom_smooth(method = "lm", se = FALSE)

Debt and 2 numerical variable

Get correlation

credit_ch6 |> 
  get_correlation(debt ~ credit_limit)

# A tibble: 1 × 1
    cor
  <dbl>
1 0.862

credit_ch6 |> 
  get_correlation(debt ~ income)

# A tibble: 1 × 1
    cor
  <dbl>
1 0.464

Get correlation

using the select() verb and command cor() we can find all correlations simultaneously by returning a correlation matrix

credit_ch6 |>
  select(debt, credit_limit, income) |> 
  cor()

                  debt credit_limit    income
debt         1.0000000    0.8616973 0.4636565
credit_limit 0.8616973    1.0000000 0.7920883
income       0.4636565    0.7920883 1.0000000

Correlation stay the same with scale one side

credit_ch6 |> 
  get_correlation(debt ~ 1000 * income)

# A tibble: 1 × 1
    cor
  <dbl>
1 0.464

Regression model without interaction

We start with a model with no interactions for the two numerical explanatory variables

y ~ x1 + x2

debt_model <- lm(debt ~ credit_limit + income, data = credit_ch6)
coef(debt_model)

 (Intercept) credit_limit       income 
-385.1792604    0.2643216   -7.6633230

Residual table

get_regression_points(debt_model)

# A tibble: 400 × 6
      ID  debt credit_limit income debt_hat residual
   <int> <dbl>        <dbl>  <dbl>    <dbl>    <dbl>
 1     1   333         3606   14.9     454.   -121. 
 2     2   903         6645  106.      559.    344. 
 3     3   580         7075  105.      683.   -103. 
 4     4   964         9504  149.      986.    -21.7
 5     5   331         4897   55.9     481.   -150. 
 6     6  1151         8047   80.2    1127.     23.6
 7     7   203         3388   21.0     349.   -146. 
 8     8   872         7114   71.4     948.    -76.0
 9     9   279         3300   15.1     371.    -92.2
10    10  1350         6819   71.1     873.    477. 
# ℹ 390 more rows

Regression model with interaction

y ~ x1 * x2

debt_int_model <- lm(debt ~ credit_limit * income, data = credit_ch6)
coef(debt_int_model)

        (Intercept)        credit_limit              income credit_limit:income 
      -3.070868e+02        2.523858e-01       -9.785183e+00        2.671507e-04

get_regression_points(debt_int_model)

# A tibble: 400 × 6
      ID  debt credit_limit income debt_hat residual
   <int> <dbl>        <dbl>  <dbl>    <dbl>    <dbl>
 1     1   333         3606   14.9     472.   -139. 
 2     2   903         6645  106.      521.    382. 
 3     3   580         7075  105.      653.    -72.8
 4     4   964         9504  149.     1012.    -48.5
 5     5   331         4897   55.9     455.   -124. 
 6     6  1151         8047   80.2    1112.     39.3
 7     7   203         3388   21.0     362.   -159. 
 8     8   872         7114   71.4     925.    -53.4
 9     9   279         3300   15.1     391.   -112. 
10    10  1350         6819   71.1     848.    502. 
# ℹ 390 more rows

Simple Model

# Fit regression model and get the coefficients of the model
simple_model <- lm(debt ~ income, data = credit_ch6)
coef(simple_model)

(Intercept)      income 
 246.514751    6.048363

Exercise 2

continue with penguin data set
can you show the correlation between flipper length, bill length and body mass?
flipper length, bill length which is the key for body mass?
can you show the regression model?
do you think gender impact the body mass?

Exercise 2 Code

pg <- penguins |> 
  na.omit()
  
pg_cor <- pg |>
  select(flipper_length_mm, body_mass_g, bill_length_mm) |>
  cor ()
pg_cor

                  flipper_length_mm body_mass_g bill_length_mm
flipper_length_mm         1.0000000   0.8729789      0.6530956
body_mass_g               0.8729789   1.0000000      0.5894511
bill_length_mm            0.6530956   0.5894511      1.0000000

pg_mass_model <- lm(body_mass_g ~ flipper_length_mm + bill_length_mm, data=pg)
coef(pg_mass_model)

      (Intercept) flipper_length_mm    bill_length_mm 
     -5836.298732         48.889692          4.958601

get_regression_points(pg_mass_model)

# A tibble: 333 × 6
      ID body_mass_g flipper_length_mm bill_length_mm body_mass_g_hat residual
   <int>       <int>             <int>          <dbl>           <dbl>    <dbl>
 1     1        3750               181           39.1           3207.   543.  
 2     2        3800               186           39.5           3453.   347.  
 3     3        3250               195           40.3           3897.  -647.  
 4     4        3450               193           36.7           3781.  -331.  
 5     5        3650               190           39.3           3648.     2.38
 6     6        3625               181           38.9           3206.   419.  
 7     7        4675               195           39.2           3892.   783.  
 8     8        3200               182           41.1           3265.   -65.4 
 9     9        3800               191           38.6           3693.   107.  
10    10        4400               198           34.6           4015.   385.  
# ℹ 323 more rows

# pg_mass_int_model <- lm(body_mass_g ~ flipper_length_mm * bill_length_mm, data=pg)
# coef(pg_mass_int_model )

pg_spe_model <- lm(body_mass_g ~ species, data=pg)
coef(pg_spe_model)

     (Intercept) speciesChinstrap    speciesGentoo 
      3706.16438         26.92385       1386.27259

pg_sex_model <- lm(body_mass_g ~ sex, data=pg)
coef(pg_sex_model)

(Intercept)     sexmale 
  3862.2727    683.4118