Data Science and statistics with RStudio

Ch 6 Multiple Linear Regression

Esther Chen

Needed Package

library(tidyverse)
library(moderndive)
library(ISLR2)
library(gridExtra)

data(package = "ISLR2")
# go to cran to search ISLR2

6.1.1 Data analysis

factor () converts a categorical variable into a factor, allowing for ordered levels and proper handling in analysis, visualization, and modeling.

UN_data_ch6 <- un_member_states_2024 |>
  select(country, 
         life_expectancy_2022, 
         fertility_rate_2022, 
         income_group_2024)|>
  na.omit()|>
  rename(life_exp = life_expectancy_2022, 
         fert_rate = fertility_rate_2022, 
         income = income_group_2024)|>
  mutate(income = factor(income, 
                         levels = c("Low income", "Lower middle income", 
                                    "Upper middle income", "High income")))

Back to previous class

gapminder2022 <- un_member_states_2024 |>
  select(country, life_exp = life_expectancy_2022, continent, gdp_per_capita) |> 
  na.omit()


gapminder2022_new <- gapminder2022 |>
  mutate(new_cont = factor(continent, levels = 
                             c("Europe", "Asia", "North America", 
                              "South America", "Oceania", "Africa")))

new_exp_model <- lm(life_exp ~ new_cont, data = gapminder2022_new)
coef(new_exp_model)
          (Intercept)          new_contAsia new_contNorth America 
            79.907674             -4.957902             -3.612457 
new_contSouth America       new_contOceania        new_contAfrica 
            -4.680174             -5.491246            -13.597867 

Un_data_ch6

Check the income already changed

glimpse(UN_data_ch6)
Rows: 182
Columns: 4
$ country   <chr> "Afghanistan", "Albania", "Algeria", "Angola", "Antigua and …
$ life_exp  <dbl> 53.65, 79.47, 78.03, 62.11, 77.80, 78.31, 76.13, 83.09, 82.2…
$ fert_rate <dbl> 4.3, 1.4, 2.7, 5.0, 1.6, 1.9, 1.6, 1.6, 1.5, 1.6, 1.4, 1.8, …
$ income    <fct> Low income, Upper middle income, Lower middle income, Lower …

Sample

UN_data_ch6 |> 
  slice_sample(n = 10)
# A tibble: 10 × 4
   country             life_exp fert_rate income             
   <chr>                  <dbl>     <dbl> <fct>              
 1 Uruguay                 78.4       1.5 High income        
 2 Vietnam                 75.5       1.9 Lower middle income
 3 North Macedonia         76.8       1.4 Upper middle income
 4 Antigua and Barbuda     77.8       1.6 High income        
 5 Ecuador                 78         2   Upper middle income
 6 San Marino              83.9       1.2 High income        
 7 Philippines             70.1       2.7 Lower middle income
 8 Sierra Leone            58.8       3.7 Low income         
 9 United Kingdom          81.9       1.6 High income        
10 Belize                  75.8       2   Upper middle income
  # sample_n(size = 10)

Summary

UN_data_ch6 |> 
  select(life_exp, fert_rate, income) |> 
  tidy_summary()
# A tibble: 6 × 11
  column        n group         type    min    Q1  mean median    Q3   max    sd
  <chr>     <int> <chr>         <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
1 life_exp    182 <NA>          nume…  53.6  69.4 73.7    75.2  78.4  86.4  6.86
2 fert_rate   182 <NA>          nume…   0.9   1.6  2.49    2     3.2   6.6  1.16
3 income       25 Low income    fact…  NA    NA   NA      NA    NA    NA   NA   
4 income       52 Lower middle… fact…  NA    NA   NA      NA    NA    NA   NA   
5 income       49 Upper middle… fact…  NA    NA   NA      NA    NA    NA   NA   
6 income       56 High income   fact…  NA    NA   NA      NA    NA    NA   NA   

Get correlation

UN_data_ch6 |> 
  get_correlation(formula = fert_rate ~ life_exp)
# A tibble: 1 × 1
     cor
   <dbl>
1 -0.815

Single Line

single <- ggplot(UN_data_ch6, aes(x = life_exp, y = fert_rate)) +
  geom_point(aes(color = income)) +  
  labs(x = "Life Expectancy", y = "Fertility Rate", color = "Income group") +
  geom_smooth(method = "lm", se = FALSE, color = "black")  +
  theme_gray() +
  theme(legend.position = "none")

Single Line

Multi lines

multi <- ggplot(UN_data_ch6, aes(x = life_exp, y = fert_rate, color = income)) +
  geom_point() +
  labs(x = "Life Expectancy", y = "Fertility Rate", color = "Income group") +
  geom_smooth(method = "lm", se = FALSE) +
  theme_gray()+ 
  theme(legend.text = element_text(size = 8),   
  legend.title = element_text(size = 10),  
  legend.key.size = unit(0.3, "cm"),         
  legend.spacing.y = unit(0.1, "cm")) 

Multi lines

Compare

legend <- gtable::gtable_filter(ggplotGrob(multi), "guide-box")

multi <- multi + theme(legend.position = "none")
single_grob <- ggplotGrob(single)
multi_grob <- ggplotGrob(multi)
grid.arrange(
  arrangeGrob(single_grob, multi_grob, ncol = 2),  # Arrange two plots
  legend,   # Place legend below
  ncol = 1,  # One column layout
  heights = c(4, 1)  # Give more space to the plots than the legend
)

The average by income group

Low income, is the “baseline” group.

Similarly, the average fertility rate for Upper middle income member states is 4.28 + -2.25 = 2.03.

one_factor_model <- lm(fert_rate ~ income, data = UN_data_ch6)
coef(one_factor_model)
              (Intercept) incomeLower middle income incomeUpper middle income 
                 4.280000                 -1.299231                 -2.247347 
        incomeHigh income 
                -2.653214 

Interaction Effect

Observe how the intercept and the slope are different for a High income member state when compared to the baseline Low income member state.

Mutiple linear regression

  • y ~ x1 + x2 + x1x2

  • we can also write y ~ x1 * x2 as the * sign accounts for both

# Fit regression model and get the coefficients of the model
model_int <- lm(fert_rate ~ life_exp * income, data = UN_data_ch6)
# coef(model_int)

coef_table <- as.data.frame(coef(model_int))
arrange(coef_table)
                                   coef(model_int)
(Intercept)                            11.91826460
life_exp                               -0.11848075
incomeLower middle income              -1.50420617
incomeUpper middle income              -1.89291757
incomeHigh income                      -6.57953685
life_exp:incomeLower middle income      0.01265027
life_exp:incomeUpper middle income      0.01117481
life_exp:incomeHigh income              0.07221696

How to read?

Income Low? Income high?

                                   coef(model_int)
(Intercept)                            11.91826460
life_exp                               -0.11848075
incomeLower middle income              -1.50420617
incomeUpper middle income              -1.89291757
incomeHigh income                      -6.57953685
life_exp:incomeLower middle income      0.01265027
life_exp:incomeUpper middle income      0.01117481
life_exp:incomeHigh income              0.07221696

6.1.3 Model without interactions

geom_parallel_slopes()

multi_wt <- ggplot(UN_data_ch6, aes(x = life_exp, y = fert_rate, color = income)) +
  geom_point() +
  labs(x = "Life expectancy", y = "Fertility rate", color = "Income group") +
  geom_parallel_slopes(se = FALSE)+
  theme_gray() +
  theme(legend.position = "none")

Compare with and without

Residual table

regression_points <- get_regression_points(model_int)
regression_points
# A tibble: 182 × 6
      ID fert_rate life_exp income              fert_rate_hat residual
   <int>     <dbl>    <dbl> <fct>                       <dbl>    <dbl>
 1     1       4.3     53.6 Low income                   5.56   -1.26 
 2     2       1.4     79.5 Upper middle income          1.50   -0.098
 3     3       2.7     78.0 Lower middle income          2.16    0.544
 4     4       5       62.1 Lower middle income          3.84    1.16 
 5     5       1.6     77.8 High income                  1.74   -0.139
 6     6       1.9     78.3 Upper middle income          1.62    0.278
 7     7       1.6     76.1 Upper middle income          1.86   -0.256
 8     8       1.6     83.1 High income                  1.50    0.105
 9     9       1.5     82.3 High income                  1.53   -0.033
10    10       1.6     74.2 Upper middle income          2.07   -0.469
# ℹ 172 more rows

Exercise 1

  • use palmerpenguins package, find out “penguins” data

  • create scatter plot with 3 linear lines according to species

Exercise 1 code

library(palmerpenguins)

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
) +
  geom_point() +
  geom_smooth(method = "lm")
# Just one line
# ggplot(
#   data = penguins,
#   mapping = aes(x = flipper_length_mm, y = body_mass_g)
# ) +
#   geom_point(mapping = aes(color = species)) +
#   geom_smooth(method = "lm")

Credit data in ISLR2

6.2 Two numerical explanatory variables

as_tibble() convert this data frame to be a tibble

library(ISLR2)

credit_ch6 <- Credit |> 
  as_tibble() |> 
  select(debt = Balance, credit_limit = Limit, 
         income = Income, credit_rating = Rating, age = Age)

glimpse(credit_ch6)
Rows: 400
Columns: 5
$ debt          <dbl> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1350, 1407…
$ credit_limit  <dbl> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114, 3300, 68…
$ income        <dbl> 14.891, 106.025, 104.593, 148.924, 55.882, 80.180, 20.99…
$ credit_rating <dbl> 283, 483, 514, 681, 357, 569, 259, 512, 266, 491, 589, 1…
$ age           <dbl> 34, 82, 71, 36, 68, 77, 37, 87, 66, 41, 30, 64, 57, 49, …

Credit data summary

credit_ch6 |>
  select(debt, credit_limit, income) |>
  tidy_summary()
# A tibble: 3 × 11
  column           n group type    min     Q1   mean median     Q3    max     sd
  <chr>        <int> <chr> <chr> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 debt           400 <NA>  nume…   0     68.8  520.   460.   863    1999   460. 
2 credit_limit   400 <NA>  nume… 855   3088   4736.  4622.  5873.  13913  2308. 
3 income         400 <NA>  nume…  10.4   21.0   45.2   33.1   57.5   187.   35.2

Debt and 2 numerical variable

plot3 <- ggplot(credit_ch6, aes(x = credit_limit, y = debt)) +
  geom_point() +
  labs(x = "Credit limit (in $)", y = "Credit card debt (in $)", 
       title = "Debt and credit limit") +
  geom_smooth(method = "lm", se = FALSE)

plot4 <- ggplot(credit_ch6, aes(x = income, y = debt)) +
  geom_point() +
  labs(x = "Income (in $1000)", y = "Credit card debt (in $)", 
       title = "Debt and income") +
  geom_smooth(method = "lm", se = FALSE)

Debt and 2 numerical variable

Get correlation

credit_ch6 |> 
  get_correlation(debt ~ credit_limit)
# A tibble: 1 × 1
    cor
  <dbl>
1 0.862
credit_ch6 |> 
  get_correlation(debt ~ income)
# A tibble: 1 × 1
    cor
  <dbl>
1 0.464

Get correlation

using the select() verb and command cor() we can find all correlations simultaneously by returning a correlation matrix

credit_ch6 |>
  select(debt, credit_limit, income) |> 
  cor()
                  debt credit_limit    income
debt         1.0000000    0.8616973 0.4636565
credit_limit 0.8616973    1.0000000 0.7920883
income       0.4636565    0.7920883 1.0000000

Correlation stay the same with scale one side

credit_ch6 |> 
  get_correlation(debt ~ 1000 * income)
# A tibble: 1 × 1
    cor
  <dbl>
1 0.464

Regression model without interaction

We start with a model with no interactions for the two numerical explanatory variables

y ~ x1 + x2

debt_model <- lm(debt ~ credit_limit + income, data = credit_ch6)
coef(debt_model)
 (Intercept) credit_limit       income 
-385.1792604    0.2643216   -7.6633230 

Residual table

get_regression_points(debt_model)
# A tibble: 400 × 6
      ID  debt credit_limit income debt_hat residual
   <int> <dbl>        <dbl>  <dbl>    <dbl>    <dbl>
 1     1   333         3606   14.9     454.   -121. 
 2     2   903         6645  106.      559.    344. 
 3     3   580         7075  105.      683.   -103. 
 4     4   964         9504  149.      986.    -21.7
 5     5   331         4897   55.9     481.   -150. 
 6     6  1151         8047   80.2    1127.     23.6
 7     7   203         3388   21.0     349.   -146. 
 8     8   872         7114   71.4     948.    -76.0
 9     9   279         3300   15.1     371.    -92.2
10    10  1350         6819   71.1     873.    477. 
# ℹ 390 more rows

Regression model with interaction

y ~ x1 * x2

debt_int_model <- lm(debt ~ credit_limit * income, data = credit_ch6)
coef(debt_int_model)
        (Intercept)        credit_limit              income credit_limit:income 
      -3.070868e+02        2.523858e-01       -9.785183e+00        2.671507e-04 
get_regression_points(debt_int_model)
# A tibble: 400 × 6
      ID  debt credit_limit income debt_hat residual
   <int> <dbl>        <dbl>  <dbl>    <dbl>    <dbl>
 1     1   333         3606   14.9     472.   -139. 
 2     2   903         6645  106.      521.    382. 
 3     3   580         7075  105.      653.    -72.8
 4     4   964         9504  149.     1012.    -48.5
 5     5   331         4897   55.9     455.   -124. 
 6     6  1151         8047   80.2    1112.     39.3
 7     7   203         3388   21.0     362.   -159. 
 8     8   872         7114   71.4     925.    -53.4
 9     9   279         3300   15.1     391.   -112. 
10    10  1350         6819   71.1     848.    502. 
# ℹ 390 more rows

Simple Model

# Fit regression model and get the coefficients of the model
simple_model <- lm(debt ~ income, data = credit_ch6)
coef(simple_model)
(Intercept)      income 
 246.514751    6.048363 

Exercise 2

Exercise 2

  • continue with penguin data set

  • can you show the correlation between flipper length, bill length and body mass?

  • flipper length, bill length which is the key for body mass?

  • can you show the regression model?

  • do you think gender impact the body mass?

Exercise 2 Code

pg <- penguins |> 
  na.omit()
  
pg_cor <- pg |>
  select(flipper_length_mm, body_mass_g, bill_length_mm) |>
  cor ()
pg_cor
                  flipper_length_mm body_mass_g bill_length_mm
flipper_length_mm         1.0000000   0.8729789      0.6530956
body_mass_g               0.8729789   1.0000000      0.5894511
bill_length_mm            0.6530956   0.5894511      1.0000000
pg_mass_model <- lm(body_mass_g ~ flipper_length_mm + bill_length_mm, data=pg)
coef(pg_mass_model)
      (Intercept) flipper_length_mm    bill_length_mm 
     -5836.298732         48.889692          4.958601 
get_regression_points(pg_mass_model)
# A tibble: 333 × 6
      ID body_mass_g flipper_length_mm bill_length_mm body_mass_g_hat residual
   <int>       <int>             <int>          <dbl>           <dbl>    <dbl>
 1     1        3750               181           39.1           3207.   543.  
 2     2        3800               186           39.5           3453.   347.  
 3     3        3250               195           40.3           3897.  -647.  
 4     4        3450               193           36.7           3781.  -331.  
 5     5        3650               190           39.3           3648.     2.38
 6     6        3625               181           38.9           3206.   419.  
 7     7        4675               195           39.2           3892.   783.  
 8     8        3200               182           41.1           3265.   -65.4 
 9     9        3800               191           38.6           3693.   107.  
10    10        4400               198           34.6           4015.   385.  
# ℹ 323 more rows
# pg_mass_int_model <- lm(body_mass_g ~ flipper_length_mm * bill_length_mm, data=pg)
# coef(pg_mass_int_model )

pg_spe_model <- lm(body_mass_g ~ species, data=pg)
coef(pg_spe_model)
     (Intercept) speciesChinstrap    speciesGentoo 
      3706.16438         26.92385       1386.27259 
pg_sex_model <- lm(body_mass_g ~ sex, data=pg)
coef(pg_sex_model)
(Intercept)     sexmale 
  3862.2727    683.4118