Ch 6 Multiple Linear Regression
factor () converts a categorical variable into a factor, allowing for ordered levels and proper handling in analysis, visualization, and modeling.
UN_data_ch6 <- un_member_states_2024 |>
select(country,
life_expectancy_2022,
fertility_rate_2022,
income_group_2024)|>
na.omit()|>
rename(life_exp = life_expectancy_2022,
fert_rate = fertility_rate_2022,
income = income_group_2024)|>
mutate(income = factor(income,
levels = c("Low income", "Lower middle income",
"Upper middle income", "High income")))gapminder2022 <- un_member_states_2024 |>
select(country, life_exp = life_expectancy_2022, continent, gdp_per_capita) |>
na.omit()
gapminder2022_new <- gapminder2022 |>
mutate(new_cont = factor(continent, levels =
c("Europe", "Asia", "North America",
"South America", "Oceania", "Africa")))
new_exp_model <- lm(life_exp ~ new_cont, data = gapminder2022_new)
coef(new_exp_model) (Intercept) new_contAsia new_contNorth America
79.907674 -4.957902 -3.612457
new_contSouth America new_contOceania new_contAfrica
-4.680174 -5.491246 -13.597867
Check the income already changed
Rows: 182
Columns: 4
$ country <chr> "Afghanistan", "Albania", "Algeria", "Angola", "Antigua and …
$ life_exp <dbl> 53.65, 79.47, 78.03, 62.11, 77.80, 78.31, 76.13, 83.09, 82.2…
$ fert_rate <dbl> 4.3, 1.4, 2.7, 5.0, 1.6, 1.9, 1.6, 1.6, 1.5, 1.6, 1.4, 1.8, …
$ income <fct> Low income, Upper middle income, Lower middle income, Lower …
# A tibble: 10 × 4
country life_exp fert_rate income
<chr> <dbl> <dbl> <fct>
1 Uruguay 78.4 1.5 High income
2 Vietnam 75.5 1.9 Lower middle income
3 North Macedonia 76.8 1.4 Upper middle income
4 Antigua and Barbuda 77.8 1.6 High income
5 Ecuador 78 2 Upper middle income
6 San Marino 83.9 1.2 High income
7 Philippines 70.1 2.7 Lower middle income
8 Sierra Leone 58.8 3.7 Low income
9 United Kingdom 81.9 1.6 High income
10 Belize 75.8 2 Upper middle income
# A tibble: 6 × 11
column n group type min Q1 mean median Q3 max sd
<chr> <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 life_exp 182 <NA> nume… 53.6 69.4 73.7 75.2 78.4 86.4 6.86
2 fert_rate 182 <NA> nume… 0.9 1.6 2.49 2 3.2 6.6 1.16
3 income 25 Low income fact… NA NA NA NA NA NA NA
4 income 52 Lower middle… fact… NA NA NA NA NA NA NA
5 income 49 Upper middle… fact… NA NA NA NA NA NA NA
6 income 56 High income fact… NA NA NA NA NA NA NA
multi <- ggplot(UN_data_ch6, aes(x = life_exp, y = fert_rate, color = income)) +
geom_point() +
labs(x = "Life Expectancy", y = "Fertility Rate", color = "Income group") +
geom_smooth(method = "lm", se = FALSE) +
theme_gray()+
theme(legend.text = element_text(size = 8),
legend.title = element_text(size = 10),
legend.key.size = unit(0.3, "cm"),
legend.spacing.y = unit(0.1, "cm")) legend <- gtable::gtable_filter(ggplotGrob(multi), "guide-box")
multi <- multi + theme(legend.position = "none")
single_grob <- ggplotGrob(single)
multi_grob <- ggplotGrob(multi)
grid.arrange(
arrangeGrob(single_grob, multi_grob, ncol = 2), # Arrange two plots
legend, # Place legend below
ncol = 1, # One column layout
heights = c(4, 1) # Give more space to the plots than the legend
)Low income, is the “baseline” group.
Similarly, the average fertility rate for Upper middle income member states is 4.28 + -2.25 = 2.03.
Observe how the intercept and the slope are different for a High income member state when compared to the baseline Low income member state.
y ~ x1 + x2 + x1x2
we can also write y ~ x1 * x2 as the * sign accounts for both
# Fit regression model and get the coefficients of the model
model_int <- lm(fert_rate ~ life_exp * income, data = UN_data_ch6)
# coef(model_int)
coef_table <- as.data.frame(coef(model_int))
arrange(coef_table) coef(model_int)
(Intercept) 11.91826460
life_exp -0.11848075
incomeLower middle income -1.50420617
incomeUpper middle income -1.89291757
incomeHigh income -6.57953685
life_exp:incomeLower middle income 0.01265027
life_exp:incomeUpper middle income 0.01117481
life_exp:incomeHigh income 0.07221696
Income Low? Income high?
coef(model_int)
(Intercept) 11.91826460
life_exp -0.11848075
incomeLower middle income -1.50420617
incomeUpper middle income -1.89291757
incomeHigh income -6.57953685
life_exp:incomeLower middle income 0.01265027
life_exp:incomeUpper middle income 0.01117481
life_exp:incomeHigh income 0.07221696
geom_parallel_slopes()
# A tibble: 182 × 6
ID fert_rate life_exp income fert_rate_hat residual
<int> <dbl> <dbl> <fct> <dbl> <dbl>
1 1 4.3 53.6 Low income 5.56 -1.26
2 2 1.4 79.5 Upper middle income 1.50 -0.098
3 3 2.7 78.0 Lower middle income 2.16 0.544
4 4 5 62.1 Lower middle income 3.84 1.16
5 5 1.6 77.8 High income 1.74 -0.139
6 6 1.9 78.3 Upper middle income 1.62 0.278
7 7 1.6 76.1 Upper middle income 1.86 -0.256
8 8 1.6 83.1 High income 1.50 0.105
9 9 1.5 82.3 High income 1.53 -0.033
10 10 1.6 74.2 Upper middle income 2.07 -0.469
# ℹ 172 more rows
use palmerpenguins package, find out “penguins” data
create scatter plot with 3 linear lines according to species
as_tibble() convert this data frame to be a tibble
library(ISLR2)
credit_ch6 <- Credit |>
as_tibble() |>
select(debt = Balance, credit_limit = Limit,
income = Income, credit_rating = Rating, age = Age)
glimpse(credit_ch6)Rows: 400
Columns: 5
$ debt <dbl> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1350, 1407…
$ credit_limit <dbl> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114, 3300, 68…
$ income <dbl> 14.891, 106.025, 104.593, 148.924, 55.882, 80.180, 20.99…
$ credit_rating <dbl> 283, 483, 514, 681, 357, 569, 259, 512, 266, 491, 589, 1…
$ age <dbl> 34, 82, 71, 36, 68, 77, 37, 87, 66, 41, 30, 64, 57, 49, …
# A tibble: 3 × 11
column n group type min Q1 mean median Q3 max sd
<chr> <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 debt 400 <NA> nume… 0 68.8 520. 460. 863 1999 460.
2 credit_limit 400 <NA> nume… 855 3088 4736. 4622. 5873. 13913 2308.
3 income 400 <NA> nume… 10.4 21.0 45.2 33.1 57.5 187. 35.2
plot3 <- ggplot(credit_ch6, aes(x = credit_limit, y = debt)) +
geom_point() +
labs(x = "Credit limit (in $)", y = "Credit card debt (in $)",
title = "Debt and credit limit") +
geom_smooth(method = "lm", se = FALSE)
plot4 <- ggplot(credit_ch6, aes(x = income, y = debt)) +
geom_point() +
labs(x = "Income (in $1000)", y = "Credit card debt (in $)",
title = "Debt and income") +
geom_smooth(method = "lm", se = FALSE)using the select() verb and command cor() we can find all correlations simultaneously by returning a correlation matrix
We start with a model with no interactions for the two numerical explanatory variables
y ~ x1 + x2
# A tibble: 400 × 6
ID debt credit_limit income debt_hat residual
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 333 3606 14.9 454. -121.
2 2 903 6645 106. 559. 344.
3 3 580 7075 105. 683. -103.
4 4 964 9504 149. 986. -21.7
5 5 331 4897 55.9 481. -150.
6 6 1151 8047 80.2 1127. 23.6
7 7 203 3388 21.0 349. -146.
8 8 872 7114 71.4 948. -76.0
9 9 279 3300 15.1 371. -92.2
10 10 1350 6819 71.1 873. 477.
# ℹ 390 more rows
y ~ x1 * x2
(Intercept) credit_limit income credit_limit:income
-3.070868e+02 2.523858e-01 -9.785183e+00 2.671507e-04
# A tibble: 400 × 6
ID debt credit_limit income debt_hat residual
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 333 3606 14.9 472. -139.
2 2 903 6645 106. 521. 382.
3 3 580 7075 105. 653. -72.8
4 4 964 9504 149. 1012. -48.5
5 5 331 4897 55.9 455. -124.
6 6 1151 8047 80.2 1112. 39.3
7 7 203 3388 21.0 362. -159.
8 8 872 7114 71.4 925. -53.4
9 9 279 3300 15.1 391. -112.
10 10 1350 6819 71.1 848. 502.
# ℹ 390 more rows
continue with penguin data set
can you show the correlation between flipper length, bill length and body mass?
flipper length, bill length which is the key for body mass?
can you show the regression model?
do you think gender impact the body mass?
pg <- penguins |>
na.omit()
pg_cor <- pg |>
select(flipper_length_mm, body_mass_g, bill_length_mm) |>
cor ()
pg_cor flipper_length_mm body_mass_g bill_length_mm
flipper_length_mm 1.0000000 0.8729789 0.6530956
body_mass_g 0.8729789 1.0000000 0.5894511
bill_length_mm 0.6530956 0.5894511 1.0000000
(Intercept) flipper_length_mm bill_length_mm
-5836.298732 48.889692 4.958601
# A tibble: 333 × 6
ID body_mass_g flipper_length_mm bill_length_mm body_mass_g_hat residual
<int> <int> <int> <dbl> <dbl> <dbl>
1 1 3750 181 39.1 3207. 543.
2 2 3800 186 39.5 3453. 347.
3 3 3250 195 40.3 3897. -647.
4 4 3450 193 36.7 3781. -331.
5 5 3650 190 39.3 3648. 2.38
6 6 3625 181 38.9 3206. 419.
7 7 4675 195 39.2 3892. 783.
8 8 3200 182 41.1 3265. -65.4
9 9 3800 191 38.6 3693. 107.
10 10 4400 198 34.6 4015. 385.
# ℹ 323 more rows
# pg_mass_int_model <- lm(body_mass_g ~ flipper_length_mm * bill_length_mm, data=pg)
# coef(pg_mass_int_model )
pg_spe_model <- lm(body_mass_g ~ species, data=pg)
coef(pg_spe_model) (Intercept) speciesChinstrap speciesGentoo
3706.16438 26.92385 1386.27259
(Intercept) sexmale
3862.2727 683.4118