Data Science and statistics with RStudio

Ch 5 Simple Linear Regression

Esther Chen

Needed Package

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(moderndive)

Subpackages

ggplot2 for data visualization
dplyr for data wrangling
tidyr for converting data to “tidy” format
readr for importing spreadsheet data into R
As well as the more advanced purrr, tibble, stringr, and forcats packages

Moderndive Package

Always checking your data

data(package="moderndive")
# summary(un_member_states_2024)
# ?un_member_states_2024
# head(un_member_states_2024)
# glimpse(un_member_states_2024)

# in_rec <- un_member_states_2024 |> 
#   filter(country=="Denmark")
# in_rec

Un_memeber_states_2024

iso: An identification variable used to distinguish between the 181 countries in the filtered dataset.
fert_rate: A numerical variable representing the country’s fertility rate in 2022 corresponding to the expected number of children born per woman in child-bearing years. This is the outcome variable y of interest.
life_exp: A numerical variable representing the country’s average life expectancy in 2022 in years. This is the primary explanatory variable x of interest.
obes_rate: A numerical variable representing the country’s obesity rate in 2016.

Select key columns

UN_data_ch5 <- un_member_states_2024 |>
  select(iso, 
         life_exp = life_expectancy_2022, 
         fert_rate = fertility_rate_2022, 
         obes_rate = obesity_rate_2016)

# anyDuplicated(UN_data_ch5$iso)
# select() already can rename the select columns directly

Selected table

UN_data_ch5

# A tibble: 193 × 4
   iso   life_exp fert_rate obes_rate
   <chr>    <dbl>     <dbl>     <dbl>
 1 AFG       53.6       4.3       5.5
 2 ALB       79.5       1.4      21.7
 3 DZA       78.0       2.7      27.4
 4 AND       83.4      NA        25.6
 5 AGO       62.1       5         8.2
 6 ATG       77.8       1.6      18.9
 7 ARG       78.3       1.9      28.3
 8 ARM       76.1       1.6      20.2
 9 AUS       83.1       1.6      29  
10 AUT       82.3       1.5      20.1
# ℹ 183 more rows

# check if has na data
# anyNA(UN_data_ch5)

How to get ride of NA data?

na.omit()

UN_data_ch5 <- un_member_states_2024 |>
  select(iso, 
         life_exp = life_expectancy_2022, 
         fert_rate = fertility_rate_2022, 
         obes_rate = obesity_rate_2016)|>
  na.omit()

glimpse(UN_data_ch5)

Rows: 181
Columns: 4
$ iso       <chr> "AFG", "ALB", "DZA", "AGO", "ATG", "ARG", "ARM", "AUS", "AUT…
$ life_exp  <dbl> 53.65, 79.47, 78.03, 62.11, 77.80, 78.31, 76.13, 83.09, 82.2…
$ fert_rate <dbl> 4.3, 1.4, 2.7, 5.0, 1.6, 1.9, 1.6, 1.6, 1.5, 1.6, 1.4, 1.8, …
$ obes_rate <dbl> 5.5, 21.7, 27.4, 8.2, 18.9, 28.3, 20.2, 29.0, 20.1, 19.9, 31…

Random Sample

slice_sample()

# assume from 193 countries we want to have 5 random sample

UN_data_ch5 |>
  slice_sample(n = 5)

# A tibble: 5 × 4
  iso   life_exp fert_rate obes_rate
  <chr>    <dbl>     <dbl>     <dbl>
1 COL       74.9       1.7      22.3
2 PER       68.9       2.1      19.7
3 CAN       83.8       1.5      29.4
4 AUS       83.1       1.6      29  
5 LKA       78         1.9       5.2

# UN_data_ch5 |>
# sample_n(size = 5)

Summarize mean and mdeian

One way we can do:

summarize ()

UN_data_ch5 |>
  summarize(mean_life_exp = mean(life_exp), 
            mean_fert_rate = mean(fert_rate),
            median_life_exp = median(life_exp), 
            median_fert_rate = median(fert_rate))

# A tibble: 1 × 4
  mean_life_exp mean_fert_rate median_life_exp median_fert_rate
          <dbl>          <dbl>           <dbl>            <dbl>
1          73.6           2.50            75.1                2

Summary in easier way with moderndive

tidy_summary ()

UN_data_ch5 |> 
  select(fert_rate, life_exp) |> 
  tidy_summary()

# A tibble: 2 × 11
  column        n group type      min    Q1  mean median    Q3   max    sd
  <chr>     <int> <chr> <chr>   <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
1 fert_rate   181 <NA>  numeric   1.1   1.6  2.50    2     3.2   6.6  1.15
2 life_exp    181 <NA>  numeric  53.6  69.4 73.6    75.1  78.3  86.4  6.80

# Also can try this way
# UN_data_ch5 |> 
#   tidy_summary(columns = c(fert_rate, life_exp))

Correlation coefficient

get_correlation()

cor()

UN_data_ch5 |> 
  get_correlation(formula = fert_rate ~ life_exp)

# A tibble: 1 × 1
     cor
   <dbl>
1 -0.812

# please also try
# UN_data_ch5 |> 
#   summarize(correlation = cor(fert_rate, life_exp))

Scatter plot

ggplot(UN_data_ch5, 
       aes(x = life_exp, y = fert_rate)) +
  geom_point(alpha = 0.5) +
  labs(x = "Life Expectancy", y = "Fertility Rate")

Scatter plot style

R for data science 2e, section 9, can see different point shape.

Scatter plot style

ggplot(UN_data_ch5, 
       aes(x = life_exp, y = fert_rate)) +
  geom_point(color="lawngreen", fill="snow", shape=23, size=2) +
  labs(x = "Life Expectancy", y = "Fertility Rate")

Best fitting line

geom_smooth(method = “lm”, se = FALSE)

ggplot(UN_data_ch5, aes(x = life_exp, y = fert_rate)) +
  geom_point(alpha = 0.1) +
  labs(x = "Life Expectancy", 
    y = "Fertility Rate",
    title = "Relationship of life expectancy and fertility rate") +
  geom_smooth(method = "lm", se = TRUE)

# the band represent confidence interval (default is 95% in ggplot2)

Best fitting line

lm(y~x, dataframe) get the regression line

coef() get the regression model coeffiencts

# Fit regression model:
demographics_model <- lm(fert_rate ~ life_exp, data = UN_data_ch5)

# Get regression coefficients:
coef(demographics_model)

(Intercept)    life_exp 
 12.5992926  -0.1372879

How to get residuals

get_regression_points ( )

# 𝑦-y_hat

regression_points <- get_regression_points(demographics_model)
regression_points

# A tibble: 181 × 5
      ID fert_rate life_exp fert_rate_hat residual
   <int>     <dbl>    <dbl>         <dbl>    <dbl>
 1     1       4.3     53.6          5.23   -0.934
 2     2       1.4     79.5          1.69   -0.289
 3     3       2.7     78.0          1.89    0.813
 4     4       5       62.1          4.07    0.928
 5     5       1.6     77.8          1.92   -0.318
 6     6       1.9     78.3          1.85    0.052
 7     7       1.6     76.1          2.15   -0.548
 8     8       1.6     83.1          1.19    0.408
 9     9       1.5     82.3          1.30    0.195
10    10       1.6     74.2          2.42   -0.819
# ℹ 171 more rows

Exercise 1

Follow this data table (UN_data_ch5)

do the scatter plot of obes_rate and life_exp
can you find the regression model?
how you like to explain these 2 set of numbers?

# A tibble: 1 × 1
    cor
  <dbl>
1 0.493

ggplot(UN_data_ch5, 
       aes(x = life_exp, y = obes_rate)) +
  geom_point(alpha = 0.5) +
  labs(x = "Life Expectancy", y = "Obesity Rate")+
  geom_smooth(method = "lm", se = TRUE)

test_model <- lm(obes_rate~ life_exp, data = UN_data_ch5)
coef(test_model)

(Intercept)    life_exp 
-33.9428026   0.7235592

Lets find out other factor!

Start up another data set!

Difference between continents?

Ch5.2

Differences between continents
Differences within continents

Exercise 2 (gapminder2022 table)

Continue with un_member_states_2024 as major data set.

name a table gapminder2022
columns need to select from un table
select: country, life_expectancy_2022, continent, gdp_per_capita
rename: life_expectancy_2022 as life_exp
Remove NA data

Rows: 188
Columns: 4
$ country        <chr> "Afghanistan", "Albania", "Algeria", "Andorra", "Angola…
$ life_exp       <dbl> 53.65, 79.47, 78.03, 83.42, 62.11, 77.80, 78.31, 76.13,…
$ continent      <fct> Asia, Europe, Africa, Europe, Africa, North America, So…
$ gdp_per_capita <dbl> 355.7778, 6810.1140, 4342.6380, 41992.7728, 3000.4442, …

Exercise 2 code

gapminder2022 <- un_member_states_2024 |>
  select(country, life_exp = life_expectancy_2022, continent, gdp_per_capita) |> 
  na.omit()

Exercise 3 (summary table)

Follow gapminder 2022 you just created at exercise 1

select life_exp and continent
use tidy_summary () for summary table

# A tibble: 7 × 11
  column        n group         type    min    Q1  mean median    Q3   max    sd
  <chr>     <int> <chr>         <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
1 life_exp    188 <NA>          nume…  53.6  69.4  73.8   75.2  78.4  89.6  6.93
2 continent    52 Africa        fact…  NA    NA    NA     NA    NA    NA   NA   
3 continent    44 Asia          fact…  NA    NA    NA     NA    NA    NA   NA   
4 continent    43 Europe        fact…  NA    NA    NA     NA    NA    NA   NA   
5 continent    23 North America fact…  NA    NA    NA     NA    NA    NA   NA   
6 continent    14 Oceania       fact…  NA    NA    NA     NA    NA    NA   NA   
7 continent    12 South America fact…  NA    NA    NA     NA    NA    NA   NA

Exercise 3 code

gapminder2022 |> 
  select(life_exp, continent) |> 
  tidy_summary()

# A tibble: 7 × 11
  column        n group         type    min    Q1  mean median    Q3   max    sd
  <chr>     <int> <chr>         <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
1 life_exp    188 <NA>          nume…  53.6  69.4  73.8   75.2  78.4  89.6  6.93
2 continent    52 Africa        fact…  NA    NA    NA     NA    NA    NA   NA   
3 continent    44 Asia          fact…  NA    NA    NA     NA    NA    NA   NA   
4 continent    43 Europe        fact…  NA    NA    NA     NA    NA    NA   NA   
5 continent    23 North America fact…  NA    NA    NA     NA    NA    NA   NA   
6 continent    14 Oceania       fact…  NA    NA    NA     NA    NA    NA   NA   
7 continent    12 South America fact…  NA    NA    NA     NA    NA    NA   NA

Gapminder Table

country: An identification variable of type character/text used to distinguish the 142 countries in the dataset.

life_exp: A numerical variable of that country’s life expectancy at birth. This is the outcome variable y of interest.

continent: A categorical variable with five levels. Here “levels” correspond to the possible categories: Africa, Asia, Americas, Europe, and Oceania. This is the explanatory variable x of interest.

gdp_per_capita: A numerical variable of that country’s GDP per capita in US inflation-adjusted dollars

Histogram catch up

Left-skewed distribution?

ggplot(gapminder2022, aes(x = life_exp)) +
  geom_histogram(binwidth = 5, color = "white") +
  labs(x = "Life expectancy", 
       y = "Number of countries",
       title = "Histogram of distribution of worldwide life expectancies")

How about separated by continental?

recap with facet_wrap()

ggplot(gapminder2022, aes(x = life_exp)) +
  geom_histogram(binwidth = 5, color = "white") +
  labs(x = "Life expectancy", 
       y = "Number of countries",
       title = "Histogram of distribution of worldwide life expectancies") +
  facet_wrap(~ continent, nrow = 2)

Is there any outliers?

Recall boxplot

ggplot(gapminder2022, aes(x = continent, y = life_exp)) +
  geom_boxplot() +
  labs(x = "Continent", y = "Life expectancy",
       title = "Life expectancy by continent")

Catgory data with scatter plot

ggplot(gapminder2022, aes(y = life_exp, x = continent)) +
  geom_point(alpha = 0.1) +
  labs(x = "continent", 
    y = "Life expectancy",
    title = "Life expectancy estimation by continent")

Is there any outliers?

Recall boxplot and mark outlier

Mean and median

life_exp_by_continent <- gapminder2022 |>
  group_by(continent) |>
  summarize(median = median(life_exp), mean = mean(life_exp))
life_exp_by_continent

# A tibble: 6 × 3
  continent     median  mean
  <fct>          <dbl> <dbl>
1 Africa          66.1  66.3
2 Asia            75.4  74.9
3 Europe          81.5  79.9
4 North America   76.1  76.3
5 Oceania         74.6  74.4
6 South America   75.4  75.2

Set the baseline to compare

R default for the baseline is according to alphabet

life_exp_model <- lm(life_exp ~ continent, data = gapminder2022)
coef(life_exp_model)

           (Intercept)          continentAsia        continentEurope 
             66.309808               8.639965              13.597867 
continentNorth America       continentOceania continentSouth America 
              9.985410               8.106621               8.917692

How to read

intercept corresponds to the mean life expectancy of countries in Africa of 66.31 years.
continentAsia the mean life expectancy of countries in Asia is 66.31 + 8.64 = 74.95
continentEurope the mean life expectancy of countries in Europe is 66.31 + 13.6 = 79.91
continentNorth the mean life expectancy of countries in North America is 66.31 + 9.98 = 76.29
continentOceania the mean life expectancy of countries in Oceania is 66.31 + 8.11 = 74.42
continentSouth America the mean life expectancy of countries in South America is 66.31 + 8.92 = 75.23

Regression model apply on country

get_regression_points(life_exp_model, ID = "country")

# A tibble: 188 × 5
   country             life_exp continent     life_exp_hat residual
   <chr>                  <dbl> <fct>                <dbl>    <dbl>
 1 Afghanistan             53.6 Asia                  75.0  -21.3  
 2 Albania                 79.5 Europe                79.9   -0.438
 3 Algeria                 78.0 Africa                66.3   11.7  
 4 Andorra                 83.4 Europe                79.9    3.51 
 5 Angola                  62.1 Africa                66.3   -4.2  
 6 Antigua and Barbuda     77.8 North America         76.3    1.50 
 7 Argentina               78.3 South America         75.2    3.08 
 8 Armenia                 76.1 Asia                  75.0    1.18 
 9 Australia               83.1 Oceania               74.4    8.67 
10 Austria                 82.3 Europe                79.9    2.36 
# ℹ 178 more rows

Regression model apply on country

country_apply <- get_regression_points(life_exp_model, ID = "country")

Asia_model <- country_apply |> 
  filter(continent=="Asia")
              
Asia_model

# A tibble: 44 × 5
   country     life_exp continent life_exp_hat residual
   <chr>          <dbl> <fct>            <dbl>    <dbl>
 1 Afghanistan     53.6 Asia              75.0   -21.3 
 2 Armenia         76.1 Asia              75.0     1.18
 3 Azerbaijan      74.2 Asia              75.0    -0.8 
 4 Bahrain         79.9 Asia              75.0     4.95
 5 Bangladesh      74.7 Asia              75.0    -0.25
 6 Bhutan          72.3 Asia              75.0    -2.64
 7 Brunei          80.6 Asia              75.0     5.64
 8 Cambodia        70.6 Asia              75.0    -4.3 
 9 Cyprus          79.7 Asia              75.0     4.79
10 Georgia         77.5 Asia              75.0     2.55
# ℹ 34 more rows

Exercise 4

Follow on previous logic, can you find out the linear model for gdp_per_capital and continent? Can you explain?

life_gdp_model <- lm(gdp_per_capita ~ continent, data = gapminder2022)
coef(life_gdp_model)

           (Intercept)          continentAsia        continentEurope 
              2637.100              13014.284              43061.350 
continentNorth America       continentOceania continentSouth America 
             13713.476              10031.048               8083.508

get_regression_points(life_gdp_model, ID = "country")

# A tibble: 188 × 5
   country             gdp_per_capita continent     gdp_per_capita_hat residual
   <chr>                        <dbl> <fct>                      <dbl>    <dbl>
 1 Afghanistan                   356. Asia                      15651.  -15296.
 2 Albania                      6810. Europe                    45698.  -38888.
 3 Algeria                      4343. Africa                     2637.    1706.
 4 Andorra                     41993. Europe                    45698.   -3706.
 5 Angola                       3000. Africa                     2637.     363.
 6 Antigua and Barbuda         19920. North America             16351.    3569.
 7 Argentina                   13651. South America             10721.    2930.
 8 Armenia                      7018. Asia                      15651.   -8633.
 9 Australia                   65100. Oceania                   12668.   52432.
10 Austria                     52085. Europe                    45698.    6386.
# ℹ 178 more rows