Regression Examples

Author

Kementari Whitcher

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(palmerpenguins)

Lecture example - cars

speed <- c(30,50,40,55,30,25,60,25,50,55)
mileage <- c(28,25,25,23,30,32,21,35,26,25)
car_data <- data.frame(speed, mileage)
ggplot(
  data = car_data,
  mapping = aes(
    x = speed,
    y = mileage
  )
) + 
  geom_point()

Correlation Coefficient (Pearson)

cor(mileage,speed,method = "pearson")
[1] -0.9103694

Covariance

cov(mileage,speed,method="pearson")
[1] -52.77778

Palmer Penguins Example

Note that the following code uses the use “complete.obs” so that cases with NA values will be deleted.

penguins2 <- na.omit(penguins)
glimpse(penguins2)
Rows: 333
Columns: 8
$ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm    <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
$ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
$ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198, 18…
$ body_mass_g       <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
$ sex               <fct> male, female, female, female, male, female, male, fe…
$ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
cor(penguins2$bill_length_mm, penguins2$body_mass_g,  method = "pearson")
[1] 0.5894511

Simple Linear Regression

my_model=lm(bill_length_mm ~ flipper_length_mm, data=penguins2)
summary(my_model)

Call:
lm(formula = bill_length_mm ~ flipper_length_mm, data = penguins2)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.6367 -2.6981 -0.5788  2.0663 19.0953 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -7.21856    3.27175  -2.206    0.028 *  
flipper_length_mm  0.25482    0.01624  15.691   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.148 on 331 degrees of freedom
Multiple R-squared:  0.4265,    Adjusted R-squared:  0.4248 
F-statistic: 246.2 on 1 and 331 DF,  p-value: < 2.2e-16
model1.stdres = rstandard(my_model)
penguins3 <- data.frame(penguins2, model1.stdres)
glimpse(penguins3)
Rows: 333
Columns: 9
$ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm    <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
$ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
$ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198, 18…
$ body_mass_g       <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
$ sex               <fct> male, female, female, female, male, female, male, fe…
$ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
$ model1.stdres     <dbl> 0.047300379, -0.164202271, -0.524679069, -1.27138199…
ggplot(
  data = penguins3,
  mapping = aes(
    x = flipper_length_mm,
    y = model1.stdres
  )
) + 
  geom_point()

To make the Normal Probability Plot

qqnorm(model1.stdres,
       ylab="Standardized Residuals",
       xlab="Normal Scores")
qqline(model1.stdres)

Multiple Regression

my_model2=lm(bill_length_mm ~ flipper_length_mm + body_mass_g, data=penguins2)
summary(my_model2)

Call:
lm(formula = bill_length_mm ~ flipper_length_mm + body_mass_g, 
    data = penguins2)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.8251 -2.6432 -0.7281  2.0229 18.8227 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -3.9812079  4.7215540  -0.843    0.400    
flipper_length_mm  0.2271747  0.0333014   6.822 4.31e-11 ***
body_mass_g        0.0005513  0.0005797   0.951    0.342    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.148 on 330 degrees of freedom
Multiple R-squared:  0.4281,    Adjusted R-squared:  0.4246 
F-statistic: 123.5 on 2 and 330 DF,  p-value: < 2.2e-16
model2.stdres = rstandard(my_model2)
penguins4 <- data.frame(penguins2, model2.stdres)
glimpse(penguins4)
Rows: 333
Columns: 9
$ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm    <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
$ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
$ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198, 18…
$ body_mass_g       <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
$ sex               <fct> male, female, female, female, male, female, male, fe…
$ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
$ model2.stdres     <dbl> -0.02546035, -0.21023477, -0.43888796, -1.22512757, …
ggplot(
  data = penguins4,
  mapping = aes(
    x = flipper_length_mm,
    y = model2.stdres
  )
) + 
  geom_point()

qqnorm(model2.stdres,
       ylab="Standardized Residuals",
       xlab="Normal Scores")
qqline(model2.stdres)