Regression

library(tidyverse)
library(openintro)
library(palmerpenguins)
library(janitor)
library(interpretCI)
library(BSDA)
library(pwr)

# Consistent Randomization
set.seed(1)

# Remove scientific notation
options(scipen = 999)

Simple linear regression

Lets explore the relationship between the bill length of a penguin and

penguins_tidy <- penguins %>% drop_na()

head(penguins_tidy)

## # A tibble: 6 x 8
##   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
## 1 Adelie  Torgersen           39.1          18.7               181        3750
## 2 Adelie  Torgersen           39.5          17.4               186        3800
## 3 Adelie  Torgersen           40.3          18                 195        3250
## 4 Adelie  Torgersen           36.7          19.3               193        3450
## 5 Adelie  Torgersen           39.3          20.6               190        3650
## 6 Adelie  Torgersen           38.9          17.8               181        3625
## # i 2 more variables: sex <fct>, year <int>

flipper_length_mm = -7.21 + 0.25 * bill_length_mm

https://towardsdatascience.com/understanding-linear-regression-output-in-r-7a9cbda948b3

mod_linear <- lm(penguins_tidy$bill_length_mm ~ penguins_tidy$flipper_length_mm)

summary(mod_linear)

## 
## Call:
## lm(formula = penguins_tidy$bill_length_mm ~ penguins_tidy$flipper_length_mm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6367 -2.6981 -0.5788  2.0663 19.0953 
## 
## Coefficients:
##                                 Estimate Std. Error t value            Pr(>|t|)
## (Intercept)                     -7.21856    3.27175  -2.206               0.028
## penguins_tidy$flipper_length_mm  0.25482    0.01624  15.691 <0.0000000000000002
##                                    
## (Intercept)                     *  
## penguins_tidy$flipper_length_mm ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.148 on 331 degrees of freedom
## Multiple R-squared:  0.4265, Adjusted R-squared:  0.4248 
## F-statistic: 246.2 on 1 and 331 DF,  p-value: < 0.00000000000000022

plot(penguins_tidy$flipper_length_mm, penguins_tidy$bill_length_mm)

Check residuals

These should be evenly scattered about showing no clear pattern
We need constant zero mean and variation

res <- resid(mod_linear)

plot(fitted(mod_linear), res)
abline(0,0)

Check for residual normal distribution

ggplot(data.frame(res), aes(sample = res)) +
stat_qq() +
stat_qq_line() +
  labs(title = "Normal Probability Plot")

Wasn’t in the course but you can use Shapiro-Wilk test for normality

A test for the null hypothesis that the data are not normally distributed
A p-value of > 0.05 means that there is weak or no evidence against the null

st_test <- shapiro.test(res)

ifelse(shapiro.test(res)[["p.value"]] > 0.05, 
       "Normally Distributed", 
       "NOT Normally Distributed")

## [1] "NOT Normally Distributed"

Heavier penguins I’d imagine have longer flippers….

Looks like flipper_length_mm = 137 + 0.152 * body_mass_g

mod_linear_2 <- lm(penguins_tidy$flipper_length_mm ~ penguins_tidy$body_mass_g)

mod_linear_2

## 
## Call:
## lm(formula = penguins_tidy$flipper_length_mm ~ penguins_tidy$body_mass_g)
## 
## Coefficients:
##               (Intercept)  penguins_tidy$body_mass_g  
##                  137.0396                     0.0152

summary(mod_linear_2)

## 
## Call:
## lm(formula = penguins_tidy$flipper_length_mm ~ penguins_tidy$body_mass_g)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.698  -4.983   1.056   5.101  13.933 
## 
## Coefficients:
##                              Estimate  Std. Error t value            Pr(>|t|)
## (Intercept)               137.0396209   1.9987694   68.56 <0.0000000000000002
## penguins_tidy$body_mass_g   0.0151953   0.0004667   32.56 <0.0000000000000002
##                              
## (Intercept)               ***
## penguins_tidy$body_mass_g ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.847 on 331 degrees of freedom
## Multiple R-squared:  0.7621, Adjusted R-squared:  0.7614 
## F-statistic:  1060 on 1 and 331 DF,  p-value: < 0.00000000000000022

plot(penguins_tidy$body_mass_g, penguins_tidy$flipper_length_mm)

res_2 <- resid(mod_linear_2)

plot(fitted(mod_linear), res_2)
abline(0,0)

ggplot(data.frame(res_2), aes(sample = res_2)) +
stat_qq() +
stat_qq_line() +
  labs(title = "Normal Probability Plot")

ifelse(shapiro.test(res_2)[["p.value"]] > 0.05, 
       "Normally Distributed", 
       "NOT Normally Distributed")

## [1] "NOT Normally Distributed"

Multiple Regression

x1 = Flipper length and x2 = bill length with y = body mass
y = -5836.299 + (48.890 * flipper_length) + (4.959 * bill_length)

mod_multiple_reg <- lm(body_mass_g ~ flipper_length_mm + bill_length_mm, data = penguins_tidy)

mod_multiple_reg

## 
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm + bill_length_mm, 
##     data = penguins_tidy)
## 
## Coefficients:
##       (Intercept)  flipper_length_mm     bill_length_mm  
##         -5836.299             48.890              4.959

summary(mod_multiple_reg)

## 
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm + bill_length_mm, 
##     data = penguins_tidy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1083.08  -282.65   -17.18   242.95  1293.24 
## 
## Coefficients:
##                    Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)       -5836.299    312.604 -18.670 <0.0000000000000002 ***
## flipper_length_mm    48.890      2.034  24.034 <0.0000000000000002 ***
## bill_length_mm        4.959      5.214   0.951               0.342    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 393.4 on 330 degrees of freedom
## Multiple R-squared:  0.7627, Adjusted R-squared:  0.7613 
## F-statistic: 530.4 on 2 and 330 DF,  p-value: < 0.00000000000000022

res_multiple_reg <- resid(mod_multiple_reg)

plot(fitted(mod_linear), res_multiple_reg)
abline(0,0)

ggplot(data.frame(res_multiple_reg), aes(sample = res_multiple_reg)) +
stat_qq() +
stat_qq_line() +
  labs(title = "Normal Probability Plot")

ifelse(shapiro.test(res_multiple_reg)[["p.value"]] > 0.05, 
       "Residuals are Normally Distributed", 
       "Residuals are NOT Normally Distributed")

## [1] "Residuals are Normally Distributed"

#penguins_tidy

new_obs <- data.frame(flipper_length_mm = 100, bill_length_mm = 400)

# Point prediction
predict(mod_multiple_reg, newdata = new_obs)

##        1 
## 1036.111

# Confidence interval
predict(mod_multiple_reg, newdata = new_obs, interval = "confidence")

##        fit       lwr      upr
## 1 1036.111 -2891.101 4963.323

# Prediction interval
predict(mod_multiple_reg, 
        newdata = new_obs, 
        interval = "prediction",
        level = 0.95)

##        fit       lwr      upr
## 1 1036.111 -2966.625 5038.847

Regression

2023-06-05

Simple linear regression

Check residuals

Check for residual normal distribution

Wasn’t in the course but you can use Shapiro-Wilk test for normality

Heavier penguins I’d imagine have longer flippers….

Multiple Regression