library(tidyverse)
library(openintro)
library(palmerpenguins)
library(janitor)
library(interpretCI)
library(BSDA)
library(pwr)
# Consistent Randomization
set.seed(1)
# Remove scientific notation
options(scipen = 999)
penguins_tidy <- penguins %>% drop_na()
head(penguins_tidy)
## # A tibble: 6 x 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 36.7 19.3 193 3450
## 5 Adelie Torgersen 39.3 20.6 190 3650
## 6 Adelie Torgersen 38.9 17.8 181 3625
## # i 2 more variables: sex <fct>, year <int>
flipper_length_mm = -7.21 + 0.25 * bill_length_mm
https://towardsdatascience.com/understanding-linear-regression-output-in-r-7a9cbda948b3
mod_linear <- lm(penguins_tidy$bill_length_mm ~ penguins_tidy$flipper_length_mm)
summary(mod_linear)
##
## Call:
## lm(formula = penguins_tidy$bill_length_mm ~ penguins_tidy$flipper_length_mm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6367 -2.6981 -0.5788 2.0663 19.0953
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.21856 3.27175 -2.206 0.028
## penguins_tidy$flipper_length_mm 0.25482 0.01624 15.691 <0.0000000000000002
##
## (Intercept) *
## penguins_tidy$flipper_length_mm ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.148 on 331 degrees of freedom
## Multiple R-squared: 0.4265, Adjusted R-squared: 0.4248
## F-statistic: 246.2 on 1 and 331 DF, p-value: < 0.00000000000000022
plot(penguins_tidy$flipper_length_mm, penguins_tidy$bill_length_mm)
res <- resid(mod_linear)
plot(fitted(mod_linear), res)
abline(0,0)
ggplot(data.frame(res), aes(sample = res)) +
stat_qq() +
stat_qq_line() +
labs(title = "Normal Probability Plot")
st_test <- shapiro.test(res)
ifelse(shapiro.test(res)[["p.value"]] > 0.05,
"Normally Distributed",
"NOT Normally Distributed")
## [1] "NOT Normally Distributed"
mod_linear_2 <- lm(penguins_tidy$flipper_length_mm ~ penguins_tidy$body_mass_g)
mod_linear_2
##
## Call:
## lm(formula = penguins_tidy$flipper_length_mm ~ penguins_tidy$body_mass_g)
##
## Coefficients:
## (Intercept) penguins_tidy$body_mass_g
## 137.0396 0.0152
summary(mod_linear_2)
##
## Call:
## lm(formula = penguins_tidy$flipper_length_mm ~ penguins_tidy$body_mass_g)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.698 -4.983 1.056 5.101 13.933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 137.0396209 1.9987694 68.56 <0.0000000000000002
## penguins_tidy$body_mass_g 0.0151953 0.0004667 32.56 <0.0000000000000002
##
## (Intercept) ***
## penguins_tidy$body_mass_g ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.847 on 331 degrees of freedom
## Multiple R-squared: 0.7621, Adjusted R-squared: 0.7614
## F-statistic: 1060 on 1 and 331 DF, p-value: < 0.00000000000000022
plot(penguins_tidy$body_mass_g, penguins_tidy$flipper_length_mm)
res_2 <- resid(mod_linear_2)
plot(fitted(mod_linear), res_2)
abline(0,0)
ggplot(data.frame(res_2), aes(sample = res_2)) +
stat_qq() +
stat_qq_line() +
labs(title = "Normal Probability Plot")
ifelse(shapiro.test(res_2)[["p.value"]] > 0.05,
"Normally Distributed",
"NOT Normally Distributed")
## [1] "NOT Normally Distributed"
mod_multiple_reg <- lm(body_mass_g ~ flipper_length_mm + bill_length_mm, data = penguins_tidy)
mod_multiple_reg
##
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm + bill_length_mm,
## data = penguins_tidy)
##
## Coefficients:
## (Intercept) flipper_length_mm bill_length_mm
## -5836.299 48.890 4.959
summary(mod_multiple_reg)
##
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm + bill_length_mm,
## data = penguins_tidy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1083.08 -282.65 -17.18 242.95 1293.24
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5836.299 312.604 -18.670 <0.0000000000000002 ***
## flipper_length_mm 48.890 2.034 24.034 <0.0000000000000002 ***
## bill_length_mm 4.959 5.214 0.951 0.342
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 393.4 on 330 degrees of freedom
## Multiple R-squared: 0.7627, Adjusted R-squared: 0.7613
## F-statistic: 530.4 on 2 and 330 DF, p-value: < 0.00000000000000022
res_multiple_reg <- resid(mod_multiple_reg)
plot(fitted(mod_linear), res_multiple_reg)
abline(0,0)
ggplot(data.frame(res_multiple_reg), aes(sample = res_multiple_reg)) +
stat_qq() +
stat_qq_line() +
labs(title = "Normal Probability Plot")
ifelse(shapiro.test(res_multiple_reg)[["p.value"]] > 0.05,
"Residuals are Normally Distributed",
"Residuals are NOT Normally Distributed")
## [1] "Residuals are Normally Distributed"
#penguins_tidy
new_obs <- data.frame(flipper_length_mm = 100, bill_length_mm = 400)
# Point prediction
predict(mod_multiple_reg, newdata = new_obs)
## 1
## 1036.111
# Confidence interval
predict(mod_multiple_reg, newdata = new_obs, interval = "confidence")
## fit lwr upr
## 1 1036.111 -2891.101 4963.323
# Prediction interval
predict(mod_multiple_reg,
newdata = new_obs,
interval = "prediction",
level = 0.95)
## fit lwr upr
## 1 1036.111 -2966.625 5038.847