Core functions from broom package: glance() (model fit), tidy() (model parameters), and augment() (data values).
Also: visdat::vis_dat(), rsample::initial_split, car::vif(), yarstick::conf_mat(), GGally::ggpairs() and interactions::interact_plot().
head(penguins)
summary(penguins)
nrow(penguins)
visdat::vis_dat(penguins)
penguins <- drop_na(penguins)
nrow(penguins)
## [1] 333
penguins %>%
pairs()
## GGally::ggpairs(penguins %>% select(bill_length_mm, bill_depth_mm, year))
pairs(penguins %>% select(bill_length_mm, bill_depth_mm, year))
penguins %>%
select_if(is.numeric) %>%
cor()
Objective: minimize residual sum of squares
lmfit <- lm(body_mass_g ~ flipper_length_mm + sex, penguins)
summary(lmfit)
##
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm + sex, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -910.28 -243.89 -2.94 238.85 1067.73
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5410.300 285.798 -18.931 < 2e-16 ***
## flipper_length_mm 46.982 1.441 32.598 < 2e-16 ***
## sexmale 347.850 40.342 8.623 2.78e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 355.9 on 330 degrees of freedom
## Multiple R-squared: 0.8058, Adjusted R-squared: 0.8047
## F-statistic: 684.8 on 2 and 330 DF, p-value: < 2.2e-16
Does the model explain much of the variance? Is the F-statistic significant?
glance(lmfit) %>%
kable()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.8058374 | 0.8046607 | 355.8829 | 684.8033 | 0 | 2 | -2427.242 | 4862.484 | 4877.717 | 41795374 | 330 | 333 |
lmparms <- tidy(lmfit, conf.int = TRUE)
kable(lmparms)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | -5410.30022 | 285.797694 | -18.930524 | 0 | -5972.51535 | -4848.08510 |
| flipper_length_mm | 46.98218 | 1.441256 | 32.598066 | 0 | 44.14697 | 49.81738 |
| sexmale | 347.85025 | 40.341557 | 8.622628 | 0 | 268.49120 | 427.20930 |
include CIs with conf.int = T, conf.level = X
p-value for each independent variable
Non-standardized estimates (coefficients, betas):
weights that minimize the sum of the square of the errors.
useful when you want to interpret the effect that a one unit change on a predictor variable has on a response variable
how much the mean of the dependent variable changes on average given a one-unit shift in the independent variable while holding other variables in the model constant
Choose B0 and B1 to minimize RSS
SE = SD of the coefficiens, SE depends on both predictor and outcome variables
Standardized estimates (coefficients, betas):
scale())standard error of estimate
tscore = B/SE(B); df = sample_size - num parameters
CI = B + (t_crit*SE)
Plot params:
ggplot(lmparms, aes(estimate, term, xmin = conf.low, xmax = conf.high)) +
geom_point() +
geom_vline(xintercept = 0, lty = 4) +
geom_linerange() +
coord_flip()
lm.augment <- augment(lmfit, interval = "confidence")
lm.augment
## # A tibble: 333 × 11
## body_mass_g flipper_length_mm sex .fitted .lower .upper .resid .hat
## <int> <int> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3750 181 male 3441. 3356. 3527. 309. 0.0150
## 2 3800 186 female 3328. 3265. 3392. 472. 0.00818
## 3 3250 195 female 3751. 3696. 3806. -501. 0.00615
## 4 3450 193 female 3657. 3601. 3713. -207. 0.00637
## 5 3650 190 male 3864. 3796. 3932. -214. 0.00940
## 6 3625 181 female 3093. 3022. 3165. 532. 0.0105
## 7 4675 195 male 4099. 4039. 4159. 576. 0.00743
## 8 3200 182 female 3140. 3071. 3210. 59.5 0.00993
## 9 3800 191 male 3911. 3845. 3977. -111. 0.00894
## 10 4400 198 male 4240. 4183. 4297. 160. 0.00665
## # … with 323 more rows, and 3 more variables: .sigma <dbl>, .cooksd <dbl>,
## # .std.resid <dbl>
Normliv
= errors between the line and the data are assumed to be drawn from a nearly-normal distribution
ggplot(lm.augment,aes(sample=.resid)) +
geom_qq() +
geom_qq_line() +
ggtitle('Normal data')
= y should be linearly related to x
ggplot(lm.augment, aes(y=.resid, x = .fitted)) +
geom_point() +
geom_smooth(method = "lm")
= Assume that data are approximately equally variable at all ranges of x and y
ggplot(lm.augment, aes(y=.std.resid, x = .fitted)) +
geom_point() +
geom_smooth(method = "lm")
= Data points very far away from the rest can exert undue influence on the model parameters
= predictors are independent from each other
#cor(penguins %>% select(bill_length_mm, flipper_length_mm), use = "pairwise.complete.obs")
car::vif(lmfit)
## flipper_length_mm sex
## 1.069646 1.069646
Goal: maximum liklihood (least squares is special case)
glmfit <- glm(sex ~ flipper_length_mm + bill_length_mm, penguins,
family = "binomial")
glmfit
##
## Call: glm(formula = sex ~ flipper_length_mm + bill_length_mm, family = "binomial",
## data = penguins)
##
## Coefficients:
## (Intercept) flipper_length_mm bill_length_mm
## -7.005988 0.007737 0.124375
##
## Degrees of Freedom: 332 Total (i.e. Null); 330 Residual
## Null Deviance: 461.6
## Residual Deviance: 419.9 AIC: 425.9
summary(glmfit)
##
## Call:
## glm(formula = sex ~ flipper_length_mm + bill_length_mm, family = "binomial",
## data = penguins)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8924 -1.1412 0.5765 0.9822 1.6976
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.005988 1.727620 -4.055 5.01e-05 ***
## flipper_length_mm 0.007737 0.011081 0.698 0.485
## bill_length_mm 0.124375 0.029528 4.212 2.53e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 461.61 on 332 degrees of freedom
## Residual deviance: 419.94 on 330 degrees of freedom
## AIC: 425.94
##
## Number of Fisher Scoring iterations: 4
tidy(glmfit, conf.int = TRUE)
## # A tibble: 3 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -7.01 1.73 -4.06 0.0000501 -10.5 -3.67
## 2 flipper_length_mm 0.00774 0.0111 0.698 0.485 -0.0141 0.0295
## 3 bill_length_mm 0.124 0.0295 4.21 0.0000253 0.0679 0.184
include CIs with conf.int = T, conf.level = X
t = B/SE(B)
nobs = n observations
sigma = Estimated standard error of the residuals
Betas are in log odds space
log odds to odds ratio [p/(1-p)]-> exp() (e.g. “odds ratio of 2 implies 2x more likely”)
penguins %>%
mutate(prob = ifelse(sex == "male", 1, 0)) %>%
ggplot(aes(body_mass_g, prob)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "glm", method.args = list(family = "binomial"))
#http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/
model_vals <- augment(glmfit, type.predict = "response") %>%
mutate(predicted = case_when(.fitted > .5 ~ "male",
TRUE ~ "female"))
Confusion matrix:
yardstick::conf_mat(model_vals, sex, predicted) #true, predicted
## Truth
## Prediction female male
## female 89 69
## male 76 99
levels(penguins$sex)
## [1] "female" "male"
penguins <- penguins %>%
mutate(sex = relevel(sex, ref = "male"))
predict(model, new_data_with_predictors) (for GLM, add type = "response)
#probabilities <- model %>% predict(test.data, type = "response")
Split into training vs test? * To evaluate your models out-of-sample on new data.
#rsample
penguins_split <- initial_split(penguins, prop = .8)
train_data <- training(penguins_split)
test_data <- testing(penguins_split)
compare using RMSE (sqrt(mean(lmfit$residuals^2))).
anova() - Is the more complex model better?
lmfit <- lm(body_mass_g ~ flipper_length_mm + sex, penguins)
lmfit2 <- lm(body_mass_g ~ flipper_length_mm + sex + island, penguins)
anova(lmfit2, lmfit)
## Analysis of Variance Table
##
## Model 1: body_mass_g ~ flipper_length_mm + sex + island
## Model 2: body_mass_g ~ flipper_length_mm + sex
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 328 35758209
## 2 330 41795374 -2 -6037165 27.689 7.737e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lmfit, lmfit2)
## Analysis of Variance Table
##
## Model 1: body_mass_g ~ flipper_length_mm + sex
## Model 2: body_mass_g ~ flipper_length_mm + sex + island
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 330 41795374
## 2 328 35758209 2 6037165 27.689 7.737e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lmfit_mod <- lm(body_mass_g ~ island * flipper_length_mm, penguins)
summary(lmfit_mod)
##
## Call:
## lm(formula = body_mass_g ~ island * flipper_length_mm, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -985.13 -248.93 -20.27 222.90 1192.46
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5520.278 429.553 -12.851 < 2e-16 ***
## islandDream 3743.156 975.595 3.837 0.00015 ***
## islandTorgersen 2845.594 1742.520 1.633 0.10342
## flipper_length_mm 48.862 2.045 23.892 < 2e-16 ***
## islandDream:flipper_length_mm -20.413 4.971 -4.106 5.08e-05 ***
## islandTorgersen:flipper_length_mm -15.535 9.047 -1.717 0.08689 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 371.8 on 327 degrees of freedom
## Multiple R-squared: 0.79, Adjusted R-squared: 0.7868
## F-statistic: 246.1 on 5 and 327 DF, p-value: < 2.2e-16
interactions::interact_plot(lmfit_mod, pred = flipper_length_mm, modx = island)
lmfit_mod2 <- lm(body_mass_g ~ bill_depth_mm * flipper_length_mm, penguins)
summary(lmfit_mod2)
##
## Call:
## lm(formula = body_mass_g ~ bill_depth_mm * flipper_length_mm,
## data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -915.78 -253.91 -33.51 223.53 1032.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -37536.190 4692.218 -8.000 2.15e-14 ***
## bill_depth_mm 1854.806 276.375 6.711 8.46e-11 ***
## flipper_length_mm 203.368 22.879 8.889 < 2e-16 ***
## bill_depth_mm:flipper_length_mm -9.018 1.357 -6.646 1.25e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.2 on 329 degrees of freedom
## Multiple R-squared: 0.7916, Adjusted R-squared: 0.7897
## F-statistic: 416.7 on 3 and 329 DF, p-value: < 2.2e-16
interactions::interact_plot(lmfit_mod2, pred = flipper_length_mm, modx = bill_depth_mm)
interactions::interact_plot(lmfit_mod2, pred = bill_depth_mm, modx = flipper_length_mm)
polynomial: model <- lm(y ~ poly(x,3))
intercept_mod <- lm(body_mass_g ~ 1, penguins)
#no_intercept_mod <- lm(y ~ x - 1)