library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(modelr)
library(broom)

## 
## Attaching package: 'broom'

## The following object is masked from 'package:modelr':
## 
##     bootstrap

# Import wages here

# Fall back in case you cannot load wages
wages <- heights %>%
filter(income > 0) %>%
mutate(marital = as.character(marital),
sex = as.character(sex))

Your Turn 1

Change the working directory to 04-Models.
Then import wages.xlsx as wages and copy the code to your setup chunk.
Be sure to set NA: to NA.

Your Turn 2

Fit the model on the slide and then examine the output. What does it look like?

mod_e <- lm(log(income) ~ education, data = wages)
#We have our regression w/ coefficients, etc.

Your Turn 3

Use a pipe to model log(income) against height. Then use broom and dplyr functions to extract:

The coefficient estimates and their related statistics
The adj.r.squared and p.value for the overall model

mod_h <- wages %>% lm(log(income) ~ height, data = .)
mod_h %>%
tidy()

## # A tibble: 2 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)   6.98     0.237        29.4 4.13e-176
## 2 height        0.0520   0.00352      14.8 2.44e- 48

mod_h %>%
glance() %>%
select(adj.r.squared, p.value)

## # A tibble: 1 × 2
##   adj.r.squared  p.value
##           <dbl>    <dbl>
## 1        0.0396 2.44e-48

mod_h %>%
tidy() %>% filter(p.value < 0.05)

## # A tibble: 2 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)   6.98     0.237        29.4 4.13e-176
## 2 height        0.0520   0.00352      14.8 2.44e- 48

mod_e %>%
tidy() %>% filter(p.value < 0.05)

## # A tibble: 2 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)    8.56    0.0733      117.  0        
## 2 education      0.142   0.00530      26.7 8.41e-148

Your Turn 4

Model log(income) against education and height and sex. Can you interpret the coefficients?

mod_ehs <- wages %>%
  lm(log(income) ~ education + height + sex, data = .)
mod_ehs %>%
tidy()

## # A tibble: 4 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)  7.79      0.307       25.3  9.41e-134
## 2 education    0.148     0.00520     28.5  5.16e-166
## 3 height       0.00673   0.00479      1.40 1.61e-  1
## 4 sexmale      0.462     0.0389      11.9  5.02e- 32

#income is positively correlated with education, height, and sex. Coefficients as shown.

Your Turn 5

Add + geom_smooth(method = lm) to the code below. What happens?

wages %>%
     ggplot(aes(x = height, y = log(income))) +
     geom_point(alpha = 0.1) +
    geom_smooth(method = lm)

## `geom_smooth()` using formula 'y ~ x'

#Adds a model line

Your Turn 6

Use add_predictions to make the plot below.

# Make plot here
wages %>%

add_predictions(mod_ehs) %>%

ggplot(mapping = aes(x = height, y = pred, color = sex)) +

geom_line() +

facet_wrap(~ education)

Take Aways

Use glance(), tidy(), and augment() from the broom package to return model values in a data frame.
Use add_predictions() or gather_predictions() or spread_predictions() from the modelr package to visualize predictions.
Use add_residuals() or gather_residuals() or spread_residuals() from the modelr package to visualize residuals.

Models

Elanur Ural