modelr training

1. Purpose.

The purpose of this notebook is to illustrate how the modelr package can be used to model a continuous response variable using linear regression.

2. Load libraries.

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.4.2

library(ggplot2)
library(modelr)

3. View data.

sim3

## # A tibble: 120 x 5
##       x1 x2      rep      y    sd
##    <int> <fct> <int>  <dbl> <dbl>
##  1     1 a         1 -0.571     2
##  2     1 a         2  1.18      2
##  3     1 a         3  2.24      2
##  4     1 b         1  7.44      2
##  5     1 b         2  8.52      2
##  6     1 b         3  7.72      2
##  7     1 c         1  6.51      2
##  8     1 c         2  5.79      2
##  9     1 c         3  6.07      2
## 10     1 d         1  2.11      2
## # ... with 110 more rows

glimpse(sim3)

## Observations: 120
## Variables: 5
## $ x1  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2...
## $ x2  <fct> a, a, a, b, b, b, c, c, c, d, d, d, a, a, a, b, b, b, c, c...
## $ rep <int> 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2...
## $ y   <dbl> -0.5707363, 1.1841503, 2.2373204, 7.4366963, 8.5182934, 7....
## $ sd  <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...

4. Visualise observations.

First visualise the distribution of the response variable, and next visualise how the response variable relates to explanatory variables.

ggplot(data = sim3, aes(x = y)) +
  geom_histogram()

ggplot(data = sim3, aes(x = x1, y = y)) +
  geom_point()

ggplot(data = sim3, aes(x = x2, y = y)) +
  geom_boxplot()

ggplot(data = sim3, aes(x = x1, y = y, color = x2)) +
  geom_point()

5. Fit model(s).

mod_lm_x1 <- lm(formula = y ~ x1, data = sim3)
mod_lm_x1

## 
## Call:
## lm(formula = y ~ x1, data = sim3)
## 
## Coefficients:
## (Intercept)           x1  
##      4.3849      -0.1967

mod_lm_x2 <- lm(formula = y ~ x2, data = sim3)
mod_lm_x2

## 
## Call:
## lm(formula = y ~ x2, data = sim3)
## 
## Coefficients:
## (Intercept)          x2b          x2c          x2d  
##      0.7896       2.8878       4.8057       2.3596

mod_lm_x1_x2 <- lm(formula = y ~ x1  + x2, data = sim3)
mod_lm_x1_x2

## 
## Call:
## lm(formula = y ~ x1 + x2, data = sim3)
## 
## Coefficients:
## (Intercept)           x1          x2b          x2c          x2d  
##      1.8717      -0.1967       2.8878       4.8057       2.3596

mod_lm_x1_x2_x1x2 <- lm(formula = y ~ x1 * x2, data = sim3)
mod_lm_x1_x2_x1x2

## 
## Call:
## lm(formula = y ~ x1 * x2, data = sim3)
## 
## Coefficients:
## (Intercept)           x1          x2b          x2c          x2d  
##     1.30124     -0.09302      7.06938      4.43090      0.83455  
##      x1:x2b       x1:x2c       x1:x2d  
##    -0.76029      0.06815      0.27728

6. Calculate predictions and residuals.

sim3 %>%
  add_predictions(mod_lm_x1) #for just one model

## # A tibble: 120 x 6
##       x1 x2      rep      y    sd  pred
##    <int> <fct> <int>  <dbl> <dbl> <dbl>
##  1     1 a         1 -0.571     2  4.19
##  2     1 a         2  1.18      2  4.19
##  3     1 a         3  2.24      2  4.19
##  4     1 b         1  7.44      2  4.19
##  5     1 b         2  8.52      2  4.19
##  6     1 b         3  7.72      2  4.19
##  7     1 c         1  6.51      2  4.19
##  8     1 c         2  5.79      2  4.19
##  9     1 c         3  6.07      2  4.19
## 10     1 d         1  2.11      2  4.19
## # ... with 110 more rows

sim3_pred <- sim3 %>%
  gather_predictions(mod_lm_x1, mod_lm_x2, mod_lm_x1_x2, mod_lm_x1_x2_x1x2) #for many models

sim3_pred

## # A tibble: 480 x 7
##    model        x1 x2      rep      y    sd  pred
##    <chr>     <int> <fct> <int>  <dbl> <dbl> <dbl>
##  1 mod_lm_x1     1 a         1 -0.571     2  4.19
##  2 mod_lm_x1     1 a         2  1.18      2  4.19
##  3 mod_lm_x1     1 a         3  2.24      2  4.19
##  4 mod_lm_x1     1 b         1  7.44      2  4.19
##  5 mod_lm_x1     1 b         2  8.52      2  4.19
##  6 mod_lm_x1     1 b         3  7.72      2  4.19
##  7 mod_lm_x1     1 c         1  6.51      2  4.19
##  8 mod_lm_x1     1 c         2  5.79      2  4.19
##  9 mod_lm_x1     1 c         3  6.07      2  4.19
## 10 mod_lm_x1     1 d         1  2.11      2  4.19
## # ... with 470 more rows

sim3_resid <- sim3 %>%
  gather_residuals(mod_lm_x1, mod_lm_x2, mod_lm_x1_x2, mod_lm_x1_x2_x1x2)

sim3_resid

## # A tibble: 480 x 7
##    model        x1 x2      rep      y    sd resid
##    <chr>     <int> <fct> <int>  <dbl> <dbl> <dbl>
##  1 mod_lm_x1     1 a         1 -0.571     2 -4.76
##  2 mod_lm_x1     1 a         2  1.18      2 -3.00
##  3 mod_lm_x1     1 a         3  2.24      2 -1.95
##  4 mod_lm_x1     1 b         1  7.44      2  3.25
##  5 mod_lm_x1     1 b         2  8.52      2  4.33
##  6 mod_lm_x1     1 b         3  7.72      2  3.54
##  7 mod_lm_x1     1 c         1  6.51      2  2.32
##  8 mod_lm_x1     1 c         2  5.79      2  1.60
##  9 mod_lm_x1     1 c         3  6.07      2  1.88
## 10 mod_lm_x1     1 d         1  2.11      2 -2.08
## # ... with 470 more rows

7. Visualise predictions.

ggplot(sim3_pred, aes(x = x1, y = y, color = x2)) +
  geom_point() +
  geom_line(aes(y = pred), size = 1) +
  facet_wrap(~model)

8. Visualise residuals.

ggplot(data = sim3_resid, aes(x = x1, y = resid, color = x2)) + 
  geom_point() + 
  facet_grid(model ~ x2)

9. View Summary stats, particularly `Adjusted R-squared`.

summary(mod_lm_x1)

## 
## Call:
## lm(formula = y ~ x1, data = sim3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7589 -1.9213 -0.1823  2.0902  4.5755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.38495    0.45283   9.684  < 2e-16 ***
## x1          -0.19674    0.07298  -2.696  0.00805 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.296 on 118 degrees of freedom
## Multiple R-squared:  0.05801,    Adjusted R-squared:  0.05003 
## F-statistic: 7.267 on 1 and 118 DF,  p-value: 0.008048

summary(mod_lm_x2)

## 
## Call:
## lm(formula = y ~ x2, data = sim3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3527 -0.7792  0.0437  0.7227  4.8409 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.7896     0.2977   2.652  0.00911 ** 
## x2b           2.8878     0.4210   6.859 3.58e-10 ***
## x2c           4.8057     0.4210  11.415  < 2e-16 ***
## x2d           2.3596     0.4210   5.604 1.43e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.631 on 116 degrees of freedom
## Multiple R-squared:  0.533,  Adjusted R-squared:  0.521 
## F-statistic: 44.14 on 3 and 116 DF,  p-value: < 2.2e-16

summary(mod_lm_x1_x2)

## 
## Call:
## lm(formula = y ~ x1 + x2, data = sim3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4674 -0.8524 -0.0729  0.7886  4.3005 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.87167    0.38738   4.832 4.22e-06 ***
## x1          -0.19674    0.04871  -4.039 9.72e-05 ***
## x2b          2.88781    0.39571   7.298 4.07e-11 ***
## x2c          4.80574    0.39571  12.145  < 2e-16 ***
## x2d          2.35959    0.39571   5.963 2.79e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.533 on 115 degrees of freedom
## Multiple R-squared:  0.5911, Adjusted R-squared:  0.5768 
## F-statistic: 41.55 on 4 and 115 DF,  p-value: < 2.2e-16

summary(mod_lm_x1_x2_x1x2)

## 
## Call:
## lm(formula = y ~ x1 * x2, data = sim3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.87634 -0.67655  0.04837  0.69963  2.58607 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.30124    0.40400   3.221  0.00167 ** 
## x1          -0.09302    0.06511  -1.429  0.15587    
## x2b          7.06938    0.57134  12.373  < 2e-16 ***
## x2c          4.43090    0.57134   7.755 4.41e-12 ***
## x2d          0.83455    0.57134   1.461  0.14690    
## x1:x2b      -0.76029    0.09208  -8.257 3.30e-13 ***
## x1:x2c       0.06815    0.09208   0.740  0.46076    
## x1:x2d       0.27728    0.09208   3.011  0.00322 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.024 on 112 degrees of freedom
## Multiple R-squared:  0.8221, Adjusted R-squared:  0.811 
## F-statistic: 73.93 on 7 and 112 DF,  p-value: < 2.2e-16

10. If appropriate, complete the model development process using train and confirm datasets.

If the dataset has sufficient observations and you are making a complex model, you should train your preferred model using only 80% of the data, and then confirm it is appropriate with the remaining set-aside 20% of the dataset. This avoids the risk of overfitting, where the the noise is modelled as well as the signal.

sim3 <- sim3 %>% 
  mutate(id = row_number()) %>% 
  select(id, everything())

sim3_train <- sim3 %>% 
  sample_frac(0.8) %>% 
  arrange(id) #dataset to train model

sim3_train

## # A tibble: 96 x 6
##       id    x1 x2      rep     y    sd
##    <int> <int> <fct> <int> <dbl> <dbl>
##  1     2     1 a         2  1.18     2
##  2     4     1 b         1  7.44     2
##  3     5     1 b         2  8.52     2
##  4     7     1 c         1  6.51     2
##  5     8     1 c         2  5.79     2
##  6    11     1 d         2  4.12     2
##  7    12     1 d         3  3.50     2
##  8    14     2 a         2  1.36     2
##  9    16     2 b         1  7.51     2
## 10    17     2 b         2  3.79     2
## # ... with 86 more rows

sim3_confirm <- anti_join(sim3, sim3_train) #dataset to confirm model

## Joining, by = c("id", "x1", "x2", "rep", "y", "sd")

## Warning: Column `y` has different attributes on LHS and RHS of join

sim3_confirm

## # A tibble: 24 x 6
##       id    x1 x2      rep      y    sd
##    <int> <int> <fct> <int>  <dbl> <dbl>
##  1     1     1 a         1 -0.571     2
##  2     3     1 a         3  2.24      2
##  3     6     1 b         3  7.72      2
##  4     9     1 c         3  6.07      2
##  5    10     1 d         1  2.11      2
##  6    13     2 a         1  1.55      2
##  7    15     2 a         3  0.252     2
##  8    23     2 d         2  3.45      2
##  9    27     3 a         3  0.346     2
## 10    35     3 d         2  0.844     2
## # ... with 14 more rows

`modelr` training

David Hodge

July 2018

1. Purpose.

2. Load libraries.

3. View data.

4. Visualise observations.

5. Fit model(s).

6. Calculate predictions and residuals.

7. Visualise predictions.

8. Visualise residuals.

9. View Summary stats, particularly `Adjusted R-squared`.

10. If appropriate, complete the model development process using train and confirm datasets.

modelr training

David Hodge

July 2018

1. Purpose.

2. Load libraries.

3. View data.

4. Visualise observations.

5. Fit model(s).

6. Calculate predictions and residuals.

7. Visualise predictions.

8. Visualise residuals.

9. View Summary stats, particularly Adjusted R-squared.

10. If appropriate, complete the model development process using train and confirm datasets.

`modelr` training

9. View Summary stats, particularly `Adjusted R-squared`.