1. Purpose.

The purpose of this notebook is to illustrate how the modelr package can be used to model a continuous response variable using linear regression.

2. Load libraries.

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
library(ggplot2)
library(modelr)

3. View data.

sim3
## # A tibble: 120 x 5
##       x1 x2      rep      y    sd
##    <int> <fct> <int>  <dbl> <dbl>
##  1     1 a         1 -0.571     2
##  2     1 a         2  1.18      2
##  3     1 a         3  2.24      2
##  4     1 b         1  7.44      2
##  5     1 b         2  8.52      2
##  6     1 b         3  7.72      2
##  7     1 c         1  6.51      2
##  8     1 c         2  5.79      2
##  9     1 c         3  6.07      2
## 10     1 d         1  2.11      2
## # ... with 110 more rows
glimpse(sim3)
## Observations: 120
## Variables: 5
## $ x1  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2...
## $ x2  <fct> a, a, a, b, b, b, c, c, c, d, d, d, a, a, a, b, b, b, c, c...
## $ rep <int> 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2...
## $ y   <dbl> -0.5707363, 1.1841503, 2.2373204, 7.4366963, 8.5182934, 7....
## $ sd  <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...

4. Visualise observations.

First visualise the distribution of the response variable, and next visualise how the response variable relates to explanatory variables.

ggplot(data = sim3, aes(x = y)) +
  geom_histogram() 

ggplot(data = sim3, aes(x = x1, y = y)) +
  geom_point() 

ggplot(data = sim3, aes(x = x2, y = y)) +
  geom_boxplot()

ggplot(data = sim3, aes(x = x1, y = y, color = x2)) +
  geom_point() 

5. Fit model(s).

mod_lm_x1 <- lm(formula = y ~ x1, data = sim3)
mod_lm_x1
## 
## Call:
## lm(formula = y ~ x1, data = sim3)
## 
## Coefficients:
## (Intercept)           x1  
##      4.3849      -0.1967
mod_lm_x2 <- lm(formula = y ~ x2, data = sim3)
mod_lm_x2
## 
## Call:
## lm(formula = y ~ x2, data = sim3)
## 
## Coefficients:
## (Intercept)          x2b          x2c          x2d  
##      0.7896       2.8878       4.8057       2.3596
mod_lm_x1_x2 <- lm(formula = y ~ x1  + x2, data = sim3)
mod_lm_x1_x2
## 
## Call:
## lm(formula = y ~ x1 + x2, data = sim3)
## 
## Coefficients:
## (Intercept)           x1          x2b          x2c          x2d  
##      1.8717      -0.1967       2.8878       4.8057       2.3596
mod_lm_x1_x2_x1x2 <- lm(formula = y ~ x1 * x2, data = sim3)
mod_lm_x1_x2_x1x2
## 
## Call:
## lm(formula = y ~ x1 * x2, data = sim3)
## 
## Coefficients:
## (Intercept)           x1          x2b          x2c          x2d  
##     1.30124     -0.09302      7.06938      4.43090      0.83455  
##      x1:x2b       x1:x2c       x1:x2d  
##    -0.76029      0.06815      0.27728

6. Calculate predictions and residuals.

sim3 %>%
  add_predictions(mod_lm_x1) #for just one model 
## # A tibble: 120 x 6
##       x1 x2      rep      y    sd  pred
##    <int> <fct> <int>  <dbl> <dbl> <dbl>
##  1     1 a         1 -0.571     2  4.19
##  2     1 a         2  1.18      2  4.19
##  3     1 a         3  2.24      2  4.19
##  4     1 b         1  7.44      2  4.19
##  5     1 b         2  8.52      2  4.19
##  6     1 b         3  7.72      2  4.19
##  7     1 c         1  6.51      2  4.19
##  8     1 c         2  5.79      2  4.19
##  9     1 c         3  6.07      2  4.19
## 10     1 d         1  2.11      2  4.19
## # ... with 110 more rows
sim3_pred <- sim3 %>%
  gather_predictions(mod_lm_x1, mod_lm_x2, mod_lm_x1_x2, mod_lm_x1_x2_x1x2) #for many models

sim3_pred
## # A tibble: 480 x 7
##    model        x1 x2      rep      y    sd  pred
##    <chr>     <int> <fct> <int>  <dbl> <dbl> <dbl>
##  1 mod_lm_x1     1 a         1 -0.571     2  4.19
##  2 mod_lm_x1     1 a         2  1.18      2  4.19
##  3 mod_lm_x1     1 a         3  2.24      2  4.19
##  4 mod_lm_x1     1 b         1  7.44      2  4.19
##  5 mod_lm_x1     1 b         2  8.52      2  4.19
##  6 mod_lm_x1     1 b         3  7.72      2  4.19
##  7 mod_lm_x1     1 c         1  6.51      2  4.19
##  8 mod_lm_x1     1 c         2  5.79      2  4.19
##  9 mod_lm_x1     1 c         3  6.07      2  4.19
## 10 mod_lm_x1     1 d         1  2.11      2  4.19
## # ... with 470 more rows
sim3_resid <- sim3 %>%
  gather_residuals(mod_lm_x1, mod_lm_x2, mod_lm_x1_x2, mod_lm_x1_x2_x1x2)

sim3_resid
## # A tibble: 480 x 7
##    model        x1 x2      rep      y    sd resid
##    <chr>     <int> <fct> <int>  <dbl> <dbl> <dbl>
##  1 mod_lm_x1     1 a         1 -0.571     2 -4.76
##  2 mod_lm_x1     1 a         2  1.18      2 -3.00
##  3 mod_lm_x1     1 a         3  2.24      2 -1.95
##  4 mod_lm_x1     1 b         1  7.44      2  3.25
##  5 mod_lm_x1     1 b         2  8.52      2  4.33
##  6 mod_lm_x1     1 b         3  7.72      2  3.54
##  7 mod_lm_x1     1 c         1  6.51      2  2.32
##  8 mod_lm_x1     1 c         2  5.79      2  1.60
##  9 mod_lm_x1     1 c         3  6.07      2  1.88
## 10 mod_lm_x1     1 d         1  2.11      2 -2.08
## # ... with 470 more rows

7. Visualise predictions.

ggplot(sim3_pred, aes(x = x1, y = y, color = x2)) +
  geom_point() +
  geom_line(aes(y = pred), size = 1) +
  facet_wrap(~model)

8. Visualise residuals.

ggplot(data = sim3_resid, aes(x = x1, y = resid, color = x2)) + 
  geom_point() + 
  facet_grid(model ~ x2)

9. View Summary stats, particularly Adjusted R-squared.

summary(mod_lm_x1)
## 
## Call:
## lm(formula = y ~ x1, data = sim3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7589 -1.9213 -0.1823  2.0902  4.5755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.38495    0.45283   9.684  < 2e-16 ***
## x1          -0.19674    0.07298  -2.696  0.00805 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.296 on 118 degrees of freedom
## Multiple R-squared:  0.05801,    Adjusted R-squared:  0.05003 
## F-statistic: 7.267 on 1 and 118 DF,  p-value: 0.008048
summary(mod_lm_x2)
## 
## Call:
## lm(formula = y ~ x2, data = sim3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3527 -0.7792  0.0437  0.7227  4.8409 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.7896     0.2977   2.652  0.00911 ** 
## x2b           2.8878     0.4210   6.859 3.58e-10 ***
## x2c           4.8057     0.4210  11.415  < 2e-16 ***
## x2d           2.3596     0.4210   5.604 1.43e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.631 on 116 degrees of freedom
## Multiple R-squared:  0.533,  Adjusted R-squared:  0.521 
## F-statistic: 44.14 on 3 and 116 DF,  p-value: < 2.2e-16
summary(mod_lm_x1_x2)
## 
## Call:
## lm(formula = y ~ x1 + x2, data = sim3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4674 -0.8524 -0.0729  0.7886  4.3005 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.87167    0.38738   4.832 4.22e-06 ***
## x1          -0.19674    0.04871  -4.039 9.72e-05 ***
## x2b          2.88781    0.39571   7.298 4.07e-11 ***
## x2c          4.80574    0.39571  12.145  < 2e-16 ***
## x2d          2.35959    0.39571   5.963 2.79e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.533 on 115 degrees of freedom
## Multiple R-squared:  0.5911, Adjusted R-squared:  0.5768 
## F-statistic: 41.55 on 4 and 115 DF,  p-value: < 2.2e-16
summary(mod_lm_x1_x2_x1x2)
## 
## Call:
## lm(formula = y ~ x1 * x2, data = sim3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.87634 -0.67655  0.04837  0.69963  2.58607 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.30124    0.40400   3.221  0.00167 ** 
## x1          -0.09302    0.06511  -1.429  0.15587    
## x2b          7.06938    0.57134  12.373  < 2e-16 ***
## x2c          4.43090    0.57134   7.755 4.41e-12 ***
## x2d          0.83455    0.57134   1.461  0.14690    
## x1:x2b      -0.76029    0.09208  -8.257 3.30e-13 ***
## x1:x2c       0.06815    0.09208   0.740  0.46076    
## x1:x2d       0.27728    0.09208   3.011  0.00322 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.024 on 112 degrees of freedom
## Multiple R-squared:  0.8221, Adjusted R-squared:  0.811 
## F-statistic: 73.93 on 7 and 112 DF,  p-value: < 2.2e-16

10. If appropriate, complete the model development process using train and confirm datasets.

If the dataset has sufficient observations and you are making a complex model, you should train your preferred model using only 80% of the data, and then confirm it is appropriate with the remaining set-aside 20% of the dataset. This avoids the risk of overfitting, where the the noise is modelled as well as the signal.

sim3 <- sim3 %>% 
  mutate(id = row_number()) %>% 
  select(id, everything())

sim3_train <- sim3 %>% 
  sample_frac(0.8) %>% 
  arrange(id) #dataset to train model

sim3_train
## # A tibble: 96 x 6
##       id    x1 x2      rep     y    sd
##    <int> <int> <fct> <int> <dbl> <dbl>
##  1     2     1 a         2  1.18     2
##  2     4     1 b         1  7.44     2
##  3     5     1 b         2  8.52     2
##  4     7     1 c         1  6.51     2
##  5     8     1 c         2  5.79     2
##  6    11     1 d         2  4.12     2
##  7    12     1 d         3  3.50     2
##  8    14     2 a         2  1.36     2
##  9    16     2 b         1  7.51     2
## 10    17     2 b         2  3.79     2
## # ... with 86 more rows
sim3_confirm <- anti_join(sim3, sim3_train) #dataset to confirm model 
## Joining, by = c("id", "x1", "x2", "rep", "y", "sd")
## Warning: Column `y` has different attributes on LHS and RHS of join
sim3_confirm
## # A tibble: 24 x 6
##       id    x1 x2      rep      y    sd
##    <int> <int> <fct> <int>  <dbl> <dbl>
##  1     1     1 a         1 -0.571     2
##  2     3     1 a         3  2.24      2
##  3     6     1 b         3  7.72      2
##  4     9     1 c         3  6.07      2
##  5    10     1 d         1  2.11      2
##  6    13     2 a         1  1.55      2
##  7    15     2 a         3  0.252     2
##  8    23     2 d         2  3.45      2
##  9    27     3 a         3  0.346     2
## 10    35     3 d         2  0.844     2
## # ... with 14 more rows