library(tidyverse)

## -- Attaching packages ----------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts -------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(modelr)

Load “sim1” data from modelr

sim1 <- sim1

sim1

## # A tibble: 30 x 2
##        x     y
##    <int> <dbl>
##  1     1  4.20
##  2     1  7.51
##  3     1  2.13
##  4     2  8.99
##  5     2 10.2 
##  6     2 11.3 
##  7     3  7.36
##  8     3 10.5 
##  9     3 10.5 
## 10     4 12.4 
## # ... with 20 more rows

Linear x y relationship

ggplot(sim1, aes(x,y)) +
  geom_point() +
  geom_smooth(method =  "lm", se = F)

A model for this relationship

sim1_mod <- lm(y ~ x, data = sim1)

sim1_mod

## 
## Call:
## lm(formula = y ~ x, data = sim1)
## 
## Coefficients:
## (Intercept)            x  
##       4.221        2.052

We can add predictions to this df. The y value where the x intersects with the fit line

sim1_pred <-
  sim1 %>% 
  add_predictions(sim1_mod)

sim1_pred

## # A tibble: 30 x 3
##        x     y  pred
##    <int> <dbl> <dbl>
##  1     1  4.20  6.27
##  2     1  7.51  6.27
##  3     1  2.13  6.27
##  4     2  8.99  8.32
##  5     2 10.2   8.32
##  6     2 11.3   8.32
##  7     3  7.36 10.4 
##  8     3 10.5  10.4 
##  9     3 10.5  10.4 
## 10     4 12.4  12.4 
## # ... with 20 more rows

We can also add the residuals. The distances between the points and the fit line

sim1_resid <-
  sim1 %>% 
  add_residuals(sim1_mod)

sim1_resid

## # A tibble: 30 x 3
##        x     y    resid
##    <int> <dbl>    <dbl>
##  1     1  4.20 -2.07   
##  2     1  7.51  1.24   
##  3     1  2.13 -4.15   
##  4     2  8.99  0.665  
##  5     2 10.2   1.92   
##  6     2 11.3   2.97   
##  7     3  7.36 -3.02   
##  8     3 10.5   0.130  
##  9     3 10.5   0.136  
## 10     4 12.4   0.00763
## # ... with 20 more rows

We can plot the residuals. Here as a frequency plot How far away are the predictions from the observed values? The mean will also be zero

ggplot(sim1_resid, aes(resid)) +
  geom_freqpoly(binwidth = 0.5)

Can also plot as points. The distribution looks random which is good

ggplot(sim1_resid, aes(x, resid)) +
  geom_ref_line(h = 0) +
  geom_point()

Sim2 is a categorical

sim2 <-sim2
sim2

## # A tibble: 40 x 2
##    x          y
##    <chr>  <dbl>
##  1 a      1.94 
##  2 a      1.18 
##  3 a      1.24 
##  4 a      2.62 
##  5 a      1.11 
##  6 a      0.866
##  7 a     -0.910
##  8 a      0.721
##  9 a      0.687
## 10 a      2.07 
## # ... with 30 more rows

Graph sim2. The category x appears to predict the value y

ggplot(sim2, aes(x, y)) + 
  geom_point()

We can create a model for this

mod2 <- lm(y ~ x, data = sim2) #y is dependent on our category

grid <- sim2 %>% 
  data_grid(x) %>% #to visualise a model, it is very useful to be able to generate an evenly spaced grid of points from the data. Without this line add_predicitons would output a value for every point. This groups the categories into one line
  add_predictions(mod2) #add predicitons column
grid

## # A tibble: 4 x 2
##   x      pred
##   <chr> <dbl>
## 1 a      1.15
## 2 b      8.12
## 3 c      6.13
## 4 d      1.91

We can visualise these predictors

ggplot(sim2, aes(x)) + 
  geom_point(aes(y = y)) +
  geom_point(data = grid, aes(y = pred), colour = "red", size = 4)

Sim3 has both continous and categorical columns

sim3 <- sim3
sim3

## # A tibble: 120 x 5
##       x1 x2      rep      y    sd
##    <int> <fct> <int>  <dbl> <dbl>
##  1     1 a         1 -0.571     2
##  2     1 a         2  1.18      2
##  3     1 a         3  2.24      2
##  4     1 b         1  7.44      2
##  5     1 b         2  8.52      2
##  6     1 b         3  7.72      2
##  7     1 c         1  6.51      2
##  8     1 c         2  5.79      2
##  9     1 c         3  6.07      2
## 10     1 d         1  2.11      2
## # ... with 110 more rows

Visualise this

ggplot(sim3, aes(x1, y)) + 
  geom_point(aes(colour = x2))

There are two possible models you could fit to this data:

mod1 <- lm(y ~ x1 + x2, data = sim3)
mod2 <- lm(y ~ x1 * x2, data = sim3)

We can see the predicted values for each model side by side:

grid <- sim3 %>% 
  data_grid(x1, x2) %>% 
  spread_predictions(mod1, mod2)
grid

## # A tibble: 40 x 4
##       x1 x2     mod1  mod2
##    <int> <fct> <dbl> <dbl>
##  1     1 a      1.67  1.21
##  2     1 b      4.56  7.52
##  3     1 c      6.48  5.71
##  4     1 d      4.03  2.32
##  5     2 a      1.48  1.12
##  6     2 b      4.37  6.66
##  7     2 c      6.28  5.68
##  8     2 d      3.84  2.50
##  9     3 a      1.28  1.02
## 10     3 b      4.17  5.81
## # ... with 30 more rows

Or stacked:

grid_pred <- sim3 %>% 
  data_grid(x1, x2) %>% 
  gather_predictions(mod1, mod2)
grid_pred

## # A tibble: 80 x 4
##    model    x1 x2     pred
##    <chr> <int> <fct> <dbl>
##  1 mod1      1 a      1.67
##  2 mod1      1 b      4.56
##  3 mod1      1 c      6.48
##  4 mod1      1 d      4.03
##  5 mod1      2 a      1.48
##  6 mod1      2 b      4.37
##  7 mod1      2 c      6.28
##  8 mod1      2 d      3.84
##  9 mod1      3 a      1.28
## 10 mod1      3 b      4.17
## # ... with 70 more rows

We can visualise the results for both models on one plot using facetting. Note that the model that uses + has the same slope for each line, but different intercepts. The model that uses * has a different slope and intercept for each line:

ggplot(sim3, aes(x1, y, colour = x2)) + 
  geom_point() + 
  geom_line(data = grid_pred, aes(y = pred)) + 
  facet_wrap(~ model)

We can plot the residuals of each model

sim3

## # A tibble: 120 x 5
##       x1 x2      rep      y    sd
##    <int> <fct> <int>  <dbl> <dbl>
##  1     1 a         1 -0.571     2
##  2     1 a         2  1.18      2
##  3     1 a         3  2.24      2
##  4     1 b         1  7.44      2
##  5     1 b         2  8.52      2
##  6     1 b         3  7.72      2
##  7     1 c         1  6.51      2
##  8     1 c         2  5.79      2
##  9     1 c         3  6.07      2
## 10     1 d         1  2.11      2
## # ... with 110 more rows

sim3_resid <- sim3 %>% 
  gather_residuals(mod1, mod2)
  
  sim3_resid

## # A tibble: 240 x 7
##    model    x1 x2      rep      y    sd   resid
##    <chr> <int> <fct> <int>  <dbl> <dbl>   <dbl>
##  1 mod1      1 a         1 -0.571     2 -2.25  
##  2 mod1      1 a         2  1.18      2 -0.491 
##  3 mod1      1 a         3  2.24      2  0.562 
##  4 mod1      1 b         1  7.44      2  2.87  
##  5 mod1      1 b         2  8.52      2  3.96  
##  6 mod1      1 b         3  7.72      2  3.16  
##  7 mod1      1 c         1  6.51      2  0.0261
##  8 mod1      1 c         2  5.79      2 -0.691 
##  9 mod1      1 c         3  6.07      2 -0.408 
## 10 mod1      1 d         1  2.11      2 -1.92  
## # ... with 230 more rows

There is little obvious pattern in the residuals for mod2. The residuals for mod1 show that the model has clearly missed some pattern in b, and less so, but still present is pattern in c, and d.

ggplot(sim3_resid, aes(x1, resid, colour = x2)) + 
  geom_point() + 
  facet_grid(model ~ x2)

Sim4 has two continuous variables

sim4

## # A tibble: 300 x 4
##       x1     x2   rep       y
##    <dbl>  <dbl> <int>   <dbl>
##  1    -1 -1         1  4.25  
##  2    -1 -1         2  1.21  
##  3    -1 -1         3  0.353 
##  4    -1 -0.778     1 -0.0467
##  5    -1 -0.778     2  4.64  
##  6    -1 -0.778     3  1.38  
##  7    -1 -0.556     1  0.975 
##  8    -1 -0.556     2  2.50  
##  9    -1 -0.556     3  2.70  
## 10    -1 -0.333     1  0.558 
## # ... with 290 more rows

Make two models as before

mod1 <- lm(y ~ x1 + x2, data = sim4)
mod2 <- lm(y ~ x1 * x2, data = sim4)

Get prdicted values in wide format :

grid2 <- sim4 %>% 
  data_grid(x1, x2) %>% 
  spread_predictions(mod1, mod2)
grid2

## # A tibble: 100 x 4
##       x1     x2   mod1   mod2
##    <dbl>  <dbl>  <dbl>  <dbl>
##  1    -1 -1      0.996  1.95 
##  2    -1 -0.778  0.378  1.12 
##  3    -1 -0.556 -0.240  0.289
##  4    -1 -0.333 -0.859 -0.541
##  5    -1 -0.111 -1.48  -1.37 
##  6    -1  0.111 -2.10  -2.20 
##  7    -1  0.333 -2.71  -3.03 
##  8    -1  0.556 -3.33  -3.86 
##  9    -1  0.778 -3.95  -4.69 
## 10    -1  1     -4.57  -5.52 
## # ... with 90 more rows

Predicted values in long format:

grid <- sim4 %>% 
  data_grid(x1, x2) %>% 
  gather_predictions(mod1, mod2)
grid

## # A tibble: 200 x 4
##    model    x1     x2   pred
##    <chr> <dbl>  <dbl>  <dbl>
##  1 mod1     -1 -1      0.996
##  2 mod1     -1 -0.778  0.378
##  3 mod1     -1 -0.556 -0.240
##  4 mod1     -1 -0.333 -0.859
##  5 mod1     -1 -0.111 -1.48 
##  6 mod1     -1  0.111 -2.10 
##  7 mod1     -1  0.333 -2.71 
##  8 mod1     -1  0.556 -3.33 
##  9 mod1     -1  0.778 -3.95 
## 10 mod1     -1  1     -4.57 
## # ... with 190 more rows

Graph these values in 3D plot

ggplot(grid, aes(x1, x2)) + 
  geom_tile(aes(fill = pred)) + 
  facet_wrap(~ model)

That doesn’t suggest that the models are very different! But that’s partly an illusion: our eyes and brains are not very good at accurately comparing shades of colour. Instead of looking at the surface from the top, we could look at it from either side, showing multiple slices:

ggplot(grid, aes(x1, pred, colour = x2, group = x2)) + #x1 versus predicted value by model
  geom_line() +
  facet_wrap(~ model)

ggplot(grid, aes(x2, pred, colour = x1, group = x1)) + #x2 versus predicted value by model
  geom_line() +
  facet_wrap(~ model)

This shows you that interaction between two continuous variables works basically the same way as for a categorical and continuous variable. An interaction says that there’s not a fixed offset: you need to consider both values of x1 and x2 simultaneously in order to predict y

Create non linear distribution

sim5 <- tibble(
  x = seq(0, 3.5 * pi, length = 50),
  y = 4 * sin(x) + rnorm(length(x))
)

ggplot(sim5, aes(x, y)) +
  geom_point()

Fit 5 models to this using polynomial:

mod1 <- lm(y ~ poly(x, 1), data = sim5)
mod2 <- lm(y ~ poly(x, 2), data = sim5)
mod3 <- lm(y ~ poly(x, 3), data = sim5)
mod4 <- lm(y ~ poly(x, 4), data = sim5)
mod5 <- lm(y ~ poly(x, 5), data = sim5)

Plot these models

grid <- sim5 %>% 
  data_grid(x = seq_range(x, n = 50, expand = 0.1)) %>% 
  gather_predictions(mod1, mod2, mod3, mod4, mod5, .pred = "y")

ggplot(sim5, aes(x, y)) + 
  geom_point() +
  geom_line(data = grid, colour = "red") +
  facet_wrap(~ model)

Use splines instead of polynomial

library(splines)

mod1 <- lm(y ~ ns(x, 1), data = sim5)
mod2 <- lm(y ~ ns(x, 2), data = sim5)
mod3 <- lm(y ~ ns(x, 3), data = sim5)
mod4 <- lm(y ~ ns(x, 4), data = sim5)
mod5 <- lm(y ~ ns(x, 5), data = sim5)

Plot these models

grid <- sim5 %>% 
  data_grid(x = seq_range(x, n = 50, expand = 0.1)) %>% 
  gather_predictions(mod1, mod2, mod3, mod4, mod5, .pred = "y")

ggplot(sim5, aes(x, y)) + 
  geom_point() +
  geom_line(data = grid, colour = "red") +
  facet_wrap(~ model)

Let’s look at model outputs of our very first set of data

Load “sim1” data from modelr

sim1 <- sim1

sim1

## # A tibble: 30 x 2
##        x     y
##    <int> <dbl>
##  1     1  4.20
##  2     1  7.51
##  3     1  2.13
##  4     2  8.99
##  5     2 10.2 
##  6     2 11.3 
##  7     3  7.36
##  8     3 10.5 
##  9     3 10.5 
## 10     4 12.4 
## # ... with 20 more rows

Linear x y relationship

ggplot(sim1, aes(x,y)) +
  geom_point() +
  geom_smooth(method =  "lm", se = F)

A model for this relationship

sim1_mod <- lm(y ~ x, data = sim1)

sim1_mod #gives x and our intercept from our model

## 
## Call:
## lm(formula = y ~ x, data = sim1)
## 
## Coefficients:
## (Intercept)            x  
##       4.221        2.052

summary(sim1_mod) #gives the estimate (as before), as well as std error, t value, and P value

## 
## Call:
## lm(formula = y ~ x, data = sim1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1469 -1.5197  0.1331  1.4670  4.6516 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.2208     0.8688   4.858 4.09e-05 ***
## x             2.0515     0.1400  14.651 1.17e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.203 on 28 degrees of freedom
## Multiple R-squared:  0.8846, Adjusted R-squared:  0.8805 
## F-statistic: 214.7 on 1 and 28 DF,  p-value: 1.173e-14

Instead of base R above, we can use broom

library(broom)

## 
## Attaching package: 'broom'

## The following object is masked from 'package:modelr':
## 
##     bootstrap

broom::tidy(sim1_mod) #gives a tidy version of summary

## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     4.22     0.869      4.86 4.09e- 5
## 2 x               2.05     0.140     14.7  1.17e-14

broom::augment(sim1_mod) # is like a tidy version of data_grid with fit, residuals, and others in tidy format

## # A tibble: 30 x 9
##        y     x .fitted .se.fit   .resid   .hat .sigma    .cooksd .std.resid
##    <dbl> <int>   <dbl>   <dbl>    <dbl>  <dbl>  <dbl>      <dbl>      <dbl>
##  1  4.20     1    6.27   0.748 -2.07    0.115    2.20    6.51e-2   -1.00   
##  2  7.51     1    6.27   0.748  1.24    0.115    2.23    2.32e-2    0.598  
##  3  2.13     1    6.27   0.748 -4.15    0.115    2.08    2.61e-1   -2.00   
##  4  8.99     2    8.32   0.634  0.665   0.0828   2.24    4.49e-3    0.315  
##  5 10.2      2    8.32   0.634  1.92    0.0828   2.21    3.74e-2    0.910  
##  6 11.3      2    8.32   0.634  2.97    0.0828   2.16    8.97e-2    1.41   
##  7  7.36     3   10.4    0.533 -3.02    0.0586   2.16    6.21e-2   -1.41   
##  8 10.5      3   10.4    0.533  0.130   0.0586   2.24    1.15e-4    0.0608 
##  9 10.5      3   10.4    0.533  0.136   0.0586   2.24    1.26e-4    0.0637 
## 10 12.4      4   12.4    0.454  0.00763 0.0424   2.24    2.78e-7    0.00354
## # ... with 20 more rows

broom::glance(sim1_mod) #provides best fit line data like the r2 value

## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>  <dbl> <dbl> <dbl>
## 1     0.885         0.880  2.20      215. 1.17e-14     2  -65.2  136.  141.
## # ... with 2 more variables: deviance <dbl>, df.residual <int>

Jtools is another package for similar analysis

library(jtools)


summ(sim1_mod) # like summary/tidy but in a visually appealing way (not "tidy" though)

Observations	30
Dependent variable	y
Type	OLS linear regression

F(1,28)	214.66
R²	0.88
Adj. R²	0.88

	Est.	S.E.	t val.	p
(Intercept)	4.22	0.87	4.86	0.00
x	2.05	0.14	14.65	0.00
Standard errors: OLS

We can use effect plot (from jtools) to fit instead of an x y scatter. The same function can be used for categories

effect_plot(sim1_mod, pred = x, interval = TRUE, plot.points = TRUE)

Can also use jtools plot summs. 95% CIs are shown by default

library(sandwich)
library(ggstance)

## 
## Attaching package: 'ggstance'

## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh

plot_summs(sim1_mod)

Can plot a normal distribution and custom CI

plot_summs(sim1_mod, plot.distributions = TRUE, inner_ci_level = .9)

Model

Load “sim1” data from modelr

Sim2 is a categorical

Sim3 has both continous and categorical columns

Sim4 has two continuous variables

Fit 5 models to this using polynomial:

Let’s look at model outputs of our very first set of data

Load “sim1” data from modelr

Linear x y relationship

Instead of base R above, we can use broom

Jtools is another package for similar analysis