Visualising Residuals

Tutorial from BLOGR

Default options that R has for visualising residuals

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
fit <- lm(mpg ~ hp, data = mtcars)
summary(fit)
## 
## Call:
## lm(formula = mpg ~ hp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7121 -2.1122 -0.8854  1.5819  8.2360 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.09886    1.63392  18.421  < 2e-16 ***
## hp          -0.06823    0.01012  -6.742 1.79e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.863 on 30 degrees of freedom
## Multiple R-squared:  0.6024, Adjusted R-squared:  0.5892 
## F-statistic: 45.46 on 1 and 30 DF,  p-value: 1.788e-07
par(mfrow = c(2, 2))  # Split the plotting panel into a 2 x 2 grid
plot(fit)  # Plot the model information

General Approach

The general approach behind each of the examples that we’ll cover below is to:

  1. Fit a regression model to predict variable (Y).
  2. Obtain the predicted and residual values associated with each observation on (Y).
  3. Plot the actual and predicted values of (Y) so that they are distinguishable, but connected.
  4. Use the residuals to make an aesthetic adjustment (e.g. red colour when residual in very high) to highlight points which are poorly predicted by the model.

Linear Regression

Step 1: fit the model

d <- mtcars
fit <- lm(mpg ~ hp, data = d)

Step 2: obtain predicted and residual values

d$predicted <- predict(fit)   # Save the predicted values
d$residuals <- residuals(fit) # Save the residual values

# Quick look at the actual, predicted, and residual values
library(dplyr)
d %>% select(mpg, predicted, residuals) %>% head()
##                    mpg predicted  residuals
## Mazda RX4         21.0  22.59375 -1.5937500
## Mazda RX4 Wag     21.0  22.59375 -1.5937500
## Datsun 710        22.8  23.75363 -0.9536307
## Hornet 4 Drive    21.4  22.59375 -1.1937500
## Hornet Sportabout 18.7  18.15891  0.5410881
## Valiant           18.1  22.93489 -4.8348913

Step 3: plot the actual and predicted values

library(ggplot2)
ggplot(d, aes(x = hp, y = mpg)) +  # Set up canvas with outcome variable on y-axis
  geom_point()  # Plot the actual points

ggplot(d, aes(x = hp, y = mpg)) +
  geom_point() +
  geom_point(aes(y = predicted), shape = 1)  # Add the predicted values

connect the actual data points with their corresponding predicted value using geom_segment():

ggplot(d, aes(x = hp, y = mpg)) +
  geom_segment(aes(xend = hp, yend = predicted)) +
  geom_point() +
  geom_point(aes(y = predicted), shape = 1)

  • Clean up the overall look with theme_bw().
  • Fade out connection lines by adjusting their alpha.
  • Add the regression slope with geom_smooth():
library(ggplot2)
ggplot(d, aes(x = hp, y = mpg)) +
  geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +  # Plot regression slope
  geom_segment(aes(xend = hp, yend = predicted), alpha = .2) +  # alpha to fade lines
  geom_point() +
  geom_point(aes(y = predicted), shape = 1) +
  theme_bw()  # Add theme for cleaner look
## `geom_smooth()` using formula 'y ~ x'

Step 4: use residuals to adjust

Finally, we want to make an adjustment to highlight the size of the residual. There are MANY options. To make comparisons easy, I’ll make adjustments to the actual values, but you could just as easily apply these, or other changes, to the predicted values. Here are a few examples building on the previous plot:

# ALPHA
# Changing alpha of actual values based on absolute value of residuals
ggplot(d, aes(x = hp, y = mpg)) +
  geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
  geom_segment(aes(xend = hp, yend = predicted), alpha = .2) +

  # > Alpha adjustments made here...
  geom_point(aes(alpha = abs(residuals))) +  # Alpha mapped to abs(residuals)
  guides(alpha = FALSE) +  # Alpha legend removed
  # <

  geom_point(aes(y = predicted), shape = 1) +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

# COLOR
# High residuals (in abolsute terms) made more red on actual values.
ggplot(d, aes(x = hp, y = mpg)) +
  geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
  geom_segment(aes(xend = hp, yend = predicted), alpha = .2) +

  # > Color adjustments made here...
  geom_point(aes(color = abs(residuals))) + # Color mapped to abs(residuals)
  scale_color_continuous(low = "black", high = "red") +  # Colors to use here
  guides(color = FALSE) +  # Color legend removed
  # <

  geom_point(aes(y = predicted), shape = 1) +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

# SIZE AND COLOR
# Same coloring as above, size corresponding as well
ggplot(d, aes(x = hp, y = mpg)) +
  geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
  geom_segment(aes(xend = hp, yend = predicted), alpha = .2) +

  # > Color AND size adjustments made here...
  geom_point(aes(color = abs(residuals), size = abs(residuals))) + # size also mapped
  scale_color_continuous(low = "black", high = "red") +
  guides(color = FALSE, size = FALSE) +  # Size legend also removed
  # <

  geom_point(aes(y = predicted), shape = 1) +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

 # COLOR UNDER/OVER
# Color mapped to residual with sign taken into account.
# i.e., whether actual value is greater or less than predicted
ggplot(d, aes(x = hp, y = mpg)) +
  geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
  geom_segment(aes(xend = hp, yend = predicted), alpha = .2) +

  # > Color adjustments made here...
  geom_point(aes(color = residuals)) +  # Color mapped here
  scale_color_gradient2(low = "blue", mid = "white", high = "red") +  # Colors to use here
  guides(color = FALSE) +
  # <

  geom_point(aes(y = predicted), shape = 1) +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

Multiple Regression

Let’s crank up the complexity and get into multiple regression, where we regress one variable on two or more others. For this example, we’ll regress Miles per gallon (mpg) on horsepower (hp), weight (wt), and displacement (disp).

# Select out data of interest:
d <- mtcars %>% select(mpg, hp, wt, disp)

# Fit the model
fit <- lm(mpg ~ hp + wt+ disp, data = d)

# Obtain predicted and residual values
d$predicted <- predict(fit)
d$residuals <- residuals(fit)

head(d)
##                    mpg  hp    wt disp predicted  residuals
## Mazda RX4         21.0 110 2.620  160  23.57003 -2.5700299
## Mazda RX4 Wag     21.0 110 2.875  160  22.60080 -1.6008028
## Datsun 710        22.8  93 2.320  108  25.28868 -2.4886829
## Hornet 4 Drive    21.4 110 3.215  258  21.21667  0.1833269
## Hornet Sportabout 18.7 175 3.440  360  18.24072  0.4592780
## Valiant           18.1 105 3.460  225  20.47216 -2.3721590

Let’s create a relevant plot using ONE of our predictors, horsepower (hp). Again, we’ll start by plotting the actual and predicted values. In this case, plotting the regression slope is a little more complicated, so we’ll exclude it to stay on focus.

ggplot(d, aes(x = hp, y = mpg)) +
  geom_segment(aes(xend = hp, yend = predicted), alpha = .2) +  # Lines to connect points
  geom_point() +  # Points of actual values
  geom_point(aes(y = predicted), shape = 1) +  # Points of predicted values
  theme_bw()

Again, we can make all sorts of adjustments using the residual values. Let’s apply the same changes as the last plot above - with blue or red for actual values that are greater or less than their predicted values:

ggplot(d, aes(x = hp, y = mpg)) +
  geom_segment(aes(xend = hp, yend = predicted), alpha = .2) +
  geom_point(aes(color = residuals)) +
  scale_color_gradient2(low = "blue", mid = "white", high = "red") +
  guides(color = FALSE) +
  geom_point(aes(y = predicted), shape = 1) +
  theme_bw()

Plotting multiple predictors at once

To visualise this, we’ll make use of one of my favourite tricks: using the tidyr package to gather() our independent variable columns, and then use facet_*() in our ggplot to split them into separate panels.

d %>% 
  gather(key = "iv", value = "x", -mpg, -predicted, -residuals) %>%  # Get data into shape
  ggplot(aes(x = x, y = mpg)) +  # Note use of `x` here and next line
  geom_segment(aes(xend = x, yend = predicted), alpha = .2) +
  geom_point(aes(color = residuals)) +
  scale_color_gradient2(low = "blue", mid = "white", high = "red") +
  guides(color = FALSE) +
  geom_point(aes(y = predicted), shape = 1) +
  facet_grid(~ iv, scales = "free_x") +  # Split panels here by `iv`
  theme_bw()

Let’s try this out with another data set. We’ll use the iris data set, and regress Sepal.Width on all other continuous variables (aside, thanks to Hadley Wickham’s suggestion to drop categorical variables for plotting):

d <- iris %>% select(-Species)

# Fit the model
fit <- lm(Sepal.Width ~ ., data = iris)

# Obtain predicted and residual values
d$predicted <- predict(fit)
d$residuals <- residuals(fit)

# Create plot
d %>% 
  gather(key = "iv", value = "x", -Sepal.Width, -predicted, -residuals) %>%
  ggplot(aes(x = x, y = Sepal.Width)) +
  geom_segment(aes(xend = x, yend = predicted), alpha = .2) +
  geom_point(aes(color = residuals)) +
  scale_color_gradient2(low = "blue", mid = "white", high = "red") +
  guides(color = FALSE) +
  geom_point(aes(y = predicted), shape = 1) +
  facet_grid(~ iv, scales = "free_x") +
  theme_bw()

We can now see how the actual and predicted values compare across our predictor variables. In case you’d forgotten, the coloured points are the actual data, and the white circles are the predicted values. With this in mind, we can see, as expected, that there is less variability in the predicted values than the actual values.

Logistic Regression

To round this post off, let’s extend our approach to logistic regression. It’s going to require the same basic workflow, but we will need to extract predicted and residual values for the responses. Here’s an example predicting V/S (vs), which is 0 or 1, with hp:

# Step 1: Fit the data
d <- mtcars
fit <- glm(vs ~ hp, family = binomial(), data = d)

# Step 2: Obtain predicted and residuals
d$predicted <- predict(fit, type="response")
d$residuals <- residuals(fit, type = "response")

# Steps 3 and 4: plot the results
ggplot(d, aes(x = hp, y = vs)) +
  geom_segment(aes(xend = hp, yend = predicted), alpha = .2) +
  geom_point(aes(color = residuals)) +
  scale_color_gradient2(low = "blue", mid = "white", high = "red") +
  guides(color = FALSE) +
  geom_point(aes(y = predicted), shape = 1) +
  theme_bw()

If we only want to flag cases that would be scored as the incorrect category, we can do something like the following (with some help from the dplyr function, filter()):

ggplot(d, aes(x = hp, y = vs)) +
  geom_segment(aes(xend = hp, yend = predicted), alpha = .2) +
  geom_point() +

  # > This plots large red circle on misclassified points
  geom_point(data = d %>% filter(vs != round(predicted)),
             color = "red", size = 2) +
  # <

  scale_color_gradient2(low = "blue", mid = "white", high = "red") +
  guides(color = FALSE) +
  geom_point(aes(y = predicted), shape = 1) +
  theme_bw()

Bonus: using broom

After recieving the same helpful suggestion from aurelien ginolhac and Ilya Kashnitsky (my thanks to both of them), this section will briefly cover how to implement the augment() function from the broom package for Step 2 of the above.

The broom package helps to “convert statistical analysis objects from R into tidy data frames”. In our case, augment() will convert the fitted regression model into a dataframe with the predicted (fitted) and residual values already available.

A complete example using augment() is shown below. However, there are a couple of important differences about the data returned by augment() compared to the earlier approach to note:

The names of the predicted and residual values are .fitted and .resid There are many additional variables that we gain access to. These need to be dropped if you wish to implement the gather() and facet_*() combination described earlier.

library(broom)

# Steps 1 and 2
d <- lm(mpg ~ hp, data = mtcars) %>% 
       augment()

head(d)
## # A tibble: 6 x 9
##   .rownames           mpg    hp .fitted .resid .std.resid   .hat .sigma  .cooksd
##   <chr>             <dbl> <dbl>   <dbl>  <dbl>      <dbl>  <dbl>  <dbl>    <dbl>
## 1 Mazda RX4          21     110    22.6 -1.59      -0.421 0.0405   3.92 0.00374 
## 2 Mazda RX4 Wag      21     110    22.6 -1.59      -0.421 0.0405   3.92 0.00374 
## 3 Datsun 710         22.8    93    23.8 -0.954     -0.253 0.0510   3.92 0.00173 
## 4 Hornet 4 Drive     21.4   110    22.6 -1.19      -0.315 0.0405   3.92 0.00210 
## 5 Hornet Sportabout  18.7   175    18.2  0.541      0.143 0.0368   3.93 0.000389
## 6 Valiant            18.1   105    22.9 -4.83      -1.28  0.0432   3.82 0.0369
# Steps 3 and 4
ggplot(d, aes(x = hp, y = mpg)) +
  geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
  geom_segment(aes(xend = hp, yend = .fitted), alpha = .2) +  # Note `.fitted`
  geom_point(aes(alpha = abs(.resid))) +  # Note `.resid`
  guides(alpha = FALSE) +
  geom_point(aes(y = .fitted), shape = 1) +  # Note `.fitted`
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'