Residual Analysis: Valid and Invalid Regression Models - Anscombe’s Four Data Sets

# Clear the workspace
  rm(list = ls()) # Clear all files from your environment
# gc()            # Clear unused memory
  cat("\f")       # Clear the console
  graphics.off()  # Clear all graphs

Raw Data

x1 <- c(10,8,13,9,11,14,6,4,12,7,5)
x2 <- x1
x3 <- x1 
x4 <- c(rep(x = 8, times=7), 19, rep(x = 8, times=3 ))

y1 <- c(8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68)
y2 <- c(9.14, 8.14, 8.74, 8.77, 9.26, 8.1, 6.13, 3.1, 9.13, 7.26, 4.74)
y3 <- c(7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42,5.73)
y4 <- c(6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.5, 5.56, 7.91, 6.89)

df <- as.data.frame(cbind(y1,y2,y3,y4,x1,x2,x3,x4))

Linear Regressions

Run the linear regressions

mod1 <- lm(data = df, 
           formula = y1 ~ x1)

mod2 <- lm(data = df, 
           formula = y2 ~ x2)

mod3 <- lm(data = df, 
           formula = y3 ~ x3)

mod4 <- lm(data = df, 
           formula = y4 ~ x4)

Present the linear regressions

library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
stargazer(mod1, mod2, mod3, mod4, type = "text")

====================================================================
                                       Dependent variable:          
                             ---------------------------------------
                                y1        y2        y3        y4    
                                (1)       (2)       (3)       (4)   
--------------------------------------------------------------------
x1                           0.500***                               
                              (0.118)                               
                                                                    
x2                                     0.500***                     
                                        (0.118)                     
                                                                    
x3                                               0.500***           
                                                  (0.118)           
                                                                    
x4                                                         0.500*** 
                                                            (0.118) 
                                                                    
Constant                      3.000**   3.001**   3.002**   3.002** 
                              (1.125)   (1.125)   (1.124)   (1.124) 
                                                                    
--------------------------------------------------------------------
Observations                    11        11        11        11    
R2                             0.667     0.666     0.666     0.667  
Adjusted R2                    0.629     0.629     0.629     0.630  
Residual Std. Error (df = 9)   1.237     1.237     1.236     1.236  
F Statistic (df = 1; 9)      17.990*** 17.966*** 17.972*** 18.003***
====================================================================
Note:                                    *p<0.1; **p<0.05; ***p<0.01

Residual Analysis

Model 1

summary(mod1)

Call:
lm(formula = y1 ~ x1, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.92127 -0.45577 -0.04136  0.70941  1.83882 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)   3.0001     1.1247   2.667  0.02573 * 
x1            0.5001     0.1179   4.241  0.00217 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.237 on 9 degrees of freedom
Multiple R-squared:  0.6665,    Adjusted R-squared:  0.6295 
F-statistic: 17.99 on 1 and 9 DF,  p-value: 0.00217
# Create a scatter plot of the raw data
plot(x = x1, 
     y = y1, 
     main = "Scatter Plot with Best-Fit Line", 
     xlab = "X-axis", 
     ylab = "Y-axis", 
     pch = 19)

# Add the best-fit line to the plot
abline(mod1, col = "red")

plot(mod1)

df$fitted_y1 <- fitted(mod1)

Model 2

summary(mod2)

Call:
lm(formula = y2 ~ x2, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.9009 -0.7609  0.1291  0.9491  1.2691 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)    3.001      1.125   2.667  0.02576 * 
x2             0.500      0.118   4.239  0.00218 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.237 on 9 degrees of freedom
Multiple R-squared:  0.6662,    Adjusted R-squared:  0.6292 
F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002179
# Create a scatter plot of the raw data
plot(x = x2, y = y2, 
     main = "Scatter Plot with Best-Fit Line", 
     xlab = "X-axis", 
     ylab = "Y-axis", 
     pch = 19)

# Add the best-fit line to the plot
abline(mod2, col = "red")

plot(mod2)

df$fitted_y2 <- fitted(mod2)

Try a different specification:

mod2_modified <- lm(data = df, formula = y2 ~ x2 + I(x2^2))
summary(mod2_modified)

Call:
lm(formula = y2 ~ x2 + I(x2^2), data = df)

Residuals:
       Min         1Q     Median         3Q        Max 
-0.0013287 -0.0011888 -0.0006294  0.0008741  0.0023776 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -5.9957343  0.0043299   -1385   <2e-16 ***
x2           2.7808392  0.0010401    2674   <2e-16 ***
I(x2^2)     -0.1267133  0.0000571   -2219   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.001672 on 8 degrees of freedom
Multiple R-squared:      1, Adjusted R-squared:      1 
F-statistic: 7.378e+06 on 2 and 8 DF,  p-value: < 2.2e-16
# # Create a scatter plot of the raw data
# plot(x = x2, y = y2,
#      main = "Scatter Plot with Best-Fit Line",
#      xlab = "X-axis",
#      ylab = "Y-axis",
#      pch = 19)
# 
# # Add the best-fit line to the plot
# abline(mod2_modified, col = "red")


plot(mod2_modified)

Model 3

summary(mod3)

Call:
lm(formula = y3 ~ x3, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.1586 -0.6146 -0.2303  0.1540  3.2411 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)   3.0025     1.1245   2.670  0.02562 * 
x3            0.4997     0.1179   4.239  0.00218 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.236 on 9 degrees of freedom
Multiple R-squared:  0.6663,    Adjusted R-squared:  0.6292 
F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002176
# Create a scatter plot of the raw data
plot(x = x3, y = y3, 
     main = "Scatter Plot with Best-Fit Line", 
     xlab = "X-axis", 
     ylab = "Y-axis", 
     pch = 19)

# Add the best-fit line to the plot
abline(mod3, col = "red")

plot(mod3)

df$fitted_y3 <- fitted(mod3)

Model 4

summary(mod4)

Call:
lm(formula = y4 ~ x4, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.751 -0.831  0.000  0.809  1.839 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)   3.0017     1.1239   2.671  0.02559 * 
x4            0.4999     0.1178   4.243  0.00216 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.236 on 9 degrees of freedom
Multiple R-squared:  0.6667,    Adjusted R-squared:  0.6297 
F-statistic:    18 on 1 and 9 DF,  p-value: 0.002165
# Create a scatter plot of the raw data
plot(x = x4, y = y4, 
     main = "Scatter Plot with Best-Fit Line", 
     xlab = "X-axis", 
     ylab = "Y-axis", 
     pch = 19)

# Add the best-fit line to the plot
abline(mod4, col = "red")

plot(mod4)
Warning: not plotting observations with leverage one:
  8

df$fitted_y4 <- fitted(mod4)

Presentation

Second, I learned about a simple but powerful way to visualize your residuals from non-base R commands that may be well worth a look.

Video - https://www.youtube.com/watch?v=Bi8sHIo3s1Y

easystats: Quickly investigate model performance: https://www.r-bloggers.com/2021/07/easystats-quickly-investigate-model-performance/