Explorations in statistics: regression

Douglas Curran-Everett
Explorations in statistics: regression
Advan Physiol Educ 2011 35:347-352;
doi:10.1152/advan.00051.2011

#
# -- This code analyzes Anscombe quartet ---------------------------------------
#
par ( mfrow = c ( 1, 1 ) )                                 #  1 graphic on 1 screen


# -- Define population parameters, sample numbers, and plotting ranges ---------
#
PopMean    <-    0                                         #  population mean
PopSD      <-    1                                         #  population standard deviation
nObs       <-   10                                         #  number of observations in each sample

b0         <-    0                                         #  coefficients that define true relationship between Y and X
b1         <-    1
b2         <-   -0.075

PlotFactor <-  100                                         #  scaling factor for horizontal axes

xMin       <-    0 * PlotFactor                            #  x minimum for data loop
xMax       <-   10 * PlotFactor                            #  x maximum for data loop
#
xPlotMin   <-    0                                         #  x minimum for plotting
xPlotMax   <-   10                                         #  x maximum for plotting
yPlotMin   <-    0                                         #  y minimum for plotting
yPlotMax   <-   10                                         #  y maximum for plotting


# -- Regression: an Overview :: Figure 1 ---------------------------------------                   # -- Figure 1: first line
#

# -- Generate 2nd-order polynomial ---------------------------------------------
#
PlotPoints <- 1000
Poly_Data  <- matrix ( nrow = PlotPoints, ncol = 2 )

for ( j in xMin:xMax )
    { x                 <- j / PlotFactor
      Poly_Data [ j,1 ] <- x
      y                 <- b0 + ( b1 * x ) + ( b2 * x**2 )
      Poly_Data [ j,2 ] <- y
      }

# -- Plot polynomials ----------------------------------------------------------
#
plot ( Poly_Data [ ,1 ], Poly_Data [ ,2 ],
       frame.plot =  FALSE,
       main       = "Polynomial models",
       xlab       = "Predictor variable x",
       ylab       = "Response variable y",
       xlim       =  c ( xPlotMin, xPlotMax ),
       ylim       =  c ( yPlotMin, yPlotMax ),
       type       = "l"
       )
abline ( a = b0, b = b1, col = "black" )
text ( 6.50, 8.60, pos = 4, 'Equation 1' )
text ( 6.50, 3.75, pos = 4, 'Equation 2' )

plot of chunk unnamed-chunk-1


PolynomialModelsPlot <- recordPlot ( )                                                             # -- Figure 1: last line


# -- Regression: an Overview :: Figure 3 ---------------------------------------                   # -- Figure 3: first line
#
First_Order_Data  <- matrix ( nrow = nObs, ncol = 2 )

for ( k in 1:nObs )
    { First_Order_Data [ k,1 ] <- k
      Random_Error <- round ( rnorm ( 1, mean = PopMean, sd = PopSD ), 3 )
      First_Order_Data [ k,2 ] <- b0 + ( b1 * First_Order_Data [ k,1 ] ) + Random_Error
      }

yPlotMin <- floor   ( min ( First_Order_Data [ ,2 ] ) )    #  minimum y for plotting
yPlotMax <- ceiling ( max ( First_Order_Data [ ,2 ] ) )    #  maximum y for plotting

lm.First_Order <- lm ( First_Order_Data [ ,2 ] ~ First_Order_Data [ ,1 ] )

anova   ( lm.First_Order )

## Analysis of Variance Table
## 
## Response: First_Order_Data[, 2]
##                       Df Sum Sq Mean Sq F value Pr(>F)    
## First_Order_Data[, 1]  1   88.8    88.8      54  8e-05 ***
## Residuals              8   13.1     1.6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary ( lm.First_Order )

## 
## Call:
## lm(formula = First_Order_Data[, 2] ~ First_Order_Data[, 1])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5111 -0.5466  0.0967  0.9797  1.3766 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -0.118      0.876   -0.14      0.9    
## First_Order_Data[, 1]    1.037      0.141    7.35    8e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.28 on 8 degrees of freedom
## Multiple R-squared:  0.871,  Adjusted R-squared:  0.855 
## F-statistic:   54 on 1 and 8 DF,  p-value: 8e-05


plot ( First_Order_Data [ ,1 ], First_Order_Data [ ,2 ],   #  plot true and estimated relationships
       frame.plot =  FALSE,
       main       =  paste ( "Sample estimates: b0 =", b0_hat, "and b1 = ", b1_hat ),
       xlab       = "Predictor variable x",
       ylab       = "Response variable y",
       xlim       =  c ( xPlotMin, xPlotMax ),
       ylim       =  c ( yPlotMin, yPlotMax ),
       pch        = 19,
       )

## Error: object 'b0_hat' not found

abline ( a = b0, b = b1, col = "black"  )                  # -- true relationship
abline ( lm.First_Order, col = "gray50" )                  # -- estimated relationship

plot of chunk unnamed-chunk-1


First_Order_Observations_Plot <- recordPlot ( )                                                    # -- Figure 3: last line


# -- A Classic Example in Regression :: Figure 5 -------------------------------                   # -- Figure 5: first line
#

# -- Drug A: read in data ------------------------------------------------------
#
TheData_A <- matrix ( nrow = 11, ncol = 2 )

TheData_A [  1, ] <- c ( 10,  8.04 )
TheData_A [  2, ] <- c (  8,  6.95 )
TheData_A [  3, ] <- c ( 13,  7.58 )
TheData_A [  4, ] <- c (  9,  8.81 )
TheData_A [  5, ] <- c ( 11,  8.33 )
TheData_A [  6, ] <- c ( 14,  9.96 )
TheData_A [  7, ] <- c (  6,  7.24 )
TheData_A [  8, ] <- c (  4,  4.26 )
TheData_A [  9, ] <- c ( 12, 10.84 )
TheData_A [ 10, ] <- c (  7,  4.82 )
TheData_A [ 11, ] <- c (  5,  5.68 )

TheData_A

##       [,1]  [,2]
##  [1,]   10  8.04
##  [2,]    8  6.95
##  [3,]   13  7.58
##  [4,]    9  8.81
##  [5,]   11  8.33
##  [6,]   14  9.96
##  [7,]    6  7.24
##  [8,]    4  4.26
##  [9,]   12 10.84
## [10,]    7  4.82
## [11,]    5  5.68


lm.Anscombe_A <- lm ( TheData_A [ ,2 ] ~ TheData_A [ ,1 ] )
summary ( lm.Anscombe_A )

## 
## Call:
## lm(formula = TheData_A[, 2] ~ TheData_A[, 1])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9213 -0.4558 -0.0414  0.7094  1.8388 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       3.000      1.125    2.67   0.0257 * 
## TheData_A[, 1]    0.500      0.118    4.24   0.0022 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.24 on 9 degrees of freedom
## Multiple R-squared:  0.667,  Adjusted R-squared:  0.629 
## F-statistic:   18 on 1 and 9 DF,  p-value: 0.00217



# -- Drug B: read in data ------------------------------------------------------
#
TheData_B <- matrix ( nrow = 11, ncol = 2 )

TheData_B [  1, ] <- c ( 10,  9.14 )
TheData_B [  2, ] <- c (  8,  8.14 )
TheData_B [  3, ] <- c ( 13,  8.74 )
TheData_B [  4, ] <- c (  9,  8.77 )
TheData_B [  5, ] <- c ( 11,  9.26 )
TheData_B [  6, ] <- c ( 14,  8.10 )
TheData_B [  7, ] <- c (  6,  6.13 )
TheData_B [  8, ] <- c (  4,  3.10 )
TheData_B [  9, ] <- c ( 12,  9.13 )
TheData_B [ 10, ] <- c (  7,  7.26 )
TheData_B [ 11, ] <- c (  5,  4.74 )

TheData_B

##       [,1] [,2]
##  [1,]   10 9.14
##  [2,]    8 8.14
##  [3,]   13 8.74
##  [4,]    9 8.77
##  [5,]   11 9.26
##  [6,]   14 8.10
##  [7,]    6 6.13
##  [8,]    4 3.10
##  [9,]   12 9.13
## [10,]    7 7.26
## [11,]    5 4.74


lm.Anscombe_B_linear <- lm ( TheData_B [ ,2 ] ~ TheData_B [ ,1 ] )
summary ( lm.Anscombe_B_linear )

## 
## Call:
## lm(formula = TheData_B[, 2] ~ TheData_B[, 1])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.901 -0.761  0.129  0.949  1.269 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       3.001      1.125    2.67   0.0258 * 
## TheData_B[, 1]    0.500      0.118    4.24   0.0022 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.24 on 9 degrees of freedom
## Multiple R-squared:  0.666,  Adjusted R-squared:  0.629 
## F-statistic:   18 on 1 and 9 DF,  p-value: 0.00218



# -- Scatterplots and first-order regression -----------------------------------
#
par ( mfrow = c ( 2, 2 ) )                                 #  4 graphics on 1 screen


# -- Drug A --------------------------------------------------------------------
#
plot ( TheData_A [ ,1 ], TheData_A [ ,2 ],
       pch = 19,
       col = "black",
       xlab = "Drug A concentration",
       xlim =  c ( 4, 14 ),
       ylab = "Response",
       ylim =  c ( 3, 11 ),
       main = "Drug A",
       frame.plot = FALSE,
       )
abline ( lm.Anscombe_A )


# -- Drug B --------------------------------------------------------------------
#
plot ( TheData_B [ ,1 ], TheData_B [ ,2 ],
       pch = 19,
       col = "black",
       xlab = "Drug B concentration",
       xlim =  c ( 4, 14 ),
       ylab = "Response",
       ylim =  c ( 3, 11 ),
       main = "Drug B",
       frame.plot = FALSE,
       )
abline ( lm.Anscombe_B_linear )

Anscombe_A_B_Plot <- recordPlot ( )                                                                # -- Figure 5: last line


# -- A Classic Example in Regression :: Figure 7 -------------------------------                   # -- Figure 7: first line
#
par ( mfrow = c ( 2, 2 ) )

plot of chunk unnamed-chunk-1


# -- Drug A: residual plots ----------------------------------------------------
#
plot ( fitted ( lm.Anscombe_A ), resid ( lm.Anscombe_A ),
       pch = 19,
       col = "black",
       xlab = "yhat: predicted response",
       ylab = "Residuals",
       ylim =  c ( -2, 2 ),
       main = "Residuals vs Predicted response",
       frame.plot = FALSE,
       )
abline ( h = 0 )

plot ( TheData_A [ ,1 ], resid ( lm.Anscombe_A ),
       pch = 19,
       col = "black",
       xlab = "x: drug A concentration",
       xlim =  c ( 4, 14 ),
       ylab = "Residuals",
       ylim =  c ( -2, 2 ),
       main = "Residuals vs Drug A",
       frame.plot = FALSE,
       )
abline ( h = 0 )

Drug_A_Residuals_Plot <- recordPlot ( )                                                            # -- Figure 7: last line


# -- A Classic Example in Regression :: Figure 8 -------------------------------                   # -- Figure 8: first line
#
par ( mfrow = c ( 3, 3 ) )                                 #  6 graphics on 1 screen

plot of chunk unnamed-chunk-1



# -- Drug B: 2nd-order model ---------------------------------------------------
#
lm.Anscombe_B_2nd_Order <- lm ( TheData_B [ ,2 ] ~ TheData_B [ ,1 ] + I ( TheData_B [ ,1 ]^2 ) )
#
anova   ( lm.Anscombe_B_2nd_Order )

## Analysis of Variance Table
## 
## Response: TheData_B[, 2]
##                     Df Sum Sq Mean Sq F value Pr(>F)    
## TheData_B[, 1]       1   27.5    27.5 9831250 <2e-16 ***
## I(TheData_B[, 1]^2)  1   13.8    13.8 4925016 <2e-16 ***
## Residuals            8    0.0     0.0                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary ( lm.Anscombe_B_2nd_Order )

## 
## Call:
## lm(formula = TheData_B[, 2] ~ TheData_B[, 1] + I(TheData_B[, 
##     1]^2))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.001329 -0.001189 -0.000629  0.000874  0.002378 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -6.00e+00   4.33e-03   -1385   <2e-16 ***
## TheData_B[, 1]       2.78e+00   1.04e-03    2674   <2e-16 ***
## I(TheData_B[, 1]^2) -1.27e-01   5.71e-05   -2219   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00167 on 8 degrees of freedom
## Multiple R-squared:     1,   Adjusted R-squared:     1 
## F-statistic: 7.38e+06 on 2 and 8 DF,  p-value: <2e-16


B_Second_Order_Fitted <- predict ( lm.Anscombe_B_2nd_Order )
B_Data_Predicted      <- cbind ( TheData_B, B_Second_Order_Fitted )
DataDimensions        <- dim ( B_Data_Predicted )
nCol                  <- DataDimensions[2]

TheData_B_Sorted <- B_Data_Predicted [ order ( B_Data_Predicted [ ,1 ] ), 1:nCol ]


# -- Scatterplots: responses and residuals -------------------------------------
#
plot ( TheData_A [ ,1 ], resid ( lm.Anscombe_A ),
       pch = 19,
       col = "black",
       xlab = "x: drug A concentration",
       xlim =  c ( 4, 14 ),
       ylab = "Residuals",
       ylim =  c ( -2, 2 ),
       main = "Residuals vs Drug",
       frame.plot = FALSE,
       )
abline ( h = 0, col = "gray50" )

plot ( TheData_B [ ,1 ], resid ( lm.Anscombe_B_linear ),
       frame.plot = FALSE,
       main = "B 1st-order: residuals vs X",
       xlab = "x: drug B concentration",
       xlim =  c ( 4, 14 ),
       ylab = "Residuals",
       ylim =  c ( -2, 2 ),
       pch = 19,
       col = "black",
       )
abline ( h = 0, col = "gray50" )

plot ( TheData_B [ ,1 ], resid ( lm.Anscombe_B_2nd_Order ),
       frame.plot =  FALSE,
       main = "B 2nd-order: residuals vs X",
       xlab = "x: drug B concentration",
       xlim =  c ( 4, 14 ),
       ylab = "Residuals",
       ylim =  c ( -2, 2 ),
       pch  = 19,
       col  = "black",
       )
abline ( h = 0, col = "gray50" )

plot ( TheData_A [ ,1 ], TheData_A [ ,2 ],
       pch = 19,
       col = "black",
       xlab = "Drug A concentration",
       xlim =  c ( 4, 14 ),
       ylab = "Response",
       ylim =  c ( 3, 11 ),
       main = "Drug A",
       frame.plot = FALSE,
       )
abline ( lm.Anscombe_A, col = "gray50" )

plot ( TheData_B [ ,1 ], TheData_B [ ,2 ],
       frame.plot = FALSE,
       main = "Drug B: 1st-order model",
       xlab = "Drug B concentration",
       xlim =  c ( 4, 14 ),
       ylab = "Response",
       ylim =  c ( 3, 11 ),
       pch = 19,
       col = "black",
       )
abline ( lm.Anscombe_B_linear, col = "gray50" )

plot ( TheData_B [ ,1 ], TheData_B [ ,2 ],
       frame.plot =  FALSE,
       main = "Drug B: 2nd-order model",
       xlab = "Drug concentration",
       xlim =  c ( 4, 14 ),
       ylab = "Response",
       ylim =  c ( 3, 11 ),
       pch  = 19,
       col  = "black",
       )
matlines ( TheData_B_Sorted [ ,1 ], TheData_B_Sorted [ ,3 ],
           type = "l", col = "gray50"
           )

Drug_A_B_Residuals_Plot <- recordPlot ( )                                                          # -- Figure 8: last line


# -- Practical Considerations :: Figure 9 --------------------------------------                   # -- Figure 9: first line
#
par ( mfrow = c ( 2, 2 ) )

plot of chunk unnamed-chunk-1



# -- Generate data for X without and with error --------------------------------
#

# -- Get variance of x values --------------------------------------------------
#
X_Error_Data  <- matrix ( nrow = nObs, ncol = 3 )

for ( k in 1:nObs )
    { X_Error_Data [ k,1 ] <- k
      }
XErrorSD <- sqrt ( var ( X_Error_Data [ ,1 ] ) )


# -- Use x variance to define variance of error in X ---------------------------
#
for ( k in 1:nObs )
    { Y_Random_Error <- round ( rnorm ( 1, mean = PopMean, sd = PopSD ), 3 )
      X_Error_Data [ k,2 ] <- b0 + ( b1 * X_Error_Data [ k,1 ] ) + Y_Random_Error

      X_Random_Error <- round ( rnorm ( 1, mean = PopMean, sd = XErrorSD ), 3 )
      X_Error_Data [ k,3 ] <- k + X_Random_Error
      }
X_Error_Data

##       [,1]   [,2]   [,3]
##  [1,]    1 -0.140  3.468
##  [2,]    2  2.357  1.013
##  [3,]    3  1.906 11.921
##  [4,]    4  6.236  0.435
##  [5,]    5  5.266  3.589
##  [6,]    6  4.863  0.753
##  [7,]    7  6.328 12.779
##  [8,]    8  6.883  4.496
##  [9,]    9  8.773 12.192
## [10,]   10  9.390 13.167



# -- Define minima and maxima for axes -----------------------------------------
#
xPlotMin <- min     (  0, floor   ( min ( X_Error_Data [ ,3 ] ) ) )
xPlotMax <- max     ( 10, ceiling ( max ( X_Error_Data [ ,3 ] ) ) )

yPlotMin <- floor   ( min ( X_Error_Data [ ,2 ] ) )
yPlotMax <- ceiling ( max ( X_Error_Data [ ,2 ] ) )

Axis_Min <- min     ( xPlotMin, yPlotMin )
Axis_Max <- max     ( xPlotMax, yPlotMax )


# -- Regression: no error in X -------------------------------------------------
#
lm.First_Order <- lm ( X_Error_Data [ ,2 ] ~ X_Error_Data [ ,1 ] )
summary ( lm.First_Order )

## 
## Call:
## lm(formula = X_Error_Data[, 2] ~ X_Error_Data[, 1])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.087 -0.760 -0.153  0.423  2.463 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.00487    0.75795    0.01        1    
## X_Error_Data[, 1]  0.94206    0.12215    7.71  5.7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.11 on 8 degrees of freedom
## Multiple R-squared:  0.881,  Adjusted R-squared:  0.867 
## F-statistic: 59.5 on 1 and 8 DF,  p-value: 5.68e-05

#
b1_hat <- round ( lm.First_Order$coefficients [ 2 ], 3 )


# -- Regression: error in X ----------------------------------------------------
#
lm.First_Order_X_Error <- lm ( X_Error_Data [ ,2 ] ~ X_Error_Data [ ,3 ] )
summary ( lm.First_Order_X_Error )

## 
## Call:
## lm(formula = X_Error_Data[, 2] ~ X_Error_Data[, 3])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.657 -1.279  0.845  2.221  2.645 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           3.72       1.48    2.52    0.036 *
## X_Error_Data[, 3]     0.23       0.18    1.28    0.237  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.94 on 8 degrees of freedom
## Multiple R-squared:  0.17,   Adjusted R-squared:  0.0659 
## F-statistic: 1.63 on 1 and 8 DF,  p-value: 0.237

#
b1_hat_x_error <- round ( lm.First_Order_X_Error$coefficients [ 2 ], 3 )


# -- Plot observations and regression lines ------------------------------------
#
plot ( X_Error_Data [ ,1 ], X_Error_Data [ ,2 ],           #  no error in x
       frame.plot =  FALSE,
       main       =  paste ( "Without error in x: b1 = ", b1_hat ),
       xlab       = "Predictor variable x",
       ylab       = "Response variable y",
       xlim       =  c ( Axis_Min, Axis_Max ),
       ylim       =  c ( Axis_Min, Axis_Max ),
       pch        = 19,
       col        = "gray50"
       )
abline ( lm.First_Order, col = "gray50" )

plot ( X_Error_Data [ ,3 ], X_Error_Data [ ,2 ],           #  error in x
       frame.plot =  FALSE,
       main       =  paste ( "With error in x: b1 = ", b1_hat_x_error ),
       xlab       = "Predictor variable x",
       ylab       = "Response variable y",
       xlim       =  c ( Axis_Min, Axis_Max ),
       ylim       =  c ( Axis_Min, Axis_Max ),
       pch        = 19
       )
abline ( lm.First_Order_X_Error, col = "black" )

X_Error_Regression_Plot <- recordPlot ( )                                                          # -- Figure 9: last line


# -- Replay each data graphic --------------------------------------------------
#
replayPlot ( PolynomialModelsPlot          )               #  Figure 1

plot of chunk unnamed-chunk-1

replayPlot ( First_Order_Observations_Plot )               #  Figure 3

plot of chunk unnamed-chunk-1

replayPlot ( Anscombe_A_B_Plot             )               #  Figure 5
replayPlot ( Drug_A_Residuals_Plot         )               #  Figure 7
replayPlot ( Drug_A_B_Residuals_Plot       )               #  Figure 8
replayPlot ( X_Error_Regression_Plot       )               #  Figure 9

plot of chunk unnamed-chunk-1