Douglas Curran-Everett
Explorations in statistics: regression
Advan Physiol Educ 2011 35:347-352;
doi:10.1152/advan.00051.2011
#
# -- This code analyzes Anscombe quartet ---------------------------------------
#
par ( mfrow = c ( 1, 1 ) ) # 1 graphic on 1 screen
# -- Define population parameters, sample numbers, and plotting ranges ---------
#
PopMean <- 0 # population mean
PopSD <- 1 # population standard deviation
nObs <- 10 # number of observations in each sample
b0 <- 0 # coefficients that define true relationship between Y and X
b1 <- 1
b2 <- -0.075
PlotFactor <- 100 # scaling factor for horizontal axes
xMin <- 0 * PlotFactor # x minimum for data loop
xMax <- 10 * PlotFactor # x maximum for data loop
#
xPlotMin <- 0 # x minimum for plotting
xPlotMax <- 10 # x maximum for plotting
yPlotMin <- 0 # y minimum for plotting
yPlotMax <- 10 # y maximum for plotting
# -- Regression: an Overview :: Figure 1 --------------------------------------- # -- Figure 1: first line
#
# -- Generate 2nd-order polynomial ---------------------------------------------
#
PlotPoints <- 1000
Poly_Data <- matrix ( nrow = PlotPoints, ncol = 2 )
for ( j in xMin:xMax )
{ x <- j / PlotFactor
Poly_Data [ j,1 ] <- x
y <- b0 + ( b1 * x ) + ( b2 * x**2 )
Poly_Data [ j,2 ] <- y
}
# -- Plot polynomials ----------------------------------------------------------
#
plot ( Poly_Data [ ,1 ], Poly_Data [ ,2 ],
frame.plot = FALSE,
main = "Polynomial models",
xlab = "Predictor variable x",
ylab = "Response variable y",
xlim = c ( xPlotMin, xPlotMax ),
ylim = c ( yPlotMin, yPlotMax ),
type = "l"
)
abline ( a = b0, b = b1, col = "black" )
text ( 6.50, 8.60, pos = 4, 'Equation 1' )
text ( 6.50, 3.75, pos = 4, 'Equation 2' )
PolynomialModelsPlot <- recordPlot ( ) # -- Figure 1: last line
# -- Regression: an Overview :: Figure 3 --------------------------------------- # -- Figure 3: first line
#
First_Order_Data <- matrix ( nrow = nObs, ncol = 2 )
for ( k in 1:nObs )
{ First_Order_Data [ k,1 ] <- k
Random_Error <- round ( rnorm ( 1, mean = PopMean, sd = PopSD ), 3 )
First_Order_Data [ k,2 ] <- b0 + ( b1 * First_Order_Data [ k,1 ] ) + Random_Error
}
yPlotMin <- floor ( min ( First_Order_Data [ ,2 ] ) ) # minimum y for plotting
yPlotMax <- ceiling ( max ( First_Order_Data [ ,2 ] ) ) # maximum y for plotting
lm.First_Order <- lm ( First_Order_Data [ ,2 ] ~ First_Order_Data [ ,1 ] )
anova ( lm.First_Order )
## Analysis of Variance Table
##
## Response: First_Order_Data[, 2]
## Df Sum Sq Mean Sq F value Pr(>F)
## First_Order_Data[, 1] 1 88.8 88.8 54 8e-05 ***
## Residuals 8 13.1 1.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary ( lm.First_Order )
##
## Call:
## lm(formula = First_Order_Data[, 2] ~ First_Order_Data[, 1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5111 -0.5466 0.0967 0.9797 1.3766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.118 0.876 -0.14 0.9
## First_Order_Data[, 1] 1.037 0.141 7.35 8e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.28 on 8 degrees of freedom
## Multiple R-squared: 0.871, Adjusted R-squared: 0.855
## F-statistic: 54 on 1 and 8 DF, p-value: 8e-05
plot ( First_Order_Data [ ,1 ], First_Order_Data [ ,2 ], # plot true and estimated relationships
frame.plot = FALSE,
main = paste ( "Sample estimates: b0 =", b0_hat, "and b1 = ", b1_hat ),
xlab = "Predictor variable x",
ylab = "Response variable y",
xlim = c ( xPlotMin, xPlotMax ),
ylim = c ( yPlotMin, yPlotMax ),
pch = 19,
)
## Error: object 'b0_hat' not found
abline ( a = b0, b = b1, col = "black" ) # -- true relationship
abline ( lm.First_Order, col = "gray50" ) # -- estimated relationship
First_Order_Observations_Plot <- recordPlot ( ) # -- Figure 3: last line
# -- A Classic Example in Regression :: Figure 5 ------------------------------- # -- Figure 5: first line
#
# -- Drug A: read in data ------------------------------------------------------
#
TheData_A <- matrix ( nrow = 11, ncol = 2 )
TheData_A [ 1, ] <- c ( 10, 8.04 )
TheData_A [ 2, ] <- c ( 8, 6.95 )
TheData_A [ 3, ] <- c ( 13, 7.58 )
TheData_A [ 4, ] <- c ( 9, 8.81 )
TheData_A [ 5, ] <- c ( 11, 8.33 )
TheData_A [ 6, ] <- c ( 14, 9.96 )
TheData_A [ 7, ] <- c ( 6, 7.24 )
TheData_A [ 8, ] <- c ( 4, 4.26 )
TheData_A [ 9, ] <- c ( 12, 10.84 )
TheData_A [ 10, ] <- c ( 7, 4.82 )
TheData_A [ 11, ] <- c ( 5, 5.68 )
TheData_A
## [,1] [,2]
## [1,] 10 8.04
## [2,] 8 6.95
## [3,] 13 7.58
## [4,] 9 8.81
## [5,] 11 8.33
## [6,] 14 9.96
## [7,] 6 7.24
## [8,] 4 4.26
## [9,] 12 10.84
## [10,] 7 4.82
## [11,] 5 5.68
lm.Anscombe_A <- lm ( TheData_A [ ,2 ] ~ TheData_A [ ,1 ] )
summary ( lm.Anscombe_A )
##
## Call:
## lm(formula = TheData_A[, 2] ~ TheData_A[, 1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9213 -0.4558 -0.0414 0.7094 1.8388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.000 1.125 2.67 0.0257 *
## TheData_A[, 1] 0.500 0.118 4.24 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.24 on 9 degrees of freedom
## Multiple R-squared: 0.667, Adjusted R-squared: 0.629
## F-statistic: 18 on 1 and 9 DF, p-value: 0.00217
# -- Drug B: read in data ------------------------------------------------------
#
TheData_B <- matrix ( nrow = 11, ncol = 2 )
TheData_B [ 1, ] <- c ( 10, 9.14 )
TheData_B [ 2, ] <- c ( 8, 8.14 )
TheData_B [ 3, ] <- c ( 13, 8.74 )
TheData_B [ 4, ] <- c ( 9, 8.77 )
TheData_B [ 5, ] <- c ( 11, 9.26 )
TheData_B [ 6, ] <- c ( 14, 8.10 )
TheData_B [ 7, ] <- c ( 6, 6.13 )
TheData_B [ 8, ] <- c ( 4, 3.10 )
TheData_B [ 9, ] <- c ( 12, 9.13 )
TheData_B [ 10, ] <- c ( 7, 7.26 )
TheData_B [ 11, ] <- c ( 5, 4.74 )
TheData_B
## [,1] [,2]
## [1,] 10 9.14
## [2,] 8 8.14
## [3,] 13 8.74
## [4,] 9 8.77
## [5,] 11 9.26
## [6,] 14 8.10
## [7,] 6 6.13
## [8,] 4 3.10
## [9,] 12 9.13
## [10,] 7 7.26
## [11,] 5 4.74
lm.Anscombe_B_linear <- lm ( TheData_B [ ,2 ] ~ TheData_B [ ,1 ] )
summary ( lm.Anscombe_B_linear )
##
## Call:
## lm(formula = TheData_B[, 2] ~ TheData_B[, 1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.901 -0.761 0.129 0.949 1.269
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.001 1.125 2.67 0.0258 *
## TheData_B[, 1] 0.500 0.118 4.24 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.24 on 9 degrees of freedom
## Multiple R-squared: 0.666, Adjusted R-squared: 0.629
## F-statistic: 18 on 1 and 9 DF, p-value: 0.00218
# -- Scatterplots and first-order regression -----------------------------------
#
par ( mfrow = c ( 2, 2 ) ) # 4 graphics on 1 screen
# -- Drug A --------------------------------------------------------------------
#
plot ( TheData_A [ ,1 ], TheData_A [ ,2 ],
pch = 19,
col = "black",
xlab = "Drug A concentration",
xlim = c ( 4, 14 ),
ylab = "Response",
ylim = c ( 3, 11 ),
main = "Drug A",
frame.plot = FALSE,
)
abline ( lm.Anscombe_A )
# -- Drug B --------------------------------------------------------------------
#
plot ( TheData_B [ ,1 ], TheData_B [ ,2 ],
pch = 19,
col = "black",
xlab = "Drug B concentration",
xlim = c ( 4, 14 ),
ylab = "Response",
ylim = c ( 3, 11 ),
main = "Drug B",
frame.plot = FALSE,
)
abline ( lm.Anscombe_B_linear )
Anscombe_A_B_Plot <- recordPlot ( ) # -- Figure 5: last line
# -- A Classic Example in Regression :: Figure 7 ------------------------------- # -- Figure 7: first line
#
par ( mfrow = c ( 2, 2 ) )
# -- Drug A: residual plots ----------------------------------------------------
#
plot ( fitted ( lm.Anscombe_A ), resid ( lm.Anscombe_A ),
pch = 19,
col = "black",
xlab = "yhat: predicted response",
ylab = "Residuals",
ylim = c ( -2, 2 ),
main = "Residuals vs Predicted response",
frame.plot = FALSE,
)
abline ( h = 0 )
plot ( TheData_A [ ,1 ], resid ( lm.Anscombe_A ),
pch = 19,
col = "black",
xlab = "x: drug A concentration",
xlim = c ( 4, 14 ),
ylab = "Residuals",
ylim = c ( -2, 2 ),
main = "Residuals vs Drug A",
frame.plot = FALSE,
)
abline ( h = 0 )
Drug_A_Residuals_Plot <- recordPlot ( ) # -- Figure 7: last line
# -- A Classic Example in Regression :: Figure 8 ------------------------------- # -- Figure 8: first line
#
par ( mfrow = c ( 3, 3 ) ) # 6 graphics on 1 screen
# -- Drug B: 2nd-order model ---------------------------------------------------
#
lm.Anscombe_B_2nd_Order <- lm ( TheData_B [ ,2 ] ~ TheData_B [ ,1 ] + I ( TheData_B [ ,1 ]^2 ) )
#
anova ( lm.Anscombe_B_2nd_Order )
## Analysis of Variance Table
##
## Response: TheData_B[, 2]
## Df Sum Sq Mean Sq F value Pr(>F)
## TheData_B[, 1] 1 27.5 27.5 9831250 <2e-16 ***
## I(TheData_B[, 1]^2) 1 13.8 13.8 4925016 <2e-16 ***
## Residuals 8 0.0 0.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary ( lm.Anscombe_B_2nd_Order )
##
## Call:
## lm(formula = TheData_B[, 2] ~ TheData_B[, 1] + I(TheData_B[,
## 1]^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.001329 -0.001189 -0.000629 0.000874 0.002378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.00e+00 4.33e-03 -1385 <2e-16 ***
## TheData_B[, 1] 2.78e+00 1.04e-03 2674 <2e-16 ***
## I(TheData_B[, 1]^2) -1.27e-01 5.71e-05 -2219 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00167 on 8 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 7.38e+06 on 2 and 8 DF, p-value: <2e-16
B_Second_Order_Fitted <- predict ( lm.Anscombe_B_2nd_Order )
B_Data_Predicted <- cbind ( TheData_B, B_Second_Order_Fitted )
DataDimensions <- dim ( B_Data_Predicted )
nCol <- DataDimensions[2]
TheData_B_Sorted <- B_Data_Predicted [ order ( B_Data_Predicted [ ,1 ] ), 1:nCol ]
# -- Scatterplots: responses and residuals -------------------------------------
#
plot ( TheData_A [ ,1 ], resid ( lm.Anscombe_A ),
pch = 19,
col = "black",
xlab = "x: drug A concentration",
xlim = c ( 4, 14 ),
ylab = "Residuals",
ylim = c ( -2, 2 ),
main = "Residuals vs Drug",
frame.plot = FALSE,
)
abline ( h = 0, col = "gray50" )
plot ( TheData_B [ ,1 ], resid ( lm.Anscombe_B_linear ),
frame.plot = FALSE,
main = "B 1st-order: residuals vs X",
xlab = "x: drug B concentration",
xlim = c ( 4, 14 ),
ylab = "Residuals",
ylim = c ( -2, 2 ),
pch = 19,
col = "black",
)
abline ( h = 0, col = "gray50" )
plot ( TheData_B [ ,1 ], resid ( lm.Anscombe_B_2nd_Order ),
frame.plot = FALSE,
main = "B 2nd-order: residuals vs X",
xlab = "x: drug B concentration",
xlim = c ( 4, 14 ),
ylab = "Residuals",
ylim = c ( -2, 2 ),
pch = 19,
col = "black",
)
abline ( h = 0, col = "gray50" )
plot ( TheData_A [ ,1 ], TheData_A [ ,2 ],
pch = 19,
col = "black",
xlab = "Drug A concentration",
xlim = c ( 4, 14 ),
ylab = "Response",
ylim = c ( 3, 11 ),
main = "Drug A",
frame.plot = FALSE,
)
abline ( lm.Anscombe_A, col = "gray50" )
plot ( TheData_B [ ,1 ], TheData_B [ ,2 ],
frame.plot = FALSE,
main = "Drug B: 1st-order model",
xlab = "Drug B concentration",
xlim = c ( 4, 14 ),
ylab = "Response",
ylim = c ( 3, 11 ),
pch = 19,
col = "black",
)
abline ( lm.Anscombe_B_linear, col = "gray50" )
plot ( TheData_B [ ,1 ], TheData_B [ ,2 ],
frame.plot = FALSE,
main = "Drug B: 2nd-order model",
xlab = "Drug concentration",
xlim = c ( 4, 14 ),
ylab = "Response",
ylim = c ( 3, 11 ),
pch = 19,
col = "black",
)
matlines ( TheData_B_Sorted [ ,1 ], TheData_B_Sorted [ ,3 ],
type = "l", col = "gray50"
)
Drug_A_B_Residuals_Plot <- recordPlot ( ) # -- Figure 8: last line
# -- Practical Considerations :: Figure 9 -------------------------------------- # -- Figure 9: first line
#
par ( mfrow = c ( 2, 2 ) )
# -- Generate data for X without and with error --------------------------------
#
# -- Get variance of x values --------------------------------------------------
#
X_Error_Data <- matrix ( nrow = nObs, ncol = 3 )
for ( k in 1:nObs )
{ X_Error_Data [ k,1 ] <- k
}
XErrorSD <- sqrt ( var ( X_Error_Data [ ,1 ] ) )
# -- Use x variance to define variance of error in X ---------------------------
#
for ( k in 1:nObs )
{ Y_Random_Error <- round ( rnorm ( 1, mean = PopMean, sd = PopSD ), 3 )
X_Error_Data [ k,2 ] <- b0 + ( b1 * X_Error_Data [ k,1 ] ) + Y_Random_Error
X_Random_Error <- round ( rnorm ( 1, mean = PopMean, sd = XErrorSD ), 3 )
X_Error_Data [ k,3 ] <- k + X_Random_Error
}
X_Error_Data
## [,1] [,2] [,3]
## [1,] 1 -0.140 3.468
## [2,] 2 2.357 1.013
## [3,] 3 1.906 11.921
## [4,] 4 6.236 0.435
## [5,] 5 5.266 3.589
## [6,] 6 4.863 0.753
## [7,] 7 6.328 12.779
## [8,] 8 6.883 4.496
## [9,] 9 8.773 12.192
## [10,] 10 9.390 13.167
# -- Define minima and maxima for axes -----------------------------------------
#
xPlotMin <- min ( 0, floor ( min ( X_Error_Data [ ,3 ] ) ) )
xPlotMax <- max ( 10, ceiling ( max ( X_Error_Data [ ,3 ] ) ) )
yPlotMin <- floor ( min ( X_Error_Data [ ,2 ] ) )
yPlotMax <- ceiling ( max ( X_Error_Data [ ,2 ] ) )
Axis_Min <- min ( xPlotMin, yPlotMin )
Axis_Max <- max ( xPlotMax, yPlotMax )
# -- Regression: no error in X -------------------------------------------------
#
lm.First_Order <- lm ( X_Error_Data [ ,2 ] ~ X_Error_Data [ ,1 ] )
summary ( lm.First_Order )
##
## Call:
## lm(formula = X_Error_Data[, 2] ~ X_Error_Data[, 1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.087 -0.760 -0.153 0.423 2.463
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.00487 0.75795 0.01 1
## X_Error_Data[, 1] 0.94206 0.12215 7.71 5.7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.11 on 8 degrees of freedom
## Multiple R-squared: 0.881, Adjusted R-squared: 0.867
## F-statistic: 59.5 on 1 and 8 DF, p-value: 5.68e-05
#
b1_hat <- round ( lm.First_Order$coefficients [ 2 ], 3 )
# -- Regression: error in X ----------------------------------------------------
#
lm.First_Order_X_Error <- lm ( X_Error_Data [ ,2 ] ~ X_Error_Data [ ,3 ] )
summary ( lm.First_Order_X_Error )
##
## Call:
## lm(formula = X_Error_Data[, 2] ~ X_Error_Data[, 3])
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.657 -1.279 0.845 2.221 2.645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.72 1.48 2.52 0.036 *
## X_Error_Data[, 3] 0.23 0.18 1.28 0.237
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.94 on 8 degrees of freedom
## Multiple R-squared: 0.17, Adjusted R-squared: 0.0659
## F-statistic: 1.63 on 1 and 8 DF, p-value: 0.237
#
b1_hat_x_error <- round ( lm.First_Order_X_Error$coefficients [ 2 ], 3 )
# -- Plot observations and regression lines ------------------------------------
#
plot ( X_Error_Data [ ,1 ], X_Error_Data [ ,2 ], # no error in x
frame.plot = FALSE,
main = paste ( "Without error in x: b1 = ", b1_hat ),
xlab = "Predictor variable x",
ylab = "Response variable y",
xlim = c ( Axis_Min, Axis_Max ),
ylim = c ( Axis_Min, Axis_Max ),
pch = 19,
col = "gray50"
)
abline ( lm.First_Order, col = "gray50" )
plot ( X_Error_Data [ ,3 ], X_Error_Data [ ,2 ], # error in x
frame.plot = FALSE,
main = paste ( "With error in x: b1 = ", b1_hat_x_error ),
xlab = "Predictor variable x",
ylab = "Response variable y",
xlim = c ( Axis_Min, Axis_Max ),
ylim = c ( Axis_Min, Axis_Max ),
pch = 19
)
abline ( lm.First_Order_X_Error, col = "black" )
X_Error_Regression_Plot <- recordPlot ( ) # -- Figure 9: last line
# -- Replay each data graphic --------------------------------------------------
#
replayPlot ( PolynomialModelsPlot ) # Figure 1
replayPlot ( First_Order_Observations_Plot ) # Figure 3
replayPlot ( Anscombe_A_B_Plot ) # Figure 5
replayPlot ( Drug_A_Residuals_Plot ) # Figure 7
replayPlot ( Drug_A_B_Residuals_Plot ) # Figure 8
replayPlot ( X_Error_Regression_Plot ) # Figure 9