Douglas Curran-Everett
Explorations in statistics: correlation
Advan Physiol Educ 2010 34:186-191;
doi:10.1152/advan.00068.2010
# -- Advances_Statistics_Code_Corr.R: R code for Explorations in Statistics ----
#
library ( MASS ) # load the MASS package
par ( mfrow = c ( 3, 3 ) ) # 9 graphics on 1 screen
par ( las = 1 ) # make axis numbers horizontal
PopMean <- 0 # mean of X and Y
PopSD <- 1 # SD of X and Y
nCorr <- 9 # number of population correlations
nStatistics <- 10 # number of statistics for each population correlation
nObs <- 100 # number of observations in each sample
xMin <- PopMean - ( 4 * PopSD ) # minima and maxima for X and Y
yMin <- xMin
#
xMax <- PopMean + ( 4 * PopSD )
yMax <- xMax
SampleStats <- matrix ( nrow = nCorr, ncol = nStatistics ) # matrix for sample statistics
# -- The Simulation: Observations and Sample Statistics :: Figure 4 ------------
#
for ( i in 1:nCorr ) # -- Figure 4: first line
{ Pop_Corr <- - ( i / 10 ) # population correlation coefficient
SampleStats [ i, 1 ] <- Pop_Corr
xy_Cov <- Pop_Corr * PopSD * PopSD
Cov_Matrix <- matrix ( c ( PopSD, xy_Cov, xy_Cov, PopSD ), 2, 2 )
R <- round ( mvrnorm ( n = nObs, rep ( 0, 2 ), Cov_Matrix ), 3 ) # sample from bivariate normal distribution
x <- R [ , 1 ]
y <- R [ , 2 ]
SampleStats [ i, 2 ] <- round ( mean ( x ), 4 ) # average x
SampleStats [ i, 3 ] <- round ( sd ( x ), 4 ) # SD x
SampleStats [ i, 4 ] <- round ( mean ( y ), 4 ) # average y
SampleStats [ i, 5 ] <- round ( sd ( y ), 4 ) # SD y
correlation <- cor.test ( x, y ) # correlation: x and y
r <- round ( correlation$estimate, 2 ) # sample correlation
P <- round ( correlation$p.value, 4 ) # P value for sample r
SampleStats [ i, 6 ] <- round ( correlation$estimate, 4 ) # sample correlation
SampleStats [ i, 7 ] <- round ( correlation$p.value, 4 ) # P value for sample r
xy_Regression <- lm ( y ~ x ) # estimate fitted regression equation
#
coefficients <- coef ( xy_Regression )
model_statistics <- summary ( xy_Regression )
SampleStats [ i, 8 ] <- round ( coefficients [ 1 ], 4 ) # intercept
SampleStats [ i, 9 ] <- round ( coefficients [ 2 ], 4 ) # slope
SampleStats [ i,10 ] <- round ( model_statistics$r.squared, 4 ) # R^2: R squared
plot ( x, y, # scatterplot
main = paste ( "r =", r, "... P =", P ),
xlab = ' ',
xlim = c ( xMin, xMax ),
ylab = ' ',
ylim = c ( yMin, yMax ),
pch = 19,
axes = FALSE,
frame.plot = FALSE
)
abline ( h = 0, v = 0, col = 'gray' ) # add axes at x = 0 and y = 0
text ( xMin + ( 0.25 * PopSD ), yMax - ( 0.075 * PopSD ), pos = 4, Pop_Corr )
}
CorrelationPlots <- recordPlot ( ) # -- Figure 4: last line
# rho Ave_x SD_x Ave_y SD_y r P Intrcpt Slope R_Sqrd
#
SampleStats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] -0.1 0.1572 1.0769 -0.0576 0.9925 -0.1199 0.2348 -0.0402 -0.1105
## [2,] -0.2 0.0007 0.9976 -0.1251 0.9725 -0.0535 0.5967 -0.1250 -0.0522
## [3,] -0.3 -0.0014 0.9226 0.0959 1.0148 -0.3840 0.0001 0.0953 -0.4223
## [4,] -0.4 -0.0658 0.9142 -0.0421 1.0193 -0.3657 0.0002 -0.0689 -0.4077
## [5,] -0.5 0.0165 0.8891 0.0779 0.9465 -0.4654 0.0000 0.0860 -0.4954
## [6,] -0.6 0.1700 1.0929 -0.0488 0.9722 -0.5820 0.0000 0.0393 -0.5177
## [7,] -0.7 -0.0919 0.9500 0.0472 0.9853 -0.6611 0.0000 -0.0158 -0.6856
## [8,] -0.8 0.1301 0.9489 -0.0963 1.0825 -0.8262 0.0000 0.0263 -0.9425
## [9,] -0.9 0.0084 1.0597 0.0311 1.0382 -0.9240 0.0000 0.0387 -0.9053
## [,10]
## [1,] 0.0144
## [2,] 0.0029
## [3,] 0.1474
## [4,] 0.1337
## [5,] 0.2166
## [6,] 0.3388
## [7,] 0.4370
## [8,] 0.6826
## [9,] 0.8537
# -- Correlation :: Figure 6 ---------------------------------------------------
#
par ( mfrow = c ( 2, 2 ) ) # -- Figure 6: first line
x_P25 <- quantile ( x, probs = 0.25 ) # 25th percentile of x
x_P75 <- quantile ( x, probs = 0.75 ) # 75th percentile of x
Data_Subset <- subset ( R, x_P25 <= R [ , 1 ] & R [ , 1 ] <= x_P75 ) # select middle 50% x values
#
sd ( Data_Subset [ , 1 ] ) # SD of restricted x values
## [1] 0.4237
sd ( Data_Subset [ , 2 ] ) # SD of restricted y values
## [1] 0.5692
sub_Regression <- lm ( Data_Subset [ , 2 ] ~ Data_Subset [ , 1 ] ) # estimate fitted regression equation
summary ( sub_Regression )
##
## Call:
## lm(formula = Data_Subset[, 2] ~ Data_Subset[, 1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8599 -0.2764 0.0697 0.2900 0.6760
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0606 0.0563 1.08 0.29
## Data_Subset[, 1] -0.9828 0.1322 -7.44 1.6e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.392 on 48 degrees of freedom
## Multiple R-squared: 0.535, Adjusted R-squared: 0.526
## F-statistic: 55.3 on 1 and 48 DF, p-value: 1.59e-09
correlation <- cor.test ( Data_Subset [ , 1 ], Data_Subset [ , 2 ] )
sub_r <- round ( correlation$estimate, 2 ) # sample correlation
sub_P <- round ( correlation$p.value, 4 ) # P value for sample r
plot ( x, y, # scatterplot of original observations
main = paste ( "r =", r, "... P =", P ),
xlab = ' ',
xlim = c ( xMin, xMax ),
ylab = ' ',
ylim = c ( yMin, yMax ),
pch = 19,
axes = FALSE,
frame.plot = FALSE
)
abline ( h = 0, v = 0, col = 'gray' ) # add axes at x = 0 and y = 0
abline ( v = x_P25, col = 'gray' ) # draw line at 25th percentile of x
abline ( v = x_P75, col = 'gray' ) # draw line at 75th percentile of x
plot ( Data_Subset [ , 1 ], Data_Subset [ , 2 ], # scatterplot of subset of observations
main = paste ( "r =", sub_r, "... P =", sub_P ),
xlab = ' ',
xlim = c ( xMin, xMax ),
ylab = ' ',
ylim = c ( yMin, yMax ),
pch = 19,
axes = FALSE,
frame.plot = FALSE
)
abline ( h = 0, v = 0, col = 'gray' ) # add axes at x = 0 and y = 0
abline ( v = x_P25, col = 'gray' ) # draw line at 25th percentile of x
abline ( v = x_P75, col = 'gray' ) # draw line at 75th percentile of x
#
Decreased_Variability_Plot <- recordPlot ( ) # -- Figure 6: last line
# -- Interpretation :: Figure 9 ------------------------------------------------
#
par ( mfrow = c ( 2, 2 ) ) # -- Figure 9: first line
TheData <- matrix ( nrow = 11, ncol = 2 )
for ( i in 1:2 )
{ TheData [ , 1 ] <- c ( 71, 68, 66, 67, 70, 71, 70, 73, 72, 65, 66 )
TheData [ , 2 ] <- c ( 69, 64, 65, 63, 65, 62, 65, 64, 66, 59, 62 )
if ( i == 2 ) TheData [ 10, 2 ] <- 69 # change a single observation
correlation_2 <- cor.test ( TheData [ , 1 ], TheData [ , 2 ] )
#
r_wz <- round ( correlation_2$estimate, 2 )
P_wz <- round ( correlation_2$p.value, 2 )
wz_Regression <- lm ( TheData [ , 2 ] ~ TheData [ , 1 ] ) # estimate fitted regression equation
summary ( wz_Regression )
plot ( TheData [ , 1 ], TheData [ , 2 ], # scatterplot
main = paste ( "r =", r_wz, "( P =",P_wz,")" ),
xlab = ' ',
xlim = c ( 65, 75 ),
ylab = ' ',
ylim = c ( 59, 69 ),
pch = 19,
frame.plot = FALSE
)
abline ( wz_Regression ) # add fitted regression line
}
#
SinglePointPlot <- recordPlot ( ) # -- Figure 9: last line
# -- Interpretation :: Figure 10 -----------------------------------------------
#
par ( mfrow = c ( 2, 2 ) ) # -- Figure 10: first line
nObs_0 <- 1000 # number of observations
Ratio <- matrix ( nrow = nObs_0, ncol = 3 ) # matrix for data
# -- Generate data for initial value, change, and final value ------------------
#
for ( i in 1:nObs_0 )
{ Initial_Value <- rnorm ( 1, mean = PopMean, sd = PopSD ) # initial value
Ratio [ i, 1 ] <- Initial_Value
Change <- rnorm ( 1, mean = PopMean, sd = PopSD ) # change
Ratio [ i, 2 ] <- Change
Final_Value <- Initial_Value + Change # final value
Ratio [ i, 3 ] <- Final_Value
}
# -- Scatterplots and correlations ---------------------------------------------
#
plot ( Ratio [ , 1 ], Ratio [ , 2 ], # scatterplot: change vs initial value
main = paste ( "Change vs initial value" ),
xlab = 'Initial value',
ylab = 'Change',
pch = 19,
frame.plot = FALSE
)
cor.test ( Ratio [ , 1 ], Ratio [ , 2 ] ) # correlation: change and initial value
##
## Pearson's product-moment correlation
##
## data: Ratio[, 1] and Ratio[, 2]
## t = 0.6441, df = 998, p-value = 0.5196
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.04166 0.08227
## sample estimates:
## cor
## 0.02039
plot ( Ratio [ , 1 ], Ratio [ , 3 ], # scatterplot: final vs initial value
main = paste ( "Final value vs initial value" ),
xlab = 'Initial value',
ylab = 'Final value',
pch = 19,
frame.plot = FALSE
)
cor.test ( Ratio [ , 3 ], Ratio [ , 1 ] ) # correlation: final and initial value
##
## Pearson's product-moment correlation
##
## data: Ratio[, 3] and Ratio[, 1]
## t = 32.91, df = 998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6903 0.7499
## sample estimates:
## cor
## 0.7215
#
SpuriousCorrelationPlot <- recordPlot ( ) # -- Figure 10: last line
# -- Replay each data graphic --------------------------------------------------
#
replayPlot ( CorrelationPlots )
replayPlot ( SinglePointPlot )
replayPlot ( Decreased_Variability_Plot )
replayPlot ( SpuriousCorrelationPlot )