Explorations in statistics: correlation

Douglas Curran-Everett
Explorations in statistics: correlation Advan Physiol Educ 2010 34:186-191;
doi:10.1152/advan.00068.2010

# -- Advances_Statistics_Code_Corr.R: R code for Explorations in Statistics ----
#
library ( MASS               )                                             #  load the MASS package
par     ( mfrow = c ( 3, 3 ) )                                             #  9 graphics on 1 screen
par     ( las   = 1          )                                             #  make axis numbers horizontal

PopMean     <-   0                                                         #  mean of X and Y
PopSD       <-   1                                                         #  SD of X and Y

nCorr       <-   9                                                         #  number of population correlations
nStatistics <-  10                                                         #  number of statistics for each population correlation
nObs        <- 100                                                         #  number of observations in each sample

xMin        <- PopMean - ( 4 * PopSD )                                     #  minima and maxima for X and Y
yMin        <- xMin
#
xMax        <- PopMean + ( 4 * PopSD )
yMax        <- xMax

SampleStats <- matrix ( nrow = nCorr, ncol = nStatistics )                 #  matrix for sample statistics


# -- The Simulation: Observations and Sample Statistics :: Figure 4 ------------
#
for ( i in 1:nCorr )                                                                               # -- Figure 4: first line
    { Pop_Corr <-  - ( i / 10 )                                            #  population correlation coefficient

      SampleStats [ i, 1 ] <- Pop_Corr

      xy_Cov     <- Pop_Corr * PopSD * PopSD
      Cov_Matrix <- matrix ( c ( PopSD, xy_Cov, xy_Cov, PopSD ), 2, 2 )

      R <- round ( mvrnorm ( n = nObs, rep ( 0, 2 ), Cov_Matrix ), 3 )     # sample from bivariate normal distribution

      x  <- R [ , 1 ]
      y  <- R [ , 2 ]

      SampleStats [ i, 2 ] <- round ( mean ( x ), 4 )                      #  average x
      SampleStats [ i, 3 ] <- round ( sd   ( x ), 4 )                      #  SD x

      SampleStats [ i, 4 ] <- round ( mean ( y ), 4 )                      #  average y
      SampleStats [ i, 5 ] <- round ( sd   ( y ), 4 )                      #  SD y

      correlation <- cor.test ( x, y )                                     #  correlation: x and y

      r <- round ( correlation$estimate, 2 )                               #  sample correlation
      P <- round ( correlation$p.value,  4 )                               #  P value for sample r

      SampleStats [ i, 6 ] <- round ( correlation$estimate, 4 )            #  sample correlation
      SampleStats [ i, 7 ] <- round ( correlation$p.value,  4 )            #  P value for sample r

      xy_Regression    <- lm ( y ~ x )                                     #  estimate fitted regression equation
      #
      coefficients     <- coef    ( xy_Regression )
      model_statistics <- summary ( xy_Regression )

      SampleStats [ i, 8 ] <- round ( coefficients [ 1 ], 4 )              #  intercept
      SampleStats [ i, 9 ] <- round ( coefficients [ 2 ], 4 )              #  slope

      SampleStats [ i,10 ] <- round ( model_statistics$r.squared, 4 )      #  R^2: R squared

      plot ( x, y,                                                         #  scatterplot
             main       =  paste ( "r =", r, "... P =", P ),
             xlab       = ' ',
             xlim       =  c ( xMin, xMax ),
             ylab       = ' ',
             ylim       =  c ( yMin, yMax ),
             pch        = 19,
             axes       = FALSE,
             frame.plot = FALSE
             )
      abline ( h = 0, v = 0, col = 'gray' )                                #  add axes at x = 0 and y = 0
      text ( xMin + ( 0.25 * PopSD ), yMax - ( 0.075 * PopSD ), pos = 4, Pop_Corr )
      }

plot of chunk unnamed-chunk-1

CorrelationPlots <- recordPlot ( )                                                                 # -- Figure 4: last line


#    rho   Ave_x   SD_x   Ave_y   SD_y       r      P Intrcpt   Slope R_Sqrd
#
SampleStats
##       [,1]    [,2]   [,3]    [,4]   [,5]    [,6]   [,7]    [,8]    [,9]
##  [1,] -0.1  0.1572 1.0769 -0.0576 0.9925 -0.1199 0.2348 -0.0402 -0.1105
##  [2,] -0.2  0.0007 0.9976 -0.1251 0.9725 -0.0535 0.5967 -0.1250 -0.0522
##  [3,] -0.3 -0.0014 0.9226  0.0959 1.0148 -0.3840 0.0001  0.0953 -0.4223
##  [4,] -0.4 -0.0658 0.9142 -0.0421 1.0193 -0.3657 0.0002 -0.0689 -0.4077
##  [5,] -0.5  0.0165 0.8891  0.0779 0.9465 -0.4654 0.0000  0.0860 -0.4954
##  [6,] -0.6  0.1700 1.0929 -0.0488 0.9722 -0.5820 0.0000  0.0393 -0.5177
##  [7,] -0.7 -0.0919 0.9500  0.0472 0.9853 -0.6611 0.0000 -0.0158 -0.6856
##  [8,] -0.8  0.1301 0.9489 -0.0963 1.0825 -0.8262 0.0000  0.0263 -0.9425
##  [9,] -0.9  0.0084 1.0597  0.0311 1.0382 -0.9240 0.0000  0.0387 -0.9053
##        [,10]
##  [1,] 0.0144
##  [2,] 0.0029
##  [3,] 0.1474
##  [4,] 0.1337
##  [5,] 0.2166
##  [6,] 0.3388
##  [7,] 0.4370
##  [8,] 0.6826
##  [9,] 0.8537


# -- Correlation :: Figure 6 ---------------------------------------------------
#
par ( mfrow = c ( 2, 2 ) )                                                                         # -- Figure 6: first line

x_P25 <- quantile ( x, probs = 0.25 )                                      #  25th percentile of x
x_P75 <- quantile ( x, probs = 0.75 )                                      #  75th percentile of x

Data_Subset <- subset ( R, x_P25 <= R [ , 1 ] & R [ , 1 ] <= x_P75 )       #  select middle 50% x values
#
sd ( Data_Subset [ , 1 ] )                                                 #  SD of restricted x values
## [1] 0.4237
sd ( Data_Subset [ , 2 ] )                                                 #  SD of restricted y values
## [1] 0.5692

sub_Regression <- lm ( Data_Subset [ , 2 ] ~ Data_Subset [ , 1 ] )         #  estimate fitted regression equation
summary ( sub_Regression )
## 
## Call:
## lm(formula = Data_Subset[, 2] ~ Data_Subset[, 1])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8599 -0.2764  0.0697  0.2900  0.6760 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.0606     0.0563    1.08     0.29    
## Data_Subset[, 1]  -0.9828     0.1322   -7.44  1.6e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.392 on 48 degrees of freedom
## Multiple R-squared:  0.535,  Adjusted R-squared:  0.526 
## F-statistic: 55.3 on 1 and 48 DF,  p-value: 1.59e-09

correlation <- cor.test ( Data_Subset [ , 1 ], Data_Subset [ , 2 ] )

sub_r <- round ( correlation$estimate, 2 )                                 #  sample correlation
sub_P <- round ( correlation$p.value,  4 )                                 #  P value for sample r

plot ( x, y,                                                               #  scatterplot of original observations
       main       =  paste ( "r =", r, "... P =", P ),
       xlab       = ' ',
       xlim       =  c ( xMin, xMax ),
       ylab       = ' ',
       ylim       =  c ( yMin, yMax ),
       pch        = 19,
       axes       = FALSE,
       frame.plot = FALSE
       )
abline ( h = 0, v = 0, col = 'gray' )                                      #  add axes at x = 0 and y = 0
abline ( v = x_P25,    col = 'gray' )                                      #  draw line at 25th percentile of x
abline ( v = x_P75,    col = 'gray' )                                      #  draw line at 75th percentile of x

plot ( Data_Subset [ , 1 ], Data_Subset [ , 2 ],                           #  scatterplot of subset of observations
       main       =  paste ( "r =", sub_r, "... P =", sub_P ),
       xlab       = ' ',
       xlim       =  c ( xMin, xMax ),
       ylab       = ' ',
       ylim       =  c ( yMin, yMax ),
       pch        = 19,
       axes       = FALSE,
       frame.plot = FALSE
       )
abline ( h = 0, v = 0, col = 'gray' )                                      #  add axes at x = 0 and y = 0
abline ( v = x_P25,    col = 'gray' )                                      #  draw line at 25th percentile of x
abline ( v = x_P75,    col = 'gray' )                                      #  draw line at 75th percentile of x
#
Decreased_Variability_Plot <- recordPlot ( )                                                       # -- Figure 6: last line


# -- Interpretation :: Figure 9 ------------------------------------------------
#
par ( mfrow = c ( 2, 2 ) )                                                                         # -- Figure 9: first line

plot of chunk unnamed-chunk-1


TheData <- matrix ( nrow = 11, ncol = 2 )
for ( i in 1:2 )
    { TheData [ , 1 ] <- c ( 71, 68, 66, 67, 70, 71, 70, 73, 72, 65, 66 )
      TheData [ , 2 ] <- c ( 69, 64, 65, 63, 65, 62, 65, 64, 66, 59, 62 )

      if ( i == 2 ) TheData [ 10, 2 ] <- 69                                #  change a single observation

      correlation_2 <- cor.test ( TheData [ , 1 ], TheData [ , 2 ] )
      #
      r_wz <- round ( correlation_2$estimate, 2 )
      P_wz <- round ( correlation_2$p.value,  2 )

      wz_Regression <- lm ( TheData [ , 2 ] ~ TheData [ , 1 ] )            #  estimate fitted regression equation
      summary ( wz_Regression )

      plot ( TheData [ , 1 ], TheData [ , 2 ],                             #  scatterplot
             main       =  paste ( "r =", r_wz, "( P =",P_wz,")" ),
             xlab       = ' ',
             xlim       =  c ( 65, 75 ),
             ylab       = ' ',
             ylim       =  c ( 59, 69 ),
             pch        = 19,
             frame.plot = FALSE
             )
      abline ( wz_Regression )                                             #  add fitted regression line
      }
#
SinglePointPlot <- recordPlot ( )                                                                  # -- Figure 9: last line


# -- Interpretation :: Figure 10 -----------------------------------------------
#
par ( mfrow = c ( 2, 2 ) )                                                                         # -- Figure 10: first line

plot of chunk unnamed-chunk-1


nObs_0      <- 1000                                                        #  number of observations
Ratio       <- matrix ( nrow = nObs_0, ncol = 3 )                          #  matrix for data


# -- Generate data for initial value, change, and final value ------------------
#
for ( i in 1:nObs_0 )
    { Initial_Value  <- rnorm ( 1, mean = PopMean, sd = PopSD )            #  initial value
      Ratio [ i, 1 ] <- Initial_Value

      Change         <- rnorm ( 1, mean = PopMean, sd = PopSD )            #  change
      Ratio [ i, 2 ] <- Change

      Final_Value    <- Initial_Value + Change                             #  final value
      Ratio [ i, 3 ] <- Final_Value
      }


# -- Scatterplots and correlations ---------------------------------------------
#
plot ( Ratio [ , 1 ], Ratio [ , 2 ],                                       #  scatterplot: change vs initial value
       main       =  paste ( "Change vs initial value" ),
       xlab       = 'Initial value',
       ylab       = 'Change',
       pch        =  19,
       frame.plot =  FALSE
       )
cor.test ( Ratio [ , 1 ], Ratio [ , 2 ] )                                  #  correlation: change and initial value
## 
##  Pearson's product-moment correlation
## 
## data:  Ratio[, 1] and Ratio[, 2]
## t = 0.6441, df = 998, p-value = 0.5196
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.04166  0.08227
## sample estimates:
##     cor 
## 0.02039

plot ( Ratio [ , 1 ], Ratio [ , 3 ],                                       #  scatterplot: final vs initial value
       main       =  paste ( "Final value vs initial value" ),
       xlab       = 'Initial value',
       ylab       = 'Final value',
       pch        =  19,
       frame.plot = FALSE
       )
cor.test ( Ratio [ , 3 ], Ratio [ , 1 ] )                                  #  correlation: final and initial value
## 
##  Pearson's product-moment correlation
## 
## data:  Ratio[, 3] and Ratio[, 1]
## t = 32.91, df = 998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6903 0.7499
## sample estimates:
##    cor 
## 0.7215
#
SpuriousCorrelationPlot <- recordPlot ( )                                                          # -- Figure 10: last line


# -- Replay each data graphic --------------------------------------------------
#
replayPlot ( CorrelationPlots           )

plot of chunk unnamed-chunk-1

replayPlot ( SinglePointPlot            )
replayPlot ( Decreased_Variability_Plot )
replayPlot ( SpuriousCorrelationPlot    )

plot of chunk unnamed-chunk-1