Explorations in statistics: standard deviations and standard errors

Douglas Curran-Everett
Explorations in statistics: standard deviations and standard errors
Advan Physiol Educ 2008 32:203-208;
doi:10.1152/advan.90123.2008

#
# -- Advances_Statistics_Code.R: R code for Explorations in Statistics ---------


# -- Define population parameters, sample numbers, and plotting ranges ---------
#
PopMean      <-    0                                         #  population mean
PopSD        <-    1                                         #  population standard deviation
PopVar       <-    PopSD**2                                  #  population variance
nObs         <-    9                                         #  number of observations in each sample
SE           <- PopSD / sqrt ( nObs )                        #  SD of theoretical distribution of sample mean
nSamples     <- 1000                                         #  number of random samples
nCIs         <-  100                                         #  number of confidence intervals to plot
nStatistics  <-    7                                         #  number of sample statistics: sample, mean, SD, SE, t, lower CI bound, upper CI bound

Alpha        <-    0.10                                      #  critical significance level
tCoeff       <- qt ( Alpha/2, nObs-1, lower.tail = FALSE )   #  t coefficient for confidence intervals

nSampleSizes <-    4                                         #  number of sample sizes for CLT example
nObs1        <-    4                                         #  sample size 1
nObs2        <-    2 * nObs1                                 #  sample size 2
nObs3        <-    2 * nObs2                                 #  sample size 3
nObs4        <-    2 * nObs3                                 #  sample size 4

PlotFactor   <- 1000                                         #  scaling factor for horizontal axes
ybarPlotMin  <- PlotFactor *  -6                             #  sample mean: minimum for plotting
ybarPlotMax  <- PlotFactor *   6                             #  sample mean: maximum for plotting
tPlotMin     <- PlotFactor * -10                             #  t: minimum for plotting
tPlotMax     <- PlotFactor *  10                             #  t: maximum for plotting


# -- The Simulation: Observations and Sample Statistics ------------------------
#
SampleStats <- matrix ( nrow = nSamples, ncol = nStatistics )
for ( i in 1:nSamples )
    { TheData <- round ( rnorm ( nObs, mean = PopMean, sd = PopSD ), 3 )
      if ( i <=    2 ) print ( TheData )                     #  print observations for samples 1-2
      if ( i == 1000 ) print ( TheData )                     #  print observations for sample 1000

      Sample_Number      <- i                                #  column 1: sample number
      SampleStats [i, 1] <- Sample_Number

      Sample_Mean        <- mean  ( TheData        )         #  column 2: sample mean
      SampleStats [i, 2] <- round ( Sample_Mean, 3 )

      Sample_SD          <- sd    ( TheData      )           #  column 3: standard deviation
      SampleStats [i, 3] <- round ( Sample_SD, 3 )

      Sample_SE          <- Sample_SD / sqrt ( nObs )        #  column 4: standard error
      SampleStats [i, 4] <- round ( Sample_SE, 3 )

      t_Observed         <- Sample_Mean / Sample_SE          #  column 5: observed value of t
      SampleStats [i, 5] <- round ( t_Observed, 3 )

      Allowance          <- tCoeff * Sample_SE               #  allowance for conf int

      Lower_CI_Bound     <- Sample_Mean - Allowance          #  column 6: lower bound of conf int
      SampleStats [i, 6] <- round ( Lower_CI_Bound, 3 )

      Upper_CI_Bound     <- Sample_Mean + Allowance          #  column 7: upper bound of conf int
      SampleStats [i, 7] <- round ( Upper_CI_Bound, 3 )
      }

## [1]  0.533 -0.847  2.049 -0.297  1.321 -1.037 -0.551 -0.184  1.427
## [1]  0.153 -0.276 -2.104 -0.266 -0.037 -1.144 -0.700 -0.064  0.013
## [1]  1.433  0.847 -0.653  0.049  0.856 -0.389  0.168 -0.395  1.258



# -- Print statistics from initial 100 and final 5 random samples --------------
#
#  Sample   Mean    SD    SE      t    LCI    UCI
SampleStats [   1:nCIs,     ]                                #  print initial 100 samples

##        [,1]   [,2]  [,3]  [,4]   [,5]   [,6]   [,7]
##   [1,]    1  0.268 1.109 0.370  0.726 -0.419  0.955
##   [2,]    2 -0.492 0.727 0.242 -2.029 -0.942 -0.041
##   [3,]    3  0.215 1.060 0.353  0.610 -0.442  0.872
##   [4,]    4 -0.026 0.718 0.239 -0.107 -0.471  0.419
##   [5,]    5  0.378 1.134 0.378  1.000 -0.325  1.081
##   [6,]    6  0.026 1.407 0.469  0.056 -0.846  0.898
##   [7,]    7  0.698 1.005 0.335  2.084  0.075  1.321
##   [8,]    8 -0.069 0.847 0.282 -0.243 -0.594  0.457
##   [9,]    9 -0.170 1.051 0.350 -0.484 -0.821  0.482
##  [10,]   10 -0.028 1.239 0.413 -0.067 -0.795  0.740
##  [11,]   11  0.200 0.901 0.300  0.667 -0.358  0.759
##  [12,]   12  0.663 1.363 0.454  1.459 -0.182  1.508
##  [13,]   13  0.866 0.901 0.300  2.885  0.308  1.424
##  [14,]   14  0.230 0.784 0.261  0.882 -0.255  0.716
##  [15,]   15 -0.220 1.258 0.419 -0.524 -1.000  0.560
##  [16,]   16  0.082 1.381 0.460  0.177 -0.774  0.937
##  [17,]   17  0.438 0.779 0.260  1.686 -0.045  0.920
##  [18,]   18 -0.064 1.117 0.372 -0.171 -0.756  0.629
##  [19,]   19 -0.329 0.964 0.321 -1.025 -0.927  0.268
##  [20,]   20 -0.190 1.011 0.337 -0.563 -0.817  0.437
##  [21,]   21 -0.016 0.938 0.313 -0.050 -0.597  0.566
##  [22,]   22  0.153 0.709 0.236  0.648 -0.286  0.592
##  [23,]   23 -0.347 1.088 0.363 -0.958 -1.022  0.327
##  [24,]   24 -0.051 1.462 0.487 -0.105 -0.958  0.855
##  [25,]   25 -0.149 1.065 0.355 -0.421 -0.810  0.511
##  [26,]   26 -0.133 0.758 0.253 -0.525 -0.603  0.337
##  [27,]   27  0.502 0.933 0.311  1.615 -0.076  1.080
##  [28,]   28  0.137 0.780 0.260  0.529 -0.346  0.621
##  [29,]   29 -0.470 1.004 0.335 -1.406 -1.092  0.152
##  [30,]   30 -0.455 1.193 0.398 -1.143 -1.194  0.285
##  [31,]   31 -0.234 1.378 0.459 -0.509 -1.088  0.620
##  [32,]   32  0.067 0.973 0.324  0.206 -0.537  0.670
##  [33,]   33  0.031 0.636 0.212  0.145 -0.364  0.425
##  [34,]   34 -0.107 1.298 0.433 -0.247 -0.912  0.698
##  [35,]   35  0.361 0.709 0.236  1.527 -0.079  0.801
##  [36,]   36 -0.350 0.988 0.329 -1.062 -0.962  0.263
##  [37,]   37 -0.459 1.352 0.451 -1.018 -1.297  0.379
##  [38,]   38 -0.202 1.033 0.344 -0.587 -0.842  0.438
##  [39,]   39 -0.151 0.757 0.252 -0.599 -0.620  0.318
##  [40,]   40 -0.140 0.887 0.296 -0.473 -0.690  0.410
##  [41,]   41 -0.170 1.119 0.373 -0.457 -0.864  0.523
##  [42,]   42  0.214 1.163 0.388  0.552 -0.507  0.934
##  [43,]   43  0.657 1.011 0.337  1.950  0.030  1.284
##  [44,]   44  0.529 0.879 0.293  1.804 -0.016  1.073
##  [45,]   45 -0.296 1.099 0.366 -0.808 -0.977  0.385
##  [46,]   46 -0.497 0.781 0.260 -1.911 -0.981 -0.013
##  [47,]   47 -0.085 1.172 0.391 -0.218 -0.811  0.641
##  [48,]   48 -0.278 0.970 0.323 -0.859 -0.879  0.324
##  [49,]   49  0.091 1.226 0.409  0.223 -0.669  0.851
##  [50,]   50 -0.036 0.775 0.258 -0.139 -0.516  0.444
##  [51,]   51  0.042 0.971 0.324  0.130 -0.560  0.644
##  [52,]   52 -0.323 1.081 0.360 -0.895 -0.993  0.347
##  [53,]   53  0.514 0.994 0.331  1.552 -0.102  1.130
##  [54,]   54 -0.136 0.682 0.227 -0.597 -0.559  0.287
##  [55,]   55  0.256 1.164 0.388  0.659 -0.466  0.977
##  [56,]   56 -0.248 0.994 0.331 -0.748 -0.864  0.368
##  [57,]   57  0.230 0.941 0.314  0.735 -0.353  0.813
##  [58,]   58 -0.057 0.741 0.247 -0.231 -0.516  0.402
##  [59,]   59  0.565 1.251 0.417  1.356 -0.210  1.341
##  [60,]   60  0.200 0.627 0.209  0.955 -0.189  0.588
##  [61,]   61  0.044 0.717 0.239  0.183 -0.401  0.488
##  [62,]   62 -0.155 1.401 0.467 -0.333 -1.024  0.713
##  [63,]   63 -0.323 0.921 0.307 -1.054 -0.894  0.247
##  [64,]   64 -0.325 0.920 0.307 -1.061 -0.895  0.245
##  [65,]   65 -0.002 0.911 0.304 -0.007 -0.567  0.563
##  [66,]   66  0.085 1.139 0.380  0.225 -0.621  0.791
##  [67,]   67  0.495 1.659 0.553  0.895 -0.533  1.523
##  [68,]   68 -0.250 1.521 0.507 -0.493 -1.193  0.693
##  [69,]   69 -0.108 1.030 0.343 -0.315 -0.746  0.530
##  [70,]   70  0.407 0.833 0.278  1.464 -0.110  0.923
##  [71,]   71 -0.246 0.783 0.261 -0.942 -0.731  0.239
##  [72,]   72  0.698 1.190 0.397  1.760 -0.039  1.436
##  [73,]   73  0.711 1.020 0.340  2.090  0.078  1.343
##  [74,]   74 -0.248 0.892 0.297 -0.832 -0.801  0.306
##  [75,]   75 -0.366 1.045 0.348 -1.051 -1.014  0.282
##  [76,]   76  0.164 0.979 0.326  0.502 -0.443  0.770
##  [77,]   77  0.301 0.652 0.217  1.385 -0.103  0.705
##  [78,]   78 -0.125 0.970 0.323 -0.387 -0.727  0.476
##  [79,]   79  0.575 1.167 0.389  1.478 -0.149  1.298
##  [80,]   80 -0.192 1.092 0.364 -0.528 -0.869  0.485
##  [81,]   81 -0.465 0.535 0.178 -2.607 -0.796 -0.133
##  [82,]   82 -0.272 0.675 0.225 -1.208 -0.691  0.147
##  [83,]   83 -0.542 1.365 0.455 -1.191 -1.389  0.304
##  [84,]   84 -0.196 0.873 0.291 -0.672 -0.737  0.346
##  [85,]   85 -0.073 0.676 0.225 -0.324 -0.492  0.346
##  [86,]   86 -0.443 0.908 0.303 -1.464 -1.006  0.120
##  [87,]   87  0.289 1.192 0.397  0.728 -0.449  1.028
##  [88,]   88 -0.344 0.835 0.278 -1.238 -0.862  0.173
##  [89,]   89  0.424 0.811 0.270  1.568 -0.079  0.926
##  [90,]   90  0.104 0.794 0.265  0.391 -0.389  0.596
##  [91,]   91  0.791 0.918 0.306  2.584  0.222  1.360
##  [92,]   92  0.589 0.952 0.317  1.857 -0.001  1.179
##  [93,]   93 -0.211 0.888 0.296 -0.713 -0.761  0.339
##  [94,]   94  0.211 1.141 0.380  0.554 -0.497  0.918
##  [95,]   95 -0.054 0.853 0.284 -0.191 -0.583  0.474
##  [96,]   96  0.137 0.556 0.185  0.736 -0.208  0.481
##  [97,]   97  0.342 0.579 0.193  1.770 -0.017  0.701
##  [98,]   98 -0.311 1.022 0.341 -0.913 -0.945  0.323
##  [99,]   99 -0.069 1.089 0.363 -0.191 -0.745  0.606
## [100,]  100  0.047 0.514 0.171  0.276 -0.271  0.366

SampleStats [ 996:nSamples, ]                                #  print final 5 samples

##      [,1]   [,2]  [,3]  [,4]   [,5]   [,6]  [,7]
## [1,]  996  0.079 1.264 0.421  0.188 -0.704 0.863
## [2,]  997 -0.216 0.750 0.250 -0.863 -0.681 0.249
## [3,]  998  0.212 1.114 0.371  0.571 -0.478 0.902
## [4,]  999  0.052 1.044 0.348  0.148 -0.596 0.699
## [5,] 1000  0.353 0.769 0.256  1.376 -0.124 0.829



# -- Variability: Standard Deviation :: Figure 3 -------------------------------                    # -- Figure 3: first line
#
max_ySD <- max ( SampleStats [, 3] )                         #  maximum for vertical axis
#
SDPlotMax  <- 3000
PDF_S_Data <- matrix ( nrow = SDPlotMax, ncol = 1 )
xs         <- c ( 1:SDPlotMax ) / PlotFactor
#
for ( j in 1:SDPlotMax )                                     #  calculate theoretical pdf of sample SD
    { s              <- j / PlotFactor
      Temp1          <-  2 * ( (( nObs-1 ) / ( 2*PopVar ))**( (nObs-1)/2 ) )
      Temp2          <- ( s**( nObs-2 ) ) / ( gamma ( (nObs-1)/2) )
      Temp3          <- exp ( (-(nObs-1)*(s**2) ) / ( 2*PopVar ) )
      pdf_s          <- Temp1 * Temp2 * Temp3
      PDF_S_Data [j] <- pdf_s
      }

hist ( SampleStats [, 3],                                    #  plot empirical distribution
       axes =  FALSE,
       ylim =  c ( 0, max_ySD ),
       freq =  FALSE,
       main =  paste ( "Distribution of the sample SD: n =", nObs ),
       xlab = "Sample standard deviation",
       ylab = " "
       )
axis  ( 1 )
lines ( xs, PDF_S_Data [,], col = "gray50" )                 #  plot theoretical distribution

plot of chunk unnamed-chunk-1

Sample_SD_Graphic <- recordPlot ( )                          #  store data graphic                  # -- Figure 3: last line


# -- Get average SD of all 1000 random samples ---------------------------------
#
round ( mean ( SampleStats [, 3] ), 3 )                      #  Ave ( sample SDs )

## [1] 0.991



# -- Variability: Standard Deviation :: Figure 4 -------------------------------                    # -- Figure 4: first line
#
xy       <- c ( ybarPlotMin:ybarPlotMax ) / PlotFactor       #  generate theoretical distribution
N_0_1    <- dnorm ( xy, mean = PopMean, sd = PopSD )         #  N ( 0, 1 )
N_0_2    <- dnorm ( xy, mean = PopMean, sd = PopSD * 2 )     #  N ( 0, 2 )

N_x_Min  <- -6 * PopSD                                       #  minimum x for normal distribution
N_x_Max  <-  6 * PopSD                                       #  maximum x for normal distribution
N_y_Max  <- max ( N_0_1, N_0_2 )                             #  maximum y for normal distribution

plot ( 0, 0,                                                 #  plot normal distributions
       axes       =  FALSE,
       frame.plot =  FALSE,
       main       = "Standard deviation: impact on variability",
       xlab       = " ",
       ylab       = " ",
       xlim       =  c ( N_x_Min, N_x_Max ),
       ylim       =  c ( 0, N_y_Max ),
       type       = "n"
       )
axis  ( 1 )
lines ( xy, N_0_1, col = "black"  )                          #  N ( 0, 1 ): SD = 1 ... the actual population
lines ( xy, N_0_2, col = "gray50" )                          #  N ( 0, 2 ): SD = 2

plot of chunk unnamed-chunk-1

Normal_Distributions <- recordPlot ( )                       #  store data graphic                  # -- Figure 4: last line


# -- Uncertainty: Standard Error of the Mean :: Figure 5 -----------------------                    # -- Figure 5: first line
#
par ( mfrow = c ( 1, 1 ) )                                   #  1 graphic on 1 screen

max_ybar <- 1.4                                              #  maximum for vertical axis
xy       <- c ( ybarPlotMin:ybarPlotMax ) / PlotFactor       #  generate theoretical distribution
yy       <- dnorm ( xy, mean = PopMean, sd = SE )
#
hist ( SampleStats [, 2],                                    #  plot empirical distribution
       axes =  FALSE,
       ylim =  c ( 0, max_ybar ),
       freq =  FALSE,
       main =  paste ( "Distribution of the sample mean: n =", nObs ),
       xlab = "Sample mean",
       ylab = " "
       )
axis  ( 1 )
lines ( xy, yy, ylim =  c ( 0, max_ybar ), col = "gray50" )  #  plot theoretical distribution

plot of chunk unnamed-chunk-1

Sample_Mean_Graphic <- recordPlot ( )                        #  store data graphic                  # -- Figure 5: last line


# -- Get average and SD of all 1000 sample means -------------------------------
#
round ( mean ( SampleStats [, 2] ), 3 )                      #  Ave ( sample means )

## [1] -0.008

round ( sd   ( SampleStats [, 2] ), 3 )                      #  SD  ( sample means )

## [1] 0.327



# -- Uncertainty: Standard Error of the Mean :: generate the samples -----------                    # -- Figure 6: first line
#
par ( mfrow = c ( 2, 2 ) )                                   #  4 graphics on 1 screen

CLT_Data <- matrix ( nrow = nSamples, ncol = nSampleSizes )
for ( i in 1:nSamples )
    { TheData1 <- round ( rnorm ( nObs1, mean = PopMean, sd = PopSD ), 4 )

      CLT_Data [i, 1] <- mean ( TheData1 )                   #  sample size 1 mean

      TheData2 <- round ( rnorm ( nObs2, mean = PopMean, sd = PopSD ), 4 )
      CLT_Data [i, 2] <- mean ( TheData2 )                   #  sample size 2 mean

      TheData3 <- round ( rnorm ( nObs3, mean = PopMean, sd = PopSD ), 4 )
      CLT_Data [i, 3] <- mean ( TheData3 )                   #  sample size 3 mean

      TheData4 <- round ( rnorm ( nObs4, mean = PopMean, sd = PopSD ), 4 )
      CLT_Data [i, 4] <- mean ( TheData4 )                   #  sample size 4 mean
      }


# -- Figure 6: get minimum and maximum for plotting ----------------------------
#
CLT_Min  <- trunc ( min ( CLT_Data [, 1] ) ) - 1             #  minimum x
CLT_Max  <- trunc ( max ( CLT_Data [, 1] ) ) + 1             #  maximum x
CLT_yMax <- 2.25                                             #  maximum y


# -- Figure 6: get average and variance for all sample sizes -------------------
#
Mean_1 <- round ( mean ( CLT_Data [, 1] ), 3 )               #  sample size 1
Var_1  <- round ( var  ( CLT_Data [, 1] ), 2 )

Mean_2 <- round ( mean ( CLT_Data [, 2] ), 3 )               #  sample size 2
Var_2  <- round ( var  ( CLT_Data [, 2] ), 2 )

Mean_3 <- round ( mean ( CLT_Data [, 3] ), 3 )               #  sample size 3
Var_3  <- round ( var  ( CLT_Data [, 3] ), 2 )

Mean_4 <- round ( mean ( CLT_Data [, 4] ), 3 )               #  sample size 4
Var_4  <- round ( var  ( CLT_Data [, 4] ), 2 )

print ( Mean_1 )

## [1] 0.002

print ( Mean_2 )

## [1] -0.003

print ( Mean_3 )

## [1] -0.006

print ( Mean_4 )

## [1] -0.001



# -- Uncertainty: Standard Error of the Mean :: Figure 6 -----------------------
#
hist ( CLT_Data [, 1],                                       #  sample size 1
       axes =  FALSE,
       xlim =  c ( CLT_Min, CLT_Max ),
       breaks = 15,
       ylim =  c ( 0, CLT_yMax ),
       freq =  FALSE,
       main = paste ( "A.  n = ", nObs1, ": Var { means } =", Var_1 ),
       xlab = " ",
       ylab = " "
       )
axis  ( 1 )

hist ( CLT_Data [, 2],                                       #  sample size 2
       axes =  FALSE,
       xlim =  c ( CLT_Min, CLT_Max ),
       ylim =  c ( 0, CLT_yMax ),
       freq =  FALSE,
       main = paste ( "B.  n = ", nObs2, ": Var { means } =", Var_2 ),
       xlab = " ",
       ylab = " "
       )
axis  ( 1 )

hist ( CLT_Data [, 3],                                       #  sample size 3
       axes =  FALSE,
       xlim =  c ( CLT_Min, CLT_Max ),
       ylim =  c ( 0, CLT_yMax ),
       freq =  FALSE,
       main = paste ( "C.  n = ", nObs3, ": Var { means } =", Var_3 ),
       xlab = "Sample mean",
       ylab = " "
       )
axis  ( 1 )

hist ( CLT_Data [, 4],                                       #  sample size 4
       axes =  FALSE,
       xlim =  c ( CLT_Min, CLT_Max ),
       ylim =  c ( 0, CLT_yMax ),
       freq =  FALSE,
       main = paste ( "D.  n = ", nObs4, ": Var { means } =", Var_4 ),
       xlab = "Sample mean",
       ylab = " "
       )
axis  ( 1 )

plot of chunk unnamed-chunk-1


CLT_Graphic <- recordPlot ( )                                #  store data graphic                  # -- Figure 6: last line


# -- Replay each data graphic --------------------------------------------------
#
replayPlot ( Sample_SD_Graphic    )

plot of chunk unnamed-chunk-1

replayPlot ( Normal_Distributions )

plot of chunk unnamed-chunk-1

replayPlot ( Sample_Mean_Graphic  )

plot of chunk unnamed-chunk-1

replayPlot ( CLT_Graphic          )

plot of chunk unnamed-chunk-1