Douglas Curran-Everett
Explorations in statistics: standard deviations and standard errors
Advan Physiol Educ 2008 32:203-208;
doi:10.1152/advan.90123.2008
#
# -- Advances_Statistics_Code.R: R code for Explorations in Statistics ---------
# -- Define population parameters, sample numbers, and plotting ranges ---------
#
PopMean <- 0 # population mean
PopSD <- 1 # population standard deviation
PopVar <- PopSD**2 # population variance
nObs <- 9 # number of observations in each sample
SE <- PopSD / sqrt ( nObs ) # SD of theoretical distribution of sample mean
nSamples <- 1000 # number of random samples
nCIs <- 100 # number of confidence intervals to plot
nStatistics <- 7 # number of sample statistics: sample, mean, SD, SE, t, lower CI bound, upper CI bound
Alpha <- 0.10 # critical significance level
tCoeff <- qt ( Alpha/2, nObs-1, lower.tail = FALSE ) # t coefficient for confidence intervals
nSampleSizes <- 4 # number of sample sizes for CLT example
nObs1 <- 4 # sample size 1
nObs2 <- 2 * nObs1 # sample size 2
nObs3 <- 2 * nObs2 # sample size 3
nObs4 <- 2 * nObs3 # sample size 4
PlotFactor <- 1000 # scaling factor for horizontal axes
ybarPlotMin <- PlotFactor * -6 # sample mean: minimum for plotting
ybarPlotMax <- PlotFactor * 6 # sample mean: maximum for plotting
tPlotMin <- PlotFactor * -10 # t: minimum for plotting
tPlotMax <- PlotFactor * 10 # t: maximum for plotting
# -- The Simulation: Observations and Sample Statistics ------------------------
#
SampleStats <- matrix ( nrow = nSamples, ncol = nStatistics )
for ( i in 1:nSamples )
{ TheData <- round ( rnorm ( nObs, mean = PopMean, sd = PopSD ), 3 )
if ( i <= 2 ) print ( TheData ) # print observations for samples 1-2
if ( i == 1000 ) print ( TheData ) # print observations for sample 1000
Sample_Number <- i # column 1: sample number
SampleStats [i, 1] <- Sample_Number
Sample_Mean <- mean ( TheData ) # column 2: sample mean
SampleStats [i, 2] <- round ( Sample_Mean, 3 )
Sample_SD <- sd ( TheData ) # column 3: standard deviation
SampleStats [i, 3] <- round ( Sample_SD, 3 )
Sample_SE <- Sample_SD / sqrt ( nObs ) # column 4: standard error
SampleStats [i, 4] <- round ( Sample_SE, 3 )
t_Observed <- Sample_Mean / Sample_SE # column 5: observed value of t
SampleStats [i, 5] <- round ( t_Observed, 3 )
Allowance <- tCoeff * Sample_SE # allowance for conf int
Lower_CI_Bound <- Sample_Mean - Allowance # column 6: lower bound of conf int
SampleStats [i, 6] <- round ( Lower_CI_Bound, 3 )
Upper_CI_Bound <- Sample_Mean + Allowance # column 7: upper bound of conf int
SampleStats [i, 7] <- round ( Upper_CI_Bound, 3 )
}
## [1] 0.533 -0.847 2.049 -0.297 1.321 -1.037 -0.551 -0.184 1.427
## [1] 0.153 -0.276 -2.104 -0.266 -0.037 -1.144 -0.700 -0.064 0.013
## [1] 1.433 0.847 -0.653 0.049 0.856 -0.389 0.168 -0.395 1.258
# -- Print statistics from initial 100 and final 5 random samples --------------
#
# Sample Mean SD SE t LCI UCI
SampleStats [ 1:nCIs, ] # print initial 100 samples
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1 0.268 1.109 0.370 0.726 -0.419 0.955
## [2,] 2 -0.492 0.727 0.242 -2.029 -0.942 -0.041
## [3,] 3 0.215 1.060 0.353 0.610 -0.442 0.872
## [4,] 4 -0.026 0.718 0.239 -0.107 -0.471 0.419
## [5,] 5 0.378 1.134 0.378 1.000 -0.325 1.081
## [6,] 6 0.026 1.407 0.469 0.056 -0.846 0.898
## [7,] 7 0.698 1.005 0.335 2.084 0.075 1.321
## [8,] 8 -0.069 0.847 0.282 -0.243 -0.594 0.457
## [9,] 9 -0.170 1.051 0.350 -0.484 -0.821 0.482
## [10,] 10 -0.028 1.239 0.413 -0.067 -0.795 0.740
## [11,] 11 0.200 0.901 0.300 0.667 -0.358 0.759
## [12,] 12 0.663 1.363 0.454 1.459 -0.182 1.508
## [13,] 13 0.866 0.901 0.300 2.885 0.308 1.424
## [14,] 14 0.230 0.784 0.261 0.882 -0.255 0.716
## [15,] 15 -0.220 1.258 0.419 -0.524 -1.000 0.560
## [16,] 16 0.082 1.381 0.460 0.177 -0.774 0.937
## [17,] 17 0.438 0.779 0.260 1.686 -0.045 0.920
## [18,] 18 -0.064 1.117 0.372 -0.171 -0.756 0.629
## [19,] 19 -0.329 0.964 0.321 -1.025 -0.927 0.268
## [20,] 20 -0.190 1.011 0.337 -0.563 -0.817 0.437
## [21,] 21 -0.016 0.938 0.313 -0.050 -0.597 0.566
## [22,] 22 0.153 0.709 0.236 0.648 -0.286 0.592
## [23,] 23 -0.347 1.088 0.363 -0.958 -1.022 0.327
## [24,] 24 -0.051 1.462 0.487 -0.105 -0.958 0.855
## [25,] 25 -0.149 1.065 0.355 -0.421 -0.810 0.511
## [26,] 26 -0.133 0.758 0.253 -0.525 -0.603 0.337
## [27,] 27 0.502 0.933 0.311 1.615 -0.076 1.080
## [28,] 28 0.137 0.780 0.260 0.529 -0.346 0.621
## [29,] 29 -0.470 1.004 0.335 -1.406 -1.092 0.152
## [30,] 30 -0.455 1.193 0.398 -1.143 -1.194 0.285
## [31,] 31 -0.234 1.378 0.459 -0.509 -1.088 0.620
## [32,] 32 0.067 0.973 0.324 0.206 -0.537 0.670
## [33,] 33 0.031 0.636 0.212 0.145 -0.364 0.425
## [34,] 34 -0.107 1.298 0.433 -0.247 -0.912 0.698
## [35,] 35 0.361 0.709 0.236 1.527 -0.079 0.801
## [36,] 36 -0.350 0.988 0.329 -1.062 -0.962 0.263
## [37,] 37 -0.459 1.352 0.451 -1.018 -1.297 0.379
## [38,] 38 -0.202 1.033 0.344 -0.587 -0.842 0.438
## [39,] 39 -0.151 0.757 0.252 -0.599 -0.620 0.318
## [40,] 40 -0.140 0.887 0.296 -0.473 -0.690 0.410
## [41,] 41 -0.170 1.119 0.373 -0.457 -0.864 0.523
## [42,] 42 0.214 1.163 0.388 0.552 -0.507 0.934
## [43,] 43 0.657 1.011 0.337 1.950 0.030 1.284
## [44,] 44 0.529 0.879 0.293 1.804 -0.016 1.073
## [45,] 45 -0.296 1.099 0.366 -0.808 -0.977 0.385
## [46,] 46 -0.497 0.781 0.260 -1.911 -0.981 -0.013
## [47,] 47 -0.085 1.172 0.391 -0.218 -0.811 0.641
## [48,] 48 -0.278 0.970 0.323 -0.859 -0.879 0.324
## [49,] 49 0.091 1.226 0.409 0.223 -0.669 0.851
## [50,] 50 -0.036 0.775 0.258 -0.139 -0.516 0.444
## [51,] 51 0.042 0.971 0.324 0.130 -0.560 0.644
## [52,] 52 -0.323 1.081 0.360 -0.895 -0.993 0.347
## [53,] 53 0.514 0.994 0.331 1.552 -0.102 1.130
## [54,] 54 -0.136 0.682 0.227 -0.597 -0.559 0.287
## [55,] 55 0.256 1.164 0.388 0.659 -0.466 0.977
## [56,] 56 -0.248 0.994 0.331 -0.748 -0.864 0.368
## [57,] 57 0.230 0.941 0.314 0.735 -0.353 0.813
## [58,] 58 -0.057 0.741 0.247 -0.231 -0.516 0.402
## [59,] 59 0.565 1.251 0.417 1.356 -0.210 1.341
## [60,] 60 0.200 0.627 0.209 0.955 -0.189 0.588
## [61,] 61 0.044 0.717 0.239 0.183 -0.401 0.488
## [62,] 62 -0.155 1.401 0.467 -0.333 -1.024 0.713
## [63,] 63 -0.323 0.921 0.307 -1.054 -0.894 0.247
## [64,] 64 -0.325 0.920 0.307 -1.061 -0.895 0.245
## [65,] 65 -0.002 0.911 0.304 -0.007 -0.567 0.563
## [66,] 66 0.085 1.139 0.380 0.225 -0.621 0.791
## [67,] 67 0.495 1.659 0.553 0.895 -0.533 1.523
## [68,] 68 -0.250 1.521 0.507 -0.493 -1.193 0.693
## [69,] 69 -0.108 1.030 0.343 -0.315 -0.746 0.530
## [70,] 70 0.407 0.833 0.278 1.464 -0.110 0.923
## [71,] 71 -0.246 0.783 0.261 -0.942 -0.731 0.239
## [72,] 72 0.698 1.190 0.397 1.760 -0.039 1.436
## [73,] 73 0.711 1.020 0.340 2.090 0.078 1.343
## [74,] 74 -0.248 0.892 0.297 -0.832 -0.801 0.306
## [75,] 75 -0.366 1.045 0.348 -1.051 -1.014 0.282
## [76,] 76 0.164 0.979 0.326 0.502 -0.443 0.770
## [77,] 77 0.301 0.652 0.217 1.385 -0.103 0.705
## [78,] 78 -0.125 0.970 0.323 -0.387 -0.727 0.476
## [79,] 79 0.575 1.167 0.389 1.478 -0.149 1.298
## [80,] 80 -0.192 1.092 0.364 -0.528 -0.869 0.485
## [81,] 81 -0.465 0.535 0.178 -2.607 -0.796 -0.133
## [82,] 82 -0.272 0.675 0.225 -1.208 -0.691 0.147
## [83,] 83 -0.542 1.365 0.455 -1.191 -1.389 0.304
## [84,] 84 -0.196 0.873 0.291 -0.672 -0.737 0.346
## [85,] 85 -0.073 0.676 0.225 -0.324 -0.492 0.346
## [86,] 86 -0.443 0.908 0.303 -1.464 -1.006 0.120
## [87,] 87 0.289 1.192 0.397 0.728 -0.449 1.028
## [88,] 88 -0.344 0.835 0.278 -1.238 -0.862 0.173
## [89,] 89 0.424 0.811 0.270 1.568 -0.079 0.926
## [90,] 90 0.104 0.794 0.265 0.391 -0.389 0.596
## [91,] 91 0.791 0.918 0.306 2.584 0.222 1.360
## [92,] 92 0.589 0.952 0.317 1.857 -0.001 1.179
## [93,] 93 -0.211 0.888 0.296 -0.713 -0.761 0.339
## [94,] 94 0.211 1.141 0.380 0.554 -0.497 0.918
## [95,] 95 -0.054 0.853 0.284 -0.191 -0.583 0.474
## [96,] 96 0.137 0.556 0.185 0.736 -0.208 0.481
## [97,] 97 0.342 0.579 0.193 1.770 -0.017 0.701
## [98,] 98 -0.311 1.022 0.341 -0.913 -0.945 0.323
## [99,] 99 -0.069 1.089 0.363 -0.191 -0.745 0.606
## [100,] 100 0.047 0.514 0.171 0.276 -0.271 0.366
SampleStats [ 996:nSamples, ] # print final 5 samples
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 996 0.079 1.264 0.421 0.188 -0.704 0.863
## [2,] 997 -0.216 0.750 0.250 -0.863 -0.681 0.249
## [3,] 998 0.212 1.114 0.371 0.571 -0.478 0.902
## [4,] 999 0.052 1.044 0.348 0.148 -0.596 0.699
## [5,] 1000 0.353 0.769 0.256 1.376 -0.124 0.829
# -- Variability: Standard Deviation :: Figure 3 ------------------------------- # -- Figure 3: first line
#
max_ySD <- max ( SampleStats [, 3] ) # maximum for vertical axis
#
SDPlotMax <- 3000
PDF_S_Data <- matrix ( nrow = SDPlotMax, ncol = 1 )
xs <- c ( 1:SDPlotMax ) / PlotFactor
#
for ( j in 1:SDPlotMax ) # calculate theoretical pdf of sample SD
{ s <- j / PlotFactor
Temp1 <- 2 * ( (( nObs-1 ) / ( 2*PopVar ))**( (nObs-1)/2 ) )
Temp2 <- ( s**( nObs-2 ) ) / ( gamma ( (nObs-1)/2) )
Temp3 <- exp ( (-(nObs-1)*(s**2) ) / ( 2*PopVar ) )
pdf_s <- Temp1 * Temp2 * Temp3
PDF_S_Data [j] <- pdf_s
}
hist ( SampleStats [, 3], # plot empirical distribution
axes = FALSE,
ylim = c ( 0, max_ySD ),
freq = FALSE,
main = paste ( "Distribution of the sample SD: n =", nObs ),
xlab = "Sample standard deviation",
ylab = " "
)
axis ( 1 )
lines ( xs, PDF_S_Data [,], col = "gray50" ) # plot theoretical distribution
Sample_SD_Graphic <- recordPlot ( ) # store data graphic # -- Figure 3: last line
# -- Get average SD of all 1000 random samples ---------------------------------
#
round ( mean ( SampleStats [, 3] ), 3 ) # Ave ( sample SDs )
## [1] 0.991
# -- Variability: Standard Deviation :: Figure 4 ------------------------------- # -- Figure 4: first line
#
xy <- c ( ybarPlotMin:ybarPlotMax ) / PlotFactor # generate theoretical distribution
N_0_1 <- dnorm ( xy, mean = PopMean, sd = PopSD ) # N ( 0, 1 )
N_0_2 <- dnorm ( xy, mean = PopMean, sd = PopSD * 2 ) # N ( 0, 2 )
N_x_Min <- -6 * PopSD # minimum x for normal distribution
N_x_Max <- 6 * PopSD # maximum x for normal distribution
N_y_Max <- max ( N_0_1, N_0_2 ) # maximum y for normal distribution
plot ( 0, 0, # plot normal distributions
axes = FALSE,
frame.plot = FALSE,
main = "Standard deviation: impact on variability",
xlab = " ",
ylab = " ",
xlim = c ( N_x_Min, N_x_Max ),
ylim = c ( 0, N_y_Max ),
type = "n"
)
axis ( 1 )
lines ( xy, N_0_1, col = "black" ) # N ( 0, 1 ): SD = 1 ... the actual population
lines ( xy, N_0_2, col = "gray50" ) # N ( 0, 2 ): SD = 2
Normal_Distributions <- recordPlot ( ) # store data graphic # -- Figure 4: last line
# -- Uncertainty: Standard Error of the Mean :: Figure 5 ----------------------- # -- Figure 5: first line
#
par ( mfrow = c ( 1, 1 ) ) # 1 graphic on 1 screen
max_ybar <- 1.4 # maximum for vertical axis
xy <- c ( ybarPlotMin:ybarPlotMax ) / PlotFactor # generate theoretical distribution
yy <- dnorm ( xy, mean = PopMean, sd = SE )
#
hist ( SampleStats [, 2], # plot empirical distribution
axes = FALSE,
ylim = c ( 0, max_ybar ),
freq = FALSE,
main = paste ( "Distribution of the sample mean: n =", nObs ),
xlab = "Sample mean",
ylab = " "
)
axis ( 1 )
lines ( xy, yy, ylim = c ( 0, max_ybar ), col = "gray50" ) # plot theoretical distribution
Sample_Mean_Graphic <- recordPlot ( ) # store data graphic # -- Figure 5: last line
# -- Get average and SD of all 1000 sample means -------------------------------
#
round ( mean ( SampleStats [, 2] ), 3 ) # Ave ( sample means )
## [1] -0.008
round ( sd ( SampleStats [, 2] ), 3 ) # SD ( sample means )
## [1] 0.327
# -- Uncertainty: Standard Error of the Mean :: generate the samples ----------- # -- Figure 6: first line
#
par ( mfrow = c ( 2, 2 ) ) # 4 graphics on 1 screen
CLT_Data <- matrix ( nrow = nSamples, ncol = nSampleSizes )
for ( i in 1:nSamples )
{ TheData1 <- round ( rnorm ( nObs1, mean = PopMean, sd = PopSD ), 4 )
CLT_Data [i, 1] <- mean ( TheData1 ) # sample size 1 mean
TheData2 <- round ( rnorm ( nObs2, mean = PopMean, sd = PopSD ), 4 )
CLT_Data [i, 2] <- mean ( TheData2 ) # sample size 2 mean
TheData3 <- round ( rnorm ( nObs3, mean = PopMean, sd = PopSD ), 4 )
CLT_Data [i, 3] <- mean ( TheData3 ) # sample size 3 mean
TheData4 <- round ( rnorm ( nObs4, mean = PopMean, sd = PopSD ), 4 )
CLT_Data [i, 4] <- mean ( TheData4 ) # sample size 4 mean
}
# -- Figure 6: get minimum and maximum for plotting ----------------------------
#
CLT_Min <- trunc ( min ( CLT_Data [, 1] ) ) - 1 # minimum x
CLT_Max <- trunc ( max ( CLT_Data [, 1] ) ) + 1 # maximum x
CLT_yMax <- 2.25 # maximum y
# -- Figure 6: get average and variance for all sample sizes -------------------
#
Mean_1 <- round ( mean ( CLT_Data [, 1] ), 3 ) # sample size 1
Var_1 <- round ( var ( CLT_Data [, 1] ), 2 )
Mean_2 <- round ( mean ( CLT_Data [, 2] ), 3 ) # sample size 2
Var_2 <- round ( var ( CLT_Data [, 2] ), 2 )
Mean_3 <- round ( mean ( CLT_Data [, 3] ), 3 ) # sample size 3
Var_3 <- round ( var ( CLT_Data [, 3] ), 2 )
Mean_4 <- round ( mean ( CLT_Data [, 4] ), 3 ) # sample size 4
Var_4 <- round ( var ( CLT_Data [, 4] ), 2 )
print ( Mean_1 )
## [1] 0.002
print ( Mean_2 )
## [1] -0.003
print ( Mean_3 )
## [1] -0.006
print ( Mean_4 )
## [1] -0.001
# -- Uncertainty: Standard Error of the Mean :: Figure 6 -----------------------
#
hist ( CLT_Data [, 1], # sample size 1
axes = FALSE,
xlim = c ( CLT_Min, CLT_Max ),
breaks = 15,
ylim = c ( 0, CLT_yMax ),
freq = FALSE,
main = paste ( "A. n = ", nObs1, ": Var { means } =", Var_1 ),
xlab = " ",
ylab = " "
)
axis ( 1 )
hist ( CLT_Data [, 2], # sample size 2
axes = FALSE,
xlim = c ( CLT_Min, CLT_Max ),
ylim = c ( 0, CLT_yMax ),
freq = FALSE,
main = paste ( "B. n = ", nObs2, ": Var { means } =", Var_2 ),
xlab = " ",
ylab = " "
)
axis ( 1 )
hist ( CLT_Data [, 3], # sample size 3
axes = FALSE,
xlim = c ( CLT_Min, CLT_Max ),
ylim = c ( 0, CLT_yMax ),
freq = FALSE,
main = paste ( "C. n = ", nObs3, ": Var { means } =", Var_3 ),
xlab = "Sample mean",
ylab = " "
)
axis ( 1 )
hist ( CLT_Data [, 4], # sample size 4
axes = FALSE,
xlim = c ( CLT_Min, CLT_Max ),
ylim = c ( 0, CLT_yMax ),
freq = FALSE,
main = paste ( "D. n = ", nObs4, ": Var { means } =", Var_4 ),
xlab = "Sample mean",
ylab = " "
)
axis ( 1 )
CLT_Graphic <- recordPlot ( ) # store data graphic # -- Figure 6: last line
# -- Replay each data graphic --------------------------------------------------
#
replayPlot ( Sample_SD_Graphic )
replayPlot ( Normal_Distributions )
replayPlot ( Sample_Mean_Graphic )
replayPlot ( CLT_Graphic )