Douglas Curran-Everett
Explorations in statistics: permutation methods
Advan Physiol Educ 2012 36:181-187;
doi:10.1152/advan.00072.2012
#
# -- Advances_Statistics_Code_Perm.R: R code for Explorations in Statistics ----
# -- Load extra packages and define parameters ---------------------------------
#
library ( beeswarm )
## Error: there is no package called 'beeswarm'
library ( coin )
## Loading required package: survival
## Loading required package: splines
library ( gtools )
par ( mfrow = c ( 1, 1 ) ) # 1 graphic on 1 screen
par ( las = 1 ) # make axis numbers horizontal
PermNum <- 10000 # number of permutations for correlation example
# -- A More Realistic Two-Sample Example :: Figure 1 --------------------------- # -- Figure 1: first line
# -- Read in the data ----------------------------------------------------------
#
TheData <- matrix ( nrow = 12, ncol = 2 )
#
TheData [ 1, ] <- c ( 1, 5.42 ) # fish group observations
TheData [ 2, ] <- c ( 1, 5.86 )
TheData [ 3, ] <- c ( 1, 6.16 )
TheData [ 4, ] <- c ( 1, 6.55 )
TheData [ 5, ] <- c ( 1, 6.80 )
TheData [ 6, ] <- c ( 1, 7.00 )
TheData [ 7, ] <- c ( 1, 7.11 )
TheData [ 8, ] <- c ( 2, 6.51 ) # meat group observations
TheData [ 9, ] <- c ( 2, 7.56 )
TheData [ 10, ] <- c ( 2, 7.61 )
TheData [ 11, ] <- c ( 2, 7.84 )
TheData [ 12, ] <- c ( 2, 11.50 )
#
TheData <- data.frame ( TheData )
colnames ( TheData ) <- c ( 'Group', 'Y' ) # assign column names
TheData$Group <- factor ( TheData$Group ) # define Group as a categorical variable
attach ( TheData )
# -- Plot the data -------------------------------------------------------------
#
beeswarm ( Y ~ Group,
ylim = c ( floor ( min ( Y ) ), ceiling ( max ( Y ) ) ),
ylab = 'Plasma cholesterol, mmol/L',
xlab = '',
labels = c ( 'Fish', 'Meat' ),
pch = 19,
bty = 'n'
)
## Error: could not find function "beeswarm"
#
lines ( x = c ( 0.85, 1.15 ), y = rep ( mean ( Y [ Group == 1 ] ), 2 ) )
## Error: plot.new has not been called yet
lines ( x = c ( 1.85, 2.15 ), y = rep ( mean ( Y [ Group == 2 ] ), 2 ) )
## Error: plot.new has not been called yet
Ludbrook_Data <- recordPlot ( ) # store data graphic # -- Figure 1: last line
# -- Analyze the data using a 2-sample t test ----------------------------------
#
n_y1 <- length ( Y [ Group == 1 ] ) # fish group: number of observations
y.Mean.1 <- round ( mean ( Y [ Group == 1 ] ), 2 ) # mean
y.Var.1 <- round ( var ( Y [ Group == 1 ] ), 3 ) # variance
n_y2 <- length ( Y [ Group == 2 ] ) # meat group: number of observations
y.Mean.2 <- round ( mean ( Y [ Group == 2 ] ), 2 ) # mean
y.Var.2 <- round ( var ( Y [ Group == 2 ] ), 3 ) # variance
SE_y2_y1 <- round ( sqrt ( ( y.Var.2 / n_y2 ) + ( y.Var.1 / n_y1 ) ), 4 )
#
t <- round ( ( y.Mean.2 - y.Mean.1 ) / SE_y2_y1, 4 )
print ( t )
## [1] 2.017
t.test ( Y ~ Group, # t test using R function t.test
alternative = c ( 'two.sided' ),
var.equal = FALSE,
)
##
## Welch Two Sample t-test
##
## data: Y by Group
## t = -2.017, df = 4.618, p-value = 0.1045
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.1291 0.5497
## sample estimates:
## mean in group 1 mean in group 2
## 6.414 8.204
# -- A More Realistic Two-Sample Example :: Figure 3 --------------------------- # -- Figure 3: first line
#
y1 <- TheData [ 1: 7, 2 ] # fish group observations
y2 <- TheData [ 8:12, 2 ] # meat group observations
obs.sample.statistic <- mean ( y2 ) - mean ( y1 ) # observed difference between sample means
# -- Calculate number of possible permutations ---------------------------------
#
n_total <- n_y1 + n_y2 # total number of observations
#
Number_Permutations <- choose ( n_total, n_y1 ) # possible number of permuations
# -- For each permutation calculate the difference in sample means -------------
#
All.Combinations <- c ( )
SampleStatistic_Matrix <- c ( )
All.Combinations <- combinations ( n_total, n_y1, TheData [ ,2 ], repeats.allowed = FALSE )
for ( k in 1:Number_Permutations )
{ y1_sum <- sum ( All.Combinations [ k, ] )
SampleStatistic_Matrix [ k ] <- ( ( sum ( TheData [ ,2 ] ) - y1_sum ) / n_y2 ) - ( y1_sum / n_y1 )
}
#
SampleStatistic_Min <- floor ( min ( SampleStatistic_Matrix [ ] ) )
SampleStatistic_Max <- ceiling ( max ( SampleStatistic_Matrix [ ] ) )
# -- Permutations for which sample statistic is at least as great as observed --
#
Extreme_Sample_Statistics <- c ( )
Combination_Index <- c ( )
k <- 0
for ( i in 1:Number_Permutations )
{ if ( abs ( SampleStatistic_Matrix [ i ] ) >= abs ( obs.sample.statistic ) )
{ k <- k + 1
Combination_Index [ k ] <- i
Extreme_Sample_Statistics [ k ] <- SampleStatistic_Matrix [ i ]
}
}
print ( Combination_Index )
## [1] 1 2 3 7 22 57 792
#
for ( i in 1:k )
( print ( All.Combinations [ Combination_Index [ i ], ] )
)
## [1] 5.42 5.86 6.16 6.51 6.55 6.80 7.00
## [1] 5.42 5.86 6.16 6.51 6.55 6.80 7.11
## [1] 5.42 5.86 6.16 6.51 6.55 6.80 7.56
## [1] 5.42 5.86 6.16 6.51 6.55 7.00 7.11
## [1] 5.42 5.86 6.16 6.51 6.80 7.00 7.11
## [1] 5.42 5.86 6.16 6.55 6.80 7.00 7.11
## [1] 6.80 7.00 7.11 7.56 7.61 7.84 11.50
#
print ( Extreme_Sample_Statistics )
## [1] 1.995 1.958 1.803 1.889 1.803 1.790 -1.817
# -- Proportion of differences at least as extreme as observed value -----------
#
p.value.diff <- round ( mean ( abs ( SampleStatistic_Matrix ) >= abs ( obs.sample.statistic ) ), 4 )
p.value.diff
## [1] 0.0088
# -- Permutation distribution of differences between sample means --------------
#
hist ( SampleStatistic_Matrix,
axes = FALSE,
main = paste ( Number_Permutations, 'combinations:', p.value.diff ),
xlab = '2-sample difference',
ylab = '',
xlim = c ( SampleStatistic_Min, SampleStatistic_Max ),
breaks = 20
)
axis ( 1 )
segments ( obs.sample.statistic, 0, obs.sample.statistic, 100, lwd = 2, col = 'gray50' )
segments ( -obs.sample.statistic, 0, -obs.sample.statistic, 100, lwd = 2, col = 'gray50' )
Ludbrook_Permutation_Distribution <- recordPlot ( ) # store data graphic # -- Figure 3: last line
# -- An Example in Correlation :: Figure 5 ------------------------------------- # -- Figure 5: first line
#
x <- c ( 71, 68, 66, 67, 70, 71, 70, 73, 72, 65, 66 ) # x observations
y <- c ( 69, 64, 65, 63, 65, 62, 65, 64, 66, 59, 62 ) # y observations
#-- Estimate Pearson correlation coefficient for actual observations -----------
#
obs.corr <- cor ( x, y, method = 'pearson' )
round ( obs.corr, 3 )
## [1] 0.558
#-- Fix x and permute y --------------------------------------------------------
#
Corr_Matrix <- c ( )
for ( i in 1:PermNum )
{ perm.y <- permute ( y )
Corr_Matrix [ i ] <- round ( cor ( x, perm.y ), 3 )
if ( i <= 3 ) print ( perm.y )
}
## [1] 64 62 62 63 59 65 69 66 65 64 65
## [1] 64 69 65 65 66 62 65 64 63 59 62
## [1] 62 63 64 65 65 64 65 69 66 62 59
#
Corr_Matrix [ 1:3 ]
## [1] 0.286 0.172 0.658
#-- Identify correlations at least as extreme as observed value ----------------
#
Extreme_Sample_Statistics <- c ( )
k <- 0
for ( i in 1:PermNum )
{ if ( abs ( Corr_Matrix [ i ] ) >= abs ( obs.corr ) )
{ k <- k + 1;
Extreme_Sample_Statistics [ k ] <- Corr_Matrix [ i ]
}
}
#
length ( Extreme_Sample_Statistics ) # number of correlations at least as extreme as observed value
## [1] 681
# -- Proportion of correlations at least as extreme as observed value ----------
#
p.value.corr <- round ( mean ( abs ( Corr_Matrix ) >= abs ( obs.corr ) ), 3 )
p.value.corr
## [1] 0.068
# -- Permutation distribution of correlations ----------------------------------
#
hist ( Corr_Matrix,
axes = FALSE,
main = paste ( PermNum, 'permuted correlations:', p.value.corr ),
xlab = 'Correlation',
ylab = '',
xlim = c ( -1, +1 ),
breaks = 40
)
axis ( 1 )
#
segments ( obs.corr, 0, obs.corr, 1000, lwd = 2, col = 'gray50' )
segments ( -obs.corr, 0, -obs.corr, 1000, lwd = 2, col = 'gray50' )
Correlation_Permutation_Distribution <- recordPlot ( ) # store data graphic # -- Figure 5: last line
# -- Replay each data graphic --------------------------------------------------
#
replayPlot ( Ludbrook_Data ) # Figure 1
## Warning: display list redraw incomplete
replayPlot ( Ludbrook_Permutation_Distribution ) # Figure 3
## Warning: display list redraw incomplete
replayPlot ( Correlation_Permutation_Distribution ) # Figure 5
## Warning: display list redraw incomplete