Explorations in statistics: permutation methods

Douglas Curran-Everett
Explorations in statistics: permutation methods
Advan Physiol Educ 2012 36:181-187;
doi:10.1152/advan.00072.2012

#
# -- Advances_Statistics_Code_Perm.R: R code for Explorations in Statistics ----


# -- Load extra packages and define parameters ---------------------------------
#
library ( beeswarm )
## Error: there is no package called 'beeswarm'
library ( coin     )
## Loading required package: survival
## Loading required package: splines
library ( gtools   )

par     ( mfrow = c ( 1, 1 ) )                               #  1 graphic on 1 screen
par     ( las   = 1          )                               #  make axis numbers horizontal

PermNum <- 10000                                             #  number of permutations for correlation example


# -- A More Realistic Two-Sample Example :: Figure 1 ---------------------------              # -- Figure 1: first line

# -- Read in the data ----------------------------------------------------------
#
TheData <- matrix ( nrow = 12, ncol = 2 )
#
TheData [  1, ] <- c ( 1,  5.42 )                            #  fish group observations
TheData [  2, ] <- c ( 1,  5.86 )
TheData [  3, ] <- c ( 1,  6.16 )
TheData [  4, ] <- c ( 1,  6.55 )
TheData [  5, ] <- c ( 1,  6.80 )
TheData [  6, ] <- c ( 1,  7.00 )
TheData [  7, ] <- c ( 1,  7.11 )
TheData [  8, ] <- c ( 2,  6.51 )                            #  meat group observations
TheData [  9, ] <- c ( 2,  7.56 )
TheData [ 10, ] <- c ( 2,  7.61 )
TheData [ 11, ] <- c ( 2,  7.84 )
TheData [ 12, ] <- c ( 2, 11.50 )
#
TheData <- data.frame ( TheData )

colnames ( TheData ) <- c ( 'Group', 'Y' )                   #  assign column names
TheData$Group        <- factor ( TheData$Group )             #  define Group as a categorical variable

attach ( TheData )


# -- Plot the data -------------------------------------------------------------
#
beeswarm ( Y ~ Group,
           ylim   =  c ( floor ( min ( Y ) ), ceiling ( max ( Y ) ) ),
           ylab   = 'Plasma cholesterol, mmol/L',
           xlab   = '',
           labels = c ( 'Fish', 'Meat' ),
           pch    = 19,
           bty    = 'n'
           )
## Error: could not find function "beeswarm"
#
lines ( x = c ( 0.85, 1.15 ), y = rep ( mean ( Y [ Group == 1 ] ), 2 ) )
## Error: plot.new has not been called yet
lines ( x = c ( 1.85, 2.15 ), y = rep ( mean ( Y [ Group == 2 ] ), 2 ) )
## Error: plot.new has not been called yet

Ludbrook_Data <- recordPlot ( )                              #  store data graphic            # -- Figure 1: last line


# -- Analyze the data using a 2-sample t test ----------------------------------
#
n_y1     <- length ( Y [ Group == 1 ] )                      #  fish group: number of observations
y.Mean.1 <- round ( mean ( Y [ Group == 1 ] ), 2 )           #              mean
y.Var.1  <- round ( var  ( Y [ Group == 1 ] ), 3 )           #              variance

n_y2     <- length ( Y [ Group == 2 ] )                      #  meat group: number of observations
y.Mean.2 <- round ( mean ( Y [ Group == 2 ] ), 2 )           #              mean
y.Var.2  <- round ( var  ( Y [ Group == 2 ] ), 3 )           #              variance

SE_y2_y1 <- round ( sqrt ( ( y.Var.2 / n_y2 ) + ( y.Var.1 / n_y1 ) ), 4 )
#
t        <- round ( ( y.Mean.2 - y.Mean.1 ) / SE_y2_y1, 4 )
print ( t )
## [1] 2.017

t.test ( Y ~ Group,                                          #  t test using R function t.test
         alternative = c ( 'two.sided' ),
         var.equal   = FALSE,
         )
## 
##  Welch Two Sample t-test
## 
## data:  Y by Group
## t = -2.017, df = 4.618, p-value = 0.1045
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.1291  0.5497
## sample estimates:
## mean in group 1 mean in group 2 
##           6.414           8.204


# -- A More Realistic Two-Sample Example :: Figure 3 ---------------------------              # -- Figure 3: first line
#
y1 <- TheData [ 1: 7, 2 ]                                    #  fish group observations
y2 <- TheData [ 8:12, 2 ]                                    #  meat group observations

obs.sample.statistic <- mean ( y2 ) - mean ( y1 )            #  observed difference between sample means


# -- Calculate number of possible permutations ---------------------------------
#
n_total             <- n_y1 + n_y2                           #  total number of observations
#
Number_Permutations <- choose ( n_total, n_y1 )              #  possible number of permuations


# -- For each permutation calculate the difference in sample means -------------
#
All.Combinations       <- c ( )
SampleStatistic_Matrix <- c ( )

All.Combinations       <- combinations ( n_total, n_y1, TheData [ ,2 ], repeats.allowed = FALSE )

for ( k in 1:Number_Permutations )
    { y1_sum                       <- sum ( All.Combinations [ k, ] )
      SampleStatistic_Matrix [ k ] <- ( ( sum ( TheData [ ,2 ] ) - y1_sum ) / n_y2 ) - ( y1_sum / n_y1 )
      }
#
SampleStatistic_Min <- floor   ( min ( SampleStatistic_Matrix [ ] ) )
SampleStatistic_Max <- ceiling ( max ( SampleStatistic_Matrix [ ] ) )


# -- Permutations for which sample statistic is at least as great as observed --
#
Extreme_Sample_Statistics <- c ( )
Combination_Index         <- c ( )

k <- 0
for ( i in 1:Number_Permutations )
    { if ( abs ( SampleStatistic_Matrix [ i ] ) >= abs ( obs.sample.statistic ) )
         { k                               <- k + 1
           Combination_Index [ k ]         <- i
           Extreme_Sample_Statistics [ k ] <- SampleStatistic_Matrix [ i ]
           }
      }

print ( Combination_Index )
## [1]   1   2   3   7  22  57 792
#
for ( i in 1:k )
    ( print ( All.Combinations [ Combination_Index [ i ], ] )
      )
## [1] 5.42 5.86 6.16 6.51 6.55 6.80 7.00
## [1] 5.42 5.86 6.16 6.51 6.55 6.80 7.11
## [1] 5.42 5.86 6.16 6.51 6.55 6.80 7.56
## [1] 5.42 5.86 6.16 6.51 6.55 7.00 7.11
## [1] 5.42 5.86 6.16 6.51 6.80 7.00 7.11
## [1] 5.42 5.86 6.16 6.55 6.80 7.00 7.11
## [1]  6.80  7.00  7.11  7.56  7.61  7.84 11.50
#
print ( Extreme_Sample_Statistics )
## [1]  1.995  1.958  1.803  1.889  1.803  1.790 -1.817


# -- Proportion of differences at least as extreme as observed value -----------
#
p.value.diff <- round ( mean ( abs ( SampleStatistic_Matrix ) >= abs ( obs.sample.statistic ) ), 4 )
p.value.diff
## [1] 0.0088


# -- Permutation distribution of differences between sample means --------------
#
hist ( SampleStatistic_Matrix,
       axes   =  FALSE,
       main   =  paste ( Number_Permutations, 'combinations:', p.value.diff ),
       xlab   = '2-sample difference',
       ylab   = '',
       xlim   = c ( SampleStatistic_Min, SampleStatistic_Max ),
       breaks = 20
       )
axis  ( 1 )
segments (  obs.sample.statistic, 0,  obs.sample.statistic, 100, lwd = 2, col = 'gray50' )
segments ( -obs.sample.statistic, 0, -obs.sample.statistic, 100, lwd = 2, col = 'gray50' )

Ludbrook_Permutation_Distribution <- recordPlot ( )          #  store data graphic            # -- Figure 3: last line


# -- An Example in Correlation :: Figure 5 -------------------------------------              # -- Figure 5: first line
#
x <- c ( 71, 68, 66, 67, 70, 71, 70, 73, 72, 65, 66 )        #  x observations
y <- c ( 69, 64, 65, 63, 65, 62, 65, 64, 66, 59, 62 )        #  y observations


#-- Estimate Pearson correlation coefficient for actual observations -----------
#
obs.corr <- cor ( x, y, method = 'pearson' )
round ( obs.corr, 3 )
## [1] 0.558


#-- Fix x and permute y --------------------------------------------------------
#
Corr_Matrix <- c ( )

for ( i in 1:PermNum )
    { perm.y            <- permute ( y )
      Corr_Matrix [ i ] <- round ( cor ( x, perm.y ), 3 )
      if ( i <= 3 ) print ( perm.y )
      }
##  [1] 64 62 62 63 59 65 69 66 65 64 65
##  [1] 64 69 65 65 66 62 65 64 63 59 62
##  [1] 62 63 64 65 65 64 65 69 66 62 59
#
Corr_Matrix [ 1:3 ]
## [1] 0.286 0.172 0.658


#-- Identify correlations at least as extreme as observed value ----------------
#
Extreme_Sample_Statistics <- c ( )

k <- 0
for ( i in 1:PermNum )
    { if ( abs ( Corr_Matrix [ i ] ) >= abs ( obs.corr ) )
         { k                               <- k + 1;
           Extreme_Sample_Statistics [ k ] <- Corr_Matrix [ i ]
           }
      }
#
length ( Extreme_Sample_Statistics )                         #  number of correlations at least as extreme as observed value
## [1] 681


# -- Proportion of correlations at least as extreme as observed value ----------
#
p.value.corr <- round ( mean ( abs ( Corr_Matrix ) >= abs ( obs.corr ) ), 3 )
p.value.corr
## [1] 0.068


# -- Permutation distribution of correlations ----------------------------------
#
hist ( Corr_Matrix,
       axes   =  FALSE,
       main   =  paste ( PermNum, 'permuted correlations:', p.value.corr ),
       xlab   = 'Correlation',
       ylab   = '',
       xlim   =  c ( -1, +1 ),
       breaks = 40
       )
axis ( 1 )
#
segments (  obs.corr, 0,  obs.corr, 1000, lwd = 2, col = 'gray50' )
segments ( -obs.corr, 0, -obs.corr, 1000, lwd = 2, col = 'gray50' )

Correlation_Permutation_Distribution <- recordPlot ( )       #  store data graphic            # -- Figure 5: last line


# -- Replay each data graphic --------------------------------------------------
#
replayPlot ( Ludbrook_Data                        )          #  Figure 1
## Warning: display list redraw incomplete
replayPlot ( Ludbrook_Permutation_Distribution    )          #  Figure 3
## Warning: display list redraw incomplete
replayPlot ( Correlation_Permutation_Distribution )          #  Figure 5
## Warning: display list redraw incomplete