PERMANOVA and Bootstrap

Cheat sheet - Simple tutorial

How to do a Permutation test (PERMANOVA) and Bootstrap

(R. Schwamborn, 2017)

I. PERMANOVA

II. Bootstrap

I. Learn how to perform a PERMANOVA (Anderson, 2001)

PERMANOVA (simple example with 2 samples)

am.A <- 1:5 # sample A

am.B <- 11:15 # sample B

# Difference between the means of samples A and B
diff.AB <- mean(am.A)-mean(am.B)
diff.AB    
## [1] -10
## I.0. Basic Question: Is the observed difference signifficant?

# standard test: Student's t-test

t.test(am.A, am.B) # p-value = 8.488e-06*** (p <0.001)
## 
##  Welch Two Sample t-test
## 
## data:  am.A and am.B
## t = -10, df = 8, p-value = 8.488e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -12.306004  -7.693996
## sample estimates:
## mean of x mean of y 
##         3        13
# non-parametric test: PERMANOVA

## I.1. Create a Dataframe for analysis
Data.2 <- data.frame(A = am.A, B = am.B)
Data.3 <- stack(Data.2)

#View(Data.3)

names(Data.3) # "values" "ind"   
## [1] "values" "ind"
names(Data.3) <- c("values", "grupo" )   
names(Data.3) # "values" "grupo"   
## [1] "values" "grupo"
t.test(Data.3$values ~Data.3$grupo) # p-value = 8.488e-06
## 
##  Welch Two Sample t-test
## 
## data:  Data.3$values by Data.3$grupo
## t = -10, df = 8, p-value = 8.488e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -12.306004  -7.693996
## sample estimates:
## mean in group A mean in group B 
##               3              13
## I.2. Take a random sample with values all mixed up,
## but factors ("A", "B") unchanged
sample (Data.3$values,10,  replace = FALSE)
##  [1] 13  3 11 14 15  1  2  4  5 12
## I.3. Take such a mixed up random sample, 
## then create a new data frame with this sample
Data.sam1 <-Data.3 
Data.sam1$values <- sample (Data.3$values,10, replace = FALSE)
##View(Data.sam1)
sam.A <- Data.sam1$values[1:5]
sam.B <- Data.sam1$values[6:10]
t.test(sam.A, sam.B) # p-value = n.s.
## 
##  Welch Two Sample t-test
## 
## data:  sam.A and sam.B
## t = 0.67135, df = 7.6582, p-value = 0.5217
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.908252 10.708252
## sample estimates:
## mean of x mean of y 
##       9.2       6.8
## I.4. calculate the difference between "A" and "B" in the "mixed up" random sample 
diff.AB <- mean(sam.A)-mean(sam.B)
diff.AB   # difference between "A" and "B" in the a "mixed up" random sample 
## [1] 2.4
## I.5. Create a function that makes all this. 
## It takes a "mixed up" random sample  
## and then calculates the difference between "A" and "B"
## new function "sam_diff.fun1()"

# As Function

sam_diff.fun1 <- function (DF) {
  
  #take a sample
  Data.sam1 <-DF
  colnames(Data.sam1)[1] <- "values"
  Data.sam1$values <- sample (Data.3$values,(length(Data.3$values)),  
                              replace = FALSE)
  ##View(Data.sam1)
  sam.A <- Data.sam1$values[1:5]
  sam.B <- Data.sam1$values[6:10]
  t.test(sam.A, sam.B) # p-value = n.s.
  
  diff.AB <- mean(sam.A)-mean(sam.B)
  ; diff.AB
  
  
} # END OF FUNCTION

sam_diff.fun1(Data.3)
## [1] 1.2
## I.6. Apply this function several times (10000 times)
# Each time, you get a diferent sample 
# and a different value for the diffence between means in the sample 
# By this, you will get 10000 values of diffence between means 
# the distrib. of these 10000 values is the posterior distib., 
# an approximation of the null distrib.. 
# Analyse: Is your original observed mean within the null distribution?

# Loop to obtain the null distribution

Nperm <- 10000 # Number of permutations
diff.vec <-  rep(0,Nperm)

for (i in 1:Nperm) {  
  
diff.sam <-  sam_diff.fun1(Data.3)
  
diff.vec[i] <- diff.sam

} # END OF LOOP

## I.7. Analyse the distibution obtained by permutation
# i.e., analyse the shape of the  distibution obtained by repeated mixing up and sampling. 
# and determine where your original observed mean is located within the null distribution.

# Analyze Loop results

summary(diff.vec)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -10.00000  -2.40000   0.00000   0.01448   2.40000  10.00000
hist.graph1 <- hist(diff.vec, breaks = seq(-20, 20, length.out = 201))

hist.graph1$counts
##   [1]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##  [18]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##  [35]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  36   0
##  [52]   0   0   0   0   0   0   0   0   0   0  35   0  77   0 113   0 157
##  [69]   0 188   0 161   0 135   0  77   0  88   0  89   0 199   0 318   0
##  [86] 467   0 532   0 607   0 580   0 457   0 339   0 238   0 170   0 247
## [103]   0 363   0 470   0 543   0 588   0 612   0 462   0 306   0 187   0
## [120]  91   0  67   0  82   0 114   0 174   0 198   0 168   0  96   0  81
## [137]   0  39   0   0   0   0   0   0   0   0   0   0   0  49   0   0   0
## [154]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [171]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [188]   0   0   0   0   0   0   0   0   0   0   0   0   0
# create a nice graph
hist.graph1 <- hist(diff.vec, breaks = seq(-20, 20, length.out = 201),
                    prob = TRUE,
     main = "Null Distribution of the diff. in means")
abline(v = -10, col = "green")
xydens<- density(diff.vec, from = -20, to= 20,n = 201)
xydens$y <- xydens$y*2
lines(xydens, col = grey(0.3), lty = 2)              
text(-11.4, 0.3, "obs.", col = "green", cex = 0.8)

## now calculate "p" and 95% Conf.Interval
# calculate95% Conf.Interval
quantile(diff.vec, c(0.025, 0.975)) # 95% Conf.Interval:-6.8 to 6.8
##  2.5% 97.5% 
##  -6.8   6.8
abline(v = -6.8,col = "grey"); abline(v = 6.8,col = "grey");
text(3, 0.32, "95% Conf. int.", col = "grey", cex = 0.8)

#  calculate "p"  based on counts (counting number of samples)
df.hist1 <- data.frame(hist.graph1$density,round(hist.graph1$mids,2), 
                       hist.graph1$counts) 
#View(df.hist1)
names(df.hist1)<-  c("density", "mids", "counts")
#View(df.hist1)

df.hist1.10 <- subset(df.hist1,df.hist1$mids > -10.2) 
df.hist1.10 <- subset(df.hist1.10,df.hist1.10$mids < -9.8) 
#View(df.hist1.10)

sum(df.hist1.10$counts) # 36 permutation runs out of 10,000 gave "diff = -10"
## [1] 36
sum(df.hist1.10$counts)/Nperm # p = 0.0035** (p<0.01)
## [1] 0.0036
#  calculate "p"  based on the kernel density

xydens$x <- round(xydens$x,2)
p_vals <-  xydens$y[xydens$x == (-10.0)]  # p = 0.0056** (p<0.01)
mean (p_vals)
## [1] 0.005797957
## I.8. Use packages "coin" and "adonis" for PERMANOVA
# very fast and simple

library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.4-3

# ?adonis
# ?adonis2
adonis(Data.3$values~ Data.3$grupo,method = "euclidian") 
## 
## Call:
## adonis(formula = Data.3$values ~ Data.3$grupo, method = "euclidian") 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##              Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)   
## Data.3$grupo  1       250   250.0     100 0.92593   0.01 **
## Residuals     8        20     2.5         0.07407          
## Total         9       270                 1.00000          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p-value = 0.007 **  (p<0.01)

library(coin)
## Loading required package: survival
# ?coin
# ?independence_test

independence_test(Data.3$values~ Data.3$grupo) 
## 
##  Asymptotic General Independence Test
## 
## data:  Data.3$values by Data.3$grupo (A, B)
## Z = -2.8868, p-value = 0.003892
## alternative hypothesis: two.sided
# p-value = 0.0039**  (p<0.01)


# II. Learning how to Bootstrap
### BOOTSTRAP simple examples
### 
# Example II.1. Bootstrap the mean of a sample
# Objectives:
# Obtain 95% Conf. Interval and st.errors for the mean of a sample. 
# Obtain the "p" value that the mean of a sample is greater than zero. 


## II.1  Bootstrap the mean of a sample
#
# Basic principle:
# The Boostrap creates a long list of new random samples taken from
# the original data.
# The frequency distribution of the Bootstrap results is analyzed 
# and compared to the original data.
#
# Basic procedure:
# Use the function "sample()" with "replace = TRUE", 
# this will create many repeated values. 
# Bootstrap sample size = original sample size.


## II.1.a Bootstrap the mean of a LARGE sample. 
#         Large sample:  n = 30

# test.sam <- rnorm (30,3,2)
# test.sam <- round(test.sam,3)
test.sam <- c(6.166, 3.010 ,4.832, 3.511,  5.910, -0.545,  4.334, 4.861, -0.107, 2.194, 1.890 ,3.827,
       1.567,  3.392,  1.057,  1.382,  1.168,  6.794,  4.414,  1.726 ,-0.109,  1.407,  2.003,  3.155,
       3.026, 3.361,  3.527,  5.224,  3.169,  3.962)
length(test.sam) #30
## [1] 30
media.teste <-mean(test.sam) #mean
media.teste
## [1] 3.0036
median(test.sam) #median=3
## [1] 3.162
sd(test.sam) # st.dev = 1.87
## [1] 1.872134
shapiro.test(test.sam) # p > 0.05, normal distrib. 
## 
##  Shapiro-Wilk normality test
## 
## data:  test.sam
## W = 0.98164, p-value = 0.8672
hist(test.sam, breaks = c(seq(-2,8, by = 2)) , main = "Original data, n=30")

# Calulate the Parametric 95% Conf.int., using the "t" distribution

# begin by calculating the prametric standard error of the mean
 n = length(test.sam)
 St.Error  = sd(test.sam)/sqrt(n)
 St.Error    # Standard error of the mean = 0.342
## [1] 0.3418033
 # Calulate the Parametric 95% Conf.int. using the "t" distribution and the St. Error 
# mean - t(0.95,df) *St.Error ; mean + t(0.95,df) *St.Error
 qt(0.975,29) # 2.05  , VALUE OF THE "t" distrib., approx. 2
## [1] 2.04523
 # mean -  2*St.Error , mean +  2 *St.Error
 mean(test.sam) -  2*St.Error ; mean(test.sam) +  2 *St.Error
## [1] 2.319993
## [1] 3.687207
 # Param 95% Conf.int. (distrib.: t): 2.32 < mean < 3.69
 
# NOW BOOTSRAP
# Bootstrap st.error and 95% Conf.int
# Principle: sampling with replacement (replace = TRUE), 
# permits repetition of the same values

# sample(test.sam,  size =n, replace = TRUE)
 
 
Boot.runs = 10000 # number of Bootstrap runs
n = length(test.sam) # sample size (300)


boot.samples = matrix(sample(test.sam, size = Boot.runs * n, 
                             replace = TRUE),
                      Boot.runs, n)
# generates a matrix with 10000 lines and 3000 columns (samples)

dim(boot.samples) # number of rows and columns
## [1] 10000    30
#View(as.data.frame(boot.samples))

boot.statistics = apply(boot.samples, 1, mean)
# calculates the mean for each line (i.e., for each sampe)
# gives a vector with  10000 means

mean(boot.statistics) # overall mean of means: 2.999, 
## [1] 2.998966
median(boot.statistics) # overall median of means: 3.0,
## [1] 3.0041
hist(boot.statistics) # histogram

#Kolmogorov-Smirnov test for normality
test.distrib <- rnorm(10000, mean = mean(boot.statistics), sd = sd(boot.statistics))
ks.test(test.distrib, boot.statistics) # p-value>0.05 , it's normal
## Warning in ks.test(test.distrib, boot.statistics): p-value will be
## approximate in the presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  test.distrib and boot.statistics
## D = 0.012, p-value = 0.4676
## alternative hypothesis: two-sided
# Histogram of the means of the samples obtained with Bootstrap,  
# i.e., histogram of the Bootstrap disribution, 
# i.e., the "posterior" distrib., wich is an approximation of the "null" distrib.
hist(boot.statistics,prob = T, col= grey(0.8), 
     main = "Posterior Distrib., N = 10,000/30",xlim = c(1.5, 4.5),ylim = c(0, 4)) # histogram
text(mean(boot.statistics), 3.95, "mean", cex = 0.7)
abline(v = mean(boot.statistics), lty = 4, col = "purple")
lines(density(rnorm(10000, mean = mean(boot.statistics), sd = sd(boot.statistics))
), col = "blue", lty= 2)
text(4, 1.55, "Bootstrap dist.",cex =0.7,col = "blue")

s.error.boot.teste.1 = sd(boot.statistics) # st.dev. = 0.333
s.error.boot.teste.1 
## [1] 0.338719
# St. dev. of bootstrap distrib. = st. error of the mean !
# Param St.Error, Standard error of the mean = 0.342
abline(v = 3+ sd(boot.statistics), col= "grey"); abline(v =  3-sd(boot.statistics), col= "grey")  
text(4, 3.95, "grey: st.dev.=S.E.",cex =0.7,col = "grey")

# Parametric Bootstrap (uses the "t" value * the boots. st.error)
me = (round(s.error.boot.teste.1,4))*2  # often used: 2 
round(media.teste, 1) + c(-1, 1) * me
## [1] 2.3226 3.6774
# 95% Conf.Int. (t distrib.): 2.78 < mean = 3  <  3.22  # parametr. Bootstrap (uses 1.96)
abline(v = 2.33, col= "purple"); abline(v = 3.67, col= "purple")  
text(2.33, 1.5, "par.(t) boot. conf.int.",cex =0.6,col = "purple")

# Non-parametr. ordinary Bootstrap (no assumptions, uses quantiles):
quantile(boot.statistics, c(0.025, 0.975)) # correct 95% quantiles OK!
##     2.5%    97.5% 
## 2.329297 3.655268
# 2.5%   97.5% 
# 2.364   3.671
# 95% Conf.Int.:  2.74 < mean  < 3.17 # non-parametr. Bootstrap (no assumptions)
abline(v = 2.364, col= "green"); abline(v = 3.671, col= "green")  
text(2.364, 2.5, "n.-par. boot. conf.int.",cex =0.6,col = "green")

abline(v = 2.32, col= "orange"); abline(v = 3.69, col= "orange")  
#  Param 95% Conf.int. (distrib.: t): 2.32 < mean < 3.69
text(2.32, 3.5, "param. (t) conf.int.",cex =0.6,col = "orange")

## Now calculate probability "p" that mean = ZERO
 dens <- density(test.distrib) # Kernel density distrib.
 dens.df <- data.frame( x=  dens$x, y=  dens$y)
 plot(dens.df$x, dens.df$y, xlim = c(0, max(dens.df$x) ))                      

 # View(dens.df)
 min(dens.df$y) # 2.97 e-05*** = minimum vaule of p (at dens.df$x = min)
## [1] 9.362449e-06
 # p < 2.97 e-05***   # prob. that mean = ZERO, p<0.001***
 
 min(hist(test.sam)$counts)/Boot.runs # 3.0 e-04*** = minimum vaule of p (at dens.df$x = min)

## [1] 0
 # p < 3.0 e-04*** = 0.0003 # prob. that mean = ZERO, p<0.001***
 

# large samples (e.g. N > 16): Common Bootstrap OK, no need for Bayesian Bootstrap.
# Small samples (e.g. N < 16): maybe use Bayesian Bootstrap?
# Very small samples (e.g. N < 8): use Bayesian Bootstrap!

### Bayesian Bootstrap with "bayesboot()"
## video tutorial: Bayesian Bootstrap, https://www.youtube.com/watch#?v=WMAgzr99PKE
 # Rubin, D.B. (1981). "The Bayesian Bootstrap". The Annals of Statistics, 9(1), p. 130-134. 
 # Paper (Rubin, 1981): https://projecteuclid.org/download/pdf_1/euclid.aos/1176345338
 # package Bayesboot()
 
 library(bayesboot)

# ?bayesboot

boot.obj1 <- bayesboot(test.sam, mean)

boot.obj1
## The first 10 draws (out of 4000) from the Bayesian bootstrap:
## 
##          V1
## 1  2.467263
## 2  3.071117
## 3  2.863152
## 4  3.356218
## 5  2.865813
## 6  3.070229
## 7  3.448393
## 8  2.867503
## 9  3.312115
## 10 3.002879
## .. ...
## 
## Use summary() to produce a summary of the posterior distribution.
summary(boot.obj1) # 95% Conf.int: 2.73 < median=2.95 < 3.18
## Bayesian bootstrap
## 
## Number of posterior draws: 4000 
## 
## Summary of the posterior (with 95% Highest Density Intervals):
##  statistic     mean        sd  hdi.low hdi.high
##         V1 3.008992 0.3402521 2.317517 3.673202
## 
## Quantiles:
##  statistic    q2.5%    q25%   median     q75%   q97.5%
##         V1 2.330311 2.78149 3.004157 3.238448 3.694705
## 
## Call:
##  bayesboot(data = test.sam, statistic = mean)
# statistic    q2.5%   q25%  median    q75%   q97.5%
#V1        2.726811 2.876869 2.954332 3.03456 3.175445


plot(boot.obj1) 

#  plots posterior distrib. and 95% Highest Density Intervals

### Common Bootstrap with "boot()"
# use package "boot()" - a bit more complicated... needs indices 
# http://www.mayin.org/ajayshah/KB/R/documents/boot.html

library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:survival':
## 
##     aml
## The following object is masked from 'package:lattice':
## 
##     melanoma
media.indice.fun <- function(x, d) {
  return(mean(x[d]))
}

indices <- 1:length(test.sam)

plot(test.sam ~indices) # OK

media.indice.fun(test.sam, indices[1:20]) # OK
## [1] 3.06915
media.indice.fun(test.sam, c(1:20)) # OK
## [1] 3.06915
media.indice.fun(test.sam, 1:20) # OK
## [1] 3.06915
# ?boot

boot.obj2 <-  boot(test.sam, media.indice.fun, 9999) 
# ordinary non-par. bootstrap,requires index

# ?boot.ci

boot.ci(boot.obj2, type = "basic")  # gives 95% Conf.int.
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 9999 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot.obj2, type = "basic")
## 
## Intervals : 
## Level      Basic         
## 95%   ( 2.353,  3.672 )  
## Calculations and Intervals on Original Scale
# 2.740,  3.172 

# 95% Conf.int: 2.74 < 2.95 < 3.17 # using package "boot()", common non-parametr. Bootstr.


### Compare Methods (n=30,normal distr.):
# All methods are OK (see figure). 
# all Conf. Intervals are very similar within 1% 
# (for very large samples), for normally distribited data.

#
## This was a test with large N (n = 30)
##
### Now let's test these Bootstrap methods with a small N (n = 8)

## II.1.b Bootstrap the mean of a SMALL sample. 
#         SMALL sample:  n = 8


test.sam.sm <- c(3,4,3,5,6,7,6,5)
length(test.sam.sm ) # n = 8
## [1] 8
# Param conf. int. with "t" distrib.
n.sm = length(test.sam.sm)
# param Conf.int
St.Error.sm  = sd(test.sam.sm)/sqrt(n)
St.Error.sm    # = 0.084
## [1] 0.2661453
# Param 95% Conf.int.:
# mean - t(0.95,df) *St.Error , mean + t(0.95,df) *St.Error
t.value.sm <- qt(0.975,n.sm) # 2.3  , distrib. "t" for n=8, approx. 2.3
# mean -  1.97*St.Error , mean +  1.97 *St.Error
mean(test.sam.sm) - t.value.sm *St.Error.sm ; mean(test.sam.sm) +  t.value.sm *St.Error.sm
## [1] 4.261268
## [1] 5.488732
# Param 95% Conf.int. (distrib.: t): 4.68 < mean=4.88 < 5.07

boot.obj.sm <- bayesboot(test.sam.sm, mean, R = 10000)
boot.obj.sm
## The first 10 draws (out of 10000) from the Bayesian bootstrap:
## 
##         V1
## 1  4.82125
## 2  4.85975
## 3  4.94150
## 4  4.27125
## 5  4.84325
## 6  4.86375
## 7  4.37800
## 8  4.21000
## 9  5.07600
## 10 5.86675
## .. ...
## 
## Use summary() to produce a summary of the posterior distribution.
plot(boot.obj.sm)

summary(boot.obj.sm) # 95% Conf.int: 3.97 < median=4.88 < 5.76
## Bayesian bootstrap
## 
## Number of posterior draws: 10000 
## 
## Summary of the posterior (with 95% Highest Density Intervals):
##  statistic     mean        sd hdi.low hdi.high
##         V1 4.868089 0.4569996  3.9825  5.77875
## 
## Quantiles:
##  statistic    q2.5%    q25%  median     q75%   q97.5%
##         V1 3.961469 4.56275 4.87475 5.184312 5.759756
## 
## Call:
##  bayesboot(data = test.sam.sm, statistic = mean, R = 10000)
#statistic    q2.5%     q25%   median     q75%   q97.5%
#  V1         3.97      4.57   4.88      5.19     5.76
# 95% Conf.int: 3.97 < median=4.88 < 5.76 # Bayes. bootstr. ("bayesboot()")

# 95% HDI: 3.98 < median=4.88 < 5.78
#Summary of the posterior (with 95% Highest Density Intervals):
#  statistic    mean        sd      hdi.low    hdi.high
#V1            4.87          0.46    3.98     5.78

boot.obj.sm2 <-  boot(test.sam.sm, media.indice.fun, 9999) 
# ordinary non-par. bootstrap,requires index
boot.ci(boot.obj.sm2, type = "basic")  # gives 95% Conf.int.
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 9999 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot.obj.sm2, type = "basic")
## 
## Intervals : 
## Level      Basic         
## 95%   ( 4.000,  5.875 )  
## Calculations and Intervals on Original Scale
#Level      Basic         
#95%   ( 3.875,  5.875 ) 
# 95% Conf.int: 3.88 < median=4.88 < 5.88 # ordinary bootstr. ("boot()")

# Bootstrap with a simple script  (Larget, 2014, modified, http://www.stat.wisc.edu/~larget/stat302/chap3.pdf)
Boot.runs = 10000 # number of Bootstrap runs
boot.samples = matrix(sample(test.sam.sm, size = Boot.runs * n.sm, 
                             replace = TRUE),
                      Boot.runs, n.sm)
boot.statistics = apply(boot.samples, 1, mean)# 
median(boot.statistics) # median
## [1] 4.875
hist(boot.statistics) # histogram

#Kolmogorov-Smirnov test for normality
test.distrib <- rnorm(10000, mean = mean(boot.statistics), sd = sd(boot.statistics))
ks.test(test.distrib, boot.statistics) # p-value<0.05, not normally distrib.
## Warning in ks.test(test.distrib, boot.statistics): p-value will be
## approximate in the presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  test.distrib and boot.statistics
## D = 0.0557, p-value = 6.717e-14
## alternative hypothesis: two-sided
# Histogram of the  Bootstrap results
hist(boot.statistics,prob = T, col= grey(0.8), 
     main = "Posterior Distrib., N = 10,000/8",ylim = c(0, 4)) # histogram
text(mean(boot.statistics), 3.95, "mean", cex = 0.7)
abline(v = (mean(test.sam.sm)), lty = 4, col = "purple")
lines(density(boot.statistics), col = "blue", lty= 2)
lines(density(test.distrib),  col = "red")
text(4, 2.75, "red: normal dist.",cex =0.7,col = "red")
text(4, 1.55, "blue: Bootstrap dist.",cex =0.7,col = "blue")

s.error.boot.teste.1 = sd(boot.statistics) # st.dev. = 0.48 
s.error.boot.teste.1 
## [1] 0.4762067
# Non-parametr. Bootstrap (no assumptions, uses quantiles):
quantile(boot.statistics, c(0.025, 0.975)) # correct 95% quantiles OK!
##  2.5% 97.5% 
##  4.00  5.75
# 2.5% 97.5% 
# 3.875 5.750 

# 95% Conf.int: 3.88 < median=4.88 < 5.75 # ordinary bootstr. script

### Compare Methods small sample (n=8,not normal distr.):
# 95% Conf.int: 4.68 < 4.88 < 5.07 # Param. (distrib.: t)
# 95% Conf.int: 3.97 < 4.88 < 5.76 # Bayes. bootstr. ("bayesboot()")
# 95% HDI:      3.98 < 4.87 < 5.78 # Full Bayes. bootstr. ("bayesboot()")
# 95% HDI = Highest Density Intervals
# 95% Conf.int: 3.88 < 4.88 < 5.75 # ordinary bootstr. script
# 95% Conf.int: 3.88 < 4.88 < 5.88 # ordinary bootstr. ("boot()")
# Parametric (t) is very narrow (overtly optimistic) - Bootraps are better
# All Bootsraps are very similar (within 5%). 

### Plot Comparison of Methods, small sample (n= 8)
plot(boot.obj.sm, main = "N= 8")
abline(v = c(4.68,5.07), col= "red")
text(5.4, 0.55,"param.(t)", cex = 0.7, col = "red")
abline(v = c(3.879,5.88), col= "blue")
text(5.75, 0.6, "boot()", cex = 0.7, col = "blue")
abline(v = c(3.883,5.75), col= "purple")
text(5.67, 0.5, "script", cex = 0.7, col = "purple")
abline(v = c(3.98,5.78), col= "orange")
text(4.2, 0.5, "HDI. bayesb.", cex = 0.7, col = "orange")
abline(v = c(3.97,5.76), col= "grey")
text(4.2, 0.58, "C.I. bayesb.", cex = 0.7, col = "grey")

##
### Now let's test the Bootstrap methods with a very small N (n = 4)

## II.1.c Bootstrap the mean of a VERY SMALL sample. 
#         VERY SMALL sample:  n = 4

test.sam.sm <- c(2,3,3,4)
length(test.sam.sm ) # n = 4
## [1] 4
# Param conf. int. with "t" distrib.
n.sm = length(test.sam.sm)
# param Conf.int
St.Error.sm  = sd(test.sam.sm)/sqrt(n)
St.Error.sm    # = 0.047
## [1] 0.1490712
# Param 95% Conf.int.:
# mean - t(0.95,df) *St.Error , mean + t(0.95,df) *St.Error
t.value.sm <- qt(0.975,n.sm) # 2.776  , distrib. "t" for n=8, approx. 2.3
# mean -  2.776*St.Error , mean +  2.776 *St.Error
mean(test.sam.sm) - t.value.sm *St.Error.sm ; mean(test.sam.sm) +  t.value.sm *St.Error.sm
## [1] 2.586112
## [1] 3.413888
# Param 95% Conf.int. (distrib.: t): 2.87 < mean=3 < 3.13

boot.obj.sm <- bayesboot(test.sam.sm, mean, R = 10000)
boot.obj.sm
## The first 10 draws (out of 10000) from the Bayesian bootstrap:
## 
##         V1
## 1  2.95375
## 2  3.11500
## 3  2.65500
## 4  2.78850
## 5  3.03350
## 6  2.53500
## 7  2.88275
## 8  2.97800
## 9  2.51275
## 10 3.08950
## .. ...
## 
## Use summary() to produce a summary of the posterior distribution.
plot(boot.obj.sm)

summary(boot.obj.sm) # 95% Conf.int: 2.36 < median=3 < 3.63
## Bayesian bootstrap
## 
## Number of posterior draws: 10000 
## 
## Summary of the posterior (with 95% Highest Density Intervals):
##  statistic     mean        sd hdi.low hdi.high
##         V1 2.996386 0.3143268 2.38725  3.64425
## 
## Quantiles:
##  statistic    q2.5%  q25%  median     q75%   q97.5%
##         V1 2.369937 2.791 2.99825 3.201062 3.632256
## 
## Call:
##  bayesboot(data = test.sam.sm, statistic = mean, R = 10000)
#statistic    q2.5%     q25%   median     q75%   q97.5%
# V1          2.3605 2.799437 3.00225 3.209 3.633006
# 95% Conf.int: 2.36 < median=38 < 3.633 # Bayes. bootstr. ("bayesboot()")

# 95% HDI: 2.35 < median=3 < 3.62
#Summary of the posterior (with 95% Highest Density Intervals):
#  statistic    mean        sd      hdi.low    hdi.high
#V1            3.0018     0.319     2.349     3.62025

boot.obj.sm2 <-  boot(test.sam.sm, media.indice.fun, 9999) 
# ordinary non-par. bootstrap,requires index
boot.ci(boot.obj.sm2, type = "basic")  # gives 95% Conf.int.
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 9999 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot.obj.sm2, type = "basic")
## 
## Intervals : 
## Level      Basic         
## 95%   ( 2.25,  3.75 )  
## Calculations and Intervals on Original Scale
#Level      Basic         
#95%   ( 2.25,  3.75 )
# 95% Conf.int: 2.25 < median=3 < 3.75 # ordinary bootstr. ("boot()")

# Bootstrap with a simple script  (Larget, 2014, modified, http://www.stat.wisc.edu/~larget/stat302/chap3.pdf)
Boot.runs = 10000 
   
# Runs a bootstrap using sample()
boot.samples = matrix(sample(test.sam.sm, size = Boot.runs * n.sm, 
                             replace = TRUE),
                      Boot.runs, n.sm)
boot.statistics = apply(boot.samples, 1, mean)# calculates the mean for each line
median(boot.statistics) # median
## [1] 3
hist(boot.statistics) # histogram

#Kolmogorov-Smirnov test for normality
test.distrib <- rnorm(10000, mean = mean(boot.statistics), sd = sd(boot.statistics))
ks.test(test.distrib, boot.statistics) # p-value<0.05 ,não  é normal
## Warning in ks.test(test.distrib, boot.statistics): p-value will be
## approximate in the presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  test.distrib and boot.statistics
## D = 0.1449, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Histogram of the Bootstrap distrib.
hist(boot.statistics,prob = T, col= grey(0.8), 
     main = "Posterior Distrib., N = 10,000/4",ylim = c(0, 4)) # histogram
text(mean(boot.statistics), 3.95, "mean", cex = 0.7)
abline(v = (mean(test.sam.sm)), lty = 4, col = "purple")
lines(density(boot.statistics), col = "blue", lty= 2)
lines(density(test.distrib),  col = "red")
text(4, 2.75, "red: normal dist.",cex =0.7,col = "red")
text(4, 1.55, "blue: Bootstrap dist.",cex =0.7,col = "blue")

s.error.boot.teste.1 = sd(boot.statistics) # st.dev. = 0.48 
s.error.boot.teste.1 
## [1] 0.3540818
# Non-parametr. Bootstrap (no assumptions, uses quantiles):
quantile(boot.statistics, c(0.025, 0.975)) # correct 95% quantiles OK!
##  2.5% 97.5% 
##  2.25  3.75
# 2.5% 97.5% 
# 2.25 3.75 
# 95% Conf.int: 2.25 < median=3 < 3.75 # ordinary boots. (script)

### Compare Methods small sample (n=4,not normal distr.):
# 95% Conf.int.: 2.87 < mean=3 < 3.13 # Param. (distrib.: t)
# 95% Conf.int: 2.36 < median=3 < 3.63 # Bayes. bootstr. ("bayesboot()")
# 95% HDI:     2.35 < median=3 < 3.62   # Bayes. bootstr. ("bayesboot()")
# 95% HDI = Highest Density Intervals
# 95% Conf.int: 2.25 < median=3 < 3.75 # ordinary boots. (script)
# 95% Conf.int: 2.25 < median=3 < 3.75 # ordinary bootstr. ("boot()")
# Parametric (t) is very narrow (overtly optimistic) - Bootraps are better
# All Bootsraps are very similar (within 5%). 
# "boot()" and this script are 100% identical!



### Plot Comparison of Methods, very small sample N= 4
plot(boot.obj.sm, main = "Posterior Distribution, N=4")
abline(v = c(2.87,3.13), col= "red")
text(3.13, 0.55,"param.(t)", cex = 0.7, col = "red")
abline(v = c(2.25,3.75), col= "blue")
text(3.75, 0.6, "boot()", cex = 0.7, col = "blue")
abline(v = c(2.25,3.75), col= "purple")
text(3.75, 0.5, "script", cex = 0.7, col = "purple")
abline(v = c(2.35,3.62), col= "orange")
text(2.35, 0.5, "HDI. bayesb.", cex = 0.7, col = "orange")
abline(v = c(2.36,3.63), col= "grey")
text(2.36, 0.58, "C.I. bayesb.", cex = 0.7, col = "grey")

### Example II.2. Bootstrap the slope of a linear model
# Objectives:
# Obtain 95% Conf. Intervals and st.errors for the slope. 
# Estimate the probability "p" that the slope is equal to zero. 


# Bootstrap example with linear regression
# calculate the "p" of the linear model by Bootstrapping
# and calculate Bootstrap 95% confidence interval of the slope

  x <- seq(1,20) # n = 20 numbers (1,...,20)
     # x
  a = 10; b = 5 # a:intecept, b: slope
  
  y <- a + b*x 
  
    # y
  
  plot(y ~ x)

  # Rand = rnorm(20, 0, sd = 15)
  Rand =  c(4.511, 10.20932,15.7684,5.44541, 7.40814,
     6.59113,  6.8693, -22.10925, 14.97867, -8.50671,
     27.60960, 0.44229, 17.20696, 5.53542, -0.09821,
     11.04133,-6.610431, 18.8136, 12.93610, 27.8513)
  
  yr <- y + Rand
  
  plot(yr ~ x)
  
  model.1 <- lm(yr ~ x)
  
  summary(model.1)
## 
## Call:
## lm(formula = yr ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.8921  -4.7467   0.8192   7.8392  19.6126 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13.5450     5.5417   2.444    0.025 *  
## x             5.4047     0.4626  11.683 7.75e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.93 on 18 degrees of freedom
## Multiple R-squared:  0.8835, Adjusted R-squared:  0.877 
## F-statistic: 136.5 on 1 and 18 DF,  p-value: 7.755e-10
  # param. "p"-value: 7.7e-10***, R-squared:  0.88 
  # slope: 5.40, st. error of the slope: 0.46 
  
  # param. 95% conf. int of the slope: 4.5 < MEAN=5.4 < 6.3
   5.4 - 1.96 *0.46 ; 1.96 *0.46 +5.4 
## [1] 4.4984
## [1] 6.3016
  # Summary table:
  # Estimate Std. Error t value Pr(>|t|)    
  #(Intercept)  13.54     5.54   2.4    0.025 *  
  #  x             5.40     0.46  11.68 7.75e-10 ***
  
   (R.squ.1 <- (cor.test(yr ,x)$estim[[1]])^2) # R² = 0.88
## [1] 0.8834915
  plot(yr ~ x)
  abline(model.1, col = "blue")

# Now Bootstrap
# TAKE A SAMPLE with pairs of X,Y. 
# Use "replace = TRUE", with repeated pairs.

  df.1 <- data.frame(x, yr)
  #View(df.1)
  
  index <- 1:length(yr)
  # ?sample
sample(index, length(yr), replace = TRUE) # samples 20 index values
##  [1] 20  4  2  5 20 11  8  7  6 11  5 13  2 10 15  3 16  6 18 17
    index.sam <- sample(index, length(yr), replace = TRUE) 
    # a sample of 20 index values, n=20
    df.sam <- df.1
    df.sam$x <- df.1$x[index.sam]
    df.sam$yr <- df.1$yr[index.sam]
  
    #View(df.sam)
    plot(df.sam$yr ~  df.sam$x)
    model.sam <- lm(df.sam$yr ~  df.sam$x) 
    abline(model.sam, col = "blue")
    summary(model.sam)
## 
## Call:
## lm(formula = df.sam$yr ~ df.sam$x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.055  -3.674  -1.274   6.833  20.259 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.5289     3.7402   5.221 5.77e-05 ***
## df.sam$x      4.8020     0.4092  11.735 7.22e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.429 on 18 degrees of freedom
## Multiple R-squared:  0.8844, Adjusted R-squared:  0.878 
## F-statistic: 137.7 on 1 and 18 DF,  p-value: 7.221e-10
    model.sam$coef[2] # slope of the linear model (based on the sample)
## df.sam$x 
## 4.801994
    # as function
    
    slope.sam.fun1 <-function (X,Y) {

      df.1 <- data.frame(X, Y)  

      index <- 1:length(Y)
      sample(index, length(Y), replace = TRUE) # samples 20 index values
      
      index.sam <- sample(index, length(Y), replace = TRUE) 
      # a sample of 20 index values, n=20
      df.sam <- df.1
      df.sam$X <- df.1$X[index.sam]
      df.sam$Y <- df.1$Y[index.sam]
      
      #View(df.sam)
      # plot(df.sam$yr ~  df.sam$x)
      model.sam <- lm(df.sam$Y ~  df.sam$X) 
      abline(model.sam, col = "blue")
      summary(model.sam)
      ; model.sam$coef[2] # slope of the linear model (based on the sample)
    } # END OF FUNCTION
    
    slope.sam.fun1(x,yr)
## df.sam$X 
## 5.440905
    # As LOOP  
    
    # 4000 BOOTSTRAP RUNS 
    # careful, takes time... may take several minutes
    
    boot.runs <- 4000
    
    boot.slope.vec <- 1:boot.runs
    
    for (i in 1:boot.runs) {  
      
      slope.sam <-  slope.sam.fun1(x,yr)
      boot.slope.vec[i] <- slope.sam
      
    }

    #boot.slope.vec # output slope vector for posterior distrib.
    
    hist(boot.slope.vec)

    median(boot.slope.vec) # mediana geral: 5.4,
## [1] 5.404659
    #Kolmogorov-Smirnov test for normality
    test.distrib <- rnorm(10000, mean = mean(boot.slope.vec), sd = sd(boot.slope.vec))
    ks.test(test.distrib, boot.slope.vec) # p-value<0.05 ,não  é normal
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  test.distrib and boot.slope.vec
## D = 0.03085, p-value = 0.008693
## alternative hypothesis: two-sided
    # histogram da distribuição Bootstrap
    hist(boot.slope.vec, col= grey(0.8), prob= TRUE,
         main = "Posterior Distribution, slope, n=20",ylim = c(0, 4)) # histogram
    text(mean(boot.slope.vec), 3.95, "mean", cex = 0.7)
    abline(v = (mean(boot.slope.vec)), lty = 4, col = "purple")
    lines(density(boot.slope.vec), col = "blue", lty= 2)
    lines(density(test.distrib),  col = "red")
    text(4.5, 2.75, "red: normal dist.",cex =0.7,col = "red")
    text(4.5, 1.55, "blue: Bootstrap dist.",cex =0.7,col = "blue")

    s.error.boot.teste.1 = sd(boot.slope.vec) # st.dev. = 0.39 
    s.error.boot.teste.1 
## [1] 0.3923248
    # Non-parametr. Bootstrap (no assumptions, uses quantiles):
    quantile(boot.slope.vec, c(0.025, 0.975)) # correct 95% quantiles OK!
##     2.5%    97.5% 
## 4.636483 6.201019
    # 2.5% 97.5% 
    # 2.25 3.75 
    # 95% Conf.int: 4.60 < median=5.4 < 6.16 # ordinary boots. (script)
    # param. 95% conf. int of the slope: 4.5 < MEAN=5.4 < 6.3
    # very similar conf. intervals...
  
    ### Calculate "p" of the model (i.e., "p" of the slope)
    
       # Calculate "p" from the Kernel density of the Posterior
        plot(density(boot.slope.vec)$x, density(boot.slope.vec)$y,
             xlim = c(0, max(boot.slope.vec)))

            min(density(boot.slope.vec)$y) # p < 1.773 e-05***
## [1] 1.799905e-05
       normalized.density <- density(boot.slope.vec)$y/(sum(density(boot.slope.vec)$y))
       plot(density(boot.slope.vec)$x,normalized.density,
                        xlim = c(0, max(boot.slope.vec)))

      sum(normalized.density) # = 1      
## [1] 1
      min(normalized.density) # p <  1.27e-07***, p<0.001***
## [1] 1.419971e-07
       # Calculate "p" from actual counts in the Posterior
      hist.1 <- hist(boot.slope.vec, col= grey(0.8), prob= TRUE,
           main = "Posterior Distribution, slope, n=20",
           ylim = c(0, 4), xlim = c(0, max(boot.slope.vec))) # histogram

      normalized.counts <- hist.1$counts/(sum(hist.1$counts))
      sum(normalized.counts)# = 1 
## [1] 1
      min(normalized.counts)# p< 0.00025***, p<0.001***
## [1] 0.00025
      # plot x, yr with 95% confidence envelope (package "ggplot2")
      
      library (ggplot2)
      
      df <-  data.frame(x,yr)
     
      ggplot(df, aes(x=x, y=yr))+
        geom_point()+
        geom_smooth(method=lm, se=TRUE)

      # ?geom_smooth
      
      
      # References
      #
      # Anderson, M.J. 2001. A new method for non-parametric multivariate analysis of variance. Austral Ecology, 26: 32-46.
      #
      # Larget, B. 2014. Bootstrap Examples. www.stat.wisc.edu.
      # http://www.stat.wisc.edu/~larget/stat302/chap3.pdf
      
      # Rubin, D.B. (1981). "The Bayesian Bootstrap". The Annals of Statistics, 9(1), p. 130-134. 
      # Paper (Rubin, 1981): https://projecteuclid.org/download/pdf_1/euclid.aos/1176345338
      
      
                  #### End of Script ###