PERMANOVA (simple example with 2 samples)
am.A <- 1:5 # sample A
am.B <- 11:15 # sample B
# Difference between the means of samples A and B
diff.AB <- mean(am.A)-mean(am.B)
diff.AB
## [1] -10
## I.0. Basic Question: Is the observed difference signifficant?
# standard test: Student's t-test
t.test(am.A, am.B) # p-value = 8.488e-06*** (p <0.001)
##
## Welch Two Sample t-test
##
## data: am.A and am.B
## t = -10, df = 8, p-value = 8.488e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -12.306004 -7.693996
## sample estimates:
## mean of x mean of y
## 3 13
# non-parametric test: PERMANOVA
## I.1. Create a Dataframe for analysis
Data.2 <- data.frame(A = am.A, B = am.B)
Data.3 <- stack(Data.2)
#View(Data.3)
names(Data.3) # "values" "ind"
## [1] "values" "ind"
names(Data.3) <- c("values", "grupo" )
names(Data.3) # "values" "grupo"
## [1] "values" "grupo"
t.test(Data.3$values ~Data.3$grupo) # p-value = 8.488e-06
##
## Welch Two Sample t-test
##
## data: Data.3$values by Data.3$grupo
## t = -10, df = 8, p-value = 8.488e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -12.306004 -7.693996
## sample estimates:
## mean in group A mean in group B
## 3 13
## I.2. Take a random sample with values all mixed up,
## but factors ("A", "B") unchanged
sample (Data.3$values,10, replace = FALSE)
## [1] 13 3 11 14 15 1 2 4 5 12
## I.3. Take such a mixed up random sample,
## then create a new data frame with this sample
Data.sam1 <-Data.3
Data.sam1$values <- sample (Data.3$values,10, replace = FALSE)
##View(Data.sam1)
sam.A <- Data.sam1$values[1:5]
sam.B <- Data.sam1$values[6:10]
t.test(sam.A, sam.B) # p-value = n.s.
##
## Welch Two Sample t-test
##
## data: sam.A and sam.B
## t = 0.67135, df = 7.6582, p-value = 0.5217
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.908252 10.708252
## sample estimates:
## mean of x mean of y
## 9.2 6.8
## I.4. calculate the difference between "A" and "B" in the "mixed up" random sample
diff.AB <- mean(sam.A)-mean(sam.B)
diff.AB # difference between "A" and "B" in the a "mixed up" random sample
## [1] 2.4
## I.5. Create a function that makes all this.
## It takes a "mixed up" random sample
## and then calculates the difference between "A" and "B"
## new function "sam_diff.fun1()"
# As Function
sam_diff.fun1 <- function (DF) {
#take a sample
Data.sam1 <-DF
colnames(Data.sam1)[1] <- "values"
Data.sam1$values <- sample (Data.3$values,(length(Data.3$values)),
replace = FALSE)
##View(Data.sam1)
sam.A <- Data.sam1$values[1:5]
sam.B <- Data.sam1$values[6:10]
t.test(sam.A, sam.B) # p-value = n.s.
diff.AB <- mean(sam.A)-mean(sam.B)
; diff.AB
} # END OF FUNCTION
sam_diff.fun1(Data.3)
## [1] 1.2
## I.6. Apply this function several times (10000 times)
# Each time, you get a diferent sample
# and a different value for the diffence between means in the sample
# By this, you will get 10000 values of diffence between means
# the distrib. of these 10000 values is the posterior distib.,
# an approximation of the null distrib..
# Analyse: Is your original observed mean within the null distribution?
# Loop to obtain the null distribution
Nperm <- 10000 # Number of permutations
diff.vec <- rep(0,Nperm)
for (i in 1:Nperm) {
diff.sam <- sam_diff.fun1(Data.3)
diff.vec[i] <- diff.sam
} # END OF LOOP
## I.7. Analyse the distibution obtained by permutation
# i.e., analyse the shape of the distibution obtained by repeated mixing up and sampling.
# and determine where your original observed mean is located within the null distribution.
# Analyze Loop results
summary(diff.vec)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -10.00000 -2.40000 0.00000 0.01448 2.40000 10.00000
hist.graph1 <- hist(diff.vec, breaks = seq(-20, 20, length.out = 201))

hist.graph1$counts
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [18] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [35] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 36 0
## [52] 0 0 0 0 0 0 0 0 0 0 35 0 77 0 113 0 157
## [69] 0 188 0 161 0 135 0 77 0 88 0 89 0 199 0 318 0
## [86] 467 0 532 0 607 0 580 0 457 0 339 0 238 0 170 0 247
## [103] 0 363 0 470 0 543 0 588 0 612 0 462 0 306 0 187 0
## [120] 91 0 67 0 82 0 114 0 174 0 198 0 168 0 96 0 81
## [137] 0 39 0 0 0 0 0 0 0 0 0 0 0 49 0 0 0
## [154] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [171] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [188] 0 0 0 0 0 0 0 0 0 0 0 0 0
# create a nice graph
hist.graph1 <- hist(diff.vec, breaks = seq(-20, 20, length.out = 201),
prob = TRUE,
main = "Null Distribution of the diff. in means")
abline(v = -10, col = "green")
xydens<- density(diff.vec, from = -20, to= 20,n = 201)
xydens$y <- xydens$y*2
lines(xydens, col = grey(0.3), lty = 2)
text(-11.4, 0.3, "obs.", col = "green", cex = 0.8)
## now calculate "p" and 95% Conf.Interval
# calculate95% Conf.Interval
quantile(diff.vec, c(0.025, 0.975)) # 95% Conf.Interval:-6.8 to 6.8
## 2.5% 97.5%
## -6.8 6.8
abline(v = -6.8,col = "grey"); abline(v = 6.8,col = "grey");
text(3, 0.32, "95% Conf. int.", col = "grey", cex = 0.8)
# calculate "p" based on counts (counting number of samples)
df.hist1 <- data.frame(hist.graph1$density,round(hist.graph1$mids,2),
hist.graph1$counts)
#View(df.hist1)
names(df.hist1)<- c("density", "mids", "counts")
#View(df.hist1)
df.hist1.10 <- subset(df.hist1,df.hist1$mids > -10.2)
df.hist1.10 <- subset(df.hist1.10,df.hist1.10$mids < -9.8)
#View(df.hist1.10)
sum(df.hist1.10$counts) # 36 permutation runs out of 10,000 gave "diff = -10"
## [1] 36
sum(df.hist1.10$counts)/Nperm # p = 0.0035** (p<0.01)
## [1] 0.0036
# calculate "p" based on the kernel density
xydens$x <- round(xydens$x,2)
p_vals <- xydens$y[xydens$x == (-10.0)] # p = 0.0056** (p<0.01)
mean (p_vals)
## [1] 0.005797957
## I.8. Use packages "coin" and "adonis" for PERMANOVA
# very fast and simple
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.4-3

# ?adonis
# ?adonis2
adonis(Data.3$values~ Data.3$grupo,method = "euclidian")
##
## Call:
## adonis(formula = Data.3$values ~ Data.3$grupo, method = "euclidian")
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Data.3$grupo 1 250 250.0 100 0.92593 0.01 **
## Residuals 8 20 2.5 0.07407
## Total 9 270 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p-value = 0.007 ** (p<0.01)
library(coin)
## Loading required package: survival
# ?coin
# ?independence_test
independence_test(Data.3$values~ Data.3$grupo)
##
## Asymptotic General Independence Test
##
## data: Data.3$values by Data.3$grupo (A, B)
## Z = -2.8868, p-value = 0.003892
## alternative hypothesis: two.sided
# p-value = 0.0039** (p<0.01)
# II. Learning how to Bootstrap
### BOOTSTRAP simple examples
###
# Example II.1. Bootstrap the mean of a sample
# Objectives:
# Obtain 95% Conf. Interval and st.errors for the mean of a sample.
# Obtain the "p" value that the mean of a sample is greater than zero.
## II.1 Bootstrap the mean of a sample
#
# Basic principle:
# The Boostrap creates a long list of new random samples taken from
# the original data.
# The frequency distribution of the Bootstrap results is analyzed
# and compared to the original data.
#
# Basic procedure:
# Use the function "sample()" with "replace = TRUE",
# this will create many repeated values.
# Bootstrap sample size = original sample size.
## II.1.a Bootstrap the mean of a LARGE sample.
# Large sample: n = 30
# test.sam <- rnorm (30,3,2)
# test.sam <- round(test.sam,3)
test.sam <- c(6.166, 3.010 ,4.832, 3.511, 5.910, -0.545, 4.334, 4.861, -0.107, 2.194, 1.890 ,3.827,
1.567, 3.392, 1.057, 1.382, 1.168, 6.794, 4.414, 1.726 ,-0.109, 1.407, 2.003, 3.155,
3.026, 3.361, 3.527, 5.224, 3.169, 3.962)
length(test.sam) #30
## [1] 30
media.teste <-mean(test.sam) #mean
media.teste
## [1] 3.0036
median(test.sam) #median=3
## [1] 3.162
sd(test.sam) # st.dev = 1.87
## [1] 1.872134
shapiro.test(test.sam) # p > 0.05, normal distrib.
##
## Shapiro-Wilk normality test
##
## data: test.sam
## W = 0.98164, p-value = 0.8672
hist(test.sam, breaks = c(seq(-2,8, by = 2)) , main = "Original data, n=30")

# Calulate the Parametric 95% Conf.int., using the "t" distribution
# begin by calculating the prametric standard error of the mean
n = length(test.sam)
St.Error = sd(test.sam)/sqrt(n)
St.Error # Standard error of the mean = 0.342
## [1] 0.3418033
# Calulate the Parametric 95% Conf.int. using the "t" distribution and the St. Error
# mean - t(0.95,df) *St.Error ; mean + t(0.95,df) *St.Error
qt(0.975,29) # 2.05 , VALUE OF THE "t" distrib., approx. 2
## [1] 2.04523
# mean - 2*St.Error , mean + 2 *St.Error
mean(test.sam) - 2*St.Error ; mean(test.sam) + 2 *St.Error
## [1] 2.319993
## [1] 3.687207
# Param 95% Conf.int. (distrib.: t): 2.32 < mean < 3.69
# NOW BOOTSRAP
# Bootstrap st.error and 95% Conf.int
# Principle: sampling with replacement (replace = TRUE),
# permits repetition of the same values
# sample(test.sam, size =n, replace = TRUE)
Boot.runs = 10000 # number of Bootstrap runs
n = length(test.sam) # sample size (300)
boot.samples = matrix(sample(test.sam, size = Boot.runs * n,
replace = TRUE),
Boot.runs, n)
# generates a matrix with 10000 lines and 3000 columns (samples)
dim(boot.samples) # number of rows and columns
## [1] 10000 30
#View(as.data.frame(boot.samples))
boot.statistics = apply(boot.samples, 1, mean)
# calculates the mean for each line (i.e., for each sampe)
# gives a vector with 10000 means
mean(boot.statistics) # overall mean of means: 2.999,
## [1] 2.998966
median(boot.statistics) # overall median of means: 3.0,
## [1] 3.0041
hist(boot.statistics) # histogram

#Kolmogorov-Smirnov test for normality
test.distrib <- rnorm(10000, mean = mean(boot.statistics), sd = sd(boot.statistics))
ks.test(test.distrib, boot.statistics) # p-value>0.05 , it's normal
## Warning in ks.test(test.distrib, boot.statistics): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: test.distrib and boot.statistics
## D = 0.012, p-value = 0.4676
## alternative hypothesis: two-sided
# Histogram of the means of the samples obtained with Bootstrap,
# i.e., histogram of the Bootstrap disribution,
# i.e., the "posterior" distrib., wich is an approximation of the "null" distrib.
hist(boot.statistics,prob = T, col= grey(0.8),
main = "Posterior Distrib., N = 10,000/30",xlim = c(1.5, 4.5),ylim = c(0, 4)) # histogram
text(mean(boot.statistics), 3.95, "mean", cex = 0.7)
abline(v = mean(boot.statistics), lty = 4, col = "purple")
lines(density(rnorm(10000, mean = mean(boot.statistics), sd = sd(boot.statistics))
), col = "blue", lty= 2)
text(4, 1.55, "Bootstrap dist.",cex =0.7,col = "blue")
s.error.boot.teste.1 = sd(boot.statistics) # st.dev. = 0.333
s.error.boot.teste.1
## [1] 0.338719
# St. dev. of bootstrap distrib. = st. error of the mean !
# Param St.Error, Standard error of the mean = 0.342
abline(v = 3+ sd(boot.statistics), col= "grey"); abline(v = 3-sd(boot.statistics), col= "grey")
text(4, 3.95, "grey: st.dev.=S.E.",cex =0.7,col = "grey")
# Parametric Bootstrap (uses the "t" value * the boots. st.error)
me = (round(s.error.boot.teste.1,4))*2 # often used: 2
round(media.teste, 1) + c(-1, 1) * me
## [1] 2.3226 3.6774
# 95% Conf.Int. (t distrib.): 2.78 < mean = 3 < 3.22 # parametr. Bootstrap (uses 1.96)
abline(v = 2.33, col= "purple"); abline(v = 3.67, col= "purple")
text(2.33, 1.5, "par.(t) boot. conf.int.",cex =0.6,col = "purple")
# Non-parametr. ordinary Bootstrap (no assumptions, uses quantiles):
quantile(boot.statistics, c(0.025, 0.975)) # correct 95% quantiles OK!
## 2.5% 97.5%
## 2.329297 3.655268
# 2.5% 97.5%
# 2.364 3.671
# 95% Conf.Int.: 2.74 < mean < 3.17 # non-parametr. Bootstrap (no assumptions)
abline(v = 2.364, col= "green"); abline(v = 3.671, col= "green")
text(2.364, 2.5, "n.-par. boot. conf.int.",cex =0.6,col = "green")
abline(v = 2.32, col= "orange"); abline(v = 3.69, col= "orange")
# Param 95% Conf.int. (distrib.: t): 2.32 < mean < 3.69
text(2.32, 3.5, "param. (t) conf.int.",cex =0.6,col = "orange")

## Now calculate probability "p" that mean = ZERO
dens <- density(test.distrib) # Kernel density distrib.
dens.df <- data.frame( x= dens$x, y= dens$y)
plot(dens.df$x, dens.df$y, xlim = c(0, max(dens.df$x) ))

# View(dens.df)
min(dens.df$y) # 2.97 e-05*** = minimum vaule of p (at dens.df$x = min)
## [1] 9.362449e-06
# p < 2.97 e-05*** # prob. that mean = ZERO, p<0.001***
min(hist(test.sam)$counts)/Boot.runs # 3.0 e-04*** = minimum vaule of p (at dens.df$x = min)

## [1] 0
# p < 3.0 e-04*** = 0.0003 # prob. that mean = ZERO, p<0.001***
# large samples (e.g. N > 16): Common Bootstrap OK, no need for Bayesian Bootstrap.
# Small samples (e.g. N < 16): maybe use Bayesian Bootstrap?
# Very small samples (e.g. N < 8): use Bayesian Bootstrap!
### Bayesian Bootstrap with "bayesboot()"
## video tutorial: Bayesian Bootstrap, https://www.youtube.com/watch#?v=WMAgzr99PKE
# Rubin, D.B. (1981). "The Bayesian Bootstrap". The Annals of Statistics, 9(1), p. 130-134.
# Paper (Rubin, 1981): https://projecteuclid.org/download/pdf_1/euclid.aos/1176345338
# package Bayesboot()
library(bayesboot)
# ?bayesboot
boot.obj1 <- bayesboot(test.sam, mean)
boot.obj1
## The first 10 draws (out of 4000) from the Bayesian bootstrap:
##
## V1
## 1 2.467263
## 2 3.071117
## 3 2.863152
## 4 3.356218
## 5 2.865813
## 6 3.070229
## 7 3.448393
## 8 2.867503
## 9 3.312115
## 10 3.002879
## .. ...
##
## Use summary() to produce a summary of the posterior distribution.
summary(boot.obj1) # 95% Conf.int: 2.73 < median=2.95 < 3.18
## Bayesian bootstrap
##
## Number of posterior draws: 4000
##
## Summary of the posterior (with 95% Highest Density Intervals):
## statistic mean sd hdi.low hdi.high
## V1 3.008992 0.3402521 2.317517 3.673202
##
## Quantiles:
## statistic q2.5% q25% median q75% q97.5%
## V1 2.330311 2.78149 3.004157 3.238448 3.694705
##
## Call:
## bayesboot(data = test.sam, statistic = mean)
# statistic q2.5% q25% median q75% q97.5%
#V1 2.726811 2.876869 2.954332 3.03456 3.175445
plot(boot.obj1)

# plots posterior distrib. and 95% Highest Density Intervals
### Common Bootstrap with "boot()"
# use package "boot()" - a bit more complicated... needs indices
# http://www.mayin.org/ajayshah/KB/R/documents/boot.html
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:survival':
##
## aml
## The following object is masked from 'package:lattice':
##
## melanoma
media.indice.fun <- function(x, d) {
return(mean(x[d]))
}
indices <- 1:length(test.sam)
plot(test.sam ~indices) # OK

media.indice.fun(test.sam, indices[1:20]) # OK
## [1] 3.06915
media.indice.fun(test.sam, c(1:20)) # OK
## [1] 3.06915
media.indice.fun(test.sam, 1:20) # OK
## [1] 3.06915
# ?boot
boot.obj2 <- boot(test.sam, media.indice.fun, 9999)
# ordinary non-par. bootstrap,requires index
# ?boot.ci
boot.ci(boot.obj2, type = "basic") # gives 95% Conf.int.
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 9999 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot.obj2, type = "basic")
##
## Intervals :
## Level Basic
## 95% ( 2.353, 3.672 )
## Calculations and Intervals on Original Scale
# 2.740, 3.172
# 95% Conf.int: 2.74 < 2.95 < 3.17 # using package "boot()", common non-parametr. Bootstr.
### Compare Methods (n=30,normal distr.):
# All methods are OK (see figure).
# all Conf. Intervals are very similar within 1%
# (for very large samples), for normally distribited data.
#
## This was a test with large N (n = 30)
##
### Now let's test these Bootstrap methods with a small N (n = 8)
## II.1.b Bootstrap the mean of a SMALL sample.
# SMALL sample: n = 8
test.sam.sm <- c(3,4,3,5,6,7,6,5)
length(test.sam.sm ) # n = 8
## [1] 8
# Param conf. int. with "t" distrib.
n.sm = length(test.sam.sm)
# param Conf.int
St.Error.sm = sd(test.sam.sm)/sqrt(n)
St.Error.sm # = 0.084
## [1] 0.2661453
# Param 95% Conf.int.:
# mean - t(0.95,df) *St.Error , mean + t(0.95,df) *St.Error
t.value.sm <- qt(0.975,n.sm) # 2.3 , distrib. "t" for n=8, approx. 2.3
# mean - 1.97*St.Error , mean + 1.97 *St.Error
mean(test.sam.sm) - t.value.sm *St.Error.sm ; mean(test.sam.sm) + t.value.sm *St.Error.sm
## [1] 4.261268
## [1] 5.488732
# Param 95% Conf.int. (distrib.: t): 4.68 < mean=4.88 < 5.07
boot.obj.sm <- bayesboot(test.sam.sm, mean, R = 10000)
boot.obj.sm
## The first 10 draws (out of 10000) from the Bayesian bootstrap:
##
## V1
## 1 4.82125
## 2 4.85975
## 3 4.94150
## 4 4.27125
## 5 4.84325
## 6 4.86375
## 7 4.37800
## 8 4.21000
## 9 5.07600
## 10 5.86675
## .. ...
##
## Use summary() to produce a summary of the posterior distribution.
plot(boot.obj.sm)

summary(boot.obj.sm) # 95% Conf.int: 3.97 < median=4.88 < 5.76
## Bayesian bootstrap
##
## Number of posterior draws: 10000
##
## Summary of the posterior (with 95% Highest Density Intervals):
## statistic mean sd hdi.low hdi.high
## V1 4.868089 0.4569996 3.9825 5.77875
##
## Quantiles:
## statistic q2.5% q25% median q75% q97.5%
## V1 3.961469 4.56275 4.87475 5.184312 5.759756
##
## Call:
## bayesboot(data = test.sam.sm, statistic = mean, R = 10000)
#statistic q2.5% q25% median q75% q97.5%
# V1 3.97 4.57 4.88 5.19 5.76
# 95% Conf.int: 3.97 < median=4.88 < 5.76 # Bayes. bootstr. ("bayesboot()")
# 95% HDI: 3.98 < median=4.88 < 5.78
#Summary of the posterior (with 95% Highest Density Intervals):
# statistic mean sd hdi.low hdi.high
#V1 4.87 0.46 3.98 5.78
boot.obj.sm2 <- boot(test.sam.sm, media.indice.fun, 9999)
# ordinary non-par. bootstrap,requires index
boot.ci(boot.obj.sm2, type = "basic") # gives 95% Conf.int.
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 9999 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot.obj.sm2, type = "basic")
##
## Intervals :
## Level Basic
## 95% ( 4.000, 5.875 )
## Calculations and Intervals on Original Scale
#Level Basic
#95% ( 3.875, 5.875 )
# 95% Conf.int: 3.88 < median=4.88 < 5.88 # ordinary bootstr. ("boot()")
# Bootstrap with a simple script (Larget, 2014, modified, http://www.stat.wisc.edu/~larget/stat302/chap3.pdf)
Boot.runs = 10000 # number of Bootstrap runs
boot.samples = matrix(sample(test.sam.sm, size = Boot.runs * n.sm,
replace = TRUE),
Boot.runs, n.sm)
boot.statistics = apply(boot.samples, 1, mean)#
median(boot.statistics) # median
## [1] 4.875
hist(boot.statistics) # histogram

#Kolmogorov-Smirnov test for normality
test.distrib <- rnorm(10000, mean = mean(boot.statistics), sd = sd(boot.statistics))
ks.test(test.distrib, boot.statistics) # p-value<0.05, not normally distrib.
## Warning in ks.test(test.distrib, boot.statistics): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: test.distrib and boot.statistics
## D = 0.0557, p-value = 6.717e-14
## alternative hypothesis: two-sided
# Histogram of the Bootstrap results
hist(boot.statistics,prob = T, col= grey(0.8),
main = "Posterior Distrib., N = 10,000/8",ylim = c(0, 4)) # histogram
text(mean(boot.statistics), 3.95, "mean", cex = 0.7)
abline(v = (mean(test.sam.sm)), lty = 4, col = "purple")
lines(density(boot.statistics), col = "blue", lty= 2)
lines(density(test.distrib), col = "red")
text(4, 2.75, "red: normal dist.",cex =0.7,col = "red")
text(4, 1.55, "blue: Bootstrap dist.",cex =0.7,col = "blue")

s.error.boot.teste.1 = sd(boot.statistics) # st.dev. = 0.48
s.error.boot.teste.1
## [1] 0.4762067
# Non-parametr. Bootstrap (no assumptions, uses quantiles):
quantile(boot.statistics, c(0.025, 0.975)) # correct 95% quantiles OK!
## 2.5% 97.5%
## 4.00 5.75
# 2.5% 97.5%
# 3.875 5.750
# 95% Conf.int: 3.88 < median=4.88 < 5.75 # ordinary bootstr. script
### Compare Methods small sample (n=8,not normal distr.):
# 95% Conf.int: 4.68 < 4.88 < 5.07 # Param. (distrib.: t)
# 95% Conf.int: 3.97 < 4.88 < 5.76 # Bayes. bootstr. ("bayesboot()")
# 95% HDI: 3.98 < 4.87 < 5.78 # Full Bayes. bootstr. ("bayesboot()")
# 95% HDI = Highest Density Intervals
# 95% Conf.int: 3.88 < 4.88 < 5.75 # ordinary bootstr. script
# 95% Conf.int: 3.88 < 4.88 < 5.88 # ordinary bootstr. ("boot()")
# Parametric (t) is very narrow (overtly optimistic) - Bootraps are better
# All Bootsraps are very similar (within 5%).
### Plot Comparison of Methods, small sample (n= 8)
plot(boot.obj.sm, main = "N= 8")
abline(v = c(4.68,5.07), col= "red")
text(5.4, 0.55,"param.(t)", cex = 0.7, col = "red")
abline(v = c(3.879,5.88), col= "blue")
text(5.75, 0.6, "boot()", cex = 0.7, col = "blue")
abline(v = c(3.883,5.75), col= "purple")
text(5.67, 0.5, "script", cex = 0.7, col = "purple")
abline(v = c(3.98,5.78), col= "orange")
text(4.2, 0.5, "HDI. bayesb.", cex = 0.7, col = "orange")
abline(v = c(3.97,5.76), col= "grey")
text(4.2, 0.58, "C.I. bayesb.", cex = 0.7, col = "grey")

##
### Now let's test the Bootstrap methods with a very small N (n = 4)
## II.1.c Bootstrap the mean of a VERY SMALL sample.
# VERY SMALL sample: n = 4
test.sam.sm <- c(2,3,3,4)
length(test.sam.sm ) # n = 4
## [1] 4
# Param conf. int. with "t" distrib.
n.sm = length(test.sam.sm)
# param Conf.int
St.Error.sm = sd(test.sam.sm)/sqrt(n)
St.Error.sm # = 0.047
## [1] 0.1490712
# Param 95% Conf.int.:
# mean - t(0.95,df) *St.Error , mean + t(0.95,df) *St.Error
t.value.sm <- qt(0.975,n.sm) # 2.776 , distrib. "t" for n=8, approx. 2.3
# mean - 2.776*St.Error , mean + 2.776 *St.Error
mean(test.sam.sm) - t.value.sm *St.Error.sm ; mean(test.sam.sm) + t.value.sm *St.Error.sm
## [1] 2.586112
## [1] 3.413888
# Param 95% Conf.int. (distrib.: t): 2.87 < mean=3 < 3.13
boot.obj.sm <- bayesboot(test.sam.sm, mean, R = 10000)
boot.obj.sm
## The first 10 draws (out of 10000) from the Bayesian bootstrap:
##
## V1
## 1 2.95375
## 2 3.11500
## 3 2.65500
## 4 2.78850
## 5 3.03350
## 6 2.53500
## 7 2.88275
## 8 2.97800
## 9 2.51275
## 10 3.08950
## .. ...
##
## Use summary() to produce a summary of the posterior distribution.
plot(boot.obj.sm)

summary(boot.obj.sm) # 95% Conf.int: 2.36 < median=3 < 3.63
## Bayesian bootstrap
##
## Number of posterior draws: 10000
##
## Summary of the posterior (with 95% Highest Density Intervals):
## statistic mean sd hdi.low hdi.high
## V1 2.996386 0.3143268 2.38725 3.64425
##
## Quantiles:
## statistic q2.5% q25% median q75% q97.5%
## V1 2.369937 2.791 2.99825 3.201062 3.632256
##
## Call:
## bayesboot(data = test.sam.sm, statistic = mean, R = 10000)
#statistic q2.5% q25% median q75% q97.5%
# V1 2.3605 2.799437 3.00225 3.209 3.633006
# 95% Conf.int: 2.36 < median=38 < 3.633 # Bayes. bootstr. ("bayesboot()")
# 95% HDI: 2.35 < median=3 < 3.62
#Summary of the posterior (with 95% Highest Density Intervals):
# statistic mean sd hdi.low hdi.high
#V1 3.0018 0.319 2.349 3.62025
boot.obj.sm2 <- boot(test.sam.sm, media.indice.fun, 9999)
# ordinary non-par. bootstrap,requires index
boot.ci(boot.obj.sm2, type = "basic") # gives 95% Conf.int.
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 9999 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot.obj.sm2, type = "basic")
##
## Intervals :
## Level Basic
## 95% ( 2.25, 3.75 )
## Calculations and Intervals on Original Scale
#Level Basic
#95% ( 2.25, 3.75 )
# 95% Conf.int: 2.25 < median=3 < 3.75 # ordinary bootstr. ("boot()")
# Bootstrap with a simple script (Larget, 2014, modified, http://www.stat.wisc.edu/~larget/stat302/chap3.pdf)
Boot.runs = 10000
# Runs a bootstrap using sample()
boot.samples = matrix(sample(test.sam.sm, size = Boot.runs * n.sm,
replace = TRUE),
Boot.runs, n.sm)
boot.statistics = apply(boot.samples, 1, mean)# calculates the mean for each line
median(boot.statistics) # median
## [1] 3
hist(boot.statistics) # histogram

#Kolmogorov-Smirnov test for normality
test.distrib <- rnorm(10000, mean = mean(boot.statistics), sd = sd(boot.statistics))
ks.test(test.distrib, boot.statistics) # p-value<0.05 ,não é normal
## Warning in ks.test(test.distrib, boot.statistics): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: test.distrib and boot.statistics
## D = 0.1449, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Histogram of the Bootstrap distrib.
hist(boot.statistics,prob = T, col= grey(0.8),
main = "Posterior Distrib., N = 10,000/4",ylim = c(0, 4)) # histogram
text(mean(boot.statistics), 3.95, "mean", cex = 0.7)
abline(v = (mean(test.sam.sm)), lty = 4, col = "purple")
lines(density(boot.statistics), col = "blue", lty= 2)
lines(density(test.distrib), col = "red")
text(4, 2.75, "red: normal dist.",cex =0.7,col = "red")
text(4, 1.55, "blue: Bootstrap dist.",cex =0.7,col = "blue")

s.error.boot.teste.1 = sd(boot.statistics) # st.dev. = 0.48
s.error.boot.teste.1
## [1] 0.3540818
# Non-parametr. Bootstrap (no assumptions, uses quantiles):
quantile(boot.statistics, c(0.025, 0.975)) # correct 95% quantiles OK!
## 2.5% 97.5%
## 2.25 3.75
# 2.5% 97.5%
# 2.25 3.75
# 95% Conf.int: 2.25 < median=3 < 3.75 # ordinary boots. (script)
### Compare Methods small sample (n=4,not normal distr.):
# 95% Conf.int.: 2.87 < mean=3 < 3.13 # Param. (distrib.: t)
# 95% Conf.int: 2.36 < median=3 < 3.63 # Bayes. bootstr. ("bayesboot()")
# 95% HDI: 2.35 < median=3 < 3.62 # Bayes. bootstr. ("bayesboot()")
# 95% HDI = Highest Density Intervals
# 95% Conf.int: 2.25 < median=3 < 3.75 # ordinary boots. (script)
# 95% Conf.int: 2.25 < median=3 < 3.75 # ordinary bootstr. ("boot()")
# Parametric (t) is very narrow (overtly optimistic) - Bootraps are better
# All Bootsraps are very similar (within 5%).
# "boot()" and this script are 100% identical!
### Plot Comparison of Methods, very small sample N= 4
plot(boot.obj.sm, main = "Posterior Distribution, N=4")
abline(v = c(2.87,3.13), col= "red")
text(3.13, 0.55,"param.(t)", cex = 0.7, col = "red")
abline(v = c(2.25,3.75), col= "blue")
text(3.75, 0.6, "boot()", cex = 0.7, col = "blue")
abline(v = c(2.25,3.75), col= "purple")
text(3.75, 0.5, "script", cex = 0.7, col = "purple")
abline(v = c(2.35,3.62), col= "orange")
text(2.35, 0.5, "HDI. bayesb.", cex = 0.7, col = "orange")
abline(v = c(2.36,3.63), col= "grey")
text(2.36, 0.58, "C.I. bayesb.", cex = 0.7, col = "grey")

### Example II.2. Bootstrap the slope of a linear model
# Objectives:
# Obtain 95% Conf. Intervals and st.errors for the slope.
# Estimate the probability "p" that the slope is equal to zero.
# Bootstrap example with linear regression
# calculate the "p" of the linear model by Bootstrapping
# and calculate Bootstrap 95% confidence interval of the slope
x <- seq(1,20) # n = 20 numbers (1,...,20)
# x
a = 10; b = 5 # a:intecept, b: slope
y <- a + b*x
# y
plot(y ~ x)

# Rand = rnorm(20, 0, sd = 15)
Rand = c(4.511, 10.20932,15.7684,5.44541, 7.40814,
6.59113, 6.8693, -22.10925, 14.97867, -8.50671,
27.60960, 0.44229, 17.20696, 5.53542, -0.09821,
11.04133,-6.610431, 18.8136, 12.93610, 27.8513)
yr <- y + Rand
plot(yr ~ x)
model.1 <- lm(yr ~ x)
summary(model.1)
##
## Call:
## lm(formula = yr ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.8921 -4.7467 0.8192 7.8392 19.6126
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.5450 5.5417 2.444 0.025 *
## x 5.4047 0.4626 11.683 7.75e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.93 on 18 degrees of freedom
## Multiple R-squared: 0.8835, Adjusted R-squared: 0.877
## F-statistic: 136.5 on 1 and 18 DF, p-value: 7.755e-10
# param. "p"-value: 7.7e-10***, R-squared: 0.88
# slope: 5.40, st. error of the slope: 0.46
# param. 95% conf. int of the slope: 4.5 < MEAN=5.4 < 6.3
5.4 - 1.96 *0.46 ; 1.96 *0.46 +5.4
## [1] 4.4984
## [1] 6.3016
# Summary table:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) 13.54 5.54 2.4 0.025 *
# x 5.40 0.46 11.68 7.75e-10 ***
(R.squ.1 <- (cor.test(yr ,x)$estim[[1]])^2) # R² = 0.88
## [1] 0.8834915
plot(yr ~ x)
abline(model.1, col = "blue")

# Now Bootstrap
# TAKE A SAMPLE with pairs of X,Y.
# Use "replace = TRUE", with repeated pairs.
df.1 <- data.frame(x, yr)
#View(df.1)
index <- 1:length(yr)
# ?sample
sample(index, length(yr), replace = TRUE) # samples 20 index values
## [1] 20 4 2 5 20 11 8 7 6 11 5 13 2 10 15 3 16 6 18 17
index.sam <- sample(index, length(yr), replace = TRUE)
# a sample of 20 index values, n=20
df.sam <- df.1
df.sam$x <- df.1$x[index.sam]
df.sam$yr <- df.1$yr[index.sam]
#View(df.sam)
plot(df.sam$yr ~ df.sam$x)
model.sam <- lm(df.sam$yr ~ df.sam$x)
abline(model.sam, col = "blue")
summary(model.sam)
##
## Call:
## lm(formula = df.sam$yr ~ df.sam$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.055 -3.674 -1.274 6.833 20.259
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.5289 3.7402 5.221 5.77e-05 ***
## df.sam$x 4.8020 0.4092 11.735 7.22e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.429 on 18 degrees of freedom
## Multiple R-squared: 0.8844, Adjusted R-squared: 0.878
## F-statistic: 137.7 on 1 and 18 DF, p-value: 7.221e-10
model.sam$coef[2] # slope of the linear model (based on the sample)
## df.sam$x
## 4.801994
# as function
slope.sam.fun1 <-function (X,Y) {
df.1 <- data.frame(X, Y)
index <- 1:length(Y)
sample(index, length(Y), replace = TRUE) # samples 20 index values
index.sam <- sample(index, length(Y), replace = TRUE)
# a sample of 20 index values, n=20
df.sam <- df.1
df.sam$X <- df.1$X[index.sam]
df.sam$Y <- df.1$Y[index.sam]
#View(df.sam)
# plot(df.sam$yr ~ df.sam$x)
model.sam <- lm(df.sam$Y ~ df.sam$X)
abline(model.sam, col = "blue")
summary(model.sam)
; model.sam$coef[2] # slope of the linear model (based on the sample)
} # END OF FUNCTION
slope.sam.fun1(x,yr)
## df.sam$X
## 5.440905
# As LOOP
# 4000 BOOTSTRAP RUNS
# careful, takes time... may take several minutes
boot.runs <- 4000
boot.slope.vec <- 1:boot.runs
for (i in 1:boot.runs) {
slope.sam <- slope.sam.fun1(x,yr)
boot.slope.vec[i] <- slope.sam
}

#boot.slope.vec # output slope vector for posterior distrib.
hist(boot.slope.vec)

median(boot.slope.vec) # mediana geral: 5.4,
## [1] 5.404659
#Kolmogorov-Smirnov test for normality
test.distrib <- rnorm(10000, mean = mean(boot.slope.vec), sd = sd(boot.slope.vec))
ks.test(test.distrib, boot.slope.vec) # p-value<0.05 ,não é normal
##
## Two-sample Kolmogorov-Smirnov test
##
## data: test.distrib and boot.slope.vec
## D = 0.03085, p-value = 0.008693
## alternative hypothesis: two-sided
# histogram da distribuição Bootstrap
hist(boot.slope.vec, col= grey(0.8), prob= TRUE,
main = "Posterior Distribution, slope, n=20",ylim = c(0, 4)) # histogram
text(mean(boot.slope.vec), 3.95, "mean", cex = 0.7)
abline(v = (mean(boot.slope.vec)), lty = 4, col = "purple")
lines(density(boot.slope.vec), col = "blue", lty= 2)
lines(density(test.distrib), col = "red")
text(4.5, 2.75, "red: normal dist.",cex =0.7,col = "red")
text(4.5, 1.55, "blue: Bootstrap dist.",cex =0.7,col = "blue")

s.error.boot.teste.1 = sd(boot.slope.vec) # st.dev. = 0.39
s.error.boot.teste.1
## [1] 0.3923248
# Non-parametr. Bootstrap (no assumptions, uses quantiles):
quantile(boot.slope.vec, c(0.025, 0.975)) # correct 95% quantiles OK!
## 2.5% 97.5%
## 4.636483 6.201019
# 2.5% 97.5%
# 2.25 3.75
# 95% Conf.int: 4.60 < median=5.4 < 6.16 # ordinary boots. (script)
# param. 95% conf. int of the slope: 4.5 < MEAN=5.4 < 6.3
# very similar conf. intervals...
### Calculate "p" of the model (i.e., "p" of the slope)
# Calculate "p" from the Kernel density of the Posterior
plot(density(boot.slope.vec)$x, density(boot.slope.vec)$y,
xlim = c(0, max(boot.slope.vec)))

min(density(boot.slope.vec)$y) # p < 1.773 e-05***
## [1] 1.799905e-05
normalized.density <- density(boot.slope.vec)$y/(sum(density(boot.slope.vec)$y))
plot(density(boot.slope.vec)$x,normalized.density,
xlim = c(0, max(boot.slope.vec)))

sum(normalized.density) # = 1
## [1] 1
min(normalized.density) # p < 1.27e-07***, p<0.001***
## [1] 1.419971e-07
# Calculate "p" from actual counts in the Posterior
hist.1 <- hist(boot.slope.vec, col= grey(0.8), prob= TRUE,
main = "Posterior Distribution, slope, n=20",
ylim = c(0, 4), xlim = c(0, max(boot.slope.vec))) # histogram

normalized.counts <- hist.1$counts/(sum(hist.1$counts))
sum(normalized.counts)# = 1
## [1] 1
min(normalized.counts)# p< 0.00025***, p<0.001***
## [1] 0.00025
# plot x, yr with 95% confidence envelope (package "ggplot2")
library (ggplot2)
df <- data.frame(x,yr)
ggplot(df, aes(x=x, y=yr))+
geom_point()+
geom_smooth(method=lm, se=TRUE)

# ?geom_smooth
# References
#
# Anderson, M.J. 2001. A new method for non-parametric multivariate analysis of variance. Austral Ecology, 26: 32-46.
#
# Larget, B. 2014. Bootstrap Examples. www.stat.wisc.edu.
# http://www.stat.wisc.edu/~larget/stat302/chap3.pdf
# Rubin, D.B. (1981). "The Bayesian Bootstrap". The Annals of Statistics, 9(1), p. 130-134.
# Paper (Rubin, 1981): https://projecteuclid.org/download/pdf_1/euclid.aos/1176345338
#### End of Script ###