knitr::opts_chunk$set(echo = TRUE)

Participants

  1. AKINGENEYE Genevieve 223010119
  2. ⁠Umutoniwase Mireille 223012675
  3. NDAYISABA Derrick 223018763
  4. BIZIMANA Castule. 223014418
  5. MUGENZI Constantin 223013503
  6. IYADUHANZE Olivier 223014487
  7. Kwizera Theogene 223017020
  8. Niyobyose Brancal 223014431

Bootstrap Methods

rm(list = ls())

What is the Bootstrap?

The bootstrap is a resampling technique used to approximate the sampling distribution of a statistic when the theoretical distribution is unknown or complicated, Resample with replacement from the observed data and recompute the statistic many times.

This sometimes happens when we have access on Sample Only

The Original Sample

set.seed(2025)
original_sample <- sample(10:50, 20, replace = T)
means <- c()

Generating Resamples With Replacement from the Original Sample

for(i in 1:1000){
  resample <- sample(original_sample, size = length(original_sample), replace = TRUE)
  means[i] <- mean(resample)
}
means
##    [1] 35.55 32.65 26.60 28.90 31.10 32.35 30.30 31.45 28.80 29.75 28.15 29.55
##   [13] 30.55 30.55 28.10 32.30 30.30 27.05 28.30 31.75 27.85 28.05 32.70 29.80
##   [25] 27.65 29.75 29.65 29.65 26.65 30.85 29.20 31.70 25.20 29.20 31.35 27.15
##   [37] 25.90 27.80 23.85 30.95 30.25 25.55 27.65 30.60 28.25 27.90 33.45 28.65
##   [49] 28.25 25.90 29.25 28.65 28.50 29.20 30.55 25.55 30.80 28.40 27.45 29.25
##   [61] 31.70 30.75 31.15 31.90 27.40 31.80 28.35 30.25 33.10 30.30 35.00 28.20
##   [73] 27.25 29.75 32.65 33.55 28.20 32.25 28.95 29.25 30.50 27.55 31.95 28.75
##   [85] 31.90 29.25 33.75 26.75 25.05 27.90 33.55 33.20 30.05 31.25 26.15 29.15
##   [97] 28.15 29.90 28.20 26.65 23.75 31.50 29.15 28.05 29.05 27.60 26.85 30.35
##  [109] 30.90 30.50 32.60 29.50 28.20 29.70 29.55 29.55 29.10 31.25 30.55 32.90
##  [121] 25.90 28.60 29.35 31.25 29.25 29.95 26.30 27.35 28.45 29.30 34.95 29.55
##  [133] 27.70 26.85 27.55 29.00 30.95 27.45 27.30 28.65 30.40 34.85 29.10 28.85
##  [145] 30.95 26.45 28.65 27.55 29.85 31.20 28.90 26.55 27.05 30.55 29.50 26.70
##  [157] 28.90 28.85 30.85 30.15 24.55 32.70 28.85 28.60 29.60 30.60 30.55 29.50
##  [169] 29.45 32.55 27.55 35.50 25.50 27.60 31.80 32.10 26.70 31.35 30.15 26.70
##  [181] 31.35 32.05 28.80 33.75 32.85 25.45 33.35 30.25 32.45 31.80 26.65 34.10
##  [193] 32.75 31.50 28.95 30.55 30.80 27.05 28.55 30.95 25.65 27.20 32.20 31.15
##  [205] 27.85 34.30 30.20 30.50 27.80 31.10 32.70 29.60 29.25 29.55 31.45 31.45
##  [217] 29.05 29.90 31.85 26.55 31.30 30.00 28.90 27.05 29.20 28.80 31.70 27.10
##  [229] 30.25 33.05 26.65 28.05 33.00 27.40 29.85 31.15 32.50 29.95 27.55 32.15
##  [241] 27.95 29.55 32.80 32.15 31.50 29.55 29.85 33.50 29.20 29.75 30.85 30.75
##  [253] 29.45 34.70 29.00 25.10 31.90 31.25 26.55 27.20 33.40 31.80 26.65 31.55
##  [265] 31.65 28.45 31.10 27.50 29.75 36.70 29.60 30.60 35.55 31.85 25.20 27.95
##  [277] 28.10 29.40 30.75 31.15 29.05 29.70 28.80 30.45 30.00 30.50 27.20 28.80
##  [289] 31.30 28.10 31.90 28.70 32.20 29.25 29.85 33.15 33.30 26.95 27.30 30.85
##  [301] 25.30 32.45 22.00 26.75 30.75 33.95 30.85 31.80 28.40 28.15 26.65 26.95
##  [313] 30.50 29.10 31.20 29.55 30.60 34.50 27.45 30.75 29.35 25.95 29.30 29.05
##  [325] 31.25 31.95 27.30 31.35 35.55 34.80 30.75 30.70 29.40 32.10 31.35 32.00
##  [337] 28.10 29.20 29.90 30.00 28.80 28.90 27.70 29.90 25.90 27.45 29.10 25.90
##  [349] 28.60 30.40 29.60 31.25 32.45 27.65 31.25 30.60 31.75 27.60 28.35 26.10
##  [361] 29.85 27.20 26.60 31.55 29.75 31.70 27.85 32.40 30.55 28.35 29.90 29.05
##  [373] 31.00 29.15 30.65 27.15 28.40 29.20 31.00 24.95 29.95 33.35 30.25 27.75
##  [385] 28.75 31.85 27.60 29.95 27.95 29.40 30.45 31.50 23.60 28.40 24.80 29.00
##  [397] 26.50 22.20 30.30 28.70 31.80 32.75 29.55 30.85 28.00 34.20 32.70 30.90
##  [409] 29.75 29.40 31.00 31.65 30.95 34.20 27.95 29.95 27.95 25.35 33.05 30.25
##  [421] 30.05 28.25 30.00 29.30 28.25 31.50 30.65 26.95 27.05 28.00 33.20 30.70
##  [433] 29.15 27.30 27.90 27.10 32.50 27.05 32.85 29.00 29.70 26.40 29.65 31.35
##  [445] 31.15 28.40 31.65 29.30 32.25 28.70 30.50 26.05 33.80 31.65 30.30 31.80
##  [457] 29.60 29.80 31.80 31.60 30.00 28.20 27.95 31.20 32.10 29.00 33.15 30.05
##  [469] 29.30 29.35 26.60 32.50 27.95 33.15 28.70 30.45 29.30 29.50 26.15 32.05
##  [481] 30.25 33.75 30.15 29.35 28.75 27.70 32.75 29.85 30.00 25.90 32.00 33.25
##  [493] 33.45 31.00 25.75 29.30 30.35 29.85 32.35 29.25 30.80 31.35 31.95 26.85
##  [505] 30.35 31.20 35.20 25.15 32.30 28.45 25.05 29.85 30.75 28.75 30.60 28.00
##  [517] 32.75 26.40 28.20 26.80 24.85 28.90 31.60 33.65 27.95 25.05 27.70 35.75
##  [529] 31.30 29.70 28.45 24.85 31.25 29.55 25.55 33.80 29.45 31.05 28.45 26.25
##  [541] 28.65 33.65 32.40 36.35 26.45 29.95 30.45 30.00 30.35 24.10 27.95 28.50
##  [553] 31.40 32.25 27.10 23.85 30.10 28.95 27.45 29.40 29.40 36.85 30.55 29.80
##  [565] 27.30 29.65 28.45 28.00 26.65 28.15 30.70 31.35 28.75 31.95 32.10 33.55
##  [577] 26.50 29.25 34.25 29.45 26.00 30.10 27.35 30.90 28.05 30.25 26.00 28.50
##  [589] 30.25 28.85 25.05 30.10 29.60 32.60 31.25 30.65 31.25 33.40 32.25 29.95
##  [601] 30.95 30.00 28.10 29.70 33.40 28.45 33.65 34.75 34.00 31.65 26.15 31.80
##  [613] 27.40 26.05 33.30 27.05 31.10 30.55 27.80 29.90 33.00 30.50 31.90 28.60
##  [625] 25.60 34.50 28.60 27.85 29.70 31.15 29.05 32.40 24.05 31.85 26.60 30.35
##  [637] 32.70 30.95 28.65 27.45 31.85 30.45 32.55 33.20 29.40 32.25 31.60 30.25
##  [649] 31.35 28.50 26.90 28.90 34.00 32.75 30.45 29.40 29.10 27.80 29.40 29.75
##  [661] 25.40 31.60 32.60 28.75 33.20 32.05 32.20 34.20 25.40 26.90 27.65 27.35
##  [673] 29.25 27.75 30.40 27.90 28.65 28.35 28.80 30.85 30.95 27.90 34.20 32.95
##  [685] 27.85 29.85 27.15 24.80 28.45 33.60 27.50 27.75 29.35 27.95 24.10 29.30
##  [697] 33.15 29.75 28.10 28.10 26.25 28.60 26.05 28.85 28.40 32.10 28.15 29.50
##  [709] 24.90 27.55 32.45 26.85 30.95 29.45 29.45 28.75 34.00 26.05 26.10 31.50
##  [721] 30.40 25.95 29.40 31.05 31.90 31.75 30.50 27.25 25.05 28.90 31.70 28.85
##  [733] 30.15 32.10 26.90 28.20 34.35 26.80 34.60 26.80 33.65 30.90 26.25 28.70
##  [745] 34.25 31.05 29.40 28.55 30.20 28.85 28.30 26.20 26.90 29.00 27.15 28.85
##  [757] 30.30 25.10 31.10 28.85 27.40 32.35 30.70 30.15 32.60 33.00 26.30 26.80
##  [769] 26.85 32.85 30.35 30.40 29.15 32.40 27.05 30.90 29.00 26.35 28.55 29.70
##  [781] 26.20 27.85 29.45 31.45 32.15 32.70 27.05 29.65 27.45 32.70 27.35 29.55
##  [793] 30.05 31.80 30.60 27.50 30.70 28.30 31.75 27.15 26.40 30.80 29.25 32.65
##  [805] 32.20 25.45 30.05 31.25 32.25 27.85 28.30 30.45 33.80 26.95 28.45 27.60
##  [817] 31.50 28.15 31.70 30.95 29.40 30.10 28.55 27.10 26.60 30.15 28.50 32.90
##  [829] 25.80 25.85 30.70 29.25 26.75 30.85 29.40 28.65 29.50 31.30 34.40 27.15
##  [841] 30.70 28.85 34.50 33.15 31.25 29.05 27.80 31.85 32.05 31.05 30.00 28.40
##  [853] 29.80 26.00 25.70 29.40 30.95 27.25 26.15 28.90 33.70 25.30 30.90 27.85
##  [865] 28.10 26.45 31.25 28.50 33.40 28.20 30.95 26.60 28.45 32.60 30.50 28.30
##  [877] 26.65 32.90 29.55 27.50 31.70 30.25 26.55 30.35 26.35 30.30 26.65 26.75
##  [889] 27.45 30.85 30.50 28.15 29.80 32.25 28.35 29.05 31.00 25.00 32.90 29.80
##  [901] 31.10 26.65 27.15 32.60 27.00 32.25 26.10 28.65 27.35 32.45 27.60 30.55
##  [913] 31.15 25.20 30.30 33.55 27.05 31.05 28.50 29.55 33.95 30.60 31.60 33.10
##  [925] 27.05 29.45 26.10 30.55 26.20 32.75 31.05 24.45 23.90 31.95 27.05 23.30
##  [937] 29.00 29.05 27.60 25.35 25.95 32.00 29.25 28.55 34.60 30.20 27.30 26.50
##  [949] 30.65 29.65 31.10 29.30 28.45 32.20 29.95 32.55 28.95 31.45 30.90 30.35
##  [961] 29.50 31.85 27.05 27.55 25.50 32.55 31.30 32.05 30.55 30.30 28.35 26.85
##  [973] 30.00 30.65 28.35 32.05 31.10 28.70 27.90 32.90 32.90 27.80 26.95 28.25
##  [985] 27.30 29.10 30.15 34.40 29.30 29.90 28.00 32.80 31.70 29.55 31.40 25.20
##  [997] 32.80 28.20 28.65 26.50

Plotting The Generated Resamples To see The Distribution

hist(means,probability = T,plot = T)
lines(density(means),lwd = 3, col = "blue")
abline(v = mean(means), lwd = 2, lty = 2, col = "red")

cat("Data we have is Normally Distributed Pupulation with mean ", mean(means),"And Standard Deivation ", sd(means))
## Data we have is Normally Distributed Pupulation with mean  29.62505 And Standard Deivation  2.387335

\[ \text{Means ~} N(29.62, 2.38) \]

Bootstrap Confidence Intervals

cat("Confidence Interval of The Generated Resamples is: \n")
## Confidence Interval of The Generated Resamples is:
quantile(means, probs = c(0.05, 0.95))
##      5%     95% 
## 25.8475 33.5500

The Confidence Interval Gives The Range of Population Mean

Basics of Markov Chain Monte Carlo (MCMC)

What is MCMC?

MCMC is a family of algorithms used to sample from complex probability distributions, especially Bayesian posterior distributions.

MCMC is called a Markov chain method because it generates samples where each new sample depends only on the current sample, forming a Markov chain whose stationary distribution is the target distribution.

\[ \textbf{The Distribution Functions Like: }\\ \pi(\theta \mid y) \propto L(y \mid \theta) \pi(\theta) \]

These complex Functions can be:

1. High Dimensionality (Too Many Variables):

In introductory statistics, you usually deal with one variable (univariate), like “height of adults.” You can easily draw this as a single curve on a 2D graph.

In real-world problems (especially Bayesian statistics), you might have a model with 50, 100, or 1,000 parameters.

Example:

\[ \text{Simple Function: } P(x) \\ \text{Complex Probability Dustribution: } P(x_1,x_2,x_3,...,x_{100}) \]

You cannot visualize 100 dimensions, and standard mathematical formulas often break down when you try to calculate the area under the curve (integration) for that many variables at once.

2. Heavy Dependency (The “Joint” Problem):

This is the bigger issue. In a complex distribution, the variables are not independent. If you change one variable, the probabilities for all the others change.

\[ \text{The } f((x_1,x_2,...,x_n)(y_1,y_2,...,y_n)) ≠ f(\bf{x})⨯f(\bf{y}) \\ \forall (\bf{x} = (x_1,x_2,...x_n)\, \&\,\textbf{y} = (y_1,y_2,...,y_n)) \]

Pure MCMC (Metropolis–Hastings)

target_density <- function(x) {
  exp(-0.5 * (x - 4)^2)
}

$$ \ f(x) = (*(x-)^2)\

= 4 , : ^2 = 1 $$

set.seed(123)

n_iter <- 15000
chain <- numeric(n_iter)

# Initial value
chain[1] <- 0

# Proposal standard deviation
proposal_sd <- 1

for (i in 2:n_iter) {
  
  # Step 1: propose a new value
  proposal <- rnorm(1, mean = chain[i - 1], sd = proposal_sd)
  
  # Step 2: acceptance probability
  alpha <- min(1, target_density(proposal) /
                  target_density(chain[i - 1]))
  
  # Step 3: accept or reject
  if (runif(1) < alpha) {
    chain[i] <- proposal
  } else {
    chain[i] <- chain[i - 1]
  }
}

4. Convergence Diagnostics

4.1. Trace Plot (Chain Mixing)

plot(chain, type = "l",
     main = "Trace Plot (Metropolis–Hastings)",
     xlab = "Iteration",
     ylab = "x")

4.2. Histogram vs True Distribution

burn_in <- 3000
chain_post <- chain[-(1:burn_in)]

ConfINt

cat("The confidence INterval of Population mean is: \n")
## The confidence INterval of Population mean is:
quantile(chain_post, probs = c(0.05, 0.95))
##       5%      95% 
## 2.185420 5.597064
hist(chain_post, probability = TRUE, breaks = 40,
     main = "MCMC Samples vs True Density",
     xlab = "x")

abline(v = mean(chain_post), col = "blue", lwd = 4, lty = 3)

curve(dnorm(x, mean = 4, sd = 1),
      col = "red", lwd = 2, add = TRUE)

# Add the legend
legend("topright",                          
       legend = c("Posterior Mean", "True Density"),
       col = c("blue", "red"),               
       lty = c(3, 1),                        
       lwd = c(4, 2))                        

$$ \

N(3.944, 1.01) $$

##Gibbs Sampling

What is Gibbs Sampling?

Gibbs sampling is a special case of Markov Chain Monte Carlo (MCMC) used to generate samples from a joint probability distribution when: The full conditional distributions of each variable are known and easy to sample from.

knitr::include_graphics("conditional.png")

Steps of Gibbs Sampling (Summary)

1. Start with a random guess for your variables (e.g. x=0, y=0).

2. Update x: Pick a new x based on the current y

3. Update y: Pick a new y based on the new x

4. Repeat this loop thousands of times. (until convergence)

5. Burn-in: Throw away the first few hundred samples (because your starting guess was probably wrong)

6. Use remaining samples for inference.

# 1. Setup
n_iter <- 20000
x_samples <- numeric(n_iter)
y_samples <- numeric(n_iter)

# Initial guesses
x_current <- 0
y_current <- 0

# Correlation (rho) - how strongly x and y are linked
rho <- 0.8 

# 2. Gibbs Sampling Loop
for(i in 1:n_iter){
  
  # Update X given the current Y
  # (We sample from a normal distribution conditional on y_current)
  x_current <- rnorm(1, mean = rho * y_current, sd = sqrt(1 - rho^2))
  x_samples[i] <- x_current
  
  # Update Y given the NEW X
  y_current <- rnorm(1, mean = rho * x_current, sd = sqrt(1 - rho^2))
  y_samples[i] <- y_current
}
# burn-in first 500 iterations
burn_in <- 1000
x_samples_burned <- x_samples[(burn_in+1):n_iter]
y_samples_burned <- y_samples[(burn_in+1):n_iter]

DataFrame of Twe Generated Sample

burned <- data.frame(X = x_samples_burned, y = y_samples_burned)
head(burned, n = 10)
# 3. Visualize
# You will see an oval shape, showing the correlation we simulated!
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(
  x = x_samples_burned,
  y = y_samples_burned,
  type = "scatter",
  mode = "line",
  marker = list(color = "blue", opacity = 0.4)
) %>%
  layout(
    title = "Correlation between Paired Data",
    xaxis = list(title = "X"),
    yaxis = list(title = "Y")
  )
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
# plot(x_samples_burned, y_samples_burned, 
#      main="Gibbs Sampling After Burn-in", 
#      pch=19, col=rgb(0,0,0,0.2))

This scatter plot illustrates the outcome of Gibbs sampling after burn-in. Each point represents a sample ( X, Y ) generated iteratively by alternately sampling X given Y and Y given X. The elliptical pattern shows the positive correlation between the two variables, and the dense central cluster reflects the equilibrium distribution reached after sufficient iterations. The spread along the diagonal captures the dependency structure imposed by the conditional sampling, demonstrating how Gibbs sampling explores the joint distribution efficiently.

You generally use Gibbs Sampling in Bayesian Statistics when you have a complex model with many parameters. You can’t solve the math for all parameters at once, but you can solve for one parameter if you assume all the others are fixed.