Assignment 2

Question 1

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.2

library(reshape2)

## Warning: package 'reshape2' was built under R version 4.4.2

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.4.2

#Set seed for reproducibility
set.seed(170)  # Changed seed for variability, I explian the effect of this function in the report

#sample sizes
sample_sizes_vec <- c(10, 50, 100)

# Define logistic regression coefficients
intercept <- 0.1
coef_x1 <- 1.1
 coef_x2 <- -0.9

#Function to create logistic regression 
create_logistic_data <- function(sample_size) {
  # Step 1: Generate independent variables x1 and x2 from Uniform[-2, 2]
  x_var1 <- runif(sample_size, min = -2, max = 2)
  x_var2 <- runif(sample_size, min = -2, max = 2)
  
  # Step 2: Compute the linear predictor
  linear_predictor <- intercept + coef_x1 * x_var1 + coef_x2 * x_var2
  
  
  #Step 3: Calculate probabilities using the logistic function
  probabilities <- 1 / (1 + exp(-linear_predictor))
  
  # Step 4: Generate binary outcomes based on the probabilities
  binary_outcomes <- rbinom(sample_size, size = 1, prob = probabilities)
  
  # Step 5: Calculate proportions of outcomes
  proportion_ones <- mean(binary_outcomes)
       proportion_zeros <- 1 - proportion_ones
  
               return(c(Proportion_Zeros = proportion_zeros, Proportion_Ones = proportion_ones))
}

# Collect results for different sample sizes
data_summary <- sapply(sample_sizes_vec, create_logistic_data)

#Convert the results
summary_df <- as.data.frame(t(data_summary))
  colnames(summary_df) <- c("Proportion of Zeros", "Proportion of Ones")
      summary_df$Sample_Size <- sample_sizes_vec

  print(summary_df)

##   Proportion of Zeros Proportion of Ones Sample_Size
## 1                0.50               0.50          10
## 2                0.48               0.52          50
## 3                0.45               0.55         100

# Reshape data 
long_format <- melt(summary_df, id.vars = "Sample_Size", 
                    variable.name = "Outcome_Type", 
                    value.name = "Proportion")

# Create a bar plot 
bar_plot <- ggplot(long_format, aes(x = factor(Sample_Size), y = Proportion, fill = Outcome_Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Proportion of Binary Outcomes by Sample Size",
       x = "Sample Size",
       y = "Proportion",
       fill = "Outcome Type") +
  scale_fill_manual(values = c("Proportion of Zeros" = "blue", "Proportion of Ones" = "salmon")) +
  theme_minimal() +
  theme(legend.position = "top")

grid.arrange(tableGrob(summary_df), bar_plot, nrow = 2)

QUESTION 2

set.seed(170)

intercept <- 0.1
 coef_x1 <- 1.1
 coef_x2 <- -0.9


sample_sizes <- c(10, 50, 100)


create_data <- function(N) {

     x1 <- runif(N, min = -2, max = 2)
  x2 <- runif(N, min = -2, max = 2)

      linear_predictor <- intercept + coef_x1 * x1 + coef_x2 * x2
  
 probabilities <- 1 / (1 + exp(-linear_predictor))
  
  # Generate binary response based on probabilities
  response <- rbinom(N, 1, probabilities)
  
  # Return the dataset 
  return(data.frame(x1 = x1, x2 = x2, y = response))
}

# Generate data for each sample size
simulated_data <- lapply(sample_sizes, create_data)

# Parameters for the importance sampling process
num_samples <- 20000  
   num_resamples <- 10000  

compute_log_likelihood <- function(beta, data) {
  
  
  
  linear_predictor <- beta[1] + beta[2] * data$x1 + beta[3] * data$x2
        probabilities <- 1 / (1 + exp(-linear_predictor))
  
  log_likelihood_value <- sum(data$y * log(probabilities) + (1 - data$y) * log(1 - probabilities))
  return(log_likelihood_value)
}

# Proposal distribution: Generate random samples for the betas from a multivariate normal
proposal_distribution <- function(N, sigma = 1.5) {
  # N samples for 3 betas (including intercept)
                       matrix(rnorm(3 * N, mean = 0, sd = sigma), ncol = 3)
}

#   Function to calculate the importance weights (including prior)
   compute_importance_weights <- function(beta, data) {
  # Log-posterior: log-likelihood + log-prior (assuming Gaussian prior)
         log_posterior <- compute_log_likelihood(beta, data) - 0.5 * sum(beta^2)  # Gaussian prior
  return(exp(log_posterior))
}

importance_sampling <- function(data, N) {
  proposal_samples <- proposal_distribution(N)
  
  # Calculate importance weights for each sample
  weights <- sapply(1:N, function(i) compute_importance_weights(proposal_samples[i, ], data))
  
               normalized_weights <- weights / sum(weights)
  
  #    Resampling 
            resample_indices <- sample(1:N, num_resamples, replace = TRUE, prob = normalized_weights)
           resampled_beta <- proposal_samples[resample_indices, ]
  
  #    Compute posterior means
  posterior_means <- colSums(proposal_samples * normalized_weights)
  
  ESS <- 1 / sum(normalized_weights^2)
  
  # Return results as a list
  return(list(posterior_means = posterior_means, ESS = ESS, resampled_beta = resampled_beta))
}

               sampling_results <- lapply(simulated_data, function(data) importance_sampling(data, num_samples))

# Extract posterior means from the results
posterior_means_results <- lapply(sampling_results, function(result) result$posterior_means)
                     posterior_means_results

## [[1]]
## [1]  0.1378285  1.1646153 -0.1678666
## 
## [[2]]
## [1] -0.1419279  1.2025519 -1.2091650
## 
## [[3]]
## [1]  0.173361  1.086216 -0.587570

# Extract Effective Sample Size (ESS) from the results
              ESS_results <- lapply(sampling_results, function(result) result$ESS)
ESS_results

## [[1]]
## [1] 1717.429
## 
## [[2]]
## [1] 352.3169
## 
## [[3]]
## [1] 126.3188

       par(mfrow = c(3, 3))  

# Loop through each sample size and plot histograms for resampled betas
for (i in 1:length(sample_sizes)) {
  resampled_betas <- sampling_results[[i]]$resampled_beta
  
  # Histogram for β0 (Intercept)
  hist(resampled_betas[, 1], main = paste("β0 (N =", sample_sizes[i], ")"), xlab = "β0", col = "yellow", breaks = 50)
  
  # Histogram for β1 (Coefficient for x1)
           hist(resampled_betas[, 2], main = paste("β1 (N =", sample_sizes[i], ")"), xlab = "β1", col = "lightblue", breaks = 50)
  
                                        # Histogram for β2 (Coefficient for x2)
          hist(resampled_betas[, 3], main = paste("β2 (N =", sample_sizes[i], ")"), xlab = "β2", col = "green", breaks = 50)
}

     # Fit the logistic regression model using MLE (GLM) for each dataset
glm_models <- lapply(simulated_data, function(data) glm(y ~ x1 + x2, data = data, family = binomial))

                    # Extract coefficients from the GLM models
glm_coefficients <- lapply(glm_models, function(model) coef(model))
   glm_coefficients

## [[1]]
## (Intercept)          x1          x2 
##   0.3494093   3.3801629  -1.7201246 
## 
## [[2]]
## (Intercept)          x1          x2 
##  -0.3704053   1.7094964  -1.7060462 
## 
## [[3]]
## (Intercept)          x1          x2 
##   0.2223994   1.1363509  -0.6265514

Question 3 with considering provided comments

set.seed(170)

intercept <- 0.1
coef_x1 <- 1.1
coef_x2 <- -0.9

additional_betas <- runif(6, min = -1, max = 1)
names(additional_betas) <- paste("beta_add", 1:6, sep = "_")
print("Generated additional beta values:")

## [1] "Generated additional beta values:"

print(additional_betas)

## beta_add_1 beta_add_2 beta_add_3 beta_add_4 beta_add_5 beta_add_6 
##  0.9683449 -0.5339030  0.5368513 -0.8417813  0.1257677 -0.8274445

sample_sizes <- c(10, 50, 100)

create_data <- function(N) {
  x1 <- runif(N, min = -2, max = 2)
  x2 <- runif(N, min = -2, max = 2)
  additional_covariates <- matrix(runif(N * 6, min = -2, max = 2), ncol = 6)
  colnames(additional_covariates) <- paste("x_add", 1:6, sep = "_")
  linear_predictor <- intercept + coef_x1 * x1 + coef_x2 * x2
  for (i in 1:6) {
    linear_predictor <- linear_predictor + additional_covariates[, i] * additional_betas[i]
  }
  probabilities <- 1 / (1 + exp(-linear_predictor))
  response <- rbinom(N, 1, probabilities)
  data <- data.frame(x1 = x1, x2 = x2, y = response, additional_covariates)
  return(data)
}

simulated_data <- lapply(sample_sizes, create_data)

num_samples <- 20000
num_resamples <- 10000

compute_log_likelihood <- function(beta, data) {
  linear_predictor <- beta[1] + beta[2] * data$x1 + beta[3] * data$x2
  for (i in 1:6) {
    linear_predictor <- linear_predictor + data[, paste("x_add", i, sep = "_")] * beta[3 + i]
  }
  probabilities <- 1 / (1 + exp(-linear_predictor))
  log_likelihood_value <- sum(data$y * log(probabilities) + (1 - data$y) * log(1 - probabilities))
  return(log_likelihood_value)
}

proposal_distribution <- function(N, sigma = 1.5) {
  matrix(rnorm(9 * N, mean = 0, sd = sigma), ncol = 9)
}

compute_importance_weights <- function(beta, data) {
  log_likelihood <- compute_log_likelihood(beta, data)
  log_prior <- -0.5 * sum(beta^2)
  log_posterior <- log_likelihood + log_prior
  return(exp(log_posterior))
}

importance_sampling <- function(data, N) {
  proposal_samples <- proposal_distribution(N)
  weights <- sapply(1:N, function(i) compute_importance_weights(proposal_samples[i, ], data))
  if (any(is.na(weights)) | any(is.nan(weights))) {
    weights[is.na(weights) | is.nan(weights)] <- 1e-10
  }
  normalized_weights <- weights / sum(weights)
  resample_indices <- sample(1:N, num_resamples, replace = TRUE, prob = normalized_weights)
  resampled_beta <- proposal_samples[resample_indices, ]
  posterior_means <- colSums(proposal_samples * normalized_weights)
  ESS <- 1 / sum(normalized_weights^2)
  return(list(posterior_means = posterior_means, ESS = ESS, resampled_beta = resampled_beta))
}

sampling_results <- lapply(simulated_data, function(data) importance_sampling(data, num_samples))

posterior_means_results <- lapply(sampling_results, function(result) result$posterior_means)
ESS_results <- lapply(sampling_results, function(result) result$ESS)

print("Posterior means for each sample size:")

## [1] "Posterior means for each sample size:"

print(posterior_means_results)

## [[1]]
## [1]  0.31279720  0.64257649 -0.63977290  0.10788616 -0.70143446  0.29467828
## [7]  0.01030248 -0.39261481 -0.83282578
## 
## [[2]]
## [1] -0.007941247  1.540101108 -0.774561409  1.322760275 -0.241152351
## [6]  0.362213366 -0.611630032 -0.218248803 -0.778815135
## 
## [[3]]
## [1]  0.1952008  1.1087517 -1.0680444  0.3095585 -0.3575560 -0.1548563 -0.7879833
## [8]  0.1095985 -1.2033588

print("Effective Sample Size (ESS) for each sample size:")

## [1] "Effective Sample Size (ESS) for each sample size:"

print(ESS_results)

## [[1]]
## [1] 42.81159
## 
## [[2]]
## [1] 2.806032
## 
## [[3]]
## [1] 1.121614

par(mfrow = c(3, 3))  # Set up a 3x3 plot grid
for (i in 1:length(sample_sizes)) {
  resampled_betas <- sampling_results[[i]]$resampled_beta
  hist(resampled_betas[, 1], main = paste("β0 (N =", sample_sizes[i], ")"), xlab = "β0", col = "yellow", breaks = 30)
  hist(resampled_betas[, 2], main = paste("β1 (N =", sample_sizes[i], ")"), xlab = "β1", col = "lightblue", breaks = 30)
  hist(resampled_betas[, 3], main = paste("β2 (N =", sample_sizes[i], ")"), xlab = "β2", col = "green", breaks = 30)
  for (j in 1:6) {
    hist(resampled_betas[, 3 + j], main = paste("β_add", j, " (N =", sample_sizes[i], ")"), xlab = paste("β_add", j, sep = ""), col = rainbow(6)[j], breaks = 30)
  }
}

glm_models <- lapply(simulated_data, function(data) glm(y ~ x1 + x2 + x_add_1 + x_add_2 + x_add_3 + x_add_4 + x_add_5 + x_add_6, data = data, family = binomial))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

glm_coefficients <- lapply(glm_models, function(model) coef(model))
print("GLM coefficients:")

## [1] "GLM coefficients:"

print(glm_coefficients)

## [[1]]
## (Intercept)          x1          x2     x_add_1     x_add_2     x_add_3 
##    6.433204   14.798308   -1.828293  -12.216024  -10.451806    7.680328 
##     x_add_4     x_add_5     x_add_6 
##   -7.121071  -20.632866  -34.417421 
## 
## [[2]]
## (Intercept)          x1          x2     x_add_1     x_add_2     x_add_3 
##  -0.3287673   1.9150312  -0.6886903   1.6157019  -0.7749457   0.2304285 
##     x_add_4     x_add_5     x_add_6 
##  -0.9472955  -0.0231685  -0.8970437 
## 
## [[3]]
## (Intercept)          x1          x2     x_add_1     x_add_2     x_add_3 
##  -0.2661212   0.9181552  -1.2082422   0.7910228  -0.4107330   0.3579283 
##     x_add_4     x_add_5     x_add_6 
##  -1.1341144   0.4847570  -0.8623633

Question 4

set.seed(170)

intercept <- 0.1
coef_x1 <- 1.1
coef_x2 <- -0.9

additional_betas <- runif(6, min = -1, max = 1)
names(additional_betas) <- paste("beta_add", 1:6, sep = "_")
print("Generated additional beta values:")

## [1] "Generated additional beta values:"

print(additional_betas)

## beta_add_1 beta_add_2 beta_add_3 beta_add_4 beta_add_5 beta_add_6 
##  0.9683449 -0.5339030  0.5368513 -0.8417813  0.1257677 -0.8274445

sample_sizes <- c(10, 50, 100)

create_data <- function(N) {
  x1 <- runif(N, min = -2, max = 2)
  x2 <- runif(N, min = -2, max = 2)
  additional_covariates <- matrix(runif(N * 6, min = -2, max = 2), ncol = 6)
  colnames(additional_covariates) <- paste("x_add", 1:6, sep = "_")
  linear_predictor <- intercept + coef_x1 * x1 + coef_x2 * x2
  for (i in 1:6) {
    linear_predictor <- linear_predictor + additional_covariates[, i] * additional_betas[i]
  }
  probabilities <- 1 / (1 + exp(-linear_predictor))
  response <- rbinom(N, 1, probabilities)
  data <- data.frame(x1 = x1, x2 = x2, y = response, additional_covariates)
  return(data)
}

simulated_data <- lapply(sample_sizes, create_data)

num_samples <- 20000
num_resamples <- 10000

compute_log_likelihood <- function(beta, data) {
  linear_predictor <- beta[1] + beta[2] * data$x1 + beta[3] * data$x2
  for (i in 1:6) {
    linear_predictor <- linear_predictor + data[, paste("x_add", i, sep = "_")] * beta[3 + i]
  }
  probabilities <- 1 / (1 + exp(-linear_predictor))
  log_likelihood_value <- sum(data$y * log(probabilities) + (1 - data$y) * log(1 - probabilities))
  return(log_likelihood_value)
}

log_posterior <- function(beta, data) {
  log_likelihood <- compute_log_likelihood(beta, data)
  log_prior <- -0.5 * sum(beta^2)
  return(log_likelihood + log_prior)
}

optim_posterior_mode <- function(data) {
  init_values <- rep(0, 9)  # Initial guess (9 parameters)
  optim_result <- optim(init_values, function(beta) -log_posterior(beta, data), method = "BFGS")
  return(optim_result$par)
}

proposal_distribution <- function(N, mode, sigma = 1.5) {
  matrix(rnorm(9 * N, mean = rep(mode, N), sd = sigma), ncol = 9)
}

compute_importance_weights <- function(beta, data) {
  log_posterior_val <- log_posterior(beta, data)
  return(exp(log_posterior_val))
}

importance_sampling <- function(data, N, mode) {
  proposal_samples <- proposal_distribution(N, mode)
  weights <- sapply(1:N, function(i) compute_importance_weights(proposal_samples[i, ], data))
  if (any(is.na(weights)) | any(is.nan(weights))) {
    weights[is.na(weights) | is.nan(weights)] <- 1e-10
  }
  normalized_weights <- weights / sum(weights)
  resample_indices <- sample(1:N, num_resamples, replace = TRUE, prob = normalized_weights)
  resampled_beta <- proposal_samples[resample_indices, ]
  posterior_means <- colSums(proposal_samples * normalized_weights)
  ESS <- 1 / sum(normalized_weights^2)
  return(list(posterior_means = posterior_means, ESS = ESS, resampled_beta = resampled_beta))
}

sampling_results <- lapply(simulated_data, function(data) {
  mode <- optim_posterior_mode(data)
  importance_sampling(data, num_samples, mode)
})

posterior_means_results <- lapply(sampling_results, function(result) result$posterior_means)
ESS_results <- lapply(sampling_results, function(result) result$ESS)

print("Posterior means for each sample size:")

## [1] "Posterior means for each sample size:"

print(posterior_means_results)

## [[1]]
## [1]  0.38652673  0.76954264 -0.65121677  0.07690843 -0.76312053  0.23692439
## [7] -0.03867812 -0.35960924 -0.82535648
## 
## [[2]]
## [1] -0.3538192  1.4441378 -0.3216473  1.3619656 -0.8968057  0.1372263 -1.2579517
## [8] -0.1657336 -1.3022357
## 
## [[3]]
## [1] -0.1803936  1.2135520 -0.9712732  0.1272283 -0.2980107  0.2428141 -0.7100506
## [8]  0.7004482 -0.9250665

print("Effective Sample Size (ESS) for each sample size:")

## [1] "Effective Sample Size (ESS) for each sample size:"

print(ESS_results)

## [[1]]
## [1] 76.51595
## 
## [[2]]
## [1] 4.530305
## 
## [[3]]
## [1] 1.748608

# Setting up the plot grid correctly
par(mfrow = c(3, 3))  # 3x3 grid for histograms

# Loop through the results and plot histograms
for (i in 1:length(sample_sizes)) {
  resampled_betas <- sampling_results[[i]]$resampled_beta
  
  # Plot β0
  hist(resampled_betas[, 1], main = paste("β0 (N =", sample_sizes[i], ")"), 
       xlab = "β0", col = "yellow", breaks = 30, xlim = c(-3, 3))
  
  # Plot β1
  hist(resampled_betas[, 2], main = paste("β1 (N =", sample_sizes[i], ")"), 
       xlab = "β1", col = "lightblue", breaks = 30, xlim = c(-3, 3))
  
  # Plot β2
  hist(resampled_betas[, 3], main = paste("β2 (N =", sample_sizes[i], ")"), 
       xlab = "β2", col = "green", breaks = 30, xlim = c(-3, 3))
  
  # Plot additional betas
  for (j in 1:6) {
    hist(resampled_betas[, 3 + j], main = paste("β_add", j, "(N =", sample_sizes[i], ")"), 
         xlab = paste("β_add", j, sep = ""), col = rainbow(6)[j], breaks = 30, xlim = c(-3, 3))
  }
}

glm_models <- lapply(simulated_data, function(data) glm(y ~ x1 + x2 + x_add_1 + x_add_2 + x_add_3 + x_add_4 + x_add_5 + x_add_6, data = data, family = binomial))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

glm_coefficients <- lapply(glm_models, function(model) coef(model))
print("GLM coefficients:")

## [1] "GLM coefficients:"

print(glm_coefficients)

## [[1]]
## (Intercept)          x1          x2     x_add_1     x_add_2     x_add_3 
##    6.433204   14.798308   -1.828293  -12.216024  -10.451806    7.680328 
##     x_add_4     x_add_5     x_add_6 
##   -7.121071  -20.632866  -34.417421 
## 
## [[2]]
## (Intercept)          x1          x2     x_add_1     x_add_2     x_add_3 
##  -0.3287673   1.9150312  -0.6886903   1.6157019  -0.7749457   0.2304285 
##     x_add_4     x_add_5     x_add_6 
##  -0.9472955  -0.0231685  -0.8970437 
## 
## [[3]]
## (Intercept)          x1          x2     x_add_1     x_add_2     x_add_3 
##  -0.2661212   0.9181552  -1.2082422   0.7910228  -0.4107330   0.3579283 
##     x_add_4     x_add_5     x_add_6 
##  -1.1341144   0.4847570  -0.8623633

Question 5 based on the Gaussian Mixture Model (GMM) as an examples of differentao aproaches toward setting covariates of proposal in a smarter waythan using a proposal with independent

coordinates

if (!require("mclust")) install.packages("mclust", dependencies = TRUE)

## Loading required package: mclust

## Warning: package 'mclust' was built under R version 4.4.2

## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.

library(mclust)

#  Generate Synthetic Data for GMM
       set.seed(170)

# Define parameters for the mixture model
         mu1 <- c(0, 0)    # Mean of the first Gaussian component
mu2 <- c(5, 5)    # Mean of the second Gaussian component
          mu3 <- c(-5, 5)   # Mean of the third Gaussian component

# Define covariance matrices for the components
         sigma1 <- matrix(c(1, 0.5, 0.5, 1), ncol = 2)
sigma2 <- matrix(c(1, -0.7, -0.7, 1), ncol = 2)
sigma3 <- matrix(c(1, 0, 0, 1), ncol = 2)

# Sample size for each cluster
 n1 <- 300
n2 <- 300
  n3 <- 300

cluster1 <- MASS::mvrnorm(n1, mu1, sigma1)
cluster2 <- MASS::mvrnorm(n2, mu2, sigma2)
           cluster3 <- MASS::mvrnorm(n3, mu3, sigma3)

       data <- rbind(cluster1, cluster2, cluster3)
        colnames(data) <- c("x", "y")

 gmm_model <- Mclust(data, G = 3)  # G = 3 for 3 components

plot(data, pch = 16, xlab = "X", ylab = "Y", 
     main = "GMM Clustering Results", cex = 0.5,
     col = gmm_model$classification)  # Coloring the points based on cluster assignment

cat("Means of each component:\n")

## Means of each component:

print(gmm_model$parameters$mean)

##          [,1]     [,2]      [,3]
## x  0.03164929 4.922467 -5.082003
## y -0.03773969 5.032440  5.032510

cat("Covariances of each component:\n")

## Covariances of each component:

print(gmm_model$parameters$variance$sigma)

## , , 1
## 
##           x         y
## x 0.9679200 0.4498477
## y 0.4498477 1.0148943
## 
## , , 2
## 
##           x         y
## x  1.253173 -0.826058
## y -0.826058  1.166914
## 
## , , 3
## 
##            x          y
## x 0.88204370 0.05823425
## y 0.05823425 0.88812469

        cat("Mixing weights for each component:\n")

## Mixing weights for each component:

         print(gmm_model$parameters$pro)

## [1] 0.3333349 0.3333319 0.3333333

#  Predict clusters for new data points
       new_data <- matrix(c(0, 0, 5, 5, -5, 5), ncol = 2)  # New points to predict
     predictions <- predict(gmm_model, newdata = new_data)
 cat("Predictions for new data points:\n")

## Predictions for new data points:

  print(predictions$classification)

## [1] 3 1 2

Assignment 2

Mohammad Soleimani

2024-11-17

Question 1

QUESTION 2

Question 3 with considering provided comments

Question 4

Question 5 based on the Gaussian Mixture Model (GMM) as an examples of differentao aproaches toward setting covariates of proposal in a smarter waythan using a proposal with independent