library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.2
#Set seed for reproducibility
set.seed(170) # Changed seed for variability, I explian the effect of this function in the report
#sample sizes
sample_sizes_vec <- c(10, 50, 100)
# Define logistic regression coefficients
intercept <- 0.1
coef_x1 <- 1.1
coef_x2 <- -0.9
#Function to create logistic regression
create_logistic_data <- function(sample_size) {
# Step 1: Generate independent variables x1 and x2 from Uniform[-2, 2]
x_var1 <- runif(sample_size, min = -2, max = 2)
x_var2 <- runif(sample_size, min = -2, max = 2)
# Step 2: Compute the linear predictor
linear_predictor <- intercept + coef_x1 * x_var1 + coef_x2 * x_var2
#Step 3: Calculate probabilities using the logistic function
probabilities <- 1 / (1 + exp(-linear_predictor))
# Step 4: Generate binary outcomes based on the probabilities
binary_outcomes <- rbinom(sample_size, size = 1, prob = probabilities)
# Step 5: Calculate proportions of outcomes
proportion_ones <- mean(binary_outcomes)
proportion_zeros <- 1 - proportion_ones
return(c(Proportion_Zeros = proportion_zeros, Proportion_Ones = proportion_ones))
}
# Collect results for different sample sizes
data_summary <- sapply(sample_sizes_vec, create_logistic_data)
#Convert the results
summary_df <- as.data.frame(t(data_summary))
colnames(summary_df) <- c("Proportion of Zeros", "Proportion of Ones")
summary_df$Sample_Size <- sample_sizes_vec
print(summary_df)
## Proportion of Zeros Proportion of Ones Sample_Size
## 1 0.50 0.50 10
## 2 0.48 0.52 50
## 3 0.45 0.55 100
# Reshape data
long_format <- melt(summary_df, id.vars = "Sample_Size",
variable.name = "Outcome_Type",
value.name = "Proportion")
# Create a bar plot
bar_plot <- ggplot(long_format, aes(x = factor(Sample_Size), y = Proportion, fill = Outcome_Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Proportion of Binary Outcomes by Sample Size",
x = "Sample Size",
y = "Proportion",
fill = "Outcome Type") +
scale_fill_manual(values = c("Proportion of Zeros" = "blue", "Proportion of Ones" = "salmon")) +
theme_minimal() +
theme(legend.position = "top")
grid.arrange(tableGrob(summary_df), bar_plot, nrow = 2)
set.seed(170)
intercept <- 0.1
coef_x1 <- 1.1
coef_x2 <- -0.9
sample_sizes <- c(10, 50, 100)
create_data <- function(N) {
x1 <- runif(N, min = -2, max = 2)
x2 <- runif(N, min = -2, max = 2)
linear_predictor <- intercept + coef_x1 * x1 + coef_x2 * x2
probabilities <- 1 / (1 + exp(-linear_predictor))
# Generate binary response based on probabilities
response <- rbinom(N, 1, probabilities)
# Return the dataset
return(data.frame(x1 = x1, x2 = x2, y = response))
}
# Generate data for each sample size
simulated_data <- lapply(sample_sizes, create_data)
# Parameters for the importance sampling process
num_samples <- 20000
num_resamples <- 10000
compute_log_likelihood <- function(beta, data) {
linear_predictor <- beta[1] + beta[2] * data$x1 + beta[3] * data$x2
probabilities <- 1 / (1 + exp(-linear_predictor))
log_likelihood_value <- sum(data$y * log(probabilities) + (1 - data$y) * log(1 - probabilities))
return(log_likelihood_value)
}
# Proposal distribution: Generate random samples for the betas from a multivariate normal
proposal_distribution <- function(N, sigma = 1.5) {
# N samples for 3 betas (including intercept)
matrix(rnorm(3 * N, mean = 0, sd = sigma), ncol = 3)
}
# Function to calculate the importance weights (including prior)
compute_importance_weights <- function(beta, data) {
# Log-posterior: log-likelihood + log-prior (assuming Gaussian prior)
log_posterior <- compute_log_likelihood(beta, data) - 0.5 * sum(beta^2) # Gaussian prior
return(exp(log_posterior))
}
importance_sampling <- function(data, N) {
proposal_samples <- proposal_distribution(N)
# Calculate importance weights for each sample
weights <- sapply(1:N, function(i) compute_importance_weights(proposal_samples[i, ], data))
normalized_weights <- weights / sum(weights)
# Resampling
resample_indices <- sample(1:N, num_resamples, replace = TRUE, prob = normalized_weights)
resampled_beta <- proposal_samples[resample_indices, ]
# Compute posterior means
posterior_means <- colSums(proposal_samples * normalized_weights)
ESS <- 1 / sum(normalized_weights^2)
# Return results as a list
return(list(posterior_means = posterior_means, ESS = ESS, resampled_beta = resampled_beta))
}
sampling_results <- lapply(simulated_data, function(data) importance_sampling(data, num_samples))
# Extract posterior means from the results
posterior_means_results <- lapply(sampling_results, function(result) result$posterior_means)
posterior_means_results
## [[1]]
## [1] 0.1378285 1.1646153 -0.1678666
##
## [[2]]
## [1] -0.1419279 1.2025519 -1.2091650
##
## [[3]]
## [1] 0.173361 1.086216 -0.587570
# Extract Effective Sample Size (ESS) from the results
ESS_results <- lapply(sampling_results, function(result) result$ESS)
ESS_results
## [[1]]
## [1] 1717.429
##
## [[2]]
## [1] 352.3169
##
## [[3]]
## [1] 126.3188
par(mfrow = c(3, 3))
# Loop through each sample size and plot histograms for resampled betas
for (i in 1:length(sample_sizes)) {
resampled_betas <- sampling_results[[i]]$resampled_beta
# Histogram for β0 (Intercept)
hist(resampled_betas[, 1], main = paste("β0 (N =", sample_sizes[i], ")"), xlab = "β0", col = "yellow", breaks = 50)
# Histogram for β1 (Coefficient for x1)
hist(resampled_betas[, 2], main = paste("β1 (N =", sample_sizes[i], ")"), xlab = "β1", col = "lightblue", breaks = 50)
# Histogram for β2 (Coefficient for x2)
hist(resampled_betas[, 3], main = paste("β2 (N =", sample_sizes[i], ")"), xlab = "β2", col = "green", breaks = 50)
}
# Fit the logistic regression model using MLE (GLM) for each dataset
glm_models <- lapply(simulated_data, function(data) glm(y ~ x1 + x2, data = data, family = binomial))
# Extract coefficients from the GLM models
glm_coefficients <- lapply(glm_models, function(model) coef(model))
glm_coefficients
## [[1]]
## (Intercept) x1 x2
## 0.3494093 3.3801629 -1.7201246
##
## [[2]]
## (Intercept) x1 x2
## -0.3704053 1.7094964 -1.7060462
##
## [[3]]
## (Intercept) x1 x2
## 0.2223994 1.1363509 -0.6265514
set.seed(170)
intercept <- 0.1
coef_x1 <- 1.1
coef_x2 <- -0.9
additional_betas <- runif(6, min = -1, max = 1)
names(additional_betas) <- paste("beta_add", 1:6, sep = "_")
print("Generated additional beta values:")
## [1] "Generated additional beta values:"
print(additional_betas)
## beta_add_1 beta_add_2 beta_add_3 beta_add_4 beta_add_5 beta_add_6
## 0.9683449 -0.5339030 0.5368513 -0.8417813 0.1257677 -0.8274445
sample_sizes <- c(10, 50, 100)
create_data <- function(N) {
x1 <- runif(N, min = -2, max = 2)
x2 <- runif(N, min = -2, max = 2)
additional_covariates <- matrix(runif(N * 6, min = -2, max = 2), ncol = 6)
colnames(additional_covariates) <- paste("x_add", 1:6, sep = "_")
linear_predictor <- intercept + coef_x1 * x1 + coef_x2 * x2
for (i in 1:6) {
linear_predictor <- linear_predictor + additional_covariates[, i] * additional_betas[i]
}
probabilities <- 1 / (1 + exp(-linear_predictor))
response <- rbinom(N, 1, probabilities)
data <- data.frame(x1 = x1, x2 = x2, y = response, additional_covariates)
return(data)
}
simulated_data <- lapply(sample_sizes, create_data)
num_samples <- 20000
num_resamples <- 10000
compute_log_likelihood <- function(beta, data) {
linear_predictor <- beta[1] + beta[2] * data$x1 + beta[3] * data$x2
for (i in 1:6) {
linear_predictor <- linear_predictor + data[, paste("x_add", i, sep = "_")] * beta[3 + i]
}
probabilities <- 1 / (1 + exp(-linear_predictor))
log_likelihood_value <- sum(data$y * log(probabilities) + (1 - data$y) * log(1 - probabilities))
return(log_likelihood_value)
}
proposal_distribution <- function(N, sigma = 1.5) {
matrix(rnorm(9 * N, mean = 0, sd = sigma), ncol = 9)
}
compute_importance_weights <- function(beta, data) {
log_likelihood <- compute_log_likelihood(beta, data)
log_prior <- -0.5 * sum(beta^2)
log_posterior <- log_likelihood + log_prior
return(exp(log_posterior))
}
importance_sampling <- function(data, N) {
proposal_samples <- proposal_distribution(N)
weights <- sapply(1:N, function(i) compute_importance_weights(proposal_samples[i, ], data))
if (any(is.na(weights)) | any(is.nan(weights))) {
weights[is.na(weights) | is.nan(weights)] <- 1e-10
}
normalized_weights <- weights / sum(weights)
resample_indices <- sample(1:N, num_resamples, replace = TRUE, prob = normalized_weights)
resampled_beta <- proposal_samples[resample_indices, ]
posterior_means <- colSums(proposal_samples * normalized_weights)
ESS <- 1 / sum(normalized_weights^2)
return(list(posterior_means = posterior_means, ESS = ESS, resampled_beta = resampled_beta))
}
sampling_results <- lapply(simulated_data, function(data) importance_sampling(data, num_samples))
posterior_means_results <- lapply(sampling_results, function(result) result$posterior_means)
ESS_results <- lapply(sampling_results, function(result) result$ESS)
print("Posterior means for each sample size:")
## [1] "Posterior means for each sample size:"
print(posterior_means_results)
## [[1]]
## [1] 0.31279720 0.64257649 -0.63977290 0.10788616 -0.70143446 0.29467828
## [7] 0.01030248 -0.39261481 -0.83282578
##
## [[2]]
## [1] -0.007941247 1.540101108 -0.774561409 1.322760275 -0.241152351
## [6] 0.362213366 -0.611630032 -0.218248803 -0.778815135
##
## [[3]]
## [1] 0.1952008 1.1087517 -1.0680444 0.3095585 -0.3575560 -0.1548563 -0.7879833
## [8] 0.1095985 -1.2033588
print("Effective Sample Size (ESS) for each sample size:")
## [1] "Effective Sample Size (ESS) for each sample size:"
print(ESS_results)
## [[1]]
## [1] 42.81159
##
## [[2]]
## [1] 2.806032
##
## [[3]]
## [1] 1.121614
par(mfrow = c(3, 3)) # Set up a 3x3 plot grid
for (i in 1:length(sample_sizes)) {
resampled_betas <- sampling_results[[i]]$resampled_beta
hist(resampled_betas[, 1], main = paste("β0 (N =", sample_sizes[i], ")"), xlab = "β0", col = "yellow", breaks = 30)
hist(resampled_betas[, 2], main = paste("β1 (N =", sample_sizes[i], ")"), xlab = "β1", col = "lightblue", breaks = 30)
hist(resampled_betas[, 3], main = paste("β2 (N =", sample_sizes[i], ")"), xlab = "β2", col = "green", breaks = 30)
for (j in 1:6) {
hist(resampled_betas[, 3 + j], main = paste("β_add", j, " (N =", sample_sizes[i], ")"), xlab = paste("β_add", j, sep = ""), col = rainbow(6)[j], breaks = 30)
}
}
glm_models <- lapply(simulated_data, function(data) glm(y ~ x1 + x2 + x_add_1 + x_add_2 + x_add_3 + x_add_4 + x_add_5 + x_add_6, data = data, family = binomial))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_coefficients <- lapply(glm_models, function(model) coef(model))
print("GLM coefficients:")
## [1] "GLM coefficients:"
print(glm_coefficients)
## [[1]]
## (Intercept) x1 x2 x_add_1 x_add_2 x_add_3
## 6.433204 14.798308 -1.828293 -12.216024 -10.451806 7.680328
## x_add_4 x_add_5 x_add_6
## -7.121071 -20.632866 -34.417421
##
## [[2]]
## (Intercept) x1 x2 x_add_1 x_add_2 x_add_3
## -0.3287673 1.9150312 -0.6886903 1.6157019 -0.7749457 0.2304285
## x_add_4 x_add_5 x_add_6
## -0.9472955 -0.0231685 -0.8970437
##
## [[3]]
## (Intercept) x1 x2 x_add_1 x_add_2 x_add_3
## -0.2661212 0.9181552 -1.2082422 0.7910228 -0.4107330 0.3579283
## x_add_4 x_add_5 x_add_6
## -1.1341144 0.4847570 -0.8623633
set.seed(170)
intercept <- 0.1
coef_x1 <- 1.1
coef_x2 <- -0.9
additional_betas <- runif(6, min = -1, max = 1)
names(additional_betas) <- paste("beta_add", 1:6, sep = "_")
print("Generated additional beta values:")
## [1] "Generated additional beta values:"
print(additional_betas)
## beta_add_1 beta_add_2 beta_add_3 beta_add_4 beta_add_5 beta_add_6
## 0.9683449 -0.5339030 0.5368513 -0.8417813 0.1257677 -0.8274445
sample_sizes <- c(10, 50, 100)
create_data <- function(N) {
x1 <- runif(N, min = -2, max = 2)
x2 <- runif(N, min = -2, max = 2)
additional_covariates <- matrix(runif(N * 6, min = -2, max = 2), ncol = 6)
colnames(additional_covariates) <- paste("x_add", 1:6, sep = "_")
linear_predictor <- intercept + coef_x1 * x1 + coef_x2 * x2
for (i in 1:6) {
linear_predictor <- linear_predictor + additional_covariates[, i] * additional_betas[i]
}
probabilities <- 1 / (1 + exp(-linear_predictor))
response <- rbinom(N, 1, probabilities)
data <- data.frame(x1 = x1, x2 = x2, y = response, additional_covariates)
return(data)
}
simulated_data <- lapply(sample_sizes, create_data)
num_samples <- 20000
num_resamples <- 10000
compute_log_likelihood <- function(beta, data) {
linear_predictor <- beta[1] + beta[2] * data$x1 + beta[3] * data$x2
for (i in 1:6) {
linear_predictor <- linear_predictor + data[, paste("x_add", i, sep = "_")] * beta[3 + i]
}
probabilities <- 1 / (1 + exp(-linear_predictor))
log_likelihood_value <- sum(data$y * log(probabilities) + (1 - data$y) * log(1 - probabilities))
return(log_likelihood_value)
}
log_posterior <- function(beta, data) {
log_likelihood <- compute_log_likelihood(beta, data)
log_prior <- -0.5 * sum(beta^2)
return(log_likelihood + log_prior)
}
optim_posterior_mode <- function(data) {
init_values <- rep(0, 9) # Initial guess (9 parameters)
optim_result <- optim(init_values, function(beta) -log_posterior(beta, data), method = "BFGS")
return(optim_result$par)
}
proposal_distribution <- function(N, mode, sigma = 1.5) {
matrix(rnorm(9 * N, mean = rep(mode, N), sd = sigma), ncol = 9)
}
compute_importance_weights <- function(beta, data) {
log_posterior_val <- log_posterior(beta, data)
return(exp(log_posterior_val))
}
importance_sampling <- function(data, N, mode) {
proposal_samples <- proposal_distribution(N, mode)
weights <- sapply(1:N, function(i) compute_importance_weights(proposal_samples[i, ], data))
if (any(is.na(weights)) | any(is.nan(weights))) {
weights[is.na(weights) | is.nan(weights)] <- 1e-10
}
normalized_weights <- weights / sum(weights)
resample_indices <- sample(1:N, num_resamples, replace = TRUE, prob = normalized_weights)
resampled_beta <- proposal_samples[resample_indices, ]
posterior_means <- colSums(proposal_samples * normalized_weights)
ESS <- 1 / sum(normalized_weights^2)
return(list(posterior_means = posterior_means, ESS = ESS, resampled_beta = resampled_beta))
}
sampling_results <- lapply(simulated_data, function(data) {
mode <- optim_posterior_mode(data)
importance_sampling(data, num_samples, mode)
})
posterior_means_results <- lapply(sampling_results, function(result) result$posterior_means)
ESS_results <- lapply(sampling_results, function(result) result$ESS)
print("Posterior means for each sample size:")
## [1] "Posterior means for each sample size:"
print(posterior_means_results)
## [[1]]
## [1] 0.38652673 0.76954264 -0.65121677 0.07690843 -0.76312053 0.23692439
## [7] -0.03867812 -0.35960924 -0.82535648
##
## [[2]]
## [1] -0.3538192 1.4441378 -0.3216473 1.3619656 -0.8968057 0.1372263 -1.2579517
## [8] -0.1657336 -1.3022357
##
## [[3]]
## [1] -0.1803936 1.2135520 -0.9712732 0.1272283 -0.2980107 0.2428141 -0.7100506
## [8] 0.7004482 -0.9250665
print("Effective Sample Size (ESS) for each sample size:")
## [1] "Effective Sample Size (ESS) for each sample size:"
print(ESS_results)
## [[1]]
## [1] 76.51595
##
## [[2]]
## [1] 4.530305
##
## [[3]]
## [1] 1.748608
# Setting up the plot grid correctly
par(mfrow = c(3, 3)) # 3x3 grid for histograms
# Loop through the results and plot histograms
for (i in 1:length(sample_sizes)) {
resampled_betas <- sampling_results[[i]]$resampled_beta
# Plot β0
hist(resampled_betas[, 1], main = paste("β0 (N =", sample_sizes[i], ")"),
xlab = "β0", col = "yellow", breaks = 30, xlim = c(-3, 3))
# Plot β1
hist(resampled_betas[, 2], main = paste("β1 (N =", sample_sizes[i], ")"),
xlab = "β1", col = "lightblue", breaks = 30, xlim = c(-3, 3))
# Plot β2
hist(resampled_betas[, 3], main = paste("β2 (N =", sample_sizes[i], ")"),
xlab = "β2", col = "green", breaks = 30, xlim = c(-3, 3))
# Plot additional betas
for (j in 1:6) {
hist(resampled_betas[, 3 + j], main = paste("β_add", j, "(N =", sample_sizes[i], ")"),
xlab = paste("β_add", j, sep = ""), col = rainbow(6)[j], breaks = 30, xlim = c(-3, 3))
}
}
glm_models <- lapply(simulated_data, function(data) glm(y ~ x1 + x2 + x_add_1 + x_add_2 + x_add_3 + x_add_4 + x_add_5 + x_add_6, data = data, family = binomial))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_coefficients <- lapply(glm_models, function(model) coef(model))
print("GLM coefficients:")
## [1] "GLM coefficients:"
print(glm_coefficients)
## [[1]]
## (Intercept) x1 x2 x_add_1 x_add_2 x_add_3
## 6.433204 14.798308 -1.828293 -12.216024 -10.451806 7.680328
## x_add_4 x_add_5 x_add_6
## -7.121071 -20.632866 -34.417421
##
## [[2]]
## (Intercept) x1 x2 x_add_1 x_add_2 x_add_3
## -0.3287673 1.9150312 -0.6886903 1.6157019 -0.7749457 0.2304285
## x_add_4 x_add_5 x_add_6
## -0.9472955 -0.0231685 -0.8970437
##
## [[3]]
## (Intercept) x1 x2 x_add_1 x_add_2 x_add_3
## -0.2661212 0.9181552 -1.2082422 0.7910228 -0.4107330 0.3579283
## x_add_4 x_add_5 x_add_6
## -1.1341144 0.4847570 -0.8623633
coordinates
if (!require("mclust")) install.packages("mclust", dependencies = TRUE)
## Loading required package: mclust
## Warning: package 'mclust' was built under R version 4.4.2
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
library(mclust)
# Generate Synthetic Data for GMM
set.seed(170)
# Define parameters for the mixture model
mu1 <- c(0, 0) # Mean of the first Gaussian component
mu2 <- c(5, 5) # Mean of the second Gaussian component
mu3 <- c(-5, 5) # Mean of the third Gaussian component
# Define covariance matrices for the components
sigma1 <- matrix(c(1, 0.5, 0.5, 1), ncol = 2)
sigma2 <- matrix(c(1, -0.7, -0.7, 1), ncol = 2)
sigma3 <- matrix(c(1, 0, 0, 1), ncol = 2)
# Sample size for each cluster
n1 <- 300
n2 <- 300
n3 <- 300
cluster1 <- MASS::mvrnorm(n1, mu1, sigma1)
cluster2 <- MASS::mvrnorm(n2, mu2, sigma2)
cluster3 <- MASS::mvrnorm(n3, mu3, sigma3)
data <- rbind(cluster1, cluster2, cluster3)
colnames(data) <- c("x", "y")
gmm_model <- Mclust(data, G = 3) # G = 3 for 3 components
plot(data, pch = 16, xlab = "X", ylab = "Y",
main = "GMM Clustering Results", cex = 0.5,
col = gmm_model$classification) # Coloring the points based on cluster assignment
cat("Means of each component:\n")
## Means of each component:
print(gmm_model$parameters$mean)
## [,1] [,2] [,3]
## x 0.03164929 4.922467 -5.082003
## y -0.03773969 5.032440 5.032510
cat("Covariances of each component:\n")
## Covariances of each component:
print(gmm_model$parameters$variance$sigma)
## , , 1
##
## x y
## x 0.9679200 0.4498477
## y 0.4498477 1.0148943
##
## , , 2
##
## x y
## x 1.253173 -0.826058
## y -0.826058 1.166914
##
## , , 3
##
## x y
## x 0.88204370 0.05823425
## y 0.05823425 0.88812469
cat("Mixing weights for each component:\n")
## Mixing weights for each component:
print(gmm_model$parameters$pro)
## [1] 0.3333349 0.3333319 0.3333333
# Predict clusters for new data points
new_data <- matrix(c(0, 0, 5, 5, -5, 5), ncol = 2) # New points to predict
predictions <- predict(gmm_model, newdata = new_data)
cat("Predictions for new data points:\n")
## Predictions for new data points:
print(predictions$classification)
## [1] 3 1 2