# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
file_path = "C:/Users/Nandan Hegde/OneDrive/Documents/MSU_Grad_Studies/STT810_class_assignments/ICA10/ab_samp.csv"
main_df = read.csv(file_path)
head(main_df)
## X sample data
## 1 1 1 10.169631
## 2 2 1 11.739846
## 3 3 1 11.227970
## 4 4 1 9.513133
## 5 5 1 9.177380
## 6 6 1 9.069717
# Calculate actual difference in means
actual_diff = mean(main_df$data[main_df$sample == 1]) - mean(main_df$data[main_df$sample == 2])
# Target shuffling simulation
n_simulations = 10000
simulated_diffs = numeric(n_simulations)
for(i in 1:n_simulations) {
# Shuffle the target variable
shuffled_data = data.frame(
sample = main_df$sample,
data = sample(main_df$data)
)
# Calculate difference in means for shuffled data
simulated_diffs[i] = mean(shuffled_data$data[shuffled_data$sample == 1]) -
mean(shuffled_data$data[shuffled_data$sample == 2])
}
# Calculate p-value
p_value = mean(abs(simulated_diffs) >= abs(actual_diff))
# Calculate 95% confidence interval from simulations
CI = quantile(simulated_diffs, c(0.025, 0.975))
print(paste("Actual difference in means:", round(actual_diff, 4)))
## [1] "Actual difference in means: 1.0381"
print(paste("P-value from target shuffling:", round(p_value, 4)))
## [1] "P-value from target shuffling: 0.0117"
print(paste("95% CI from simulations:", round(CI[1], 4), "to", round(CI[2], 4)))
## [1] "95% CI from simulations: -0.8158 to 0.8325"