# Load necessary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
file_path = "C:/Users/Nandan Hegde/OneDrive/Documents/MSU_Grad_Studies/STT810_class_assignments/ICA10/ab_samp.csv"
main_df = read.csv(file_path)
head(main_df)
##   X sample      data
## 1 1      1 10.169631
## 2 2      1 11.739846
## 3 3      1 11.227970
## 4 4      1  9.513133
## 5 5      1  9.177380
## 6 6      1  9.069717
# Calculate actual difference in means
actual_diff = mean(main_df$data[main_df$sample == 1]) - mean(main_df$data[main_df$sample == 2])

# Target shuffling simulation
n_simulations = 10000
simulated_diffs = numeric(n_simulations)

for(i in 1:n_simulations) {
  # Shuffle the target variable
  shuffled_data = data.frame(
    sample = main_df$sample,
    data = sample(main_df$data)
  )
  
  # Calculate difference in means for shuffled data
  simulated_diffs[i] = mean(shuffled_data$data[shuffled_data$sample == 1]) - 
                        mean(shuffled_data$data[shuffled_data$sample == 2])
}

# Calculate p-value
p_value = mean(abs(simulated_diffs) >= abs(actual_diff))

# Calculate 95% confidence interval from simulations
CI = quantile(simulated_diffs, c(0.025, 0.975))

print(paste("Actual difference in means:", round(actual_diff, 4)))
## [1] "Actual difference in means: 1.0381"
print(paste("P-value from target shuffling:", round(p_value, 4)))
## [1] "P-value from target shuffling: 0.0117"
print(paste("95% CI from simulations:", round(CI[1], 4), "to", round(CI[2], 4)))
## [1] "95% CI from simulations: -0.8158 to 0.8325"