rm(list = ls())
library(ggplot2)
#Question 6
# Set parameters for the simulation
N <- 10               # Sample size 
num_iterations <- 100  # Number of t-statistics to generate 
population_mean <- 3.5 # Expected value for a fair die

# Create a vector to store t-statistics across all iterations
t_statistics <- numeric(num_iterations)

# Function to calculate the t-statistic for a given sample
# Arguments:
#  - sample: the random sample drawn from the population 
#  - population_mean: the expected value of the population
calculate_t_statistic <- function(sample, population_mean) {
  sample_mean <- mean(sample)            # Calculate sample mean
  sample_sd <- stats::sd(sample)         # Calculate sample standard deviation
  t_stat <- (sample_mean - population_mean) / (sample_sd / sqrt(length(sample)))  # Calculate t-statistic
  return(t_stat)                         # Return the calculated t-statistic
}

# Loop to generate random samples and calculate t-statistics
for (i in 1:num_iterations) {
  sample <- sample(1:6, N, replace = TRUE)  # Generate a random sample of N=10 from a fair die
  t_statistics[i] <- calculate_t_statistic(sample, population_mean)  # Store the t-statistic for this sample
}

# Print the t-statistics to observe the results
print(t_statistics)
##   [1]  0.0000000 -0.1617492  0.7929823  0.5454545  3.0317469 -0.9302605
##   [7]  0.0000000  0.1617492 -1.9166297 -0.1846372  0.6784005 -0.6246950
##  [13] -1.9904664 -1.5829346 -1.5000000 -0.2857143  0.0000000 -1.3125000
##  [19]  0.6246950  0.9230769  0.0000000 -0.5417363 -0.5062896  1.1696884
##  [25] -2.1083458 -1.0974794 -0.5052912  0.0000000  0.7058824  2.0952909
##  [31] -1.3833370  0.7316529 -0.7717436 -0.8385255  0.7905694  0.5052912
##  [37]  1.4746175 -0.8385255 -0.1780172 -0.2004459  0.3579300  1.2640515
##  [43] -2.3104621 -0.1920553 -0.7058824 -0.9923721 -1.3669836  0.6784005
##  [49]  0.1572427  1.7284979 -0.2924221 -2.1126085  1.1813423 -1.2677314
##  [55] -0.7694838  1.6770510  1.1180340 -0.7058824 -0.6246950  1.0974794
##  [61]  0.9682458 -1.4746175  0.3157895  0.5858501 -0.7717436 -0.2996257
##  [67]  0.0000000 -4.4736068  0.0000000  0.3864940 -0.1780172  0.4232074
##  [73] -0.3249182  0.8728716 -0.4036037  0.7604691 -2.6832816  0.0000000
##  [79] -1.1180340 -1.4288690  0.4232074  0.7929823  1.6928295 -0.1846372
##  [85] -0.6123724  0.0000000  0.2004459 -0.3348873  1.2677314  0.3249182
##  [91]  0.3249182 -0.3579300  0.5454545 -0.1492556  1.2996729 -0.6615814
##  [97]  0.5231144  0.6428571 -0.8728716 -0.2004459
# Question 7
# Plot a histogram to visualize the distribution of t-statistics
hist(t_statistics, breaks = 20, main = "Histogram of t-statistics", xlab = "t-statistic", col = "lightblue")

#  Question 8 

# Now we will repeat the process for larger sample sizes: 50, 100, 200, 500
sample_sizes <- c(50, 100, 200, 500)  # Different sample sizes to analyze
num_iterations <- 100                 # Number of t-statistics to generate per sample size
population_mean <- 3.5                # Population mean for a fair die

# Function to calculate t-statistics for different sample sizes
calculate_t_statistics <- function(N, num_iterations, population_mean) {
  t_statistics <- numeric(num_iterations)  # Initialize vector to store t-statistics
  
  # Loop to generate samples and calculate t-statistics
  for (i in 1:num_iterations) {
    sample <- sample(1:6, N, replace = TRUE)  # Generate a sample of size N
    sample_mean <- mean(sample)               # Calculate the sample mean
    sample_sd <- sd(sample)                   # Calculate the sample standard deviation
    t_statistics[i] <- (sample_mean - population_mean) / (sample_sd / sqrt(length(sample)))  # Calculate t-statistic
  }
  
  return(t_statistics)  # Return the t-statistics
}

# Store all t-statistics for different sample sizes in a list
all_t_statistics <- list()

# Generate and store t-statistics for each sample size
for (N in sample_sizes) {
  all_t_statistics[[as.character(N)]] <- calculate_t_statistics(N, num_iterations, population_mean)
}

# Prepare data for plotting cumulative density functions
plot_data <- data.frame()

# Convert the list of t-statistics into a data frame for ggplot
for (N in sample_sizes) {
  t_stats <- all_t_statistics[[as.character(N)]]
  plot_data <- rbind(plot_data, data.frame(t_stat = t_stats, sample_size = N))  # Combine data into one frame
}

# Plot the cumulative density of t-statistics for each sample size
ggplot(plot_data, aes(x = t_stat, color = factor(sample_size))) +
  geom_density(aes(y = ..scaled..), alpha = 0.5) +  # Density plot for each sample size
  stat_function(fun = dnorm, args = list(mean = 0, sd = 1), linetype = "dashed", color = "black") +  # Standard normal curve
  labs(title = "Density of t-Statistics for Different Sample Sizes",
       x = "t-Statistic",
       y = "Density",
       color = "Sample Size") +
  theme_minimal()
## Warning: The dot-dot notation (`..scaled..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(scaled)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.