rm(list = ls())
library(ggplot2)
#Question 6
# Set parameters for the simulation
N <- 10 # Sample size
num_iterations <- 100 # Number of t-statistics to generate
population_mean <- 3.5 # Expected value for a fair die
# Create a vector to store t-statistics across all iterations
t_statistics <- numeric(num_iterations)
# Function to calculate the t-statistic for a given sample
# Arguments:
# - sample: the random sample drawn from the population
# - population_mean: the expected value of the population
calculate_t_statistic <- function(sample, population_mean) {
sample_mean <- mean(sample) # Calculate sample mean
sample_sd <- stats::sd(sample) # Calculate sample standard deviation
t_stat <- (sample_mean - population_mean) / (sample_sd / sqrt(length(sample))) # Calculate t-statistic
return(t_stat) # Return the calculated t-statistic
}
# Loop to generate random samples and calculate t-statistics
for (i in 1:num_iterations) {
sample <- sample(1:6, N, replace = TRUE) # Generate a random sample of N=10 from a fair die
t_statistics[i] <- calculate_t_statistic(sample, population_mean) # Store the t-statistic for this sample
}
# Print the t-statistics to observe the results
print(t_statistics)
## [1] 0.0000000 -0.1617492 0.7929823 0.5454545 3.0317469 -0.9302605
## [7] 0.0000000 0.1617492 -1.9166297 -0.1846372 0.6784005 -0.6246950
## [13] -1.9904664 -1.5829346 -1.5000000 -0.2857143 0.0000000 -1.3125000
## [19] 0.6246950 0.9230769 0.0000000 -0.5417363 -0.5062896 1.1696884
## [25] -2.1083458 -1.0974794 -0.5052912 0.0000000 0.7058824 2.0952909
## [31] -1.3833370 0.7316529 -0.7717436 -0.8385255 0.7905694 0.5052912
## [37] 1.4746175 -0.8385255 -0.1780172 -0.2004459 0.3579300 1.2640515
## [43] -2.3104621 -0.1920553 -0.7058824 -0.9923721 -1.3669836 0.6784005
## [49] 0.1572427 1.7284979 -0.2924221 -2.1126085 1.1813423 -1.2677314
## [55] -0.7694838 1.6770510 1.1180340 -0.7058824 -0.6246950 1.0974794
## [61] 0.9682458 -1.4746175 0.3157895 0.5858501 -0.7717436 -0.2996257
## [67] 0.0000000 -4.4736068 0.0000000 0.3864940 -0.1780172 0.4232074
## [73] -0.3249182 0.8728716 -0.4036037 0.7604691 -2.6832816 0.0000000
## [79] -1.1180340 -1.4288690 0.4232074 0.7929823 1.6928295 -0.1846372
## [85] -0.6123724 0.0000000 0.2004459 -0.3348873 1.2677314 0.3249182
## [91] 0.3249182 -0.3579300 0.5454545 -0.1492556 1.2996729 -0.6615814
## [97] 0.5231144 0.6428571 -0.8728716 -0.2004459
# Question 7
# Plot a histogram to visualize the distribution of t-statistics
hist(t_statistics, breaks = 20, main = "Histogram of t-statistics", xlab = "t-statistic", col = "lightblue")

# Question 8
# Now we will repeat the process for larger sample sizes: 50, 100, 200, 500
sample_sizes <- c(50, 100, 200, 500) # Different sample sizes to analyze
num_iterations <- 100 # Number of t-statistics to generate per sample size
population_mean <- 3.5 # Population mean for a fair die
# Function to calculate t-statistics for different sample sizes
calculate_t_statistics <- function(N, num_iterations, population_mean) {
t_statistics <- numeric(num_iterations) # Initialize vector to store t-statistics
# Loop to generate samples and calculate t-statistics
for (i in 1:num_iterations) {
sample <- sample(1:6, N, replace = TRUE) # Generate a sample of size N
sample_mean <- mean(sample) # Calculate the sample mean
sample_sd <- sd(sample) # Calculate the sample standard deviation
t_statistics[i] <- (sample_mean - population_mean) / (sample_sd / sqrt(length(sample))) # Calculate t-statistic
}
return(t_statistics) # Return the t-statistics
}
# Store all t-statistics for different sample sizes in a list
all_t_statistics <- list()
# Generate and store t-statistics for each sample size
for (N in sample_sizes) {
all_t_statistics[[as.character(N)]] <- calculate_t_statistics(N, num_iterations, population_mean)
}
# Prepare data for plotting cumulative density functions
plot_data <- data.frame()
# Convert the list of t-statistics into a data frame for ggplot
for (N in sample_sizes) {
t_stats <- all_t_statistics[[as.character(N)]]
plot_data <- rbind(plot_data, data.frame(t_stat = t_stats, sample_size = N)) # Combine data into one frame
}
# Plot the cumulative density of t-statistics for each sample size
ggplot(plot_data, aes(x = t_stat, color = factor(sample_size))) +
geom_density(aes(y = ..scaled..), alpha = 0.5) + # Density plot for each sample size
stat_function(fun = dnorm, args = list(mean = 0, sd = 1), linetype = "dashed", color = "black") + # Standard normal curve
labs(title = "Density of t-Statistics for Different Sample Sizes",
x = "t-Statistic",
y = "Density",
color = "Sample Size") +
theme_minimal()
## Warning: The dot-dot notation (`..scaled..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(scaled)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
