library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)

# Create comparison table data
comparison_df <- tribble(
  ~Feature, ~Wilcoxon, ~TTest, ~FTest,
  "Test Type", "Non-parametric", "Parametric", "Parametric",
  "Purpose", "Compare distributions of two independent samples", "Compare means of two independent samples", "Compare variances of two samples",
  "Distribution Requirements", "No specific distribution required", "Requires normal distribution", "Requires normal distribution",
  "Sample Size", "Suitable for small samples", "Better for large samples\n(n > 30)", "No strict requirements",
  "Test Object", "Ranks of data", "Original data means", "Original data variances",
  "Outlier Sensitivity", "Low", "High", "High",
  "Test Statistic", "W statistic", "t statistic", "F statistic",
  "Advantages", "Wide applicability,\nrobust to outliers", "High statistical power,\neasy to interpret", "Essential for variance homogeneity testing",
  "Disadvantages", "Lower power than parametric tests", "Strict normality assumption", "Limited to variance comparison"
)
print(comparison_df[,c(1,2)])
## # A tibble: 9 × 2
##   Feature                   Wilcoxon                                          
##   <chr>                     <chr>                                             
## 1 Test Type                 "Non-parametric"                                  
## 2 Purpose                   "Compare distributions of two independent samples"
## 3 Distribution Requirements "No specific distribution required"               
## 4 Sample Size               "Suitable for small samples"                      
## 5 Test Object               "Ranks of data"                                   
## 6 Outlier Sensitivity       "Low"                                             
## 7 Test Statistic            "W statistic"                                     
## 8 Advantages                "Wide applicability,\nrobust to outliers"         
## 9 Disadvantages             "Lower power than parametric tests"
print(comparison_df[,c(1,3)])
## # A tibble: 9 × 2
##   Feature                   TTest                                       
##   <chr>                     <chr>                                       
## 1 Test Type                 "Parametric"                                
## 2 Purpose                   "Compare means of two independent samples"  
## 3 Distribution Requirements "Requires normal distribution"              
## 4 Sample Size               "Better for large samples\n(n > 30)"        
## 5 Test Object               "Original data means"                       
## 6 Outlier Sensitivity       "High"                                      
## 7 Test Statistic            "t statistic"                               
## 8 Advantages                "High statistical power,\neasy to interpret"
## 9 Disadvantages             "Strict normality assumption"
print(comparison_df[,c(1,4)])
## # A tibble: 9 × 2
##   Feature                   FTest                                     
##   <chr>                     <chr>                                     
## 1 Test Type                 Parametric                                
## 2 Purpose                   Compare variances of two samples          
## 3 Distribution Requirements Requires normal distribution              
## 4 Sample Size               No strict requirements                    
## 5 Test Object               Original data variances                   
## 6 Outlier Sensitivity       High                                      
## 7 Test Statistic            F statistic                               
## 8 Advantages                Essential for variance homogeneity testing
## 9 Disadvantages             Limited to variance comparison
###########################################################
# Practical Examples
# Set theme for consistent plotting
theme_set(theme_minimal())

# Set random seed for reproducibility
set.seed(123)

# Generate example datasets
# Normal distribution data for t-test and F-test
n_samples <- 1000  # Increased sample size for smoother distributions
group1_normal <- rnorm(n_samples, mean = 10, sd = 2)
group2_normal <- rnorm(n_samples, mean = 12, sd = 2)

# Skewed data for Wilcoxon test (using log-normal distribution)
group1_skewed <- rlnorm(n_samples, meanlog = 2, sdlog = 0.5)
group2_skewed <- rlnorm(n_samples, meanlog = 2.3, sdlog = 0.5)

# Create data frames for visualization
df_normal <- data.frame(
  value = c(group1_normal, group2_normal),
  group = factor(rep(c("Group 1", "Group 2"), each = n_samples))
)

df_skewed <- data.frame(
  value = c(group1_skewed, group2_skewed),
  group = factor(rep(c("Group 1", "Group 2"), each = n_samples))
)
# Create Distribution Plots
# 1. Wilcoxon Test Data Distribution
wilcox_dist <- ggplot(df_skewed, aes(x = value, fill = group)) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("#FF9999", "#66B2FF")) +
  labs(title = "Skewed Data Distribution",
       subtitle = "Wilcoxon Rank-Sum Test Example",
       x = "Value",
       y = "Density") +
  theme(legend.position = "bottom") +
  # Add mean lines
  geom_vline(data = df_skewed %>% group_by(group) %>% 
               summarize(mean_val = mean(value)),
             aes(xintercept = mean_val, color = group),
             linetype = "dashed") +
  scale_color_manual(values = c("#FF0000", "#0066CC"))
# Set theme for consistent plotting
theme_set(theme_minimal())
wilcox_dist

# 2. T-Test Data Distribution
t_test_dist <- ggplot(df_normal, aes(x = value, fill = group)) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("#99FF99", "#FF99FF")) +
  labs(title = "Normal Data Distribution",
       subtitle = "t-Test Example",
       x = "Value",
       y = "Density") +
  theme(legend.position = "bottom") +
  # Add mean lines
  geom_vline(data = df_normal %>% group_by(group) %>% 
               summarize(mean_val = mean(value)),
             aes(xintercept = mean_val, color = group),
             linetype = "dashed") +
  scale_color_manual(values = c("#00CC00", "#CC00CC"))
t_test_dist

# 3. F-Test Data Distribution with Variance Comparison
f_test_dist <- ggplot(df_normal, aes(x = value, fill = group)) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("#FFB366", "#66FFB3")) +
  labs(title = "Variance Comparison",
       subtitle = "F-Test Example",
       x = "Value",
       y = "Density") +
  theme(legend.position = "bottom") +
  # Add SD ranges
  geom_vline(data = df_normal %>% group_by(group) %>% 
               summarize(
                 mean_val = mean(value),
                 sd_minus = mean_val - sd(value),
                 sd_plus = mean_val + sd(value)
               ),
             aes(xintercept = sd_minus, color = group),
             linetype = "dotted") +
  geom_vline(data = df_normal %>% group_by(group) %>% 
               summarize(
                 mean_val = mean(value),
                 sd_minus = mean_val - sd(value),
                 sd_plus = mean_val + sd(value)
               ),
             aes(xintercept = sd_plus, color = group),
             linetype = "dotted") +
  scale_color_manual(values = c("#CC6600", "#00CC66"))
f_test_dist 

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# Display all plots
grid.arrange(wilcox_dist, t_test_dist, f_test_dist, ncol = 3)

# Statistical Tests
# 1. Wilcoxon Rank-Sum Test
wilcox_result <- wilcox.test(group1_skewed, group2_skewed)

# 2. Independent t-Test
t_result <- t.test(group1_normal, group2_normal)

# 3. F-Test
f_result <- var.test(group1_normal, group2_normal)

# Create summary statistics function
get_distribution_stats <- function(data) {
  c(
    n = length(data),
    mean = mean(data),
    median = median(data),
    sd = sd(data),
    var = var(data),
    skewness = mean((data - mean(data))^3)/(sd(data)^3),
    kurtosis = mean((data - mean(data))^4)/(sd(data)^4) - 3
  )
}

# Print comprehensive results
cat("\n=== Distribution Statistics ===\n")
## 
## === Distribution Statistics ===
cat("\nNormal Distribution - Group 1:\n")
## 
## Normal Distribution - Group 1:
print(round(get_distribution_stats(group1_normal), 4))
##         n      mean    median        sd       var  skewness  kurtosis 
## 1000.0000   10.0323   10.0184    1.9834    3.9338    0.0652   -0.0801
cat("\nNormal Distribution - Group 2:\n")
## 
## Normal Distribution - Group 2:
print(round(get_distribution_stats(group2_normal), 4))
##         n      mean    median        sd       var  skewness  kurtosis 
## 1000.0000   12.0849   12.1097    2.0193    4.0778   -0.0105   -0.0719
cat("\nSkewed Distribution - Group 1:\n")
## 
## Skewed Distribution - Group 1:
print(round(get_distribution_stats(group1_skewed), 4))
##         n      mean    median        sd       var  skewness  kurtosis 
## 1000.0000    8.2518    7.2046    4.3540   18.9572    1.8115    6.0230
cat("\nSkewed Distribution - Group 2:\n")
## 
## Skewed Distribution - Group 2:
print(round(get_distribution_stats(group2_skewed), 4))
##         n      mean    median        sd       var  skewness  kurtosis 
## 1000.0000   11.2104    9.9334    5.7786   33.3921    1.4369    2.9838
# Print test results
cat("\n=== Statistical Test Results ===\n")
## 
## === Statistical Test Results ===
cat("\n1. Wilcoxon Rank-Sum Test:\n")
## 
## 1. Wilcoxon Rank-Sum Test:
print(wilcox_result)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  group1_skewed and group2_skewed
## W = 327631, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
cat("\n2. Independent t-Test:\n")
## 
## 2. Independent t-Test:
print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  group1_normal and group2_normal
## t = -22.933, df = 1997.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.228213 -1.877137
## sample estimates:
## mean of x mean of y 
##  10.03226  12.08493
cat("\n3. F-Test:\n")
## 
## 3. F-Test:
print(f_result)
## 
##  F test to compare two variances
## 
## data:  group1_normal and group2_normal
## F = 0.9647, num df = 999, denom df = 999, p-value = 0.5702
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.8521211 1.0921596
## sample estimates:
## ratio of variances 
##          0.9647032
# Distribution Analysis
# Test for normality
cat("\n=== Normality Tests (Shapiro-Wilk) ===\n")
## 
## === Normality Tests (Shapiro-Wilk) ===
normality_tests <- data.frame(
  Group = c("Normal Group 1", "Normal Group 2", "Skewed Group 1", "Skewed Group 2"),
  W_statistic = c(
    shapiro.test(group1_normal)$statistic,
    shapiro.test(group2_normal)$statistic,
    shapiro.test(group1_skewed)$statistic,
    shapiro.test(group2_skewed)$statistic
  ),
  p_value = c(
    shapiro.test(group1_normal)$p.value,
    shapiro.test(group2_normal)$p.value,
    shapiro.test(group1_skewed)$p.value,
    shapiro.test(group2_skewed)$p.value
  )
)
print(normality_tests)
##            Group W_statistic      p_value
## 1 Normal Group 1   0.9983762 4.764686e-01
## 2 Normal Group 2   0.9986157 6.316108e-01
## 3 Skewed Group 1   0.8733212 1.013865e-27
## 4 Skewed Group 2   0.9004864 5.870318e-25