library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
# Create comparison table data
comparison_df <- tribble(
~Feature, ~Wilcoxon, ~TTest, ~FTest,
"Test Type", "Non-parametric", "Parametric", "Parametric",
"Purpose", "Compare distributions of two independent samples", "Compare means of two independent samples", "Compare variances of two samples",
"Distribution Requirements", "No specific distribution required", "Requires normal distribution", "Requires normal distribution",
"Sample Size", "Suitable for small samples", "Better for large samples\n(n > 30)", "No strict requirements",
"Test Object", "Ranks of data", "Original data means", "Original data variances",
"Outlier Sensitivity", "Low", "High", "High",
"Test Statistic", "W statistic", "t statistic", "F statistic",
"Advantages", "Wide applicability,\nrobust to outliers", "High statistical power,\neasy to interpret", "Essential for variance homogeneity testing",
"Disadvantages", "Lower power than parametric tests", "Strict normality assumption", "Limited to variance comparison"
)
print(comparison_df[,c(1,2)])
## # A tibble: 9 × 2
## Feature Wilcoxon
## <chr> <chr>
## 1 Test Type "Non-parametric"
## 2 Purpose "Compare distributions of two independent samples"
## 3 Distribution Requirements "No specific distribution required"
## 4 Sample Size "Suitable for small samples"
## 5 Test Object "Ranks of data"
## 6 Outlier Sensitivity "Low"
## 7 Test Statistic "W statistic"
## 8 Advantages "Wide applicability,\nrobust to outliers"
## 9 Disadvantages "Lower power than parametric tests"
print(comparison_df[,c(1,3)])
## # A tibble: 9 × 2
## Feature TTest
## <chr> <chr>
## 1 Test Type "Parametric"
## 2 Purpose "Compare means of two independent samples"
## 3 Distribution Requirements "Requires normal distribution"
## 4 Sample Size "Better for large samples\n(n > 30)"
## 5 Test Object "Original data means"
## 6 Outlier Sensitivity "High"
## 7 Test Statistic "t statistic"
## 8 Advantages "High statistical power,\neasy to interpret"
## 9 Disadvantages "Strict normality assumption"
print(comparison_df[,c(1,4)])
## # A tibble: 9 × 2
## Feature FTest
## <chr> <chr>
## 1 Test Type Parametric
## 2 Purpose Compare variances of two samples
## 3 Distribution Requirements Requires normal distribution
## 4 Sample Size No strict requirements
## 5 Test Object Original data variances
## 6 Outlier Sensitivity High
## 7 Test Statistic F statistic
## 8 Advantages Essential for variance homogeneity testing
## 9 Disadvantages Limited to variance comparison
###########################################################
# Practical Examples
# Set theme for consistent plotting
theme_set(theme_minimal())
# Set random seed for reproducibility
set.seed(123)
# Generate example datasets
# Normal distribution data for t-test and F-test
n_samples <- 1000 # Increased sample size for smoother distributions
group1_normal <- rnorm(n_samples, mean = 10, sd = 2)
group2_normal <- rnorm(n_samples, mean = 12, sd = 2)
# Skewed data for Wilcoxon test (using log-normal distribution)
group1_skewed <- rlnorm(n_samples, meanlog = 2, sdlog = 0.5)
group2_skewed <- rlnorm(n_samples, meanlog = 2.3, sdlog = 0.5)
# Create data frames for visualization
df_normal <- data.frame(
value = c(group1_normal, group2_normal),
group = factor(rep(c("Group 1", "Group 2"), each = n_samples))
)
df_skewed <- data.frame(
value = c(group1_skewed, group2_skewed),
group = factor(rep(c("Group 1", "Group 2"), each = n_samples))
)
# Create Distribution Plots
# 1. Wilcoxon Test Data Distribution
wilcox_dist <- ggplot(df_skewed, aes(x = value, fill = group)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#FF9999", "#66B2FF")) +
labs(title = "Skewed Data Distribution",
subtitle = "Wilcoxon Rank-Sum Test Example",
x = "Value",
y = "Density") +
theme(legend.position = "bottom") +
# Add mean lines
geom_vline(data = df_skewed %>% group_by(group) %>%
summarize(mean_val = mean(value)),
aes(xintercept = mean_val, color = group),
linetype = "dashed") +
scale_color_manual(values = c("#FF0000", "#0066CC"))
# Set theme for consistent plotting
theme_set(theme_minimal())
wilcox_dist

# 2. T-Test Data Distribution
t_test_dist <- ggplot(df_normal, aes(x = value, fill = group)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#99FF99", "#FF99FF")) +
labs(title = "Normal Data Distribution",
subtitle = "t-Test Example",
x = "Value",
y = "Density") +
theme(legend.position = "bottom") +
# Add mean lines
geom_vline(data = df_normal %>% group_by(group) %>%
summarize(mean_val = mean(value)),
aes(xintercept = mean_val, color = group),
linetype = "dashed") +
scale_color_manual(values = c("#00CC00", "#CC00CC"))
t_test_dist

# 3. F-Test Data Distribution with Variance Comparison
f_test_dist <- ggplot(df_normal, aes(x = value, fill = group)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#FFB366", "#66FFB3")) +
labs(title = "Variance Comparison",
subtitle = "F-Test Example",
x = "Value",
y = "Density") +
theme(legend.position = "bottom") +
# Add SD ranges
geom_vline(data = df_normal %>% group_by(group) %>%
summarize(
mean_val = mean(value),
sd_minus = mean_val - sd(value),
sd_plus = mean_val + sd(value)
),
aes(xintercept = sd_minus, color = group),
linetype = "dotted") +
geom_vline(data = df_normal %>% group_by(group) %>%
summarize(
mean_val = mean(value),
sd_minus = mean_val - sd(value),
sd_plus = mean_val + sd(value)
),
aes(xintercept = sd_plus, color = group),
linetype = "dotted") +
scale_color_manual(values = c("#CC6600", "#00CC66"))
f_test_dist

library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Display all plots
grid.arrange(wilcox_dist, t_test_dist, f_test_dist, ncol = 3)

# Statistical Tests
# 1. Wilcoxon Rank-Sum Test
wilcox_result <- wilcox.test(group1_skewed, group2_skewed)
# 2. Independent t-Test
t_result <- t.test(group1_normal, group2_normal)
# 3. F-Test
f_result <- var.test(group1_normal, group2_normal)
# Create summary statistics function
get_distribution_stats <- function(data) {
c(
n = length(data),
mean = mean(data),
median = median(data),
sd = sd(data),
var = var(data),
skewness = mean((data - mean(data))^3)/(sd(data)^3),
kurtosis = mean((data - mean(data))^4)/(sd(data)^4) - 3
)
}
# Print comprehensive results
cat("\n=== Distribution Statistics ===\n")
##
## === Distribution Statistics ===
cat("\nNormal Distribution - Group 1:\n")
##
## Normal Distribution - Group 1:
print(round(get_distribution_stats(group1_normal), 4))
## n mean median sd var skewness kurtosis
## 1000.0000 10.0323 10.0184 1.9834 3.9338 0.0652 -0.0801
cat("\nNormal Distribution - Group 2:\n")
##
## Normal Distribution - Group 2:
print(round(get_distribution_stats(group2_normal), 4))
## n mean median sd var skewness kurtosis
## 1000.0000 12.0849 12.1097 2.0193 4.0778 -0.0105 -0.0719
cat("\nSkewed Distribution - Group 1:\n")
##
## Skewed Distribution - Group 1:
print(round(get_distribution_stats(group1_skewed), 4))
## n mean median sd var skewness kurtosis
## 1000.0000 8.2518 7.2046 4.3540 18.9572 1.8115 6.0230
cat("\nSkewed Distribution - Group 2:\n")
##
## Skewed Distribution - Group 2:
print(round(get_distribution_stats(group2_skewed), 4))
## n mean median sd var skewness kurtosis
## 1000.0000 11.2104 9.9334 5.7786 33.3921 1.4369 2.9838
# Print test results
cat("\n=== Statistical Test Results ===\n")
##
## === Statistical Test Results ===
cat("\n1. Wilcoxon Rank-Sum Test:\n")
##
## 1. Wilcoxon Rank-Sum Test:
print(wilcox_result)
##
## Wilcoxon rank sum test with continuity correction
##
## data: group1_skewed and group2_skewed
## W = 327631, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
cat("\n2. Independent t-Test:\n")
##
## 2. Independent t-Test:
print(t_result)
##
## Welch Two Sample t-test
##
## data: group1_normal and group2_normal
## t = -22.933, df = 1997.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.228213 -1.877137
## sample estimates:
## mean of x mean of y
## 10.03226 12.08493
cat("\n3. F-Test:\n")
##
## 3. F-Test:
print(f_result)
##
## F test to compare two variances
##
## data: group1_normal and group2_normal
## F = 0.9647, num df = 999, denom df = 999, p-value = 0.5702
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.8521211 1.0921596
## sample estimates:
## ratio of variances
## 0.9647032
# Distribution Analysis
# Test for normality
cat("\n=== Normality Tests (Shapiro-Wilk) ===\n")
##
## === Normality Tests (Shapiro-Wilk) ===
normality_tests <- data.frame(
Group = c("Normal Group 1", "Normal Group 2", "Skewed Group 1", "Skewed Group 2"),
W_statistic = c(
shapiro.test(group1_normal)$statistic,
shapiro.test(group2_normal)$statistic,
shapiro.test(group1_skewed)$statistic,
shapiro.test(group2_skewed)$statistic
),
p_value = c(
shapiro.test(group1_normal)$p.value,
shapiro.test(group2_normal)$p.value,
shapiro.test(group1_skewed)$p.value,
shapiro.test(group2_skewed)$p.value
)
)
print(normality_tests)
## Group W_statistic p_value
## 1 Normal Group 1 0.9983762 4.764686e-01
## 2 Normal Group 2 0.9986157 6.316108e-01
## 3 Skewed Group 1 0.8733212 1.013865e-27
## 4 Skewed Group 2 0.9004864 5.870318e-25