# ====================================================================
# Delivery Statistics Differentiation Analysis
# R Script for comprehensive statistical analysis of delivery data
# ====================================================================
# Load required libraries
install.packages (c ("tidyverse" , "ggplot2" , "gridExtra" , "car" , "broom" , "knitr" ))
Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
(as 'lib' is unspecified)
also installing the dependency 'lme4'
Warning in install.packages(c("tidyverse", "ggplot2", "gridExtra", "car", :
installation of package 'lme4' had non-zero exit status
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (ggplot2)
library (gridExtra)
Attaching package: 'gridExtra'
The following object is masked from 'package:dplyr':
combine
Loading required package: carData
Attaching package: 'car'
The following object is masked from 'package:dplyr':
recode
The following object is masked from 'package:purrr':
some
library (broom)
library (knitr)
# Read and prepare data
cat ("Loading delivery data... \n " )
data <- read.csv ("taco_edit.csv" , stringsAsFactors = FALSE )
# Clean column names and data types
colnames (data) <- c ("Location" , "Delivery_Duration" , "Distance" )
data$ Delivery_Duration <- as.numeric (data$ Delivery_Duration)
data$ Distance <- as.numeric (data$ Distance)
data$ Location <- as.factor (trimws (data$ Location))
# Data overview
cat (" \n === DATA OVERVIEW === \n " )
cat ("Dataset dimensions:" , dim (data), " \n " )
Dataset dimensions: 1000 3
cat ("Locations:" , levels (data$ Location), " \n " )
Locations: East Coast Texas West Coast
cat ("Total observations:" , nrow (data), " \n " )
Location Delivery_Duration Distance
East Coast:212 Min. :10.00 Min. : 0.510
Texas :389 1st Qu.:30.00 1st Qu.: 6.973
West Coast:399 Median :53.00 Median :13.200
Mean :50.93 Mean :13.073
3rd Qu.:71.00 3rd Qu.:19.242
Max. :90.00 Max. :24.980
# ====================================================================
# DESCRIPTIVE STATISTICS
# ====================================================================
cat (" \n === DESCRIPTIVE STATISTICS === \n " )
=== DESCRIPTIVE STATISTICS ===
# Summary statistics by location
desc_stats <- data %>%
group_by (Location) %>%
summarise (
Count = n (),
Mean_Delivery = round (mean (Delivery_Duration, na.rm = TRUE ), 2 ),
Median_Delivery = round (median (Delivery_Duration, na.rm = TRUE ), 2 ),
SD_Delivery = round (sd (Delivery_Duration, na.rm = TRUE ), 2 ),
Min_Delivery = min (Delivery_Duration, na.rm = TRUE ),
Max_Delivery = max (Delivery_Duration, na.rm = TRUE ),
Q1_Delivery = round (quantile (Delivery_Duration, 0.25 , na.rm = TRUE ), 2 ),
Q3_Delivery = round (quantile (Delivery_Duration, 0.75 , na.rm = TRUE ), 2 ),
IQR_Delivery = round (IQR (Delivery_Duration, na.rm = TRUE ), 2 ),
CV_Delivery = round (sd (Delivery_Duration, na.rm = TRUE ) / mean (Delivery_Duration, na.rm = TRUE ) * 100 , 2 ),
.groups = 'drop'
)
print (kable (desc_stats, caption = "Delivery Duration Statistics by Location" ))
Table: Delivery Duration Statistics by Location
|Location | Count| Mean_Delivery| Median_Delivery| SD_Delivery| Min_Delivery| Max_Delivery| Q1_Delivery| Q3_Delivery| IQR_Delivery| CV_Delivery|
|:----------|-----:|-------------:|---------------:|-----------:|------------:|------------:|-----------:|-----------:|------------:|-----------:|
|East Coast | 212| 50.33| 52.5| 24.05| 10| 90| 27.75| 72.25| 44.5| 47.78|
|Texas | 389| 50.96| 51.0| 22.59| 10| 90| 32.00| 70.00| 38.0| 44.33|
|West Coast | 399| 51.22| 53.0| 23.45| 10| 90| 30.50| 71.00| 40.5| 45.78|
# Distance statistics
dist_stats <- data %>%
group_by (Location) %>%
summarise (
Mean_Distance = round (mean (Distance, na.rm = TRUE ), 2 ),
Median_Distance = round (median (Distance, na.rm = TRUE ), 2 ),
SD_Distance = round (sd (Distance, na.rm = TRUE ), 2 ),
Min_Distance = round (min (Distance, na.rm = TRUE ), 2 ),
Max_Distance = round (max (Distance, na.rm = TRUE ), 2 ),
.groups = 'drop'
)
print (kable (dist_stats, caption = "Distance Statistics by Location" ))
Table: Distance Statistics by Location
|Location | Mean_Distance| Median_Distance| SD_Distance| Min_Distance| Max_Distance|
|:----------|-------------:|---------------:|-----------:|------------:|------------:|
|East Coast | 12.82| 12.45| 7.38| 0.60| 24.80|
|Texas | 13.00| 12.81| 7.16| 0.51| 24.81|
|West Coast | 13.28| 13.76| 7.01| 0.55| 24.98|
# ====================================================================
# STATISTICAL TESTS FOR DIFFERENCES
# ====================================================================
cat (" \n === STATISTICAL TESTS === \n " )
=== STATISTICAL TESTS ===
# 1. Normality tests
cat (" \n 1. NORMALITY TESTS (Shapiro-Wilk) \n " )
1. NORMALITY TESTS (Shapiro-Wilk)
normality_results <- data %>%
group_by (Location) %>%
summarise (
Shapiro_W = round (shapiro.test (Delivery_Duration)$ statistic, 4 ),
Shapiro_p = round (shapiro.test (Delivery_Duration)$ p.value, 6 ),
Normal = ifelse (shapiro.test (Delivery_Duration)$ p.value > 0.05 , "Yes" , "No" ),
.groups = 'drop'
)
print (kable (normality_results))
|Location | Shapiro_W| Shapiro_p|Normal |
|:----------|---------:|---------:|:------|
|East Coast | 0.9392| 0|No |
|Texas | 0.9567| 0|No |
|West Coast | 0.9487| 0|No |
# 2. Homogeneity of variance test
cat (" \n 2. HOMOGENEITY OF VARIANCE (Levene's Test) \n " )
2. HOMOGENEITY OF VARIANCE (Levene's Test)
levene_test <- leveneTest (Delivery_Duration ~ Location, data = data)
print (levene_test)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 2 1.2395 0.29
997
# 3. ANOVA for delivery duration differences
cat (" \n 3. ONE-WAY ANOVA - Delivery Duration \n " )
3. ONE-WAY ANOVA - Delivery Duration
anova_model <- aov (Delivery_Duration ~ Location, data = data)
anova_summary <- summary (anova_model)
print (anova_summary)
Df Sum Sq Mean Sq F value Pr(>F)
Location 2 109 54.3 0.1 0.905
Residuals 997 538871 540.5
# Effect size (eta squared)
eta_squared <- anova_summary[[1 ]]["Location" , "Sum Sq" ] /
(anova_summary[[1 ]]["Location" , "Sum Sq" ] + anova_summary[[1 ]]["Residuals" , "Sum Sq" ])
cat ("Eta-squared (effect size):" , round (eta_squared, 4 ), " \n " )
Eta-squared (effect size): 2e-04
# 4. Post-hoc tests (Tukey HSD)
if (anova_summary[[1 ]]["Location" , "Pr(>F)" ] < 0.05 ) {
cat (" \n 4. POST-HOC ANALYSIS (Tukey HSD) \n " )
tukey_results <- TukeyHSD (anova_model)
print (tukey_results)
}
# 5. Kruskal-Wallis test (non-parametric alternative)
cat (" \n 5. KRUSKAL-WALLIS TEST (Non-parametric) \n " )
5. KRUSKAL-WALLIS TEST (Non-parametric)
kw_test <- kruskal.test (Delivery_Duration ~ Location, data = data)
print (kw_test)
Kruskal-Wallis rank sum test
data: Delivery_Duration by Location
Kruskal-Wallis chi-squared = 0.19613, df = 2, p-value = 0.9066
# 6. Pairwise Mann-Whitney U tests
cat (" \n 6. PAIRWISE MANN-WHITNEY U TESTS \n " )
6. PAIRWISE MANN-WHITNEY U TESTS
pairwise_results <- pairwise.wilcox.test (data$ Delivery_Duration, data$ Location,
p.adjust.method = "bonferroni" )
print (pairwise_results)
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: data$Delivery_Duration and data$Location
East Coast Texas
Texas 1 -
West Coast 1 1
P value adjustment method: bonferroni
# ====================================================================
# CORRELATION ANALYSIS
# ====================================================================
cat (" \n === CORRELATION ANALYSIS === \n " )
=== CORRELATION ANALYSIS ===
# Overall correlation
overall_cor <- cor.test (data$ Delivery_Duration, data$ Distance)
cat ("Overall correlation (Duration vs Distance): \n " )
Overall correlation (Duration vs Distance):
cat ("Pearson r =" , round (overall_cor$ estimate, 4 ), " \n " )
cat ("p-value =" , round (overall_cor$ p.value, 6 ), " \n " )
# Correlation by location
cor_by_location <- data %>%
group_by (Location) %>%
summarise (
Correlation = round (cor (Delivery_Duration, Distance, use = "complete.obs" ), 4 ),
Cor_p_value = round (cor.test (Delivery_Duration, Distance)$ p.value, 6 ),
.groups = 'drop'
)
print (kable (cor_by_location, caption = "Correlation by Location" ))
Table: Correlation by Location
|Location | Correlation| Cor_p_value|
|:----------|-----------:|-----------:|
|East Coast | -0.0699| 0.311189|
|Texas | -0.0929| 0.067053|
|West Coast | -0.0124| 0.804409|
# ====================================================================
# VISUALIZATIONS
# ====================================================================
cat (" \n === CREATING VISUALIZATIONS === \n " )
=== CREATING VISUALIZATIONS ===
# 1. Box plots for delivery duration
p1 <- ggplot (data, aes (x = Location, y = Delivery_Duration, fill = Location)) +
geom_boxplot (alpha = 0.7 ) +
geom_jitter (width = 0.2 , alpha = 0.3 , size = 0.8 ) +
labs (title = "Delivery Duration Distribution by Location" ,
x = "Location" , y = "Delivery Duration (minutes)" ) +
theme_minimal () +
theme (legend.position = "none" ) +
scale_fill_brewer (palette = "Set2" )
# 2. Histogram with density curves
p2 <- ggplot (data, aes (x = Delivery_Duration, fill = Location)) +
geom_histogram (aes (y = ..density..), alpha = 0.7 , bins = 30 , position = "identity" ) +
geom_density (alpha = 0.3 ) +
facet_wrap (~ Location, nrow = 3 ) +
labs (title = "Delivery Duration Distribution by Location" ,
x = "Delivery Duration (minutes)" , y = "Density" ) +
theme_minimal () +
scale_fill_brewer (palette = "Set2" )
# 3. Scatter plot: Duration vs Distance
p3 <- ggplot (data, aes (x = Distance, y = Delivery_Duration, color = Location)) +
geom_point (alpha = 0.6 ) +
geom_smooth (method = "lm" , se = TRUE ) +
labs (title = "Delivery Duration vs Distance by Location" ,
x = "Distance" , y = "Delivery Duration (minutes)" ) +
theme_minimal () +
scale_color_brewer (palette = "Set2" )
# 4. Mean comparison plot
mean_data <- desc_stats %>%
select (Location, Mean_Delivery, SD_Delivery)
p4 <- ggplot (mean_data, aes (x = reorder (Location, Mean_Delivery), y = Mean_Delivery, fill = Location)) +
geom_col (alpha = 0.8 ) +
geom_errorbar (aes (ymin = Mean_Delivery - SD_Delivery, ymax = Mean_Delivery + SD_Delivery),
width = 0.2 ) +
geom_text (aes (label = paste0 (Mean_Delivery, " min" )), vjust = - 0.5 ) +
labs (title = "Average Delivery Duration by Location (with SD)" ,
x = "Location" , y = "Average Delivery Duration (minutes)" ) +
theme_minimal () +
theme (legend.position = "none" ) +
scale_fill_brewer (palette = "Set2" )
# Save plots
ggsave ("delivery_boxplots.png" , p1, width = 10 , height = 6 , dpi = 300 )
ggsave ("delivery_histograms.png" , p2, width = 12 , height = 8 , dpi = 300 )
Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.
ggsave ("duration_vs_distance.png" , p3, width = 10 , height = 6 , dpi = 300 )
`geom_smooth()` using formula = 'y ~ x'
ggsave ("mean_comparison.png" , p4, width = 8 , height = 6 , dpi = 300 )
# Display plots
print (p1)
`geom_smooth()` using formula = 'y ~ x'
# ====================================================================
# ADDITIONAL ANALYSES
# ====================================================================
cat (" \n === ADDITIONAL ANALYSES === \n " )
=== ADDITIONAL ANALYSES ===
# 1. Outlier detection using IQR method
outliers_summary <- data %>%
group_by (Location) %>%
summarise (
Q1 = quantile (Delivery_Duration, 0.25 ),
Q3 = quantile (Delivery_Duration, 0.75 ),
IQR = Q3 - Q1,
Lower_Bound = Q1 - 1.5 * IQR,
Upper_Bound = Q3 + 1.5 * IQR,
Outliers = sum (Delivery_Duration < Lower_Bound | Delivery_Duration > Upper_Bound),
Outlier_Percentage = round (Outliers / n () * 100 , 2 ),
.groups = 'drop'
)
cat (" \n OUTLIER ANALYSIS: \n " )
print (kable (outliers_summary))
|Location | Q1| Q3| IQR| Lower_Bound| Upper_Bound| Outliers| Outlier_Percentage|
|:----------|-----:|-----:|----:|-----------:|-----------:|--------:|------------------:|
|East Coast | 27.75| 72.25| 44.5| -39.00| 139.00| 0| 0|
|Texas | 32.00| 70.00| 38.0| -25.00| 127.00| 0| 0|
|West Coast | 30.50| 71.00| 40.5| -30.25| 131.75| 0| 0|
# 2. Performance ranking
performance_ranking <- desc_stats %>%
arrange (Mean_Delivery) %>%
mutate (
Rank = row_number (),
Performance = case_when (
Rank == 1 ~ "Best" ,
Rank == 2 ~ "Average" ,
Rank == 3 ~ "Needs Improvement"
)
) %>%
select (Rank, Location, Mean_Delivery, Performance)
cat (" \n PERFORMANCE RANKING: \n " )
print (kable (performance_ranking))
| Rank|Location | Mean_Delivery|Performance |
|----:|:----------|-------------:|:-----------------|
| 1|East Coast | 50.33|Best |
| 2|Texas | 50.96|Average |
| 3|West Coast | 51.22|Needs Improvement |
# 3. Confidence intervals for means
ci_results <- data %>%
group_by (Location) %>%
summarise (
Mean = mean (Delivery_Duration),
SE = sd (Delivery_Duration) / sqrt (n ()),
CI_Lower = Mean - 1.96 * SE,
CI_Upper = Mean + 1.96 * SE,
.groups = 'drop'
) %>%
mutate (across (where (is.numeric), ~ round (.x, 2 )))
cat (" \n 95% CONFIDENCE INTERVALS FOR MEANS: \n " )
95% CONFIDENCE INTERVALS FOR MEANS:
|Location | Mean| SE| CI_Lower| CI_Upper|
|:----------|-----:|----:|--------:|--------:|
|East Coast | 50.33| 1.65| 47.10| 53.57|
|Texas | 50.96| 1.15| 48.71| 53.20|
|West Coast | 51.22| 1.17| 48.92| 53.52|
# ====================================================================
# SUMMARY AND RECOMMENDATIONS
# ====================================================================
cat (" \n === EXECUTIVE SUMMARY === \n " )
=== EXECUTIVE SUMMARY ===
cat ("1. FASTEST DELIVERY: " , performance_ranking$ Location[1 ],
" (" , performance_ranking$ Mean_Delivery[1 ], " minutes average) \n " , sep= "" )
1. FASTEST DELIVERY: 1 (50.33 minutes average)
cat ("2. STATISTICAL SIGNIFICANCE: " )
2. STATISTICAL SIGNIFICANCE:
if (anova_summary[[1 ]]["Location" , "Pr(>F)" ] < 0.05 ) {
cat ("Significant differences exist between locations (p < 0.05) \n " )
} else {
cat ("No significant differences between locations (p >= 0.05) \n " )
}
No significant differences between locations (p >= 0.05)
cat ("3. EFFECT SIZE: " , ifelse (eta_squared > 0.14 , "Large" ,
ifelse (eta_squared > 0.06 , "Medium" , "Small" )),
" effect (η² = " , round (eta_squared, 4 ), ") \n " , sep= "" )
3. EFFECT SIZE: Small effect (η² = 2e-04)
cat ("4. RECOMMENDATION: Focus on replicating " , performance_ranking$ Location[1 ],
"'s delivery processes \n " )
4. RECOMMENDATION: Focus on replicating 1 's delivery processes
cat (" \n Script completed successfully! \n " )
Script completed successfully!
cat ("Generated files: delivery_boxplots.png, delivery_histograms.png, duration_vs_distance.png, mean_comparison.png \n " )
Generated files: delivery_boxplots.png, delivery_histograms.png, duration_vs_distance.png, mean_comparison.png