tacotest

# ====================================================================
# Delivery Statistics Differentiation Analysis
# R Script for comprehensive statistical analysis of delivery data
# ====================================================================

# Load required libraries
install.packages(c("tidyverse", "ggplot2", "gridExtra", "car", "broom", "knitr"))

Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
(as 'lib' is unspecified)

also installing the dependency 'lme4'

Warning in install.packages(c("tidyverse", "ggplot2", "gridExtra", "car", :
installation of package 'lme4' had non-zero exit status

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(gridExtra)


Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine

library(car)

Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

library(broom)
library(knitr)

# Read and prepare data
cat("Loading delivery data...\n")

Loading delivery data...

data <- read.csv("taco_edit.csv", stringsAsFactors = FALSE)

# Clean column names and data types
colnames(data) <- c("Location", "Delivery_Duration", "Distance")
data$Delivery_Duration <- as.numeric(data$Delivery_Duration)
data$Distance <- as.numeric(data$Distance)
data$Location <- as.factor(trimws(data$Location))

# Data overview
cat("\n=== DATA OVERVIEW ===\n")


=== DATA OVERVIEW ===

cat("Dataset dimensions:", dim(data), "\n")

Dataset dimensions: 1000 3

cat("Locations:", levels(data$Location), "\n")

Locations: East Coast Texas West Coast

cat("Total observations:", nrow(data), "\n")

Total observations: 1000

print(summary(data))

       Location   Delivery_Duration    Distance     
 East Coast:212   Min.   :10.00     Min.   : 0.510  
 Texas     :389   1st Qu.:30.00     1st Qu.: 6.973  
 West Coast:399   Median :53.00     Median :13.200  
                  Mean   :50.93     Mean   :13.073  
                  3rd Qu.:71.00     3rd Qu.:19.242  
                  Max.   :90.00     Max.   :24.980

# ====================================================================
# DESCRIPTIVE STATISTICS
# ====================================================================

cat("\n=== DESCRIPTIVE STATISTICS ===\n")


=== DESCRIPTIVE STATISTICS ===

# Summary statistics by location
desc_stats <- data %>%
  group_by(Location) %>%
  summarise(
    Count = n(),
    Mean_Delivery = round(mean(Delivery_Duration, na.rm = TRUE), 2),
    Median_Delivery = round(median(Delivery_Duration, na.rm = TRUE), 2),
    SD_Delivery = round(sd(Delivery_Duration, na.rm = TRUE), 2),
    Min_Delivery = min(Delivery_Duration, na.rm = TRUE),
    Max_Delivery = max(Delivery_Duration, na.rm = TRUE),
    Q1_Delivery = round(quantile(Delivery_Duration, 0.25, na.rm = TRUE), 2),
    Q3_Delivery = round(quantile(Delivery_Duration, 0.75, na.rm = TRUE), 2),
    IQR_Delivery = round(IQR(Delivery_Duration, na.rm = TRUE), 2),
    CV_Delivery = round(sd(Delivery_Duration, na.rm = TRUE) / mean(Delivery_Duration, na.rm = TRUE) * 100, 2),
    .groups = 'drop'
  )

print(kable(desc_stats, caption = "Delivery Duration Statistics by Location"))



Table: Delivery Duration Statistics by Location

|Location   | Count| Mean_Delivery| Median_Delivery| SD_Delivery| Min_Delivery| Max_Delivery| Q1_Delivery| Q3_Delivery| IQR_Delivery| CV_Delivery|
|:----------|-----:|-------------:|---------------:|-----------:|------------:|------------:|-----------:|-----------:|------------:|-----------:|
|East Coast |   212|         50.33|            52.5|       24.05|           10|           90|       27.75|       72.25|         44.5|       47.78|
|Texas      |   389|         50.96|            51.0|       22.59|           10|           90|       32.00|       70.00|         38.0|       44.33|
|West Coast |   399|         51.22|            53.0|       23.45|           10|           90|       30.50|       71.00|         40.5|       45.78|

# Distance statistics
dist_stats <- data %>%
  group_by(Location) %>%
  summarise(
    Mean_Distance = round(mean(Distance, na.rm = TRUE), 2),
    Median_Distance = round(median(Distance, na.rm = TRUE), 2),
    SD_Distance = round(sd(Distance, na.rm = TRUE), 2),
    Min_Distance = round(min(Distance, na.rm = TRUE), 2),
    Max_Distance = round(max(Distance, na.rm = TRUE), 2),
    .groups = 'drop'
  )

print(kable(dist_stats, caption = "Distance Statistics by Location"))



Table: Distance Statistics by Location

|Location   | Mean_Distance| Median_Distance| SD_Distance| Min_Distance| Max_Distance|
|:----------|-------------:|---------------:|-----------:|------------:|------------:|
|East Coast |         12.82|           12.45|        7.38|         0.60|        24.80|
|Texas      |         13.00|           12.81|        7.16|         0.51|        24.81|
|West Coast |         13.28|           13.76|        7.01|         0.55|        24.98|

# ====================================================================
# STATISTICAL TESTS FOR DIFFERENCES
# ====================================================================

cat("\n=== STATISTICAL TESTS ===\n")


=== STATISTICAL TESTS ===

# 1. Normality tests
cat("\n1. NORMALITY TESTS (Shapiro-Wilk)\n")


1. NORMALITY TESTS (Shapiro-Wilk)

normality_results <- data %>%
  group_by(Location) %>%
  summarise(
    Shapiro_W = round(shapiro.test(Delivery_Duration)$statistic, 4),
    Shapiro_p = round(shapiro.test(Delivery_Duration)$p.value, 6),
    Normal = ifelse(shapiro.test(Delivery_Duration)$p.value > 0.05, "Yes", "No"),
    .groups = 'drop'
  )
print(kable(normality_results))



|Location   | Shapiro_W| Shapiro_p|Normal |
|:----------|---------:|---------:|:------|
|East Coast |    0.9392|         0|No     |
|Texas      |    0.9567|         0|No     |
|West Coast |    0.9487|         0|No     |

# 2. Homogeneity of variance test
cat("\n2. HOMOGENEITY OF VARIANCE (Levene's Test)\n")


2. HOMOGENEITY OF VARIANCE (Levene's Test)

levene_test <- leveneTest(Delivery_Duration ~ Location, data = data)
print(levene_test)

Levene's Test for Homogeneity of Variance (center = median)
       Df F value Pr(>F)
group   2  1.2395   0.29
      997

# 3. ANOVA for delivery duration differences
cat("\n3. ONE-WAY ANOVA - Delivery Duration\n")


3. ONE-WAY ANOVA - Delivery Duration

anova_model <- aov(Delivery_Duration ~ Location, data = data)
anova_summary <- summary(anova_model)
print(anova_summary)

             Df Sum Sq Mean Sq F value Pr(>F)
Location      2    109    54.3     0.1  0.905
Residuals   997 538871   540.5

# Effect size (eta squared)
eta_squared <- anova_summary[[1]]["Location", "Sum Sq"] / 
  (anova_summary[[1]]["Location", "Sum Sq"] + anova_summary[[1]]["Residuals", "Sum Sq"])
cat("Eta-squared (effect size):", round(eta_squared, 4), "\n")

Eta-squared (effect size): 2e-04

# 4. Post-hoc tests (Tukey HSD)
if (anova_summary[[1]]["Location", "Pr(>F)"] < 0.05) {
  cat("\n4. POST-HOC ANALYSIS (Tukey HSD)\n")
  tukey_results <- TukeyHSD(anova_model)
  print(tukey_results)
}

# 5. Kruskal-Wallis test (non-parametric alternative)
cat("\n5. KRUSKAL-WALLIS TEST (Non-parametric)\n")


5. KRUSKAL-WALLIS TEST (Non-parametric)

kw_test <- kruskal.test(Delivery_Duration ~ Location, data = data)
print(kw_test)


    Kruskal-Wallis rank sum test

data:  Delivery_Duration by Location
Kruskal-Wallis chi-squared = 0.19613, df = 2, p-value = 0.9066

# 6. Pairwise Mann-Whitney U tests
cat("\n6. PAIRWISE MANN-WHITNEY U TESTS\n")


6. PAIRWISE MANN-WHITNEY U TESTS

pairwise_results <- pairwise.wilcox.test(data$Delivery_Duration, data$Location, 
                                         p.adjust.method = "bonferroni")
print(pairwise_results)


    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  data$Delivery_Duration and data$Location 

           East Coast Texas
Texas      1          -    
West Coast 1          1    

P value adjustment method: bonferroni

# ====================================================================
# CORRELATION ANALYSIS
# ====================================================================

cat("\n=== CORRELATION ANALYSIS ===\n")


=== CORRELATION ANALYSIS ===

# Overall correlation
overall_cor <- cor.test(data$Delivery_Duration, data$Distance)
cat("Overall correlation (Duration vs Distance):\n")

Overall correlation (Duration vs Distance):

cat("Pearson r =", round(overall_cor$estimate, 4), "\n")

Pearson r = -0.0556

cat("p-value =", round(overall_cor$p.value, 6), "\n")

p-value = 0.079077

# Correlation by location
cor_by_location <- data %>%
  group_by(Location) %>%
  summarise(
    Correlation = round(cor(Delivery_Duration, Distance, use = "complete.obs"), 4),
    Cor_p_value = round(cor.test(Delivery_Duration, Distance)$p.value, 6),
    .groups = 'drop'
  )
print(kable(cor_by_location, caption = "Correlation by Location"))



Table: Correlation by Location

|Location   | Correlation| Cor_p_value|
|:----------|-----------:|-----------:|
|East Coast |     -0.0699|    0.311189|
|Texas      |     -0.0929|    0.067053|
|West Coast |     -0.0124|    0.804409|

# ====================================================================
# VISUALIZATIONS
# ====================================================================

cat("\n=== CREATING VISUALIZATIONS ===\n")


=== CREATING VISUALIZATIONS ===

# 1. Box plots for delivery duration
p1 <- ggplot(data, aes(x = Location, y = Delivery_Duration, fill = Location)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
  labs(title = "Delivery Duration Distribution by Location",
       x = "Location", y = "Delivery Duration (minutes)") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_brewer(palette = "Set2")

# 2. Histogram with density curves
p2 <- ggplot(data, aes(x = Delivery_Duration, fill = Location)) +
  geom_histogram(aes(y = ..density..), alpha = 0.7, bins = 30, position = "identity") +
  geom_density(alpha = 0.3) +
  facet_wrap(~Location, nrow = 3) +
  labs(title = "Delivery Duration Distribution by Location",
       x = "Delivery Duration (minutes)", y = "Density") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2")

# 3. Scatter plot: Duration vs Distance
p3 <- ggplot(data, aes(x = Distance, y = Delivery_Duration, color = Location)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Delivery Duration vs Distance by Location",
       x = "Distance", y = "Delivery Duration (minutes)") +
  theme_minimal() +
  scale_color_brewer(palette = "Set2")

# 4. Mean comparison plot
mean_data <- desc_stats %>%
  select(Location, Mean_Delivery, SD_Delivery)

p4 <- ggplot(mean_data, aes(x = reorder(Location, Mean_Delivery), y = Mean_Delivery, fill = Location)) +
  geom_col(alpha = 0.8) +
  geom_errorbar(aes(ymin = Mean_Delivery - SD_Delivery, ymax = Mean_Delivery + SD_Delivery),
                width = 0.2) +
  geom_text(aes(label = paste0(Mean_Delivery, " min")), vjust = -0.5) +
  labs(title = "Average Delivery Duration by Location (with SD)",
       x = "Location", y = "Average Delivery Duration (minutes)") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_brewer(palette = "Set2")

# Save plots
ggsave("delivery_boxplots.png", p1, width = 10, height = 6, dpi = 300)
ggsave("delivery_histograms.png", p2, width = 12, height = 8, dpi = 300)

Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.

ggsave("duration_vs_distance.png", p3, width = 10, height = 6, dpi = 300)

`geom_smooth()` using formula = 'y ~ x'

ggsave("mean_comparison.png", p4, width = 8, height = 6, dpi = 300)

# Display plots
print(p1)

print(p2)

print(p3)

`geom_smooth()` using formula = 'y ~ x'

print(p4)

# ====================================================================
# ADDITIONAL ANALYSES
# ====================================================================

cat("\n=== ADDITIONAL ANALYSES ===\n")


=== ADDITIONAL ANALYSES ===

# 1. Outlier detection using IQR method
outliers_summary <- data %>%
  group_by(Location) %>%
  summarise(
    Q1 = quantile(Delivery_Duration, 0.25),
    Q3 = quantile(Delivery_Duration, 0.75),
    IQR = Q3 - Q1,
    Lower_Bound = Q1 - 1.5 * IQR,
    Upper_Bound = Q3 + 1.5 * IQR,
    Outliers = sum(Delivery_Duration < Lower_Bound | Delivery_Duration > Upper_Bound),
    Outlier_Percentage = round(Outliers / n() * 100, 2),
    .groups = 'drop'
  )

cat("\nOUTLIER ANALYSIS:\n")


OUTLIER ANALYSIS:

print(kable(outliers_summary))



|Location   |    Q1|    Q3|  IQR| Lower_Bound| Upper_Bound| Outliers| Outlier_Percentage|
|:----------|-----:|-----:|----:|-----------:|-----------:|--------:|------------------:|
|East Coast | 27.75| 72.25| 44.5|      -39.00|      139.00|        0|                  0|
|Texas      | 32.00| 70.00| 38.0|      -25.00|      127.00|        0|                  0|
|West Coast | 30.50| 71.00| 40.5|      -30.25|      131.75|        0|                  0|

# 2. Performance ranking
performance_ranking <- desc_stats %>%
  arrange(Mean_Delivery) %>%
  mutate(
    Rank = row_number(),
    Performance = case_when(
      Rank == 1 ~ "Best",
      Rank == 2 ~ "Average", 
      Rank == 3 ~ "Needs Improvement"
    )
  ) %>%
  select(Rank, Location, Mean_Delivery, Performance)

cat("\nPERFORMANCE RANKING:\n")


PERFORMANCE RANKING:

print(kable(performance_ranking))



| Rank|Location   | Mean_Delivery|Performance       |
|----:|:----------|-------------:|:-----------------|
|    1|East Coast |         50.33|Best              |
|    2|Texas      |         50.96|Average           |
|    3|West Coast |         51.22|Needs Improvement |

# 3. Confidence intervals for means
ci_results <- data %>%
  group_by(Location) %>%
  summarise(
    Mean = mean(Delivery_Duration),
    SE = sd(Delivery_Duration) / sqrt(n()),
    CI_Lower = Mean - 1.96 * SE,
    CI_Upper = Mean + 1.96 * SE,
    .groups = 'drop'
  ) %>%
  mutate(across(where(is.numeric), ~round(.x, 2)))

cat("\n95% CONFIDENCE INTERVALS FOR MEANS:\n")


95% CONFIDENCE INTERVALS FOR MEANS:

print(kable(ci_results))



|Location   |  Mean|   SE| CI_Lower| CI_Upper|
|:----------|-----:|----:|--------:|--------:|
|East Coast | 50.33| 1.65|    47.10|    53.57|
|Texas      | 50.96| 1.15|    48.71|    53.20|
|West Coast | 51.22| 1.17|    48.92|    53.52|

# ====================================================================
# SUMMARY AND RECOMMENDATIONS
# ====================================================================

cat("\n=== EXECUTIVE SUMMARY ===\n")


=== EXECUTIVE SUMMARY ===

cat("1. FASTEST DELIVERY: ", performance_ranking$Location[1], 
    " (", performance_ranking$Mean_Delivery[1], " minutes average)\n", sep="")

1. FASTEST DELIVERY: 1 (50.33 minutes average)

cat("2. STATISTICAL SIGNIFICANCE: ")

2. STATISTICAL SIGNIFICANCE:

if (anova_summary[[1]]["Location", "Pr(>F)"] < 0.05) {
  cat("Significant differences exist between locations (p < 0.05)\n")
} else {
  cat("No significant differences between locations (p >= 0.05)\n")
}

No significant differences between locations (p >= 0.05)

cat("3. EFFECT SIZE: ", ifelse(eta_squared > 0.14, "Large", 
                               ifelse(eta_squared > 0.06, "Medium", "Small")), 
    " effect (η² = ", round(eta_squared, 4), ")\n", sep="")

3. EFFECT SIZE: Small effect (η² = 2e-04)

cat("4. RECOMMENDATION: Focus on replicating ", performance_ranking$Location[1], 
    "'s delivery processes\n")

4. RECOMMENDATION: Focus on replicating  1 's delivery processes

cat("\nScript completed successfully!\n")


Script completed successfully!

cat("Generated files: delivery_boxplots.png, delivery_histograms.png, duration_vs_distance.png, mean_comparison.png\n")

Generated files: delivery_boxplots.png, delivery_histograms.png, duration_vs_distance.png, mean_comparison.png