libraries and data upload

# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Carregando pacotes exigidos: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(multcomp)
## Warning: package 'multcomp' was built under R version 4.3.2
## Carregando pacotes exigidos: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.2
## Carregando pacotes exigidos: survival
## Carregando pacotes exigidos: TH.data
## Warning: package 'TH.data' was built under R version 4.3.2
## Carregando pacotes exigidos: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## Attaching package: 'TH.data'
## 
## The following object is masked from 'package:MASS':
## 
##     geyser
# Load your data
library(readr)
Taxi <- read_csv("c:/users/dell/downloads/Taxi.csv")
## Rows: 348 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): origin, income, rushhour, dgender, overcharge
## dbl (3): triple, dage, overtreatment
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

robust permutation test based on medians

# Extract overtreatment indices for 'high' and 'low' income levels
overtreatment_high <- as.numeric(Taxi$overtreatment[Taxi$income == "high"])
overtreatment_low <- as.numeric(Taxi$overtreatment[Taxi$income == "low"])

# Function for robust permutation test based on medians
robust_permutation_test_median <- function(x, y, n_permutations = 1000) {
  observed_difference <- median(x) - median(y)
  combined_data <- c(x, y)
  
  # Initialize an empty vector to store permuted differences
  permuted_differences <- numeric(n_permutations)
  
  # Perform permutations
  set.seed(123)  # Set seed for reproducibility
  for (i in 1:n_permutations) {
    permuted_data <- sample(combined_data)
    permuted_x <- permuted_data[1:length(x)]
    permuted_y <- permuted_data[(length(x) + 1):length(combined_data)]
    permuted_differences[i] <- median(permuted_x) - median(permuted_y)
  }
  
  # Calculate p-value
  p_value <- mean(abs(permuted_differences) >= abs(observed_difference))
  
  # Return results
  result <- list(
    observed_difference = observed_difference,
    permuted_differences = permuted_differences,
    p_value = p_value
  )
  return(result)
}

robust permutation test based on the upper quartile

robust_permutation_test_upper_quartile <- function(x, y, n_permutations = 1000) {
  observed_difference <- quantile(x, 0.75, na.rm = TRUE) - quantile(y, 0.75, na.rm = TRUE)
  combined_data <- c(x, y)
  
  # Initialize an empty vector to store permuted differences
  permuted_differences <- numeric(n_permutations)
  
  # Perform permutations
  set.seed(123)  # Set seed for reproducibility
  for (i in 1:n_permutations) {
    permuted_data <- sample(combined_data)
    permuted_x <- permuted_data[1:length(x)]
    permuted_y <- permuted_data[(length(x) + 1):length(combined_data)]
    permuted_differences[i] <- quantile(permuted_x, 0.75, na.rm = TRUE) - quantile(permuted_y, 0.75, na.rm = TRUE)
  }
  
  # Calculate p-value
  p_value <- mean(abs(permuted_differences) >= abs(observed_difference))
  
  # Return results
  result <- list(
    observed_difference = observed_difference,
    permuted_differences = permuted_differences,
    p_value = p_value
  )
  return(result)
}

robust permutation tests

result_median <- robust_permutation_test_median(overtreatment_high, overtreatment_low)
result_upper_quartile <- robust_permutation_test_upper_quartile(overtreatment_high, overtreatment_low)

# Interpret the results
cat("Robust Permutation Test Based on Medians:\n")
## Robust Permutation Test Based on Medians:
cat("Observed Difference:", result_median$observed_difference, "\n")
## Observed Difference: 0.0099897
cat("P-value:", result_median$p_value, "\n\n")
## P-value: 0.081
cat("Robust Permutation Test Based on Upper Quartile:\n")
## Robust Permutation Test Based on Upper Quartile:
cat("Observed Difference:", result_upper_quartile$observed_difference, "\n")
## Observed Difference: 0.04290212
cat("P-value:", result_upper_quartile$p_value, "\n")
## P-value: 0.033

results of the robust permutation tests based on medians and upper quartiles:

Robust Permutation Test Based on Medians:

  • Observed Difference: 0.0099897
  • P-value: 0.081

Robust Permutation Test Based on Upper Quartile:

  • Observed Difference: 0.04290212
  • P-value: 0.033

Interpretation:

Robust Permutation Test Based on Medians:

  • The median disparity between the ‘high’ and ‘low’ income groups is observed to be 0.0099897.
  • The corresponding p-value for this observed difference is 0.081.
  • Having a p-value of 0.081 suggests that we lack substantial evidence to dismiss the null hypothesis, which posits no discernible difference in overtreatment medians between the ‘high’ and ‘low’ income groups. It is crucial to emphasize that the determination of whether to reject or retain the null hypothesis hinges on the selected significance level, often established at 0.05.

Robust Permutation Test Based on Upper Quartile:

  • There is a noticeable contrast of 0.04290212 in the upper quartiles between the ‘high’ and ‘low’ income groups.
  • The p-value linked to this contrast is 0.033. With a p-value of 0.033, there is an indication that a substantial disparity in the upper quartiles of overtreatment exists between the ‘high’ and ‘low’ income groups. Once again, the significance level holds a critical role in understanding these findings.

These findings offer insights into potential variations in the positioning of the overtreatment index across diverse income levels. The upper quartile test implies a more pronounced difference than the median test.