Matching methods

library(MatchIt)
## Warning: package 'MatchIt' was built under R version 4.4.3
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.4.3
data <- read.csv("D:/OneDrive - IIT Kanpur/PhD_new/Papers/gingerpaper/Nepal_2014.csv")


# --- NEAREST - GLM - Replace TRUE
nearestglmT <- matchit(CONTRACT_FARMING_YN ~ AGE    + EDU_INTERSCHOOL_YN    + MIGRA_EMPLOY_YN,
                  data = data, method = "nearest", distance = "glm", replace = TRUE)
nearestglmT
## A `matchit` object
##  - method: 1:1 nearest neighbor matching with replacement
##  - distance: Propensity score
##              - estimated with logistic regression
##  - number of obs.: 461 (original), 363 (matched)
##  - target estimand: ATT
##  - covariates: AGE, EDU_INTERSCHOOL_YN, MIGRA_EMPLOY_YN
summary(nearestglmT)
## 
## Call:
## matchit(formula = CONTRACT_FARMING_YN ~ AGE + EDU_INTERSCHOOL_YN + 
##     MIGRA_EMPLOY_YN, data = data, method = "nearest", distance = "glm", 
##     replace = TRUE)
## 
## Summary of Balance for All Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                  0.5333        0.5248          0.1889     0.9049
## AGE                      45.1803       44.7373          0.0333     0.9602
## EDU_INTERSCHOOL_YN        0.3525        0.4009         -0.1014          .
## MIGRA_EMPLOY_YN           0.6352        0.5576          0.1613          .
##                    eCDF Mean eCDF Max
## distance              0.0528   0.0976
## AGE                   0.0126   0.0583
## EDU_INTERSCHOOL_YN    0.0485   0.0485
## MIGRA_EMPLOY_YN       0.0776   0.0776
## 
## Summary of Balance for Matched Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                  0.5333        0.5333         -0.0007     0.9896
## AGE                      45.1803       45.0205          0.0120     0.9998
## EDU_INTERSCHOOL_YN        0.3525        0.3566         -0.0086          .
## MIGRA_EMPLOY_YN           0.6352        0.6393         -0.0085          .
##                    eCDF Mean eCDF Max Std. Pair Dist.
## distance              0.0021   0.0205          0.0049
## AGE                   0.0058   0.0205          0.0477
## EDU_INTERSCHOOL_YN    0.0041   0.0041          0.0086
## MIGRA_EMPLOY_YN       0.0041   0.0041          0.0085
## 
## Sample Sizes:
##               Control Treated
## All            217.       244
## Matched (ESS)   82.46     244
## Matched        119.       244
## Unmatched       98.         0
## Discarded        0.         0
plot(nearestglmT, type = "histogram")

nearestglmT_data <- match.data(nearestglmT)

# --- NEAREST - GLM - Replace FALSE
nearestglmF <- matchit(CONTRACT_FARMING_YN ~ AGE    + EDU_INTERSCHOOL_YN    + MIGRA_EMPLOY_YN,
                  data = data, method = "nearest", distance = "glm", replace = FALSE)
## Warning: Fewer control units than treated units; not all treated units will get
## a match.
nearestglmF
## A `matchit` object
##  - method: 1:1 nearest neighbor matching without replacement
##  - distance: Propensity score
##              - estimated with logistic regression
##  - number of obs.: 461 (original), 434 (matched)
##  - target estimand: ATT
##  - covariates: AGE, EDU_INTERSCHOOL_YN, MIGRA_EMPLOY_YN
summary(nearestglmF)
## 
## Call:
## matchit(formula = CONTRACT_FARMING_YN ~ AGE + EDU_INTERSCHOOL_YN + 
##     MIGRA_EMPLOY_YN, data = data, method = "nearest", distance = "glm", 
##     replace = FALSE)
## 
## Summary of Balance for All Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                  0.5333        0.5248          0.1889     0.9049
## AGE                      45.1803       44.7373          0.0333     0.9602
## EDU_INTERSCHOOL_YN        0.3525        0.4009         -0.1014          .
## MIGRA_EMPLOY_YN           0.6352        0.5576          0.1613          .
##                    eCDF Mean eCDF Max
## distance              0.0528   0.0976
## AGE                   0.0126   0.0583
## EDU_INTERSCHOOL_YN    0.0485   0.0485
## MIGRA_EMPLOY_YN       0.0776   0.0776
## 
## Summary of Balance for Matched Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                  0.5433        0.5248          0.4130     0.6034
## AGE                      45.4562       44.7373          0.0540     0.9959
## EDU_INTERSCHOOL_YN        0.2719        0.4009         -0.2701          .
## MIGRA_EMPLOY_YN           0.7143        0.5576          0.3255          .
##                    eCDF Mean eCDF Max Std. Pair Dist.
## distance              0.1108   0.1843          0.4179
## AGE                   0.0151   0.0599          1.1452
## EDU_INTERSCHOOL_YN    0.1290   0.1290          0.9260
## MIGRA_EMPLOY_YN       0.1567   0.1567          0.3255
## 
## Sample Sizes:
##           Control Treated
## All           217     244
## Matched       217     217
## Unmatched       0      27
## Discarded       0       0
plot(nearestglmF, type = "histogram")

nearestglmF_data <- match.data(nearestglmF)


# --- NEAREST - MAHALANOBIS - Replace TRUE
nearestmahalanobisT <- matchit(CONTRACT_FARMING_YN ~ AGE    + EDU_INTERSCHOOL_YN    + MIGRA_EMPLOY_YN,
                  data = data, method = "nearest", distance = "mahalanobis", replace = TRUE)
nearestmahalanobisT
## A `matchit` object
##  - method: 1:1 nearest neighbor matching with replacement
##  - distance: Mahalanobis - number of obs.: 461 (original), 359 (matched)
##  - target estimand: ATT
##  - covariates: AGE, EDU_INTERSCHOOL_YN, MIGRA_EMPLOY_YN
summary(nearestmahalanobisT)
## 
## Call:
## matchit(formula = CONTRACT_FARMING_YN ~ AGE + EDU_INTERSCHOOL_YN + 
##     MIGRA_EMPLOY_YN, data = data, method = "nearest", distance = "mahalanobis", 
##     replace = TRUE)
## 
## Summary of Balance for All Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## AGE                      45.1803       44.7373          0.0333     0.9602
## EDU_INTERSCHOOL_YN        0.3525        0.4009         -0.1014          .
## MIGRA_EMPLOY_YN           0.6352        0.5576          0.1613          .
##                    eCDF Mean eCDF Max
## AGE                   0.0126   0.0583
## EDU_INTERSCHOOL_YN    0.0485   0.0485
## MIGRA_EMPLOY_YN       0.0776   0.0776
## 
## Summary of Balance for Matched Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## AGE                      45.1803       45.1967         -0.0012     1.0004
## EDU_INTERSCHOOL_YN        0.3525        0.3525          0.0000          .
## MIGRA_EMPLOY_YN           0.6352        0.6352          0.0000          .
##                    eCDF Mean eCDF Max Std. Pair Dist.
## AGE                   0.0047   0.0246          0.0333
## EDU_INTERSCHOOL_YN    0.0000   0.0000          0.0000
## MIGRA_EMPLOY_YN       0.0000   0.0000          0.0000
## 
## Sample Sizes:
##               Control Treated
## All            217.       244
## Matched (ESS)   80.24     244
## Matched        115.       244
## Unmatched      102.         0
## Discarded        0.         0
nearestmahalanobisT_data <- match.data(nearestmahalanobisT)

# --- NEAREST - MAHALANOBIS - Replace FALSE
nearestmahalanobisF <- matchit(CONTRACT_FARMING_YN ~ AGE    + EDU_INTERSCHOOL_YN    + MIGRA_EMPLOY_YN,
                  data = data, method = "nearest", distance = "mahalanobis", replace = FALSE)
## Warning: Fewer control units than treated units; not all treated units will get
## a match.
nearestmahalanobisF
## A `matchit` object
##  - method: 1:1 nearest neighbor matching without replacement
##  - distance: Mahalanobis - number of obs.: 461 (original), 434 (matched)
##  - target estimand: ATT
##  - covariates: AGE, EDU_INTERSCHOOL_YN, MIGRA_EMPLOY_YN
summary(nearestmahalanobisF)
## 
## Call:
## matchit(formula = CONTRACT_FARMING_YN ~ AGE + EDU_INTERSCHOOL_YN + 
##     MIGRA_EMPLOY_YN, data = data, method = "nearest", distance = "mahalanobis", 
##     replace = FALSE)
## 
## Summary of Balance for All Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## AGE                      45.1803       44.7373          0.0333     0.9602
## EDU_INTERSCHOOL_YN        0.3525        0.4009         -0.1014          .
## MIGRA_EMPLOY_YN           0.6352        0.5576          0.1613          .
##                    eCDF Mean eCDF Max
## AGE                   0.0126   0.0583
## EDU_INTERSCHOOL_YN    0.0485   0.0485
## MIGRA_EMPLOY_YN       0.0776   0.0776
## 
## Summary of Balance for Matched Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## AGE                      44.9447       44.7373          0.0156     0.9561
## EDU_INTERSCHOOL_YN        0.3594        0.4009         -0.0868          .
## MIGRA_EMPLOY_YN           0.6221        0.5576          0.1340          .
##                    eCDF Mean eCDF Max Std. Pair Dist.
## AGE                   0.0116   0.0507          0.1957
## EDU_INTERSCHOOL_YN    0.0415   0.0415          0.1061
## MIGRA_EMPLOY_YN       0.0645   0.0645          0.1340
## 
## Sample Sizes:
##           Control Treated
## All           217     244
## Matched       217     217
## Unmatched       0      27
## Discarded       0       0
nearestmahalanobisF_data <- match.data(nearestmahalanobisF)


# --- OPTIMAL - GLM
optimalglm <- matchit(CONTRACT_FARMING_YN ~ AGE + EDU_INTERSCHOOL_YN    + MIGRA_EMPLOY_YN,
                  data = data, method = "optimal", distance = "glm")
## Warning: Fewer control units than treated units; not all treated units will get
## a match.
optimalglm
## A `matchit` object
##  - method: 1:1 optimal pair matching
##  - distance: Propensity score
##              - estimated with logistic regression
##  - number of obs.: 461 (original), 434 (matched)
##  - target estimand: ATT
##  - covariates: AGE, EDU_INTERSCHOOL_YN, MIGRA_EMPLOY_YN
summary(optimalglm)
## 
## Call:
## matchit(formula = CONTRACT_FARMING_YN ~ AGE + EDU_INTERSCHOOL_YN + 
##     MIGRA_EMPLOY_YN, data = data, method = "optimal", distance = "glm")
## 
## Summary of Balance for All Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                  0.5333        0.5248          0.1889     0.9049
## AGE                      45.1803       44.7373          0.0333     0.9602
## EDU_INTERSCHOOL_YN        0.3525        0.4009         -0.1014          .
## MIGRA_EMPLOY_YN           0.6352        0.5576          0.1613          .
##                    eCDF Mean eCDF Max
## distance              0.0528   0.0976
## AGE                   0.0126   0.0583
## EDU_INTERSCHOOL_YN    0.0485   0.0485
## MIGRA_EMPLOY_YN       0.0776   0.0776
## 
## Summary of Balance for Matched Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                  0.5278        0.5248          0.0676     0.8934
## AGE                      44.9770       44.7373          0.0180     0.9845
## EDU_INTERSCHOOL_YN        0.3917        0.4009         -0.0193          .
## MIGRA_EMPLOY_YN           0.5899        0.5576          0.0670          .
##                    eCDF Mean eCDF Max Std. Pair Dist.
## distance              0.0196   0.0599          0.0719
## AGE                   0.0125   0.0507          0.4344
## EDU_INTERSCHOOL_YN    0.0092   0.0092          0.1736
## MIGRA_EMPLOY_YN       0.0323   0.0323          0.0862
## 
## Sample Sizes:
##           Control Treated
## All           217     244
## Matched       217     217
## Unmatched       0      27
## Discarded       0       0
plot(optimalglm, type = "histogram")

optimalglm_data <- match.data(optimalglm)


# --- OPTIMAL - MAHALANOBIS
optimalmahalanobis <- matchit(CONTRACT_FARMING_YN ~ AGE + EDU_INTERSCHOOL_YN    + MIGRA_EMPLOY_YN,
                  data = data, method = "optimal", distance = "mahalanobis")
## Warning: Fewer control units than treated units; not all treated units will get
## a match.
optimalmahalanobis
## A `matchit` object
##  - method: 1:1 optimal pair matching
##  - distance: Mahalanobis - number of obs.: 461 (original), 434 (matched)
##  - target estimand: ATT
##  - covariates: AGE, EDU_INTERSCHOOL_YN, MIGRA_EMPLOY_YN
summary(optimalmahalanobis)
## 
## Call:
## matchit(formula = CONTRACT_FARMING_YN ~ AGE + EDU_INTERSCHOOL_YN + 
##     MIGRA_EMPLOY_YN, data = data, method = "optimal", distance = "mahalanobis")
## 
## Summary of Balance for All Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## AGE                      45.1803       44.7373          0.0333     0.9602
## EDU_INTERSCHOOL_YN        0.3525        0.4009         -0.1014          .
## MIGRA_EMPLOY_YN           0.6352        0.5576          0.1613          .
##                    eCDF Mean eCDF Max
## AGE                   0.0126   0.0583
## EDU_INTERSCHOOL_YN    0.0485   0.0485
## MIGRA_EMPLOY_YN       0.0776   0.0776
## 
## Summary of Balance for Matched Data:
##                    Means Treated Means Control Std. Mean Diff. Var. Ratio
## AGE                      44.9401       44.7373          0.0152     0.9895
## EDU_INTERSCHOOL_YN        0.3871        0.4009         -0.0289          .
## MIGRA_EMPLOY_YN           0.5899        0.5576          0.0670          .
##                    eCDF Mean eCDF Max Std. Pair Dist.
## AGE                   0.0127   0.0507          0.1268
## EDU_INTERSCHOOL_YN    0.0138   0.0138          0.0289
## MIGRA_EMPLOY_YN       0.0323   0.0323          0.0670
## 
## Sample Sizes:
##           Control Treated
## All           217     244
## Matched       217     217
## Unmatched       0      27
## Discarded       0       0
optimalmahalanobis_data <- match.data(optimalmahalanobis)



# --- Write both to Excel at your specified path ---
# output_file <- "D:/OneDrive - IIT Kanpur/PhD_new/Papers/gingerpaper/PSM_results - EDU_INTERSCHOOL.xlsx"
# 
# wb <- createWorkbook()
# 
# addWorksheet(wb, "nearestglmT_data")
# writeData(wb, "nearestglmT_data", nearestglmT_data)
# addWorksheet(wb, "nearestglmF_data")
# writeData(wb, "nearestglmF_data", nearestglmF_data)
# 
# addWorksheet(wb, "nearestmahalanobisT_data")
# writeData(wb, "nearestmahalanobisT_data", nearestmahalanobisT_data)
# addWorksheet(wb, "nearestmahalanobisF_data")
# writeData(wb, "nearestmahalanobisF_data", nearestmahalanobisF_data)
# 
# addWorksheet(wb, "optimalglm_data")
# writeData(wb, "optimalglm_data", optimalglm_data)
# 
# addWorksheet(wb, "optimalmahalanobis_data")
# writeData(wb, "optimalmahalanobis_data", optimalmahalanobis_data)
# 
# saveWorkbook(wb, output_file, overwrite = TRUE)

Conventional DEA

# Load package
library(Benchmarking)
## Warning: package 'Benchmarking' was built under R version 4.4.3
## Loading required package: lpSolveAPI
## Warning: package 'lpSolveAPI' was built under R version 4.4.3
## Loading required package: ucminf
## Loading required package: quadprog
library(openxlsx)

# File path and sheet name
file_path <- "D:/OneDrive - IIT Kanpur/PhD_new/Papers/gingerpaper/Nepal_2014_matcheddatasets.xlsx"
sheet_name <- "MatchedData_EDU-INTERSCHOOL"

# Read the data
data <- read.xlsx(file_path, sheet = sheet_name)

# --- Prepare input and output matrices ---
X <- as.matrix(data[, c("Q_SEED", "Q_MANURE", "M_DAYS_LABOUR", "PLOT_AREA")])
Y <- as.matrix(data[, "PLOT_PROD"])


# --- IO-BCC Model
deaio <- dea(X,Y, RTS = "vrs", ORIENTATION = "in", SLACK = TRUE) #Use RTS = "crs" for CCR model AND ORIENTATION = "out" for output orientation
summary(deaio)
## Summary of efficiencies
## VRS technology and input orientated efficiency
## Number of firms with efficiency==1 are 56 out of 359 
## Mean efficiency: 0.77 
## ---                
##   Eff range       #    %
##   0.3<= E <0.4    6  1.7
##   0.4<= E <0.5   12  3.3
##   0.5<= E <0.6   41 11.4
##   0.6<= E <0.7   86 24.0
##   0.7<= E <0.8   56 15.6
##   0.8<= E <0.9   46 12.8
##   0.9<= E <1     56 15.6
##         E ==1    56 15.6
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3183  0.6377  0.7481  0.7701  0.9378  1.0000
results_deaio <- data.frame(data$ID,data$CONTRACT_FARMING,deaio$eff,deaio$slack,deaio$sx,deaio$sy)

# --- OO-BCC Model
deaoo <- dea(X,Y, RTS = "vrs", ORIENTATION = "out", SLACK = TRUE) #Use RTS = "crs" for CCR model AND ORIENTATION = "in" for input orientation
summary(deaoo)
## Summary of efficiencies
## VRS technology and output orientated efficiency
## Number of firms with efficiency==1 are 24 out of 359 
## Mean efficiency: 1.67 
## ---                
##   Eff range        #    %
##        F ==1      24  6.7
##     1< F =<1.1    22  6.1
##   1.1< F =<1.2    27  7.5
##   1.2< F =<1.3    26  7.2
##   1.3< F =<1.5    44 12.3
##   1.5< F =<  2   141 39.3
##     2< F =<  5    75 20.9
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.261   1.611   1.671   1.985   3.347
results_deaoo <- data.frame(data$ID,data$CONTRACT_FARMING,deaoo$eff,deaoo$slack,deaoo$sx,deaoo$sy)


# # --- Write both to Excel at your specified path ---
# output_file <- "D:/OneDrive - IIT Kanpur/PhD_new/Papers/gingerpaper/VRS_DEA_Conv_MatchedData_EDU-INTERSCHOOL_Results.xlsx"
# 
# wb <- createWorkbook()
# addWorksheet(wb, "Input_Oriented")
# writeData(wb, "Input_Oriented", results_deaio)
# 
# addWorksheet(wb, "Output_Oriented")
# writeData(wb, "Output_Oriented", results_deaoo)
# 
# saveWorkbook(wb, output_file, overwrite = TRUE)

Robust DEA

# Load package
library(rDEA)
## Warning: package 'rDEA' was built under R version 4.4.3
## Using the GLPK callable library version 5.0
## 
## Attaching package: 'rDEA'
## The following object is masked from 'package:Benchmarking':
## 
##     dea
library(openxlsx)

# File path and sheet name
file_path <- "D:/OneDrive - IIT Kanpur/PhD_new/Papers/gingerpaper/Nepal_2014_matcheddatasets.xlsx"
sheet_name <- "MatchedData_EDU-HIGHSCHOOL"

# Read the data
data <- read.xlsx(file_path, sheet = sheet_name)

# --- Prepare input and output matrices ---
X <- as.matrix(data[, c("Q_SEED", "Q_MANURE", "M_DAYS_LABOUR", "PLOT_AREA")])
Y <- as.matrix(data[, "PLOT_PROD"])

if (any(X <= 0) | any(Y <= 0)) {
  stop("DEA requires strictly positive inputs and outputs.")
}

set.seed(123)

# --- Input-oriented Robust DEA ---
dea_robust_io <- dea.robust(
  X = X, Y = Y,
  model = "input",    # input-oriented,  Use model = "output" for output orientation
  RTS = "variable",   # variable returns to scale (BCC), Use RTS = "constant" for CCR model
  B = 500,            # bootstrap replications
  alpha = 0.05,       # 95% CI
  bw = "cv"           # cross-validation bandwidth
)

data_io <- data
data_io$DEA_BiasCorrected_IO <- dea_robust_io$theta_hat_hat  # bias-corrected efficiency scores
data_io$DEA_Bias_IO          <- dea_robust_io$bias           # bootstrap-estimated bias
data_io$DEA_CI_Low_IO        <- dea_robust_io$theta_ci_low   # lower CI bounds
data_io$DEA_CI_High_IO       <- dea_robust_io$theta_ci_high  # upper CI bounds


# --- Output-oriented Robust DEA ---
dea_robust_oo <- dea.robust(
  X = X, Y = Y,
  model = "output",   # output-oriented,  Use model = "input" for input orientation
  RTS = "variable",   # variable returns to scale (BCC), Use RTS = "constant" for CCR model
  B = 500,            # bootstrap replications
  alpha = 0.05,       # 95% CI
  bw = "cv"           # cross-validation bandwidth
)

data_oo <- data
data_oo$DEA_BiasCorrected_OO <- dea_robust_oo$theta_hat_hat # bias-corrected efficiency scores
data_oo$DEA_Bias_OO          <- dea_robust_oo$bias # bootstrap-estimated bias
data_oo$DEA_CI_Low_OO        <- dea_robust_oo$theta_ci_low  # lower CI bounds
data_oo$DEA_CI_High_OO       <- dea_robust_oo$theta_ci_high  # upper CI bounds


# # --- Write both to Excel at your specified path ---
# output_file <- "D:/OneDrive - IIT Kanpur/PhD_new/Papers/gingerpaper/VRS_DEA_Robust_MatchedData_EDU-HIGHSCHOOL_Results.xlsx"
# 
# wb <- createWorkbook()
# addWorksheet(wb, "Input_Oriented")
# writeData(wb, "Input_Oriented", data_io)
# 
# addWorksheet(wb, "Output_Oriented")
# writeData(wb, "Output_Oriented", data_oo)
# 
# saveWorkbook(wb, output_file, overwrite = TRUE)
# 
# cat("Results exported to:\n", output_file, "\nwith two sheets: Input_Oriented and Output_Oriented\n")

Kolmogorov-Smirnov (K-S) and Kruskal-Wallis (K-W) tests

# Load libraries
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# File path and sheet name
file_path <- "D:/OneDrive - IIT Kanpur/PhD_new/Papers/gingerpaper/Nepal_2014_matcheddatasets.xlsx"
sheet_name <- "MatchedData_EDU-INTERSCHOOL"

# Read data
df <- read_excel(file_path, sheet = sheet_name)

# Efficiency variables
eff_vars <- c("CRS.DEA_Robust_OO.IS", 
              "VRS.DEA_Robust_OO.IS", 
              "Scale.Eff.Robust.OO.IS", 
              "CRS.DEA_Robust_IO.IS", 
              "VRS.DEA_Robust_IO.IS", 
              "CRS.DEA_Conv_OO.IS", 
              "VRS.DEA_Conv_OO.IS", 
              "Scale.Eff.Conv.OO.IS", 
              "CRS.DEA_Conv_IO.IS", 
              "VRS.DEA_Conv_IO.IS",
              "PLOT_YIELD")

group_var <- "CONTRACT_FARMING_YN"

# Function to add significance stars
stars <- function(p) {
  if (p <= 0.01) return("***")
  else if (p <= 0.05) return("**")
  else if (p <= 0.1) return("*")
  else return("")
}

# Create results dataframe
results_list <- list()

for (var in eff_vars) {
  ks_test <- ks.test(df[[var]][df[[group_var]] == 1],
                     df[[var]][df[[group_var]] == 0]) #Kolmogorov-Smirnov (K-S) test
  
  kw_test <- kruskal.test(df[[var]] ~ as.factor(df[[group_var]]), data = df) # Kruskal-Wallis (K-W) test
  
  results_list[[var]] <- data.frame(
    Variable = var,
    KS_Statistic = round(ks_test$statistic, 3),
    KS_p_value = round(ks_test$p.value, 4),
    KS_Sig = stars(ks_test$p.value),
    KW_Chi_sq = round(kw_test$statistic, 3),
    KW_p_value = round(kw_test$p.value, 4),
    KW_Sig = stars(kw_test$p.value),
    Summary = ifelse(kw_test$p.value <= 0.05, 
                     "Significant difference", 
                     "No significant difference")
  )
}
## Warning in ks.test.default(df[[var]][df[[group_var]] == 1],
## df[[var]][df[[group_var]] == : p-value will be approximate in the presence of
## ties
## Warning in ks.test.default(df[[var]][df[[group_var]] == 1],
## df[[var]][df[[group_var]] == : p-value will be approximate in the presence of
## ties
## Warning in ks.test.default(df[[var]][df[[group_var]] == 1],
## df[[var]][df[[group_var]] == : p-value will be approximate in the presence of
## ties
## Warning in ks.test.default(df[[var]][df[[group_var]] == 1],
## df[[var]][df[[group_var]] == : p-value will be approximate in the presence of
## ties
## Warning in ks.test.default(df[[var]][df[[group_var]] == 1],
## df[[var]][df[[group_var]] == : p-value will be approximate in the presence of
## ties
## Warning in ks.test.default(df[[var]][df[[group_var]] == 1],
## df[[var]][df[[group_var]] == : p-value will be approximate in the presence of
## ties
## Warning in ks.test.default(df[[var]][df[[group_var]] == 1],
## df[[var]][df[[group_var]] == : p-value will be approximate in the presence of
## ties
## Warning in ks.test.default(df[[var]][df[[group_var]] == 1],
## df[[var]][df[[group_var]] == : p-value will be approximate in the presence of
## ties
## Warning in ks.test.default(df[[var]][df[[group_var]] == 1],
## df[[var]][df[[group_var]] == : p-value will be approximate in the presence of
## ties
## Warning in ks.test.default(df[[var]][df[[group_var]] == 1],
## df[[var]][df[[group_var]] == : p-value will be approximate in the presence of
## ties
## Warning in ks.test.default(df[[var]][df[[group_var]] == 1],
## df[[var]][df[[group_var]] == : p-value will be approximate in the presence of
## ties
results_df <- bind_rows(results_list)

print(results_df)
##                      Variable KS_Statistic KS_p_value KS_Sig KW_Chi_sq
## D...1    CRS.DEA_Robust_OO.IS        0.256     0.0001    ***    11.821
## D...2    VRS.DEA_Robust_OO.IS        0.248     0.0001    ***    12.421
## D...3  Scale.Eff.Robust.OO.IS        0.090     0.5457            0.003
## D...4    CRS.DEA_Robust_IO.IS        0.256     0.0001    ***    11.634
## D...5    VRS.DEA_Robust_IO.IS        0.146     0.0730      *     2.387
## D...6      CRS.DEA_Conv_OO.IS        0.210     0.0020    ***    12.718
## D...7      VRS.DEA_Conv_OO.IS        0.230     0.0005    ***     7.797
## D...8    Scale.Eff.Conv.OO.IS        0.077     0.7423            0.000
## D...9      CRS.DEA_Conv_IO.IS        0.210     0.0020    ***    12.559
## D...10     VRS.DEA_Conv_IO.IS        0.108     0.3226            1.439
## D...11             PLOT_YIELD        0.251     0.0001    ***    10.963
##        KW_p_value KW_Sig                   Summary
## D...1      0.0006    ***    Significant difference
## D...2      0.0004    ***    Significant difference
## D...3      0.9561        No significant difference
## D...4      0.0006    ***    Significant difference
## D...5      0.1224        No significant difference
## D...6      0.0004    ***    Significant difference
## D...7      0.0052    ***    Significant difference
## D...8      0.9851        No significant difference
## D...9      0.0004    ***    Significant difference
## D...10     0.2303        No significant difference
## D...11     0.0009    ***    Significant difference