1. Load the data

library(readxl)
data <- read_excel("Final_reg_data_for_maize_DE.xlsx")
data

# Rename columns for convenience
colnames(data) <- c("Year","Population","Producer_Price_Index","Producer_Price_USD",
                    "Avg_Max_Temp","Annual_Precip","GNI_per_capita","Cropland_area",
                    "Value_added_deflator","N2O_indirect","N2O_direct","GDP_deflator","NVWI")
str(data)

## tibble [31 × 13] (S3: tbl_df/tbl/data.frame)
##  $ Year                : num [1:31] 1991 1992 1993 1994 1995 ...
##  $ Population          : num [1:31] 80.3 80.9 81.5 81.8 82 ...
##  $ Producer_Price_Index: num [1:31] 114.8 109.3 86.4 89 94.5 ...
##  $ Producer_Price_USD  : num [1:31] 214 217 162 170 204 ...
##  $ Avg_Max_Temp        : num [1:31] 12.8 13.4 12.6 13.7 12.9 ...
##  $ Annual_Precip       : num [1:31] 655 814 878 856 841 ...
##  $ GNI_per_capita      : num [1:31] 23506 26657 25751 27255 31829 ...
##  $ Cropland_area       : num [1:31] 11807 11708 11911 12037 12061 ...
##  $ Value_added_deflator: num [1:31] 58.7 62.7 64.5 94.3 114.8 ...
##  $ N2O_indirect        : num [1:31] 0.0552 0.0608 0.0749 0.0696 0.0679 0.0823 0.0895 0.0784 0.0913 0.0929 ...
##  $ N2O_direct          : num [1:31] 0.245 0.27 0.333 0.309 0.302 ...
##  $ GDP_deflator        : num [1:31] 75.5 84.5 82.9 86.1 99.2 ...
##  $ NVWI                : num [1:31] 1030 1095 1119 989 871 ...

sum(is.na(data))

## [1] 0

summary(data)

##       Year        Population    Producer_Price_Index Producer_Price_USD
##  Min.   :1991   Min.   :80.30   Min.   : 60.63       Min.   :101.6     
##  1st Qu.:1998   1st Qu.:81.20   1st Qu.: 78.00       1st Qu.:151.6     
##  Median :2006   Median :81.93   Median : 96.00       Median :177.4     
##  Mean   :2006   Mean   :81.91   Mean   : 96.22       Mean   :185.2     
##  3rd Qu.:2014   3rd Qu.:82.19   3rd Qu.:107.25       3rd Qu.:215.8     
##  Max.   :2021   Max.   :83.70   Max.   :140.61       Max.   :286.4     
##   Avg_Max_Temp   Annual_Precip   GNI_per_capita  Cropland_area  
##  Min.   :10.97   Min.   :564.2   Min.   :23506   Min.   :11708  
##  1st Qu.:12.79   1st Qu.:743.9   1st Qu.:27183   1st Qu.:11967  
##  Median :13.20   Median :796.5   Median :37481   Median :12038  
##  Mean   :13.26   Mean   :797.2   Mean   :37047   Mean   :12011  
##  3rd Qu.:13.71   3rd Qu.:866.8   3rd Qu.:46049   3rd Qu.:12074  
##  Max.   :15.12   Max.   :987.0   Max.   :53272   Max.   :12145  
##  Value_added_deflator  N2O_indirect       N2O_direct      GDP_deflator   
##  Min.   : 58.67       Min.   :0.05520   Min.   :0.2455   Min.   : 67.07  
##  1st Qu.: 85.71       1st Qu.:0.09015   1st Qu.:0.4007   1st Qu.: 83.69  
##  Median :100.00       Median :0.10440   Median :0.4641   Median : 99.23  
##  Mean   :102.05       Mean   :0.10396   Mean   :0.4621   Mean   : 97.53  
##  3rd Qu.:123.31       3rd Qu.:0.12045   3rd Qu.:0.5353   3rd Qu.:110.75  
##  Max.   :142.05       Max.   :0.15310   Max.   :0.6805   Max.   :119.06  
##       NVWI       
##  Min.   : 517.3  
##  1st Qu.: 874.7  
##  Median :1456.2  
##  Mean   :1705.9  
##  3rd Qu.:2251.3  
##  Max.   :4549.9

2. Check for stationarity

library(urca)
library(tseries)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(dplyr)

## 
## Dołączanie pakietu: 'dplyr'

## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     filter, lag

## Następujące obiekty zostały zakryte z 'package:base':
## 
##     intersect, setdiff, setequal, union

# ---- Variables ----
vars <- c("Population","GNI_per_capita","Producer_Price_Index","Producer_Price_USD",
          "Avg_Max_Temp","Annual_Precip","Cropland_area","Value_added_deflator",
          "N2O_indirect","N2O_direct","GDP_deflator","NVWI")

# ---- Function to test stationarity ----
check_stationarity <- function(x){
  x <- na.omit(x)
  max_lag <- floor(4*(length(x)/100)^(1/4))
  
  adf <- ur.df(x, type="trend", lags=max_lag, selectlags="AIC")
  adf_diff <- ur.df(diff(x), type="drift", lags=max_lag, selectlags="AIC")
  pp  <- pp.test(x)
  kpss <- kpss.test(x)
  
  level_stat <- ifelse(adf@teststat[1] < adf@cval[2, "5pct"] & pp$p.value < 0.05 & kpss$p.value > 0.05,
                       "Stationary", "Non-stationary")
  diff_stat <- ifelse(adf_diff@teststat[1] < adf_diff@cval[2, "5pct"], "Stationary", "Non-stationary")
  
  integ <- ifelse(level_stat=="Stationary", "I(0)",
                  ifelse(diff_stat=="Stationary","I(1)","I(?)"))
  
  return(c(Level=level_stat, First_Diff=diff_stat, Integration=integ))
}

# ---- Build stationarity table with automatic log-transform ----
stationarity_table <- lapply(vars, function(v){
  x <- data[[v]]
  res <- check_stationarity(x)
  
  # If I(?), apply log transform and retest
  if(res["Integration"]=="I(?)" & all(x>0)){
    x <- log(x)
    res <- check_stationarity(x)
  }
  
  data.frame(Variable=v,
             Level_ADF_PP_KPSS=res["Level"],
             First_Difference_ADF=res["First_Diff"],
             Integration_Order=res["Integration"],
             stringsAsFactors = FALSE)
}) %>% bind_rows()

## Warning in kpss.test(x): p-value greater than printed p-value

## Warning in kpss.test(x): p-value smaller than printed p-value

## Warning in kpss.test(x): p-value greater than printed p-value

## Warning in pp.test(x): p-value smaller than printed p-value

## Warning in kpss.test(x): p-value greater than printed p-value
## Warning in kpss.test(x): p-value greater than printed p-value

## Warning in kpss.test(x): p-value smaller than printed p-value
## Warning in kpss.test(x): p-value smaller than printed p-value
## Warning in kpss.test(x): p-value smaller than printed p-value
## Warning in kpss.test(x): p-value smaller than printed p-value
## Warning in kpss.test(x): p-value smaller than printed p-value

# ---- Print final table ----
print(stationarity_table)

##                        Variable Level_ADF_PP_KPSS First_Difference_ADF
## Level...1            Population    Non-stationary           Stationary
## Level...2        GNI_per_capita    Non-stationary           Stationary
## Level...3  Producer_Price_Index    Non-stationary           Stationary
## Level...4    Producer_Price_USD    Non-stationary           Stationary
## Level...5          Avg_Max_Temp    Non-stationary           Stationary
## Level...6         Annual_Precip        Stationary           Stationary
## Level...7         Cropland_area    Non-stationary           Stationary
## Level...8  Value_added_deflator    Non-stationary           Stationary
## Level...9          N2O_indirect    Non-stationary           Stationary
## Level...10           N2O_direct    Non-stationary           Stationary
## Level...11         GDP_deflator    Non-stationary           Stationary
## Level...12                 NVWI    Non-stationary           Stationary
##            Integration_Order
## Level...1               I(1)
## Level...2               I(1)
## Level...3               I(1)
## Level...4               I(1)
## Level...5               I(1)
## Level...6               I(0)
## Level...7               I(1)
## Level...8               I(1)
## Level...9               I(1)
## Level...10              I(1)
## Level...11              I(1)
## Level...12              I(1)

# for ADF / PP Test: if p-value<0.05=stationary
# for KPSS Test: if p-value<0.05=non-stationary

3. Decide on the modeling approach:Check for cointegration (long-run relationship)

Use Johansen test if you have multiple I(1) variables.

Since you have ~30 observations and 11 I(1) variables, Johansen may fail due to rank deficiency.

Reduce variables (drop highly correlated ones like Producer_Price_Index vs Producer_Price_USD)

Or use Engle-Granger two-step (NVWI vs each regressor separately).

library(urca)
library(dplyr)
# Separate variables by integration order
# ---- Identify I(1) variables ----
I1_vars <- stationarity_table$Variable[stationarity_table$Integration_Order == "I(1)"]
I1_vars <- setdiff(I1_vars, "NVWI")  # Remove dependent variable
I1_vars

##  [1] "Population"           "GNI_per_capita"       "Producer_Price_Index"
##  [4] "Producer_Price_USD"   "Avg_Max_Temp"         "Cropland_area"       
##  [7] "Value_added_deflator" "N2O_indirect"         "N2O_direct"          
## [10] "GDP_deflator"

I0_vars <- stationarity_table$Variable[stationarity_table$Integration_Order=="I(0)"]
I0_vars

## [1] "Annual_Precip"

# ---- Function to run Engle-Granger test ----
eg_test <- function(dep, indep){
  df <- data[, c(dep, indep)]
  df <- na.omit(df)
  
  # Phillips-Ouliaris (Pz) test
  test <- ca.po(df, type="Pz", demean="trend")
  
  # Return as data.frame
  data.frame(
    Variable = indep,
    Test_statistic = round(test@teststat, 3),
    Critical_5pct = round(test@cval[2], 3),
    Cointegrated = ifelse(test@teststat > test@cval[2], "No", "Yes"),
    stringsAsFactors = FALSE
  )
}

# ---- Loop through I(1) variables ----
eg_results <- lapply(I1_vars, function(v){
  eg_test("NVWI", v)
}) %>% bind_rows()

# ---- View results ----
print(eg_results)

##                Variable Test_statistic Critical_5pct Cointegrated
## 1            Population          5.290        81.381          Yes
## 2        GNI_per_capita         12.146        81.381          Yes
## 3  Producer_Price_Index         11.421        81.381          Yes
## 4    Producer_Price_USD         12.620        81.381          Yes
## 5          Avg_Max_Temp          5.050        81.381          Yes
## 6         Cropland_area          4.725        81.381          Yes
## 7  Value_added_deflator          8.748        81.381          Yes
## 8          N2O_indirect          8.866        81.381          Yes
## 9            N2O_direct          8.866        81.381          Yes
## 10         GDP_deflator          7.970        81.381          Yes

#  since co-integration exists, use Engle-Granger Error Correction Model (ECM) to capture long run and short run dynamics

4. Engle-Granger Error Correction Model (ECM)

# a. Long-run regression: regress NVWI on all cointegrated I(1) variables.
long_run_formula <- as.formula(
  paste("NVWI ~", paste(eg_results$Variable[eg_results$Cointegrated=="Yes"], collapse=" + "))
)
long_run_model <- lm(long_run_formula, data=data)
summary(long_run_model)

## 
## Call:
## lm(formula = long_run_formula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -600.74 -233.00   45.42  210.52  762.48 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           4.329e+04  2.335e+04   1.854   0.0786 .
## Population           -1.154e+02  1.581e+02  -0.730   0.4741  
## GNI_per_capita        3.269e-02  7.548e-02   0.433   0.6696  
## Producer_Price_Index  2.891e+01  3.503e+01   0.825   0.4189  
## Producer_Price_USD   -2.053e+01  1.677e+01  -1.224   0.2352  
## Avg_Max_Temp          2.709e+02  1.146e+02   2.365   0.0283 *
## Cropland_area        -3.381e+00  1.428e+00  -2.368   0.0281 *
## Value_added_deflator  1.542e+01  7.844e+00   1.966   0.0633 .
## N2O_indirect          9.223e+05  2.539e+06   0.363   0.7203  
## N2O_direct           -2.085e+05  5.712e+05  -0.365   0.7189  
## GDP_deflator          3.682e+01  4.666e+01   0.789   0.4393  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 350.9 on 20 degrees of freedom
## Multiple R-squared:  0.9184, Adjusted R-squared:  0.8776 
## F-statistic: 22.51 on 10 and 20 DF,  p-value: 9.643e-09

# b. Compute the error correction term (residuals from long-run regression):
data$ECT <- residuals(long_run_model)

# c.Short-run model: first-difference all I(1) variables, include ECT and I(0) variables in levels:
# First differences
data_diff <- data %>%
  mutate(across(all_of(I1_vars), ~ c(NA, diff(.)), .names="d_{.col}")) %>%
  mutate(d_NVWI = c(NA, diff(NVWI)))

# Build short-run formula
short_run_vars <- paste0("d_", eg_results$Variable[eg_results$Cointegrated=="Yes"])
short_run_formula <- as.formula(
  paste("d_NVWI ~", paste(short_run_vars, collapse=" + "),
        "+ ECT +", paste(I0_vars, collapse=" + "))
)

# Fit ECM
ecm_model <- lm(short_run_formula, data=data_diff)
summary(ecm_model)

## 
## Call:
## lm(formula = short_run_formula, data = data_diff)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -462.77  -99.91   -3.30  131.66  475.90 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -9.792e+01  5.044e+02  -0.194  0.84836    
## d_Population           -7.909e+01  2.108e+02  -0.375  0.71221    
## d_GNI_per_capita        1.875e-01  8.080e-02   2.320  0.03303 *  
## d_Producer_Price_Index  7.064e+01  3.527e+01   2.003  0.06140 .  
## d_Producer_Price_USD   -4.467e+01  1.782e+01  -2.507  0.02262 *  
## d_Avg_Max_Temp          2.179e+02  6.778e+01   3.215  0.00508 ** 
## d_Cropland_area        -3.785e+00  1.111e+00  -3.407  0.00336 ** 
## d_Value_added_deflator  2.267e+01  6.746e+00   3.360  0.00371 ** 
## d_N2O_indirect         -1.445e+06  1.443e+06  -1.002  0.33057    
## d_N2O_direct            3.239e+05  3.246e+05   0.998  0.33242    
## d_GDP_deflator          3.674e+01  3.753e+01   0.979  0.34136    
## ECT                     1.478e+00  2.789e-01   5.301 5.87e-05 ***
## Annual_Precip          -7.844e-02  6.117e-01  -0.128  0.89947    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 285.2 on 17 degrees of freedom
##   (1 obserwacja została skasowana z uwagi na braki w niej zawarte)
## Multiple R-squared:  0.8021, Adjusted R-squared:  0.6625 
## F-statistic: 5.743 on 12 and 17 DF,  p-value: 0.0006396

5. check diagnostics

—- 5. Diagnostics for Short-Run (ECM) and Long-Run Models —-

library(lmtest)  # dwtest, bptest

## Ładowanie wymaganego pakietu: zoo

## 
## Dołączanie pakietu: 'zoo'

## Następujące obiekty zostały zakryte z 'package:base':
## 
##     as.Date, as.Date.numeric

library(car)     # vif

## Ładowanie wymaganego pakietu: carData

## 
## Dołączanie pakietu: 'car'

## Następujący obiekt został zakryty z 'package:dplyr':
## 
##     recode

library(urca)    # ur.df

# Function to run diagnostics
run_diagnostics <- function(model, model_name = "Model") {
  
  # Residuals
  res <- residuals(model)
  
  # Durbin-Watson Test (Autocorrelation)
  dw <- dwtest(model)
  dw_stat <- round(dw$statistic, 3)
  dw_p <- round(dw$p.value, 3)
  
  # Breusch-Pagan Test (Heteroskedasticity)
  bp <- bptest(model)
  bp_stat <- round(bp$statistic, 3)
  bp_p <- round(bp$p.value, 3)
  
  # Multicollinearity (VIF)
  vif_vals <- vif(model)
  high_vif <- names(vif_vals[vif_vals > 5])
  vif_flag <- if(length(high_vif) == 0) "No severe multicollinearity" else paste("High VIF for:", paste(high_vif, collapse=", "))
  
  # Residual stationarity (ADF)
  adf <- ur.df(res, type = "drift", selectlags = "AIC")
  resid_stationary <- ifelse(adf@teststat[1] < adf@cval[2, "5pct"], "Stationary", "Non-stationary")
  
  # Compile diagnostics table
  diagnostics <- data.frame(
    Test = c("Durbin-Watson (Autocorr)", 
             "Breusch-Pagan (Heteroskedasticity)", 
             "Residual Stationarity", 
             "Multicollinearity"),
    Result = c(paste0(dw_stat, ", p=", dw_p),
               paste0(bp_stat, ", p=", bp_p),
               resid_stationary,
               vif_flag),
    stringsAsFactors = FALSE
  )
  
  cat("\nDiagnostics for", model_name, ":\n")
  print(diagnostics)
  invisible(diagnostics)
}

# ---- Run diagnostics for Long-Run Model ----
diagnostics_long <- run_diagnostics(long_run_model, "Long-Run Model")

## 
## Diagnostics for Long-Run Model :
##                                 Test
## 1           Durbin-Watson (Autocorr)
## 2 Breusch-Pagan (Heteroskedasticity)
## 3              Residual Stationarity
## 4                  Multicollinearity
##                                                                                                                                 Result
## 1                                                                                                                        2.231, p=0.32
## 2                                                                                                                         9.8, p=0.458
## 3                                                                                                                           Stationary
## 4 High VIF for: GNI_per_capita, Producer_Price_Index, Producer_Price_USD, Value_added_deflator, N2O_indirect, N2O_direct, GDP_deflator

# ---- Run diagnostics for Short-Run ECM ----
diagnostics_short <- run_diagnostics(ecm_model, "Short-Run ECM")

## 
## Diagnostics for Short-Run ECM :
##                                 Test
## 1           Durbin-Watson (Autocorr)
## 2 Breusch-Pagan (Heteroskedasticity)
## 3              Residual Stationarity
## 4                  Multicollinearity
##                                                                                                                       Result
## 1                                                                                                             1.617, p=0.171
## 2                                                                                                             15.07, p=0.238
## 3                                                                                                                 Stationary
## 4 High VIF for: d_GNI_per_capita, d_Producer_Price_Index, d_Producer_Price_USD, d_N2O_indirect, d_N2O_direct, d_GDP_deflator

#####################################
#check for variables with high VIF
# VIF diagnostics for short- and long-run models 
library(car)

check_vif <- function(model, model_name="Model") {
  vif_vals <- vif(model)
  vif_table <- data.frame(
    Variable = names(vif_vals),
    VIF = round(vif_vals, 2),
    HighVIF = ifelse(vif_vals > 10, "Yes", "No"),
    stringsAsFactors = FALSE
  )
  
  cat("\nVIF Table for", model_name, ":\n")
  print(vif_table)
  invisible(vif_table)
}

# ---- Run for long-run model ----
vif_long <- check_vif(long_run_model, "Long-Run Model")

## 
## VIF Table for Long-Run Model :
##                                  Variable       VIF HighVIF
## Population                     Population      4.83      No
## GNI_per_capita             GNI_per_capita    129.64     Yes
## Producer_Price_Index Producer_Price_Index    118.32     Yes
## Producer_Price_USD     Producer_Price_USD    179.17     Yes
## Avg_Max_Temp                 Avg_Max_Temp      2.39      No
## Cropland_area               Cropland_area      4.95      No
## Value_added_deflator Value_added_deflator      8.37      No
## N2O_indirect                 N2O_indirect 991046.33     Yes
## N2O_direct                     N2O_direct 990610.04     Yes
## GDP_deflator                 GDP_deflator    137.01     Yes

# ---- Run for short-run ECM ----
vif_short <- check_vif(ecm_model, "Short-Run ECM")

## 
## VIF Table for Short-Run ECM :
##                                      Variable       VIF HighVIF
## d_Population                     d_Population      1.38      No
## d_GNI_per_capita             d_GNI_per_capita     21.62     Yes
## d_Producer_Price_Index d_Producer_Price_Index    116.25     Yes
## d_Producer_Price_USD     d_Producer_Price_USD    184.51     Yes
## d_Avg_Max_Temp                 d_Avg_Max_Temp      1.31      No
## d_Cropland_area               d_Cropland_area      1.72      No
## d_Value_added_deflator d_Value_added_deflator      3.96      No
## d_N2O_indirect                 d_N2O_indirect 209050.78     Yes
## d_N2O_direct                     d_N2O_direct 208929.35     Yes
## d_GDP_deflator                 d_GDP_deflator     29.50     Yes
## ECT                                       ECT      2.22      No
## Annual_Precip                   Annual_Precip      1.36      No

6. REFINE YOUR MODEL (by removing variables)

—- Refined Long- and Short-Run Models —-

library(dplyr)
library(lmtest)
library(car)
library(urca)

# ---- 1. Refine Long-Run Model ----
# Keep only significant and low-VIF variables
long_vars_refined <- c("Avg_Max_Temp", "Cropland_area", "Value_added_deflator", "Producer_Price_USD")
long_run_formula_refined <- as.formula(paste("NVWI ~", paste(long_vars_refined, collapse = " + ")))
long_run_model_refined <- lm(long_run_formula_refined, data = data)
summary(long_run_model_refined)

## 
## Call:
## lm(formula = long_run_formula_refined, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -810.94 -297.14   22.34  354.68  847.34 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          39015.868  13689.653   2.850 0.008443 ** 
## Avg_Max_Temp           464.324    116.417   3.988 0.000482 ***
## Cropland_area           -3.848      1.085  -3.546 0.001507 ** 
## Value_added_deflator    31.792      6.071   5.237  1.8e-05 ***
## Producer_Price_USD      -2.642      2.522  -1.048 0.304342    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 461.7 on 26 degrees of freedom
## Multiple R-squared:  0.8163, Adjusted R-squared:  0.788 
## F-statistic: 28.88 on 4 and 26 DF,  p-value: 3.151e-09

long_vars_refined2 <- c("Population", "Annual_Precip", "GNI_per_capita", "Producer_Price_Index")
long_run_formula_refined2 <- as.formula(paste("NVWI ~", paste(long_vars_refined2, collapse = " + ")))
long_run_model_refined2 <- lm(long_run_formula_refined2, data = data)
summary(long_run_model_refined2)

## 
## Call:
## lm(formula = long_run_formula_refined2, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -825.85 -306.09  -55.82  267.85 1403.18 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.993e+04  9.634e+03  -3.107  0.00453 ** 
## Population            3.515e+02  1.167e+02   3.012  0.00572 ** 
## Annual_Precip        -2.139e-01  9.102e-01  -0.235  0.81607    
## GNI_per_capita        7.208e-02  1.404e-02   5.133 2.37e-05 ***
## Producer_Price_Index  3.595e+00  6.540e+00   0.550  0.58728    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 507 on 26 degrees of freedom
## Multiple R-squared:  0.7785, Adjusted R-squared:  0.7444 
## F-statistic: 22.84 on 4 and 26 DF,  p-value: 3.437e-08

long_vars_refined3 <- c("N2O_indirect", "GDP_deflator")
long_run_formula_refined3 <- as.formula(paste("NVWI ~", paste(long_vars_refined3, collapse = " + ")))
long_run_model_refined3 <- lm(long_run_formula_refined3, data = data)
summary(long_run_model_refined3)

## 
## Call:
## lm(formula = long_run_formula_refined3, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1054.5  -544.9  -128.7   309.6  2353.1 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2541.28     831.40  -3.057  0.00488 ** 
## N2O_indirect -3842.94    7434.33  -0.517  0.60927    
## GDP_deflator    47.65      11.62   4.101  0.00032 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 740.7 on 28 degrees of freedom
## Multiple R-squared:  0.4909, Adjusted R-squared:  0.4545 
## F-statistic:  13.5 on 2 and 28 DF,  p-value: 7.86e-05

# ---- 2. Compute ECT ----
data$ECT_refined <- residuals(long_run_model_refined)
data$ECT_refined2 <- residuals(long_run_model_refined2)
data$ECT_refined3 <- residuals(long_run_model_refined3)
# ---- 3. Prepare differenced data for Short-Run ECM ----
data_diff <- data %>%
  mutate(across(all_of(I1_vars), ~ c(NA, diff(.)), .names="d_{.col}")) %>%
  mutate(d_NVWI = c(NA, diff(NVWI))) %>%
  mutate(ECT_refined = data$ECT_refined)  # add ECT to differenced data

data_diff2 <- data %>%
  mutate(across(all_of(I1_vars), ~ c(NA, diff(.)), .names="d_{.col}")) %>%
  mutate(d_NVWI = c(NA, diff(NVWI))) %>%
  mutate(ECT_refined2 = data$ECT_refined2)  # add ECT to differenced data

data_diff3 <- data %>%
  mutate(across(all_of(I1_vars), ~ c(NA, diff(.)), .names="d_{.col}")) %>%
  mutate(d_NVWI = c(NA, diff(NVWI))) %>%
  mutate(ECT_refined3 = data$ECT_refined3)  # add ECT to differenced data

# Keep differenced variables with low VIF and significant
short_vars_refined <- c("d_Avg_Max_Temp", "d_Cropland_area", "d_Value_added_deflator", "d_Producer_Price_USD")
short_run_formula_refined <- as.formula(paste("d_NVWI ~", paste(short_vars_refined, collapse = " + "), "+ ECT_refined"))

short_vars_refined2 <- c("d_Population", "Annual_Precip", "d_GNI_per_capita", "d_Producer_Price_Index")
short_run_formula_refined2 <- as.formula(paste("d_NVWI ~", paste(short_vars_refined2, collapse = " + "), "+ ECT_refined2"))

short_vars_refined3 <- c("d_N2O_indirect", "d_GDP_deflator")
short_run_formula_refined3 <- as.formula(paste("d_NVWI ~", paste(short_vars_refined3, collapse = " + "), "+ ECT_refined3"))

# Fit Short-Run ECM
ecm_model_refined <- lm(short_run_formula_refined, data = data_diff)
summary(ecm_model_refined)

## 
## Call:
## lm(formula = short_run_formula_refined, data = data_diff)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -685.53 -244.03  -82.11  297.80  646.10 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             28.1998    72.4233   0.389  0.70043   
## d_Avg_Max_Temp         323.3771    90.8614   3.559  0.00159 **
## d_Cropland_area         -1.2063     1.3024  -0.926  0.36357   
## d_Value_added_deflator  22.3862     7.5221   2.976  0.00657 **
## d_Producer_Price_USD    -3.9966     2.6359  -1.516  0.14253   
## ECT_refined              0.5157     0.2222   2.320  0.02914 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 392.5 on 24 degrees of freedom
##   (1 obserwacja została skasowana z uwagi na braki w niej zawarte)
## Multiple R-squared:  0.4709, Adjusted R-squared:  0.3607 
## F-statistic: 4.273 on 5 and 24 DF,  p-value: 0.006395

ecm_model_refined2 <- lm(short_run_formula_refined2, data = data_diff2)
summary(ecm_model_refined2)

## 
## Call:
## lm(formula = short_run_formula_refined2, data = data_diff2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1217.57  -151.31   -18.87   257.24   469.91 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             684.66537  622.03913   1.101   0.2820   
## d_Population           -329.21915  272.42461  -1.208   0.2386   
## Annual_Precip            -0.79891    0.75947  -1.052   0.3033   
## d_GNI_per_capita          0.07783    0.03031   2.568   0.0169 * 
## d_Producer_Price_Index   -8.51679    5.74822  -1.482   0.1514   
## ECT_refined2              0.67075    0.18088   3.708   0.0011 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 403.5 on 24 degrees of freedom
##   (1 obserwacja została skasowana z uwagi na braki w niej zawarte)
## Multiple R-squared:  0.4408, Adjusted R-squared:  0.3244 
## F-statistic: 3.784 on 5 and 24 DF,  p-value: 0.01143

ecm_model_refined3 <- lm(short_run_formula_refined3, data = data_diff3)
summary(ecm_model_refined3)

## 
## Call:
## lm(formula = short_run_formula_refined3, data = data_diff3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1188.75   -94.24    12.78   207.89   943.00 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      42.9468    91.1999   0.471   0.6416  
## d_N2O_indirect  323.8399  5537.1477   0.058   0.9538  
## d_GDP_deflator   11.8349    12.5106   0.946   0.3529  
## ECT_refined3      0.2246     0.1288   1.744   0.0929 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 487.8 on 26 degrees of freedom
##   (1 obserwacja została skasowana z uwagi na braki w niej zawarte)
## Multiple R-squared:  0.1145, Adjusted R-squared:  0.01228 
## F-statistic:  1.12 on 3 and 26 DF,  p-value: 0.359

# ---- 4. Optional: Quick Diagnostics Function ----
run_diagnostics <- function(model) {
  res <- residuals(model)
  dw <- dwtest(model); bp <- bptest(model)
  vif_vals <- vif(model); high_vif <- names(vif_vals[vif_vals>5])
  adf <- ur.df(res, type="drift", selectlags="AIC")
  
  data.frame(
    Test = c("Durbin-Watson", "Breusch-Pagan", "Residual Stationarity", "Multicollinearity"),
    Result = c(
      paste0(round(dw$statistic,3), ", p=", round(dw$p.value,3)),
      paste0(round(bp$statistic,3), ", p=", round(bp$p.value,3)),
      ifelse(adf@teststat[1]<adf@cval[2,"5pct"], "Stationary", "Non-stationary"),
      if(length(high_vif)==0) "No severe multicollinearity" else paste("High VIF for:", paste(high_vif, collapse=", "))
    ),
    stringsAsFactors = FALSE
  )
}

# Run diagnostics
run_diagnostics(long_run_model_refined)

run_diagnostics(long_run_model_refined2)

run_diagnostics(long_run_model_refined3)

run_diagnostics(ecm_model_refined)

run_diagnostics(ecm_model_refined2)

run_diagnostics(ecm_model_refined3)

# Adjust for serial correlation of refine long run_diagnostics
library(dynlm)
library(sandwich)
library(lmtest)
nw_se2 <- NeweyWest(long_run_model_refined2, lag = 2, prewhite = FALSE)
coeftest(long_run_model_refined2, vcov = nw_se2)

## 
## t test of coefficients:
## 
##                         Estimate  Std. Error t value Pr(>|t|)    
## (Intercept)          -2.9932e+04  1.0757e+04 -2.7825 0.009911 ** 
## Population            3.5151e+02  1.2964e+02  2.7114 0.011714 *  
## Annual_Precip        -2.1388e-01  8.7763e-01 -0.2437 0.809380    
## GNI_per_capita        7.2085e-02  1.5141e-02  4.7607 6.32e-05 ***
## Producer_Price_Index  3.5945e+00  7.0459e+00  0.5102 0.614243    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

nw_se3 <- NeweyWest(long_run_model_refined3, lag = 2, prewhite = FALSE)
coeftest(long_run_model_refined3, vcov = nw_se3)

## 
## t test of coefficients:
## 
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -2541.279    770.535 -3.2981 0.002654 **
## N2O_indirect -3842.937   8474.504 -0.4535 0.653705   
## GDP_deflator    47.646     15.023  3.1716 0.003659 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

nw_se4 <- NeweyWest(ecm_model_refined3, lag = 2, prewhite = FALSE)
coeftest(ecm_model_refined3, vcov = nw_se4)

## 
## t test of coefficients:
## 
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)      42.94676   71.12845  0.6038   0.5512
## d_N2O_indirect  323.83992 5447.57394  0.0594   0.9531
## d_GDP_deflator   11.83494   10.82283  1.0935   0.2842
## ECT_refined3      0.22462    0.13469  1.6677   0.1074

Maize DE

1. Load the data

2. Check for stationarity

3. Decide on the modeling approach:Check for cointegration (long-run relationship)

Use Johansen test if you have multiple I(1) variables.

Since you have ~30 observations and 11 I(1) variables, Johansen may fail due to rank deficiency.

Reduce variables (drop highly correlated ones like Producer_Price_Index vs Producer_Price_USD)

Or use Engle-Granger two-step (NVWI vs each regressor separately).

4. Engle-Granger Error Correction Model (ECM)

5. check diagnostics

—- 5. Diagnostics for Short-Run (ECM) and Long-Run Models —-

6. REFINE YOUR MODEL (by removing variables)

—- Refined Long- and Short-Run Models —-