1. Load the data

library(readxl)
data <- read_excel("Final_reg_data_for_wheat_DE.xlsx")
data

# Rename columns for convenience
colnames(data) <- c("Year","Population","Producer_Price_Index","Producer_Price_USD",
                    "Avg_Max_Temp","Annual_Precip","GNI_per_capita","Cropland_area",
                    "Value_added_deflator","N2O_indirect","N2O_direct","GDP_deflator","NVWI")
str(data)

## tibble [31 × 13] (S3: tbl_df/tbl/data.frame)
##  $ Year                : num [1:31] 1991 1992 1993 1994 1995 ...
##  $ Population          : num [1:31] 80.3 80.9 81.5 81.8 82 ...
##  $ Producer_Price_Index: num [1:31] 96.2 105 88.4 84.5 80.6 ...
##  $ Producer_Price_USD  : num [1:31] 178 207 164 160 173 ...
##  $ Avg_Max_Temp        : num [1:31] 12.8 13.4 12.6 13.7 12.9 ...
##  $ Annual_Precip       : num [1:31] 655 814 878 856 841 ...
##  $ GNI_per_capita      : num [1:31] 23506 26657 25751 27255 31829 ...
##  $ Cropland_area       : num [1:31] 11807 11708 11911 12037 12061 ...
##  $ Value_added_deflator: num [1:31] 58.7 62.7 64.5 94.3 114.8 ...
##  $ N2O_indirect        : num [1:31] 0.722 0.68 0.686 0.717 0.772 ...
##  $ N2O_direct          : num [1:31] 3.21 3.02 3.05 3.19 3.43 ...
##  $ GDP_deflator        : num [1:31] 75.5 84.5 82.9 86.1 99.2 ...
##  $ NVWI                : num [1:31] 1152 -208 991 404 617 ...

sum(is.na(data))

## [1] 0

summary(data)

##       Year        Population    Producer_Price_Index Producer_Price_USD
##  Min.   :1991   Min.   :80.30   Min.   : 63.06       Min.   : 95.1     
##  1st Qu.:1998   1st Qu.:81.20   1st Qu.: 73.16       1st Qu.:137.9     
##  Median :2006   Median :81.93   Median : 89.17       Median :173.0     
##  Mean   :2006   Mean   :81.91   Mean   : 93.87       Mean   :179.3     
##  3rd Qu.:2014   3rd Qu.:82.19   3rd Qu.:107.45       3rd Qu.:202.8     
##  Max.   :2021   Max.   :83.70   Max.   :139.85       Max.   :287.8     
##   Avg_Max_Temp   Annual_Precip   GNI_per_capita  Cropland_area  
##  Min.   :10.97   Min.   :564.2   Min.   :23506   Min.   :11708  
##  1st Qu.:12.79   1st Qu.:743.9   1st Qu.:27183   1st Qu.:11967  
##  Median :13.20   Median :796.5   Median :37481   Median :12038  
##  Mean   :13.26   Mean   :797.2   Mean   :37047   Mean   :12011  
##  3rd Qu.:13.71   3rd Qu.:866.8   3rd Qu.:46049   3rd Qu.:12074  
##  Max.   :15.12   Max.   :987.0   Max.   :53272   Max.   :12145  
##  Value_added_deflator  N2O_indirect      N2O_direct     GDP_deflator   
##  Min.   : 58.67       Min.   :0.6800   Min.   :3.022   Min.   : 67.07  
##  1st Qu.: 85.71       1st Qu.:0.8541   1st Qu.:3.796   1st Qu.: 83.69  
##  Median :100.00       Median :0.9582   Median :4.258   Median : 99.23  
##  Mean   :102.05       Mean   :0.9406   Mean   :4.180   Mean   : 97.53  
##  3rd Qu.:123.31       3rd Qu.:1.0448   3rd Qu.:4.643   3rd Qu.:110.75  
##  Max.   :142.05       Max.   :1.1962   Max.   :5.316   Max.   :119.06  
##       NVWI       
##  Min.   :-298.2  
##  1st Qu.: 618.0  
##  Median :1302.1  
##  Mean   :2205.9  
##  3rd Qu.:4143.0  
##  Max.   :5426.4

2. Check for stationarity

library(urca)
library(tseries)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(dplyr)

## 
## Dołączanie pakietu: 'dplyr'

## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     filter, lag

## Następujące obiekty zostały zakryte z 'package:base':
## 
##     intersect, setdiff, setequal, union

# ---- Variables ----
vars <- c("Population","GNI_per_capita","Producer_Price_Index","Producer_Price_USD",
          "Avg_Max_Temp","Annual_Precip","Cropland_area","Value_added_deflator",
          "N2O_indirect","N2O_direct","GDP_deflator","NVWI")

# ---- Function to test stationarity ----
check_stationarity <- function(x){
  x <- na.omit(x)
  max_lag <- floor(4*(length(x)/100)^(1/4))
  
  adf <- ur.df(x, type="trend", lags=max_lag, selectlags="AIC")
  adf_diff <- ur.df(diff(x), type="drift", lags=max_lag, selectlags="AIC")
  pp  <- pp.test(x)
  kpss <- kpss.test(x)
  
  level_stat <- ifelse(adf@teststat[1] < adf@cval[2, "5pct"] & pp$p.value < 0.05 & kpss$p.value > 0.05,
                       "Stationary", "Non-stationary")
  diff_stat <- ifelse(adf_diff@teststat[1] < adf_diff@cval[2, "5pct"], "Stationary", "Non-stationary")
  
  integ <- ifelse(level_stat=="Stationary", "I(0)",
                  ifelse(diff_stat=="Stationary","I(1)","I(?)"))
  
  return(c(Level=level_stat, First_Diff=diff_stat, Integration=integ))
}

# ---- Build stationarity table with automatic log-transform ----
stationarity_table <- lapply(vars, function(v){
  x <- data[[v]]
  res <- check_stationarity(x)
  
  # If I(?), apply log transform and retest
  if(res["Integration"]=="I(?)" & all(x>0)){
    x <- log(x)
    res <- check_stationarity(x)
  }
  
  data.frame(Variable=v,
             Level_ADF_PP_KPSS=res["Level"],
             First_Difference_ADF=res["First_Diff"],
             Integration_Order=res["Integration"],
             stringsAsFactors = FALSE)
}) %>% bind_rows()

## Warning in kpss.test(x): p-value greater than printed p-value

## Warning in kpss.test(x): p-value smaller than printed p-value

## Warning in pp.test(x): p-value smaller than printed p-value

## Warning in kpss.test(x): p-value greater than printed p-value
## Warning in kpss.test(x): p-value greater than printed p-value

## Warning in kpss.test(x): p-value smaller than printed p-value
## Warning in kpss.test(x): p-value smaller than printed p-value
## Warning in kpss.test(x): p-value smaller than printed p-value
## Warning in kpss.test(x): p-value smaller than printed p-value
## Warning in kpss.test(x): p-value smaller than printed p-value

# ---- Print final table ----
print(stationarity_table)

##                        Variable Level_ADF_PP_KPSS First_Difference_ADF
## Level...1            Population    Non-stationary           Stationary
## Level...2        GNI_per_capita    Non-stationary           Stationary
## Level...3  Producer_Price_Index    Non-stationary           Stationary
## Level...4    Producer_Price_USD    Non-stationary           Stationary
## Level...5          Avg_Max_Temp    Non-stationary           Stationary
## Level...6         Annual_Precip        Stationary           Stationary
## Level...7         Cropland_area    Non-stationary           Stationary
## Level...8  Value_added_deflator    Non-stationary           Stationary
## Level...9          N2O_indirect    Non-stationary           Stationary
## Level...10           N2O_direct    Non-stationary           Stationary
## Level...11         GDP_deflator    Non-stationary           Stationary
## Level...12                 NVWI    Non-stationary           Stationary
##            Integration_Order
## Level...1               I(1)
## Level...2               I(1)
## Level...3               I(1)
## Level...4               I(1)
## Level...5               I(1)
## Level...6               I(0)
## Level...7               I(1)
## Level...8               I(1)
## Level...9               I(1)
## Level...10              I(1)
## Level...11              I(1)
## Level...12              I(1)

# Only annual prep is I(0)-stationary at basic level
# for ADF / PP Test: if p-value<0.05=stationary
# for KPSS Test: if p-value<0.05=non-stationary

3. Decide on the modeling approach:Check for cointegration (long-run relationship)

Use Johansen test if you have multiple I(1) variables.

Since you have ~30 observations and 11 I(1) variables, Johansen may fail due to rank deficiency.

Reduce variables (drop highly correlated ones like Producer_Price_Index vs Producer_Price_USD)

Or use Engle-Granger two-step (NVWI vs each regressor separately).

library(urca)
library(dplyr)
# Separate variables by integration order
# ---- Identify I(1) variables ----
I1_vars <- stationarity_table$Variable[stationarity_table$Integration_Order == "I(1)"]
I1_vars <- setdiff(I1_vars, "NVWI")  # Remove dependent variable
I1_vars

##  [1] "Population"           "GNI_per_capita"       "Producer_Price_Index"
##  [4] "Producer_Price_USD"   "Avg_Max_Temp"         "Cropland_area"       
##  [7] "Value_added_deflator" "N2O_indirect"         "N2O_direct"          
## [10] "GDP_deflator"

I0_vars <- stationarity_table$Variable[stationarity_table$Integration_Order=="I(0)"]
I0_vars

## [1] "Annual_Precip"

# ---- Function to run Engle-Granger test ----
eg_test <- function(dep, indep){
  df <- data[, c(dep, indep)]
  df <- na.omit(df)
  
  # Phillips-Ouliaris (Pz) test
  test <- ca.po(df, type="Pz", demean="trend")
  
  # Return as data.frame
  data.frame(
    Variable = indep,
    Test_statistic = round(test@teststat, 3),
    Critical_5pct = round(test@cval[2], 3),
    Cointegrated = ifelse(test@teststat > test@cval[2], "No", "Yes"),
    stringsAsFactors = FALSE
  )
}

# ---- Loop through I(1) variables ----
eg_results <- lapply(I1_vars, function(v){
  eg_test("NVWI", v)
}) %>% bind_rows()

# ---- View results ----
print(eg_results)

##                Variable Test_statistic Critical_5pct Cointegrated
## 1            Population          3.110        81.381          Yes
## 2        GNI_per_capita          6.290        81.381          Yes
## 3  Producer_Price_Index          7.404        81.381          Yes
## 4    Producer_Price_USD          8.533        81.381          Yes
## 5          Avg_Max_Temp          4.947        81.381          Yes
## 6         Cropland_area          4.026        81.381          Yes
## 7  Value_added_deflator          6.408        81.381          Yes
## 8          N2O_indirect          4.968        81.381          Yes
## 9            N2O_direct          4.968        81.381          Yes
## 10         GDP_deflator          4.415        81.381          Yes

#  since co-integration exists, use Engle-Granger Error Correction Model (ECM) to capture long run and short run dynamics

4. Engle-Granger Error Correction Model (ECM)

# a. Long-run regression: regress NVWI on all cointegrated I(1) variables.
long_run_formula <- as.formula(
  paste("NVWI ~", paste(eg_results$Variable[eg_results$Cointegrated=="Yes"], collapse=" + "))
)
long_run_model <- lm(long_run_formula, data=data)
summary(long_run_model)

## 
## Call:
## lm(formula = long_run_formula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -873.81 -553.78  -20.35  299.84 1665.70 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           1.107e+05  5.083e+04   2.178  0.04157 * 
## Population           -1.216e+03  3.657e+02  -3.326  0.00337 **
## GNI_per_capita        1.500e-01  1.627e-01   0.922  0.36768   
## Producer_Price_Index  1.618e+02  6.936e+01   2.333  0.03018 * 
## Producer_Price_USD   -8.836e+01  3.351e+01  -2.637  0.01581 * 
## Avg_Max_Temp         -2.252e+02  2.493e+02  -0.903  0.37705   
## Cropland_area        -1.617e+00  3.296e+00  -0.490  0.62916   
## Value_added_deflator  1.706e+01  1.766e+01   0.966  0.34555   
## N2O_indirect          7.607e+04  5.987e+06   0.013  0.98999   
## N2O_direct           -1.745e+04  1.347e+06  -0.013  0.98979   
## GDP_deflator          8.481e+01  9.424e+01   0.900  0.37888   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 791.7 on 20 degrees of freedom
## Multiple R-squared:  0.8836, Adjusted R-squared:  0.8254 
## F-statistic: 15.19 on 10 and 20 DF,  p-value: 2.917e-07

# b. Compute the error correction term (residuals from long-run regression):
data$ECT <- residuals(long_run_model)

# c.Short-run model: first-difference all I(1) variables, include ECT and I(0) variables in levels:
# First differences
data_diff <- data %>%
  mutate(across(all_of(I1_vars), ~ c(NA, diff(.)), .names="d_{.col}")) %>%
  mutate(d_NVWI = c(NA, diff(NVWI)))

# Build short-run formula
short_run_vars <- paste0("d_", eg_results$Variable[eg_results$Cointegrated=="Yes"])
short_run_formula <- as.formula(
  paste("d_NVWI ~", paste(short_run_vars, collapse=" + "),
        "+ ECT +", paste(I0_vars, collapse=" + "))
)

# Fit ECM
ecm_model <- lm(short_run_formula, data=data_diff)
summary(ecm_model)

## 
## Call:
## lm(formula = short_run_formula, data = data_diff)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -979.5 -292.4  111.8  302.9  775.3 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2.452e+02  1.139e+03   0.215 0.832110    
## d_Population           -1.847e+03  4.745e+02  -3.892 0.001171 ** 
## d_GNI_per_capita        1.582e-01  1.535e-01   1.031 0.317117    
## d_Producer_Price_Index -5.349e+00  6.942e+01  -0.077 0.939481    
## d_Producer_Price_USD   -3.369e+00  3.389e+01  -0.099 0.921974    
## d_Avg_Max_Temp         -2.484e+02  1.484e+02  -1.673 0.112548    
## d_Cropland_area         2.653e+00  2.398e+00   1.106 0.283948    
## d_Value_added_deflator -1.570e+01  1.281e+01  -1.226 0.237022    
## d_N2O_indirect          9.551e+04  2.910e+06   0.033 0.974202    
## d_N2O_direct           -2.213e+04  6.548e+05  -0.034 0.973439    
## d_GDP_deflator         -1.327e+01  7.290e+01  -0.182 0.857739    
## ECT                     1.042e+00  2.532e-01   4.114 0.000725 ***
## Annual_Precip           1.113e-02  1.387e+00   0.008 0.993689    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 575.2 on 17 degrees of freedom
##   (1 obserwacja została skasowana z uwagi na braki w niej zawarte)
## Multiple R-squared:  0.7304, Adjusted R-squared:  0.5402 
## F-statistic: 3.839 on 12 and 17 DF,  p-value: 0.005943

5. check diagnostics

—- 5. Diagnostics for Short-Run (ECM) and Long-Run Models —-

library(lmtest)  # dwtest, bptest

## Ładowanie wymaganego pakietu: zoo

## 
## Dołączanie pakietu: 'zoo'

## Następujące obiekty zostały zakryte z 'package:base':
## 
##     as.Date, as.Date.numeric

library(car)     # vif

## Ładowanie wymaganego pakietu: carData

## 
## Dołączanie pakietu: 'car'

## Następujący obiekt został zakryty z 'package:dplyr':
## 
##     recode

library(urca)    # ur.df

# Function to run diagnostics
run_diagnostics <- function(model, model_name = "Model") {
  
  # Residuals
  res <- residuals(model)
  
  # Durbin-Watson Test (Autocorrelation)
  dw <- dwtest(model)
  dw_stat <- round(dw$statistic, 3)
  dw_p <- round(dw$p.value, 3)
  
  # Breusch-Pagan Test (Heteroskedasticity)
  bp <- bptest(model)
  bp_stat <- round(bp$statistic, 3)
  bp_p <- round(bp$p.value, 3)
  
  # Multicollinearity (VIF)
  vif_vals <- vif(model)
  high_vif <- names(vif_vals[vif_vals > 5])
  vif_flag <- if(length(high_vif) == 0) "No severe multicollinearity" else paste("High VIF for:", paste(high_vif, collapse=", "))
  
  # Residual stationarity (ADF)
  adf <- ur.df(res, type = "drift", selectlags = "AIC")
  resid_stationary <- ifelse(adf@teststat[1] < adf@cval[2, "5pct"], "Stationary", "Non-stationary")
  
  # Compile diagnostics table
  diagnostics <- data.frame(
    Test = c("Durbin-Watson (Autocorr)", 
             "Breusch-Pagan (Heteroskedasticity)", 
             "Residual Stationarity", 
             "Multicollinearity"),
    Result = c(paste0(dw_stat, ", p=", dw_p),
               paste0(bp_stat, ", p=", bp_p),
               resid_stationary,
               vif_flag),
    stringsAsFactors = FALSE
  )
  
  cat("\nDiagnostics for", model_name, ":\n")
  print(diagnostics)
  invisible(diagnostics)
}

# ---- Run diagnostics for Long-Run Model ----
diagnostics_long <- run_diagnostics(long_run_model, "Long-Run Model")

## 
## Diagnostics for Long-Run Model :
##                                 Test
## 1           Durbin-Watson (Autocorr)
## 2 Breusch-Pagan (Heteroskedasticity)
## 3              Residual Stationarity
## 4                  Multicollinearity
##                                                                                                                                                            Result
## 1                                                                                                                                                  1.893, p=0.073
## 2                                                                                                                                                 11.003, p=0.357
## 3                                                                                                                                                      Stationary
## 4 High VIF for: Population, GNI_per_capita, Producer_Price_Index, Producer_Price_USD, Cropland_area, Value_added_deflator, N2O_indirect, N2O_direct, GDP_deflator

# ---- Run diagnostics for Short-Run ECM ----
diagnostics_short <- run_diagnostics(ecm_model, "Short-Run ECM")

## 
## Diagnostics for Short-Run ECM :
##                                 Test
## 1           Durbin-Watson (Autocorr)
## 2 Breusch-Pagan (Heteroskedasticity)
## 3              Residual Stationarity
## 4                  Multicollinearity
##                                                                                                                       Result
## 1                                                                                                             1.876, p=0.402
## 2                                                                                                             5.888, p=0.922
## 3                                                                                                                 Stationary
## 4 High VIF for: d_GNI_per_capita, d_Producer_Price_Index, d_Producer_Price_USD, d_N2O_indirect, d_N2O_direct, d_GDP_deflator

# check for variables with high VIF
# VIF diagnostics for short- and long-run models 
library(car)

check_vif <- function(model, model_name="Model") {
  vif_vals <- vif(model)
  vif_table <- data.frame(
    Variable = names(vif_vals),
    VIF = round(vif_vals, 2),
    HighVIF = ifelse(vif_vals > 10, "Yes", "No"),
    stringsAsFactors = FALSE
  )
  
  cat("\nVIF Table for", model_name, ":\n")
  print(vif_table)
  invisible(vif_table)
}

# ---- Run for long-run model ----
vif_long <- check_vif(long_run_model, "Long-Run Model")

## 
## VIF Table for Long-Run Model :
##                                  Variable         VIF HighVIF
## Population                     Population        5.07      No
## GNI_per_capita             GNI_per_capita      118.35     Yes
## Producer_Price_Index Producer_Price_Index      115.07     Yes
## Producer_Price_USD     Producer_Price_USD      161.39     Yes
## Avg_Max_Temp                 Avg_Max_Temp        2.22      No
## Cropland_area               Cropland_area        5.18      No
## Value_added_deflator Value_added_deflator        8.33      No
## N2O_indirect                 N2O_indirect 32508562.39     Yes
## N2O_direct                     N2O_direct 32507464.72     Yes
## GDP_deflator                 GDP_deflator      109.78     Yes

# ---- Run for short-run ECM ----
vif_short <- check_vif(ecm_model, "Short-Run ECM")

## 
## VIF Table for Short-Run ECM :
##                                      Variable        VIF HighVIF
## d_Population                     d_Population       1.72      No
## d_GNI_per_capita             d_GNI_per_capita      19.17     Yes
## d_Producer_Price_Index d_Producer_Price_Index     125.17     Yes
## d_Producer_Price_USD     d_Producer_Price_USD     178.81     Yes
## d_Avg_Max_Temp                 d_Avg_Max_Temp       1.55      No
## d_Cropland_area               d_Cropland_area       1.97      No
## d_Value_added_deflator d_Value_added_deflator       3.51      No
## d_N2O_indirect                 d_N2O_indirect 6300143.34     Yes
## d_N2O_direct                     d_N2O_direct 6300976.91     Yes
## d_GDP_deflator                 d_GDP_deflator      27.36     Yes
## ECT                                       ECT       2.40      No
## Annual_Precip                   Annual_Precip       1.72      No

6. REFINE YOUR MODEL (by removing variables)

—- Refined Long- and Short-Run Models —-

library(dplyr)
library(lmtest)
library(car)
library(urca)

# ---- 1. Refine Long-Run Model ----
# Keep only significant and low-VIF variables
long_vars_refined <- c("Avg_Max_Temp", "Cropland_area","Value_added_deflator","N2O_indirect")
long_run_formula_refined <- as.formula(paste("NVWI ~", paste(long_vars_refined, collapse = " + ")))
long_run_model_refined <- lm(long_run_formula_refined, data = data)
summary(long_run_model_refined)

## 
## Call:
## lm(formula = long_run_formula_refined, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1654.3  -557.0  -180.3   422.5  2092.6 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          91001.515  30281.460   3.005 0.005812 ** 
## Avg_Max_Temp            86.970    257.668   0.338 0.738428    
## Cropland_area           -8.424      2.467  -3.415 0.002102 ** 
## Value_added_deflator    44.410      9.739   4.560 0.000107 ***
## N2O_indirect          7126.639   1884.752   3.781 0.000825 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1050 on 26 degrees of freedom
## Multiple R-squared:  0.7337, Adjusted R-squared:  0.6927 
## F-statistic: 17.91 on 4 and 26 DF,  p-value: 3.567e-07

long_vars_refined2 <- c("Population", "Annual_Precip", "GNI_per_capita", "Producer_Price_Index")
long_run_formula_refined2 <- as.formula(paste("NVWI ~", paste(long_vars_refined2, collapse = " + ")))
long_run_model_refined2 <- lm(long_run_formula_refined2, data = data)
summary(long_run_model_refined2)

## 
## Call:
## lm(formula = long_run_formula_refined2, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1948.86  -587.64   -47.73   582.29  1975.04 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2525.8512 17506.3387   0.144    0.886    
## Population             -64.4408   213.4726  -0.302    0.765    
## Annual_Precip           -1.9587     1.6747  -1.170    0.253    
## GNI_per_capita           0.1636     0.0280   5.845 3.68e-06 ***
## Producer_Price_Index     4.8743    11.5192   0.423    0.676    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 933.5 on 26 degrees of freedom
## Multiple R-squared:  0.7897, Adjusted R-squared:  0.7573 
## F-statistic:  24.4 on 4 and 26 DF,  p-value: 1.776e-08

long_vars_refined3 <- c("N2O_direct", "GDP_deflator")
long_run_formula_refined3 <- as.formula(paste("NVWI ~", paste(long_vars_refined3, collapse = " + ")))
long_run_model_refined3 <- lm(long_run_formula_refined3, data = data)
summary(long_run_model_refined3)

## 
## Call:
## lm(formula = long_run_formula_refined3, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2427.25  -747.87   -17.64   745.76  2337.01 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8281.13    1587.95  -5.215 1.54e-05 ***
## N2O_direct     724.76     419.90   1.726   0.0954 .  
## GDP_deflator    76.46      15.99   4.783 5.01e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1171 on 28 degrees of freedom
## Multiple R-squared:  0.6436, Adjusted R-squared:  0.6181 
## F-statistic: 25.28 on 2 and 28 DF,  p-value: 5.344e-07

# ---- 2. Compute ECT ----
data$ECT_refined <- residuals(long_run_model_refined)
data$ECT_refined2 <- residuals(long_run_model_refined2)
data$ECT_refined3 <- residuals(long_run_model_refined3)
# ---- 3. Prepare differenced data for Short-Run ECM ----
data_diff <- data %>%
  mutate(across(all_of(I1_vars), ~ c(NA, diff(.)), .names="d_{.col}")) %>%
  mutate(d_NVWI = c(NA, diff(NVWI))) %>%
  mutate(ECT_refined = data$ECT_refined)  # add ECT to differenced data

data_diff2 <- data %>%
  mutate(across(all_of(I1_vars), ~ c(NA, diff(.)), .names="d_{.col}")) %>%
  mutate(d_NVWI = c(NA, diff(NVWI))) %>%
  mutate(ECT_refined2 = data$ECT_refined2)  # add ECT to differenced data

data_diff3 <- data %>%
  mutate(across(all_of(I1_vars), ~ c(NA, diff(.)), .names="d_{.col}")) %>%
  mutate(d_NVWI = c(NA, diff(NVWI))) %>%
  mutate(ECT_refined3 = data$ECT_refined3)  # add ECT to differenced data


# Keep differenced variables with low VIF and significant
short_vars_refined <- c("d_Avg_Max_Temp", "d_Cropland_area","d_Value_added_deflator","d_N2O_indirect")
short_run_formula_refined <- as.formula(paste("d_NVWI ~", paste(short_vars_refined, collapse = " + "), "+ ECT_refined"))

short_vars_refined2 <- c("d_Population", "Annual_Precip", "d_GNI_per_capita", "d_Producer_Price_Index")
short_run_formula_refined2 <- as.formula(paste("d_NVWI ~", paste(short_vars_refined2, collapse = " + "), "+ ECT_refined2"))

short_vars_refined3 <- c("d_N2O_indirect", "d_GDP_deflator")
short_run_formula_refined3 <- as.formula(paste("d_NVWI ~", paste(short_vars_refined3, collapse = " + "), "+ ECT_refined3"))

# Fit Short-Run ECM
ecm_model_refined <- lm(short_run_formula_refined, data = data_diff)
summary(ecm_model_refined)

## 
## Call:
## lm(formula = short_run_formula_refined, data = data_diff)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1919.73  -280.81   -41.76   510.54  1106.01 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             113.5179   133.7251   0.849  0.40433   
## d_Avg_Max_Temp          -57.4317   156.6600  -0.367  0.71713   
## d_Cropland_area           2.6280     2.6707   0.984  0.33493   
## d_Value_added_deflator   -0.4490     9.3545  -0.048  0.96212   
## d_N2O_indirect          307.2606  2093.4125   0.147  0.88454   
## ECT_refined               0.5103     0.1761   2.897  0.00791 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 722.9 on 24 degrees of freedom
##   (1 obserwacja została skasowana z uwagi na braki w niej zawarte)
## Multiple R-squared:  0.3989, Adjusted R-squared:  0.2737 
## F-statistic: 3.186 on 5 and 24 DF,  p-value: 0.02404

ecm_model_refined2 <- lm(short_run_formula_refined2, data = data_diff2)
summary(ecm_model_refined2)

## 
## Call:
## lm(formula = short_run_formula_refined2, data = data_diff2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1390.46  -394.11    27.93   445.06  1435.79 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             2.477e+03  1.067e+03   2.321  0.02907 * 
## d_Population           -1.629e+03  5.256e+02  -3.098  0.00491 **
## Annual_Precip          -2.841e+00  1.300e+00  -2.185  0.03887 * 
## d_GNI_per_capita        1.335e-01  5.262e-02   2.537  0.01809 * 
## d_Producer_Price_Index -1.622e+01  8.767e+00  -1.850  0.07667 . 
## ECT_refined2            6.709e-01  1.902e-01   3.527  0.00173 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 687.4 on 24 degrees of freedom
##   (1 obserwacja została skasowana z uwagi na braki w niej zawarte)
## Multiple R-squared:  0.4565, Adjusted R-squared:  0.3432 
## F-statistic: 4.031 on 5 and 24 DF,  p-value: 0.008498

ecm_model_refined3 <- lm(short_run_formula_refined3, data = data_diff3)
summary(ecm_model_refined3)

## 
## Call:
## lm(formula = short_run_formula_refined3, data = data_diff3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1806.72  -533.81    84.04   337.64  1859.91 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)
## (Intercept)      82.0986   160.5733   0.511    0.613
## d_N2O_indirect -746.5113  1988.9985  -0.375    0.710
## d_GDP_deflator   24.5903    22.1707   1.109    0.278
## ECT_refined3      0.1433     0.1719   0.834    0.412
## 
## Residual standard error: 859.9 on 26 degrees of freedom
##   (1 obserwacja została skasowana z uwagi na braki w niej zawarte)
## Multiple R-squared:  0.07864,    Adjusted R-squared:  -0.02767 
## F-statistic: 0.7397 on 3 and 26 DF,  p-value: 0.538

# ---- 4. Optional: Quick Diagnostics Function ----
run_diagnostics <- function(model) {
  res <- residuals(model)
  dw <- dwtest(model); bp <- bptest(model)
  vif_vals <- vif(model); high_vif <- names(vif_vals[vif_vals>5])
  adf <- ur.df(res, type="drift", selectlags="AIC")
  
  data.frame(
    Test = c("Durbin-Watson", "Breusch-Pagan", "Residual Stationarity", "Multicollinearity"),
    Result = c(
      paste0(round(dw$statistic,3), ", p=", round(dw$p.value,3)),
      paste0(round(bp$statistic,3), ", p=", round(bp$p.value,3)),
      ifelse(adf@teststat[1]<adf@cval[2,"5pct"], "Stationary", "Non-stationary"),
      if(length(high_vif)==0) "No severe multicollinearity" else paste("High VIF for:", paste(high_vif, collapse=", "))
    ),
    stringsAsFactors = FALSE
  )
}

# Run diagnostics
run_diagnostics(long_run_model_refined)

run_diagnostics(long_run_model_refined2)

run_diagnostics(long_run_model_refined3)

run_diagnostics(ecm_model_refined)

run_diagnostics(ecm_model_refined2)

run_diagnostics(ecm_model_refined3)

7. Adjust for serial correlation of refine long run_diagnostics

library(dynlm)
library(sandwich)
library(lmtest)

nw_se2 <- NeweyWest(long_run_model_refined2, lag = 2, prewhite = FALSE)
coeftest(long_run_model_refined2, vcov = nw_se2)

## 
## t test of coefficients:
## 
##                         Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)           2.5259e+03  1.4498e+04  0.1742    0.8630    
## Population           -6.4441e+01  1.7762e+02 -0.3628    0.7197    
## Annual_Precip        -1.9587e+00  1.2380e+00 -1.5823    0.1257    
## GNI_per_capita        1.6363e-01  2.5518e-02  6.4125 8.569e-07 ***
## Producer_Price_Index  4.8743e+00  1.0677e+01  0.4565    0.6518    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

nw_se3 <- NeweyWest(long_run_model_refined3, lag = 2, prewhite = FALSE)
coeftest(long_run_model_refined3, vcov = nw_se3)

## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  -8281.132   1480.524 -5.5934 5.496e-06 ***
## N2O_direct     724.764    450.999  1.6070    0.1193    
## GDP_deflator    76.464     14.460  5.2878 1.263e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Wheat_DE

1. Load the data

2. Check for stationarity

3. Decide on the modeling approach:Check for cointegration (long-run relationship)

Use Johansen test if you have multiple I(1) variables.

Since you have ~30 observations and 11 I(1) variables, Johansen may fail due to rank deficiency.

Reduce variables (drop highly correlated ones like Producer_Price_Index vs Producer_Price_USD)

Or use Engle-Granger two-step (NVWI vs each regressor separately).

4. Engle-Granger Error Correction Model (ECM)

5. check diagnostics

—- 5. Diagnostics for Short-Run (ECM) and Long-Run Models —-

6. REFINE YOUR MODEL (by removing variables)

—- Refined Long- and Short-Run Models —-

7. Adjust for serial correlation of refine long run_diagnostics