1. Load the data

library(readxl)
data <- read_excel("Final_reg_data_for_potatoes_DE.xlsx")
data

# Rename columns for convenience
colnames(data) <- c("Year","Population","Producer_Price_Index","Producer_Price_USD",
                    "Avg_Max_Temp","Annual_Precip","GNI_per_capita","Cropland_area",
                    "Value_added_deflator","N2O_indirect","N2O_direct","GDP_deflator","NVWI")
str(data)

## tibble [31 × 13] (S3: tbl_df/tbl/data.frame)
##  $ Year                : num [1:31] 1991 1992 1993 1994 1995 ...
##  $ Population          : num [1:31] 80.3 80.9 81.5 81.8 82 ...
##  $ Producer_Price_Index: num [1:31] 87.1 75.9 29.2 69.3 123 ...
##  $ Producer_Price_USD  : num [1:31] 165.1 153 55.6 134.3 270 ...
##  $ Avg_Max_Temp        : num [1:31] 12.8 13.4 12.6 13.7 12.9 ...
##  $ Annual_Precip       : num [1:31] 655 814 878 856 841 ...
##  $ GNI_per_capita      : num [1:31] 23506 26657 25751 27255 31829 ...
##  $ Cropland_area       : num [1:31] 11807 11708 11911 12037 12061 ...
##  $ Value_added_deflator: num [1:31] 58.7 62.7 64.5 94.3 114.8 ...
##  $ N2O_indirect        : num [1:31] 0.062 0.066 0.0672 0.0565 0.059 0.0719 0.0644 0.0627 0.0644 0.0699 ...
##  $ N2O_direct          : num [1:31] 0.276 0.293 0.298 0.251 0.262 ...
##  $ GDP_deflator        : num [1:31] 75.5 84.5 82.9 86.1 99.2 ...
##  $ NVWI                : num [1:31] 139 159.6 120.7 88.1 143.1 ...

sum(is.na(data))

## [1] 0

summary(data)

##       Year        Population    Producer_Price_Index Producer_Price_USD
##  Min.   :1991   Min.   :80.30   Min.   : 29.23       Min.   : 55.6     
##  1st Qu.:1998   1st Qu.:81.20   1st Qu.: 60.56       1st Qu.:114.2     
##  Median :2006   Median :81.93   Median : 80.91       Median :165.1     
##  Mean   :2006   Mean   :81.91   Mean   : 86.94       Mean   :170.8     
##  3rd Qu.:2014   3rd Qu.:82.19   3rd Qu.:104.62       3rd Qu.:212.1     
##  Max.   :2021   Max.   :83.70   Max.   :176.25       Max.   :328.0     
##   Avg_Max_Temp   Annual_Precip   GNI_per_capita  Cropland_area  
##  Min.   :10.97   Min.   :564.2   Min.   :23506   Min.   :11708  
##  1st Qu.:12.79   1st Qu.:743.9   1st Qu.:27183   1st Qu.:11967  
##  Median :13.20   Median :796.5   Median :37481   Median :12038  
##  Mean   :13.26   Mean   :797.2   Mean   :37047   Mean   :12011  
##  3rd Qu.:13.71   3rd Qu.:866.8   3rd Qu.:46049   3rd Qu.:12074  
##  Max.   :15.12   Max.   :987.0   Max.   :53272   Max.   :12145  
##  Value_added_deflator  N2O_indirect       N2O_direct      GDP_deflator   
##  Min.   : 58.67       Min.   :0.05070   Min.   :0.2256   Min.   : 67.07  
##  1st Qu.: 85.71       1st Qu.:0.05675   1st Qu.:0.2524   1st Qu.: 83.69  
##  Median :100.00       Median :0.06100   Median :0.2709   Median : 99.23  
##  Mean   :102.05       Mean   :0.06075   Mean   :0.2700   Mean   : 97.53  
##  3rd Qu.:123.31       3rd Qu.:0.06250   3rd Qu.:0.2779   3rd Qu.:110.75  
##  Max.   :142.05       Max.   :0.07190   Max.   :0.3197   Max.   :119.06  
##       NVWI        
##  Min.   : -2.589  
##  1st Qu.: 20.372  
##  Median : 31.830  
##  Mean   : 52.180  
##  3rd Qu.: 82.458  
##  Max.   :159.640

2. Check for stationarity

library(urca)
library(tseries)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(dplyr)

## 
## Dołączanie pakietu: 'dplyr'

## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     filter, lag

## Następujące obiekty zostały zakryte z 'package:base':
## 
##     intersect, setdiff, setequal, union

# ---- Variables ----
vars <- c("Population","GNI_per_capita","Producer_Price_Index","Producer_Price_USD",
          "Avg_Max_Temp","Annual_Precip","Cropland_area","Value_added_deflator",
          "N2O_indirect","N2O_direct","GDP_deflator","NVWI")

# ---- Function to test stationarity ----
check_stationarity <- function(x){
  x <- na.omit(x)
  max_lag <- floor(4*(length(x)/100)^(1/4))
  
  adf <- ur.df(x, type="trend", lags=max_lag, selectlags="AIC")
  adf_diff <- ur.df(diff(x), type="drift", lags=max_lag, selectlags="AIC")
  pp  <- pp.test(x)
  kpss <- kpss.test(x)
  
  level_stat <- ifelse(adf@teststat[1] < adf@cval[2, "5pct"] & pp$p.value < 0.05 & kpss$p.value > 0.05,
                       "Stationary", "Non-stationary")
  diff_stat <- ifelse(adf_diff@teststat[1] < adf_diff@cval[2, "5pct"], "Stationary", "Non-stationary")
  
  integ <- ifelse(level_stat=="Stationary", "I(0)",
                  ifelse(diff_stat=="Stationary","I(1)","I(?)"))
  
  return(c(Level=level_stat, First_Diff=diff_stat, Integration=integ))
}

# ---- Build stationarity table with automatic log-transform ----
stationarity_table <- lapply(vars, function(v){
  x <- data[[v]]
  res <- check_stationarity(x)
  
  # If I(?), apply log transform and retest
  if(res["Integration"]=="I(?)" & all(x>0)){
    x <- log(x)
    res <- check_stationarity(x)
  }
  
  data.frame(Variable=v,
             Level_ADF_PP_KPSS=res["Level"],
             First_Difference_ADF=res["First_Diff"],
             Integration_Order=res["Integration"],
             stringsAsFactors = FALSE)
}) %>% bind_rows()

## Warning in kpss.test(x): p-value greater than printed p-value

## Warning in kpss.test(x): p-value smaller than printed p-value

## Warning in pp.test(x): p-value smaller than printed p-value
## Warning in pp.test(x): p-value smaller than printed p-value

## Warning in kpss.test(x): p-value greater than printed p-value
## Warning in kpss.test(x): p-value greater than printed p-value

## Warning in kpss.test(x): p-value smaller than printed p-value

## Warning in pp.test(x): p-value smaller than printed p-value

## Warning in kpss.test(x): p-value smaller than printed p-value

## Warning in pp.test(x): p-value smaller than printed p-value

## Warning in kpss.test(x): p-value smaller than printed p-value
## Warning in kpss.test(x): p-value smaller than printed p-value

# ---- Print final table ----
print(stationarity_table)

##                        Variable Level_ADF_PP_KPSS First_Difference_ADF
## Level...1            Population    Non-stationary           Stationary
## Level...2        GNI_per_capita    Non-stationary           Stationary
## Level...3  Producer_Price_Index    Non-stationary           Stationary
## Level...4    Producer_Price_USD    Non-stationary           Stationary
## Level...5          Avg_Max_Temp    Non-stationary           Stationary
## Level...6         Annual_Precip        Stationary           Stationary
## Level...7         Cropland_area    Non-stationary           Stationary
## Level...8  Value_added_deflator    Non-stationary           Stationary
## Level...9          N2O_indirect    Non-stationary           Stationary
## Level...10           N2O_direct    Non-stationary           Stationary
## Level...11         GDP_deflator    Non-stationary           Stationary
## Level...12                 NVWI    Non-stationary           Stationary
##            Integration_Order
## Level...1               I(1)
## Level...2               I(1)
## Level...3               I(1)
## Level...4               I(1)
## Level...5               I(1)
## Level...6               I(0)
## Level...7               I(1)
## Level...8               I(1)
## Level...9               I(1)
## Level...10              I(1)
## Level...11              I(1)
## Level...12              I(1)

# Only annual prep is I(0)-stationary at basic level
# for ADF / PP Test: if p-value<0.05=stationary
# for KPSS Test: if p-value<0.05=non-stationary

3. Decide on the modeling approach:Check for cointegration (long-run relationship)

Use Johansen test if you have multiple I(1) variables.

Since you have ~30 observations and 11 I(1) variables, Johansen may fail due to rank deficiency.

Reduce variables (drop highly correlated ones like Producer_Price_Index vs Producer_Price_USD)

Or use Engle-Granger two-step (NVWI vs each regressor separately).

library(urca)
library(dplyr)
# Separate variables by integration order
# ---- Identify I(1) variables ----
I1_vars <- stationarity_table$Variable[stationarity_table$Integration_Order == "I(1)"]
I1_vars <- setdiff(I1_vars, "NVWI")  # Remove dependent variable
I1_vars

##  [1] "Population"           "GNI_per_capita"       "Producer_Price_Index"
##  [4] "Producer_Price_USD"   "Avg_Max_Temp"         "Cropland_area"       
##  [7] "Value_added_deflator" "N2O_indirect"         "N2O_direct"          
## [10] "GDP_deflator"

I0_vars <- stationarity_table$Variable[stationarity_table$Integration_Order=="I(0)"]
I0_vars

## [1] "Annual_Precip"

# ---- Function to run Engle-Granger test ----
eg_test <- function(dep, indep){
  df <- data[, c(dep, indep)]
  df <- na.omit(df)
  
  # Phillips-Ouliaris (Pz) test
  test <- ca.po(df, type="Pz", demean="trend")
  
  # Return as data.frame
  data.frame(
    Variable = indep,
    Test_statistic = round(test@teststat, 3),
    Critical_5pct = round(test@cval[2], 3),
    Cointegrated = ifelse(test@teststat > test@cval[2], "No", "Yes"),
    stringsAsFactors = FALSE
  )
}

# ---- Loop through I(1) variables ----
eg_results <- lapply(I1_vars, function(v){
  eg_test("NVWI", v)
}) %>% bind_rows()

# ---- View results ----
print(eg_results)

##                Variable Test_statistic Critical_5pct Cointegrated
## 1            Population          7.206        81.381          Yes
## 2        GNI_per_capita          4.695        81.381          Yes
## 3  Producer_Price_Index          6.336        81.381          Yes
## 4    Producer_Price_USD          7.480        81.381          Yes
## 5          Avg_Max_Temp          8.126        81.381          Yes
## 6         Cropland_area          7.734        81.381          Yes
## 7  Value_added_deflator          5.179        81.381          Yes
## 8          N2O_indirect          6.379        81.381          Yes
## 9            N2O_direct          6.367        81.381          Yes
## 10         GDP_deflator          5.601        81.381          Yes

#  since co-integration exists, use Engle-Granger Error Correction Model (ECM) to capture long run and short run dynamics

4. Engle-Granger Error Correction Model (ECM)

# a. Long-run regression: regress NVWI on all cointegrated I(1) variables.
long_run_formula <- as.formula(
  paste("NVWI ~", paste(eg_results$Variable[eg_results$Cointegrated=="Yes"], collapse=" + "))
)
long_run_model <- lm(long_run_formula, data=data)
summary(long_run_model)

## 
## Call:
## lm(formula = long_run_formula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.044  -8.352  -2.215  10.809  33.748 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           4.522e+03  7.986e+02   5.662 1.53e-05 ***
## Population           -1.285e+01  7.173e+00  -1.791 0.088370 .  
## GNI_per_capita       -1.528e-02  2.503e-03  -6.103 5.78e-06 ***
## Producer_Price_Index  1.989e+00  1.239e+00   1.605 0.124076    
## Producer_Price_USD   -9.351e-01  6.291e-01  -1.486 0.152783    
## Avg_Max_Temp         -7.236e+00  5.769e+00  -1.254 0.224209    
## Cropland_area        -3.048e-01  4.405e-02  -6.919 1.02e-06 ***
## Value_added_deflator  1.537e+00  3.826e-01   4.018 0.000674 ***
## N2O_indirect         -1.145e+05  1.110e+05  -1.032 0.314444    
## N2O_direct            2.595e+04  2.503e+04   1.037 0.312069    
## GDP_deflator          7.039e+00  1.567e+00   4.491 0.000223 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.27 on 20 degrees of freedom
## Multiple R-squared:  0.8799, Adjusted R-squared:  0.8198 
## F-statistic: 14.65 on 10 and 20 DF,  p-value: 3.953e-07

# b. Compute the error correction term (residuals from long-run regression):
data$ECT <- residuals(long_run_model)

# c.Short-run model: first-difference all I(1) variables, include ECT and I(0) variables in levels:
# First differences
data_diff <- data %>%
  mutate(across(all_of(I1_vars), ~ c(NA, diff(.)), .names="d_{.col}")) %>%
  mutate(d_NVWI = c(NA, diff(NVWI)))

# Build short-run formula
short_run_vars <- paste0("d_", eg_results$Variable[eg_results$Cointegrated=="Yes"])
short_run_formula <- as.formula(
  paste("d_NVWI ~", paste(short_run_vars, collapse=" + "),
        "+ ECT +", paste(I0_vars, collapse=" + "))
)

# Fit ECM
ecm_model <- lm(short_run_formula, data=data_diff)
summary(ecm_model)

## 
## Call:
## lm(formula = short_run_formula, data = data_diff)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.639  -7.086   0.270   5.803  28.390 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             3.570e+01  3.627e+01   0.984  0.33884   
## d_Population           -2.452e+01  1.464e+01  -1.675  0.11225   
## d_GNI_per_capita       -1.213e-02  4.290e-03  -2.826  0.01164 * 
## d_Producer_Price_Index  7.941e-01  9.267e-01   0.857  0.40343   
## d_Producer_Price_USD   -3.334e-01  4.871e-01  -0.684  0.50300   
## d_Avg_Max_Temp         -9.030e+00  4.527e+00  -1.995  0.06238 . 
## d_Cropland_area        -2.449e-01  7.129e-02  -3.435  0.00316 **
## d_Value_added_deflator  1.282e+00  4.619e-01   2.774  0.01299 * 
## d_N2O_indirect         -5.258e+04  8.368e+04  -0.628  0.53809   
## d_N2O_direct            1.205e+04  1.891e+04   0.637  0.53255   
## d_GDP_deflator          5.419e+00  1.670e+00   3.245  0.00476 **
## ECT                     9.987e-01  3.070e-01   3.253  0.00469 **
## Annual_Precip          -4.350e-02  4.417e-02  -0.985  0.33845   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.87 on 17 degrees of freedom
##   (1 obserwacja została skasowana z uwagi na braki w niej zawarte)
## Multiple R-squared:  0.7447, Adjusted R-squared:  0.5645 
## F-statistic: 4.132 on 12 and 17 DF,  p-value: 0.004064

5. check diagnostics

—- 5. Diagnostics for Short-Run (ECM) and Long-Run Models —-

library(lmtest)  # dwtest, bptest

## Ładowanie wymaganego pakietu: zoo

## 
## Dołączanie pakietu: 'zoo'

## Następujące obiekty zostały zakryte z 'package:base':
## 
##     as.Date, as.Date.numeric

library(car)     # vif

## Ładowanie wymaganego pakietu: carData

## 
## Dołączanie pakietu: 'car'

## Następujący obiekt został zakryty z 'package:dplyr':
## 
##     recode

library(urca)    # ur.df

# Function to run diagnostics
run_diagnostics <- function(model, model_name = "Model") {
  
  # Residuals
  res <- residuals(model)
  
  # Durbin-Watson Test (Autocorrelation)
  dw <- dwtest(model)
  dw_stat <- round(dw$statistic, 3)
  dw_p <- round(dw$p.value, 3)
  
  # Breusch-Pagan Test (Heteroskedasticity)
  bp <- bptest(model)
  bp_stat <- round(bp$statistic, 3)
  bp_p <- round(bp$p.value, 3)
  
  # Multicollinearity (VIF)
  vif_vals <- vif(model)
  high_vif <- names(vif_vals[vif_vals > 5])
  vif_flag <- if(length(high_vif) == 0) "No severe multicollinearity" else paste("High VIF for:", paste(high_vif, collapse=", "))
  
  # Residual stationarity (ADF)
  adf <- ur.df(res, type = "drift", selectlags = "AIC")
  resid_stationary <- ifelse(adf@teststat[1] < adf@cval[2, "5pct"], "Stationary", "Non-stationary")
  
  # Compile diagnostics table
  diagnostics <- data.frame(
    Test = c("Durbin-Watson (Autocorr)", 
             "Breusch-Pagan (Heteroskedasticity)", 
             "Residual Stationarity", 
             "Multicollinearity"),
    Result = c(paste0(dw_stat, ", p=", dw_p),
               paste0(bp_stat, ", p=", bp_p),
               resid_stationary,
               vif_flag),
    stringsAsFactors = FALSE
  )
  
  cat("\nDiagnostics for", model_name, ":\n")
  print(diagnostics)
  invisible(diagnostics)
}

# ---- Run diagnostics for Long-Run Model ----
diagnostics_long <- run_diagnostics(long_run_model, "Long-Run Model")

## 
## Diagnostics for Long-Run Model :
##                                 Test
## 1           Durbin-Watson (Autocorr)
## 2 Breusch-Pagan (Heteroskedasticity)
## 3              Residual Stationarity
## 4                  Multicollinearity
##                                                                                                                                 Result
## 1                                                                                                                       2.069, p=0.236
## 2                                                                                                                       8.846, p=0.547
## 3                                                                                                                           Stationary
## 4 High VIF for: GNI_per_capita, Producer_Price_Index, Producer_Price_USD, Value_added_deflator, N2O_indirect, N2O_direct, GDP_deflator

# ---- Run diagnostics for Short-Run ECM ----
diagnostics_short <- run_diagnostics(ecm_model, "Short-Run ECM")

## 
## Diagnostics for Short-Run ECM :
##                                 Test
## 1           Durbin-Watson (Autocorr)
## 2 Breusch-Pagan (Heteroskedasticity)
## 3              Residual Stationarity
## 4                  Multicollinearity
##                                                                                                                       Result
## 1                                                                                                              1.918, p=0.53
## 2                                                                                                             7.061, p=0.854
## 3                                                                                                                 Stationary
## 4 High VIF for: d_GNI_per_capita, d_Producer_Price_Index, d_Producer_Price_USD, d_N2O_indirect, d_N2O_direct, d_GDP_deflator

#####################################
# 5. check for variables with high VIF
# VIF diagnostics for short- and long-run models 
library(car)

check_vif <- function(model, model_name="Model") {
  vif_vals <- vif(model)
  vif_table <- data.frame(
    Variable = names(vif_vals),
    VIF = round(vif_vals, 2),
    HighVIF = ifelse(vif_vals > 10, "Yes", "No"),
    stringsAsFactors = FALSE
  )
  
  cat("\nVIF Table for", model_name, ":\n")
  print(vif_table)
  invisible(vif_table)
}

# ---- Run for long-run model ----
vif_long <- check_vif(long_run_model, "Long-Run Model")

## 
## VIF Table for Long-Run Model :
##                                  Variable      VIF HighVIF
## Population                     Population     3.29      No
## GNI_per_capita             GNI_per_capita    47.26     Yes
## Producer_Price_Index Producer_Price_Index   161.62     Yes
## Producer_Price_USD     Producer_Price_USD   191.30     Yes
## Avg_Max_Temp                 Avg_Max_Temp     2.01      No
## Cropland_area               Cropland_area     1.56      No
## Value_added_deflator Value_added_deflator     6.60      No
## N2O_indirect                 N2O_indirect 23859.83     Yes
## N2O_direct                     N2O_direct 23880.88     Yes
## GDP_deflator                 GDP_deflator    51.23     Yes

# ---- Run for short-run ECM ----
vif_short <- check_vif(ecm_model, "Short-Run ECM")

## 
## VIF Table for Short-Run ECM :
##                                      Variable      VIF HighVIF
## d_Population                     d_Population     1.70      No
## d_GNI_per_capita             d_GNI_per_capita    15.52     Yes
## d_Producer_Price_Index d_Producer_Price_Index   141.30     Yes
## d_Producer_Price_USD     d_Producer_Price_USD   163.90     Yes
## d_Avg_Max_Temp                 d_Avg_Max_Temp     1.49      No
## d_Cropland_area               d_Cropland_area     1.80      No
## d_Value_added_deflator d_Value_added_deflator     4.73      No
## d_N2O_indirect                 d_N2O_indirect 24662.46     Yes
## d_N2O_direct                     d_N2O_direct 24754.18     Yes
## d_GDP_deflator                 d_GDP_deflator    14.88     Yes
## ECT                                       ECT     2.19      No
## Annual_Precip                   Annual_Precip     1.81      No

6. REFINE YOUR MODEL (by removing variables)

—- Refined Long- and Short-Run Models —-

library(dplyr)
library(lmtest)
library(car)
library(urca)

# ---- 1. Refine Long-Run Model ----
# Keep only significant and low-VIF variables
long_vars_refined <- c("Population", "GNI_per_capita", "Avg_Max_Temp", "Producer_Price_Index","Cropland_area")
long_run_formula_refined <- as.formula(paste("NVWI ~", paste(long_vars_refined, collapse = " + ")))
long_run_model_refined <- lm(long_run_formula_refined, data = data)
summary(long_run_model_refined)

## 
## Call:
## lm(formula = long_run_formula_refined, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.765 -18.129  -2.294  20.518  71.117 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           3.908e+03  1.095e+03   3.568  0.00149 **
## Population           -1.462e+01  7.650e+00  -1.911  0.06756 . 
## GNI_per_capita       -1.624e-03  8.654e-04  -1.877  0.07227 . 
## Avg_Max_Temp         -2.358e+01  8.692e+00  -2.712  0.01191 * 
## Producer_Price_Index  3.686e-01  2.278e-01   1.618  0.11821   
## Cropland_area        -1.930e-01  6.468e-02  -2.984  0.00628 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.72 on 25 degrees of freedom
## Multiple R-squared:  0.5672, Adjusted R-squared:  0.4806 
## F-statistic: 6.553 on 5 and 25 DF,  p-value: 0.0005059

# ---- 2. Compute ECT ----
data$ECT_refined <- residuals(long_run_model_refined)

# ---- 3. Prepare differenced data for Short-Run ECM ----
data_diff <- data %>%
  mutate(across(all_of(I1_vars), ~ c(NA, diff(.)), .names="d_{.col}")) %>%
  mutate(d_NVWI = c(NA, diff(NVWI))) %>%
  mutate(ECT_refined = data$ECT_refined)  # add ECT to differenced data

# Keep differenced variables with low VIF and significant
short_vars_refined <- c("d_Population","d_GNI_per_capita", "d_Avg_Max_Temp","d_Producer_Price_Index", "d_Cropland_area", "d_Value_added_deflator")
short_run_formula_refined <- as.formula(paste("d_NVWI ~", paste(short_vars_refined, collapse = " + "), "+ ECT_refined"))
# Value_added_deflator was removed for long run because of high vif
# Fit Short-Run ECM
ecm_model_refined <- lm(short_run_formula_refined, data = data_diff)
summary(ecm_model_refined)

## 
## Call:
## lm(formula = short_run_formula_refined, data = data_diff)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.911 -14.667   3.343  14.549  34.669 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)             -5.788592   4.746991  -1.219   0.2356  
## d_Population            -9.616154  14.950542  -0.643   0.5267  
## d_GNI_per_capita         0.002167   0.001923   1.127   0.2720  
## d_Avg_Max_Temp         -10.511194   4.785436  -2.196   0.0389 *
## d_Producer_Price_Index   0.221409   0.108949   2.032   0.0544 .
## d_Cropland_area         -0.157821   0.076935  -2.051   0.0523 .
## d_Value_added_deflator   0.275889   0.400433   0.689   0.4980  
## ECT_refined              0.382250   0.160455   2.382   0.0263 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.15 on 22 degrees of freedom
##   (1 obserwacja została skasowana z uwagi na braki w niej zawarte)
## Multiple R-squared:  0.4924, Adjusted R-squared:  0.3309 
## F-statistic: 3.049 on 7 and 22 DF,  p-value: 0.0212

# ---- 4. Optional: Quick Diagnostics Function ----
run_diagnostics <- function(model) {
  res <- residuals(model)
  dw <- dwtest(model); bp <- bptest(model)
  vif_vals <- vif(model); high_vif <- names(vif_vals[vif_vals>5])
  adf <- ur.df(res, type="drift", selectlags="AIC")
  
  data.frame(
    Test = c("Durbin-Watson", "Breusch-Pagan", "Residual Stationarity", "Multicollinearity"),
    Result = c(
      paste0(round(dw$statistic,3), ", p=", round(dw$p.value,3)),
      paste0(round(bp$statistic,3), ", p=", round(bp$p.value,3)),
      ifelse(adf@teststat[1]<adf@cval[2,"5pct"], "Stationary", "Non-stationary"),
      if(length(high_vif)==0) "No severe multicollinearity" else paste("High VIF for:", paste(high_vif, collapse=", "))
    ),
    stringsAsFactors = FALSE
  )
}


# Run diagnostics
run_diagnostics(long_run_model_refined)

run_diagnostics(ecm_model_refined)

################################################
# Adjust for serial correlation of refine long run_diagnostics
library(dynlm)
library(sandwich)
library(lmtest)

nw_se <- NeweyWest(long_run_model_refined, lag = 2, prewhite = FALSE)
coeftest(long_run_model_refined, vcov = nw_se)

## 
## t test of coefficients:
## 
##                         Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)           3.9081e+03  9.6018e+02  4.0702 0.0004136 ***
## Population           -1.4617e+01  6.9468e+00 -2.1042 0.0455879 *  
## GNI_per_capita       -1.6241e-03  1.0475e-03 -1.5504 0.1336022    
## Avg_Max_Temp         -2.3576e+01  8.1407e+00 -2.8961 0.0077390 ** 
## Producer_Price_Index  3.6862e-01  2.2424e-01  1.6439 0.1127175    
## Cropland_area        -1.9300e-01  4.7803e-02 -4.0373 0.0004502 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Potato_DE

1. Load the data

2. Check for stationarity

3. Decide on the modeling approach:Check for cointegration (long-run relationship)

Use Johansen test if you have multiple I(1) variables.

Since you have ~30 observations and 11 I(1) variables, Johansen may fail due to rank deficiency.

Reduce variables (drop highly correlated ones like Producer_Price_Index vs Producer_Price_USD)

Or use Engle-Granger two-step (NVWI vs each regressor separately).

4. Engle-Granger Error Correction Model (ECM)

5. check diagnostics

—- 5. Diagnostics for Short-Run (ECM) and Long-Run Models —-

6. REFINE YOUR MODEL (by removing variables)

—- Refined Long- and Short-Run Models —-