RealEstate_Quarto

Real Estate - House Price Prediction

Library

# library ----
# setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

library(tidyverse)

Warning: package 'tidyverse' was built under R version 4.5.2

Warning: package 'ggplot2' was built under R version 4.5.2

Warning: package 'tidyr' was built under R version 4.5.2

Warning: package 'readr' was built under R version 4.5.2

Warning: package 'purrr' was built under R version 4.5.2

Warning: package 'forcats' was built under R version 4.5.2

Warning: package 'lubridate' was built under R version 4.5.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(TSA)

Warning: package 'TSA' was built under R version 4.5.2


Attaching package: 'TSA'

The following object is masked from 'package:readr':

    spec

The following objects are masked from 'package:stats':

    acf, arima

The following object is masked from 'package:utils':

    tar

library(tseries)

Warning: package 'tseries' was built under R version 4.5.2

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo

library(lubridate)
library(forecast)

Warning: package 'forecast' was built under R version 4.5.2

Registered S3 methods overwritten by 'forecast':
  method       from
  fitted.Arima TSA 
  plot.Arima   TSA

library(car)

Warning: package 'car' was built under R version 4.5.2

Loading required package: carData

Warning: package 'carData' was built under R version 4.5.2


Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

library(nlme)


Attaching package: 'nlme'

The following object is masked from 'package:forecast':

    getResponse

The following object is masked from 'package:dplyr':

    collapse

library(vars)

Warning: package 'vars' was built under R version 4.5.2

Loading required package: MASS

Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select

Loading required package: strucchange

Warning: package 'strucchange' was built under R version 4.5.2

Loading required package: zoo

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

Loading required package: sandwich

Warning: package 'sandwich' was built under R version 4.5.2


Attaching package: 'strucchange'

The following object is masked from 'package:stringr':

    boundary

Loading required package: urca

Warning: package 'urca' was built under R version 4.5.2

Loading required package: lmtest

Warning: package 'lmtest' was built under R version 4.5.2

library(lmtest)
library(tswge)

Warning: package 'tswge' was built under R version 4.5.2


Attaching package: 'tswge'

The following object is masked from 'package:lmtest':

    wages

The following object is masked from 'package:MASS':

    cement

The following object is masked from 'package:datasets':

    uspop

library(zoo)
library(combinat)

Warning: package 'combinat' was built under R version 4.5.2


Attaching package: 'combinat'

The following object is masked from 'package:utils':

    combn

help (explain library)

# help for library ----
# help(package = "lubridate")
# help(package = "TSA")
# help(package = "stats")
# help(package = "nlme")
# help(package = "tseries")
# help(package = "vars")
# help(package = "lmtest")
# help(package = "tswge")

help for function

# help ----
# ?base::as.Date
# ?base::min
# ?base::max
# ?base::tapply
# ?base::expression
# ?base::diff  # differencing with previous X
# ?base::cbind
# 
# ?lubridate::year
# ?lubridate::month
# 
# ?stats::ARMAacf  # theoretical data for acf or pacf -? ARMAacf(ar,ma,lag.max, pacf)
# ?stats::arima.sim  # simulate data for p,q -? arima.sim(list(ar,ma),n)
# ?stats::acf # plot acf -?  acf(data)
# ?stats::pacf # plot pacf -? pacf(data)
# ?stats::ar # fit AR model
# ?stats::arima  # fit ARIMA model -? arima(data, order=c(p,d,q))
# 
# ?TSA::eacf # matrix for check p,q -? eacf(data)
# ?TSA::BoxCox.ar  # transform y -? BoxCox.ar(data)
# 
# 
# ?stats::ar.yw  # validate ar by yule-walker coefficient
# 
# ?stats::predict
# ?stats::frequency  # check no. of observed/unit time # annual: 1, quarterly: 4, monthly: 12
# ?stats::rstandard  # standardized residual (for diagnosticing model)
# ?stats::qqnorm
# ?stats::qqline
# ?stats::tsdiag  # std. residual, acf residual, p-value Ljung-Box
# ?stats::AIC
# ?stats::shapiro.test  # p>0.05 ; normal
# ?stats::predict.Arima # predict ARIMA model - predict(arima fitted model)
# ?stats::diffinv # inverse differencing
# 
# ?tseries::adf.test # check stationary p < 0.05 stationary
# 
# ?forecast::ets # ets model on y
# ?forecast::auto.arima # auto fit arima-? auto.arima(data, stepwise, approximation, trace)
# ?forecast::checkresiduals # check overall p-value from checking white noise
# ?forecast::tsCV #cross validation fit
# ?forecast::forecast.ets # -? forecast(obj,h)
# 
# ?car::durbinWatsonTest # check autocorrelation for residual ONLY lag1 : p>0 => no autocorrelate
# ?nlme::intervals # compute CI on model 
# 
# ?lmtest::grangertest # check causality x to y
# 
# ?vars::VARselect # eda for select lag for VAR fitting
# ?vars::serial.test  # check Portmanteau test (result still autocorrelate or not) : p >0 no autocorrelate
# ?vars::causality # check granger test from xi to several xj's in same ts data
# ?vars::VAR # fit VAR model
# 
# ?tswge::ljung.wge #Ljung-Box test: residual still autocorrelate : p < 0.05 => autocorrelate

Import Data

data <- read.csv('weekly_consolidated_data.csv',header = TRUE)
data$REF_DATE <- as.Date(data$REF_DATE, format = "%Y-%m-%d")
head(data)

    REF_DATE   Wage Cement Energy Lumber Metal HOUSE_PRICE_IDX
1 1990-01-07 23.804  60.75  33.02  68.59 57.62           65.79
2 1990-01-14 23.804  54.70  30.82  64.25 50.81           59.68
3 1990-01-21 23.804  57.09  36.13  68.18 51.21           62.00
4 1990-01-28 23.804  65.76  31.80  65.81 52.78           70.61
5 1990-02-04 23.804  66.62  33.16  65.05 50.23           71.41
6 1990-02-11 23.804  60.96  31.66  64.77 50.98           65.70

tail(data)

       REF_DATE  Wage Cement Energy Lumber  Metal HOUSE_PRICE_IDX
1821 2024-11-24 59.34 142.48 122.29 136.90 140.38          118.43
1822 2024-12-01 59.34 140.24 122.40 135.07 136.91          116.04
1823 2024-12-08 59.34 137.74 127.87 136.31 141.43          113.28
1824 2024-12-15 59.34 135.24 125.84 136.97 142.76          110.51
1825 2024-12-22 59.34 135.36 119.91 133.25 141.89          110.37
1826 2024-12-29 59.34 139.61 122.91 134.38 135.04          114.34

# Recheck data
H <- 52 # frequency as weekly
par(mfrow=c(1,1))

# EDA
min(data$REF_DATE)

[1] "1990-01-07"

max(data$REF_DATE)

[1] "2024-12-29"

# Convert each column to zoo time series
data_ts_list <- list(
  Wage       = zoo(data$Wage, order.by = data$REF_DATE),
  Cement     = zoo(data$Cement, order.by = data$REF_DATE),
  Energy     = zoo(data$Energy, order.by = data$REF_DATE),
  Lumber     = zoo(data$Lumber, order.by = data$REF_DATE),
  Metal      = zoo(data$Metal, order.by = data$REF_DATE),
  HousePrice = zoo(data$HOUSE_PRICE_IDX, order.by = data$REF_DATE)
)

Data Variables

# Set plotting layout: 2 rows x 3 columns
par(mfrow = c(2, 3), mar = c(4, 4, 3, 1))  # adjust margins

# Loop to plot each variable
for(name in names(data_ts_list)) {
  plot(
    data_ts_list[[name]],
    ylab = name,
    xlab = "Year",
    main = paste(name, "time series"),
    type = "l",
    col = "blue"
  )
}

Univariate

Data Manipulate

### Handle with Lubridate 
### Convert to ts objects (weekly) ##

data_ts <- zoo(data$HOUSE_PRICE_IDX, order.by = data$REF_DATE)
start(data_ts)

[1] "1990-01-07"

end(data_ts)

[1] "2024-12-29"

Split Train and Test

nTotal <- length(data_ts)

nTest <- 104        # e.g., last 2 year for test
nTrain <- nTotal - nTest

time_points <- time(data_ts)
end_train  <- time_points[nTrain]           # last train point
start_test <- time_points[nTrain + 1]       # first test point

data_ts_train <- window(data_ts, end = end_train)
data_ts_test  <- window(data_ts, start = start_test)

# Using zoo for weekly data spliting
# convert zoo to ts
data_ts_train <- ts(coredata(data_ts_train),
                    start = c(year(index(data_ts_train)[1]), 
                              week(index(data_ts_train)[1])),
                    frequency = H)

data_ts_test <- ts(coredata(data_ts_test),
                   start = c(year(index(data_ts_test)[1]), 
                             week(index(data_ts_test)[1])),
                   frequency = H)


# Check
start(data_ts_train); end(data_ts_train)

[1] 1990    1

[1] 2023    6

start(data_ts_test); end(data_ts_test)

[1] 2023    2

[1] 2025    1

EDA: Train

# Time Series plot
plot(data_ts_train, ylab="House Price Index", xlab="year", main="House Price Index History", col="darkgreen")

BoxCox: Transformation checking

# See BoxCox
BoxCox.ar(data_ts)

Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1
Warning in arima0(x, order = c(i, 0L, 0L), include.mean = demean): possible
convergence problem: optim gave code = 1

 # lambda = 1 , No transformation needed

# Find proper lambda
x_train <- as.numeric(data_ts_train)
lambda <- BoxCox.lambda(x_train)
lambda

[1] 1.040802

Lambda around 1, no need to transform

ADF: Stationary Checking

# Stationary test
adf.test(data_ts_train) # p < 0.05 ==> stationary

Warning in adf.test(data_ts_train): p-value smaller than printed p-value


    Augmented Dickey-Fuller Test

data:  data_ts_train
Dickey-Fuller = -4.4795, Lag order = 11, p-value = 0.01
alternative hypothesis: stationary

H₀: series is non-stationary
H₁: series is stationary

The ADF test strongly rejects the null hypothesis of a unit root (p < 0.01). This means data is stationary, so it is suitable for ARIMA modeling.

1. Modeling: ETS -> ETS(A,A,N) (Test: MAPE 2.628%, RMSE

%, RMSE 3.695)

plot(decompose(data_ts_train), col="darkgreen")

ets_model <- ets(data_ts_train)# lm object  , let R select model type

Warning in ets(data_ts_train): I can't handle data with frequency greater than
24. Seasonality will be ignored. Try stlf() if you need seasonal forecasts.

summary(ets_model) # A,Ad,N

ETS(A,A,N) 

Call:
ets(y = data_ts_train)

  Smoothing parameters:
    alpha = 0.0603 
    beta  = 0.0029 

  Initial states:
    l = 68.0645 
    b = -0.1806 

  sigma:  3.0902

     AIC     AICc      BIC 
16722.72 16722.76 16749.98 

Training set error measures:
                     ME    RMSE      MAE        MPE     MAPE      MASE
Training set 0.03787999 3.08665 2.458344 -0.1187727 3.588953 0.5822222
                    ACF1
Training set 0.005314623

ets_model

ETS(A,A,N) 

Call:
ets(y = data_ts_train)

  Smoothing parameters:
    alpha = 0.0603 
    beta  = 0.0029 

  Initial states:
    l = 68.0645 
    b = -0.1806 

  sigma:  3.0902

     AIC     AICc      BIC 
16722.72 16722.76 16749.98

plot(ets(data_ts_train))  # ignore seasonal

Warning in ets(data_ts_train): I can't handle data with frequency greater than
24. Seasonality will be ignored. Try stlf() if you need seasonal forecasts.

lines(data_ts_test, col = "red")

The ETS(A,A,N) model (Holt’s linear trend) fits the house price index well. There is a smooth upward trend, updated slowly over time (small α and β), capturing the long-term growth of the housing market. Residuals show almost no autocorrelation, and forecast errors are small (MAPE ~3.6%), indicating a well-behaved model. Because weekly seasonality cannot be modeled by ETS, the model fitting ignored seasonality — which is appropriate since the data does not exhibit clear weekly seasonal patterns.

Cross Validation

# cross-validation # : overlay on data_ts_test ----
ets_forecast_cross <- forecast(ets_model,h = nTest)
accuracy(ets_forecast_cross$mean,data_ts_test)

                ME   RMSE      MAE       MPE     MAPE        ACF1 Theil's U
Test set -2.122207 3.6946 2.967062 -1.919768 2.627778 -0.06351401 0.8345208

plot(ets_forecast_cross)
lines(data_ts_test, col = "orange", lwd = 1, lty = 2)

2. Modeling: ARIMA(0,1,2) (Test: MAPE 2.131%, RMSE 2.986)

# Plot ACF, PACF from train

par(mfrow = c(2,1))

# --- ACF ---
acf_out <- acf(data_ts_train, plot = FALSE, lag.max = 260)
lags_acf <- acf_out$lag[,1,1] * frequency(data_ts_train)

plot(
  lags_acf,acf_out$acf,type = "h",
  xlab = "Lag",ylab = "ACF",main = "ACF for house price index")

abline(h = c(-1,1) * qnorm(0.975) / sqrt(length(data_ts_train)), 
       col = "blue", lty = 2)

# --- PACF ---
pacf_out <- pacf(data_ts_train, plot = FALSE, lag.max = 260)
lags_pacf <- pacf_out$lag * frequency(data_ts_train)

plot(
  lags_pacf,pacf_out$acf,type = "h",
  xlab = "Lag",ylab = "Partial ACF",
  main = "PACF for house price index")

abline(h = c(-1,1) * qnorm(0.975) / sqrt(length(data_ts_train)), 
       col = "blue", lty = 2)

Non stationary: The original house price index is highly persistent and clearly non-stationary, with ACF near 1 across many lags and a strong PACF spike at lag 1; therefore the series must be differenced before fitting an ARIMA model.

# Plot ACF, PACF from train

par(mfrow = c(2,1))
dif <- diff(data_ts_train, 1)

# --- ACF ---
acf_out <- acf(dif, plot = FALSE, lag.max = 260)
lags_acf <- acf_out$lag[,1,1] * frequency(data_ts_train)

plot(
  lags_acf,acf_out$acf,type = "h",
  xlab = "Lag",ylab = "ACF",main = "ACF for house price index at diff=1")

abline(h = c(-1,1) * qnorm(0.975) / sqrt(length(data_ts_train)), 
       col = "blue", lty = 2)

# --- PACF ---
pacf_out <- pacf(dif, plot = FALSE, lag.max = 260)
lags_pacf <- pacf_out$lag * frequency(data_ts_train)

plot(
  lags_pacf,pacf_out$acf,type = "h",
  xlab = "Lag",ylab = "PACF",
  main = "PACF for house price index at diff=1")

abline(h = c(-1,1) * qnorm(0.975) / sqrt(length(data_ts_train)), 
       col = "blue", lty = 2)

Driff = 1: After first differencing, the series becomes stationary with a strong negative MA(1) signature—ACF cuts off after lag 1 while PACF decays—indicating that ARIMA(0,1,1) is the appropriate model structure.

# Loop through each combination of exogenous vars

# ===== Parameter grid =====
p_values <- 0:3
q_values <- 0:3
d_value  <- 1              # <-- change if needed (0,1,2...)

# Create combinations
arima_grid <- expand.grid(p = p_values,
                          d = d_value,
                          q = q_values)

# Empty results table
df_results <- data.frame(
  p = integer(),
  d = integer(),
  q = integer(),
  RMSE = numeric(),
  MAE = numeric(),
  MAPE = numeric(),
  MASE = numeric(),
  AIC = numeric(),
  stringsAsFactors = FALSE
)

# ===== Loop through all combinations =====
for(i in 1:nrow(arima_grid)) {
  
  p <- arima_grid$p[i]
  d <- arima_grid$d[i]
  q <- arima_grid$q[i]
  
  cat("Fitting ARIMA(", p, ",", d, ",", q, ")\n")
  
  # Fit ARIMA
  fit <- tryCatch(
    Arima(data_ts_train, order = c(p, d, q)),
    error = function(e) NULL
  )
  
  if(is.null(fit)) {
    cat(" --> Model failed. Skipping.\n")
    next
  }
  
  # Forecast
  fc <- forecast(fit, h = nTest)
  
  # Compute accuracy
  acc <- accuracy(fc$mean, data_ts_test)
  row_index <- if(nrow(acc) >= 2) 2 else 1
  
  RMSE <- acc[row_index, "RMSE"]
  MAE  <- acc[row_index, "MAE"]
  MAPE <- acc[row_index, "MAPE"]
  AIC  <- fit$aic
  

  # Append to results
  df_results <- rbind(df_results, data.frame(
    p = p,
    d = d,
    q = q,
    RMSE = RMSE,
    MAE  = MAE,
    MAPE = MAPE,
    AIC  = AIC,
    stringsAsFactors = FALSE
  ))
}

Fitting ARIMA( 0 , 1 , 0 )
Fitting ARIMA( 1 , 1 , 0 )
Fitting ARIMA( 2 , 1 , 0 )
Fitting ARIMA( 3 , 1 , 0 )
Fitting ARIMA( 0 , 1 , 1 )
Fitting ARIMA( 1 , 1 , 1 )
Fitting ARIMA( 2 , 1 , 1 )
Fitting ARIMA( 3 , 1 , 1 )
Fitting ARIMA( 0 , 1 , 2 )
Fitting ARIMA( 1 , 1 , 2 )
Fitting ARIMA( 2 , 1 , 2 )
Fitting ARIMA( 3 , 1 , 2 )
Fitting ARIMA( 0 , 1 , 3 )
Fitting ARIMA( 1 , 1 , 3 )
Fitting ARIMA( 2 , 1 , 3 )
Fitting ARIMA( 3 , 1 , 3 )

# ===== Sort results by AIC =====
df_results_sorted_AIC <- df_results[order(df_results$AIC), ]
print(df_results_sorted_AIC)

   p d q     RMSE      MAE     MAPE      AIC
11 2 1 2 2.985815 2.450115 2.153494 8770.962
15 2 1 3 2.986858 2.424079 2.127356 8810.189
5  0 1 1 2.988690 2.429791 2.132252 8813.645
9  0 1 2 2.985515 2.428485 2.130746 8814.490
6  1 1 1 2.985779 2.428642 2.130913 8814.570
13 0 1 3 2.983283 2.425287 2.127769 8814.600
14 1 1 3 2.989139 2.423859 2.126852 8814.823
7  2 1 1 2.983799 2.425744 2.128230 8814.824
10 1 1 2 2.993114 2.428318 2.131206 8816.631
8  3 1 1 2.983790 2.425747 2.128231 8816.824
12 3 1 2 2.989020 2.423899 2.126886 8816.846
16 3 1 3 2.988854 2.432787 2.134331 8818.265
4  3 1 0 2.958462 2.414617 2.115373 9052.979
3  2 1 0 2.960950 2.401528 2.104849 9135.605
2  1 1 0 2.965941 2.429227 2.127767 9357.610
1  0 1 0 3.754966 3.015859 2.673331 9841.243

# Create empty AIC matrix
aic_matrix <- matrix(NA, 
                      nrow = length(p_values), 
                      ncol = length(q_values),
                      dimnames = list(
                        paste0("p=", p_values),
                        paste0("q=", q_values)
                      ))

# Fill matrix with AIC
for (i in 1:nrow(df_results)) {
  p <- df_results$p[i]
  q <- df_results$q[i]
  AIC <- df_results$AIC[i]
  
  aic_matrix[p + 1, q + 1] <- AIC   # +1 because index starts at 1
}

# Print AIC matrix
aic_matrix

         q=0      q=1      q=2      q=3
p=0 9841.243 8813.645 8814.490 8814.600
p=1 9357.610 8814.570 8816.631 8814.823
p=2 9135.605 8814.824 8770.962 8810.189
p=3 9052.979 8816.824 8816.846 8818.265

# ===== Sort results by RMSE =====
df_results_sorted_RMSE <- df_results[order(df_results$RMSE), ]
print(df_results_sorted_RMSE)

   p d q     RMSE      MAE     MAPE      AIC
4  3 1 0 2.958462 2.414617 2.115373 9052.979
3  2 1 0 2.960950 2.401528 2.104849 9135.605
2  1 1 0 2.965941 2.429227 2.127767 9357.610
13 0 1 3 2.983283 2.425287 2.127769 8814.600
8  3 1 1 2.983790 2.425747 2.128231 8816.824
7  2 1 1 2.983799 2.425744 2.128230 8814.824
9  0 1 2 2.985515 2.428485 2.130746 8814.490
6  1 1 1 2.985779 2.428642 2.130913 8814.570
11 2 1 2 2.985815 2.450115 2.153494 8770.962
15 2 1 3 2.986858 2.424079 2.127356 8810.189
5  0 1 1 2.988690 2.429791 2.132252 8813.645
16 3 1 3 2.988854 2.432787 2.134331 8818.265
12 3 1 2 2.989020 2.423899 2.126886 8816.846
14 1 1 3 2.989139 2.423859 2.126852 8814.823
10 1 1 2 2.993114 2.428318 2.131206 8816.631
1  0 1 0 3.754966 3.015859 2.673331 9841.243

# Create empty RMSE matrix
rmse_matrix <- matrix(NA, 
                      nrow = length(p_values), 
                      ncol = length(q_values),
                      dimnames = list(
                        paste0("p=", p_values),
                        paste0("q=", q_values)
                      ))

# Fill matrix with RMSE
for (i in 1:nrow(df_results)) {
  p <- df_results$p[i]
  q <- df_results$q[i]
  RMSE <- df_results$RMSE[i]
  
  rmse_matrix[p + 1, q + 1] <- RMSE   # +1 because index starts at 1
}

# Print RMSE matrix
rmse_matrix

         q=0      q=1      q=2      q=3
p=0 3.754966 2.988690 2.985515 2.983283
p=1 2.965941 2.985779 2.993114 2.989139
p=2 2.960950 2.983799 2.985815 2.986858
p=3 2.958462 2.983790 2.989020 2.988854

2.1 Auto ARIMA -> ARIMA(2,1,2) (Test: MAPE 2.154%, RMSE 2.986)

# ARIMA model ----
auto_arima <- auto.arima(data_ts_train,
                         stepwise = T,
                         seasonal = F,
                         approximation = F,
                         trace = T)


 ARIMA(2,1,2)            with drift         : 8772.131
 ARIMA(0,1,0)            with drift         : 9843.164
 ARIMA(1,1,0)            with drift         : 9359.37
 ARIMA(0,1,1)            with drift         : 8804.167
 ARIMA(0,1,0)                               : 9841.245
 ARIMA(1,1,2)            with drift         : 8807.119
 ARIMA(2,1,1)            with drift         : 8805.835
 ARIMA(3,1,2)            with drift         : 8807.786
 ARIMA(2,1,3)            with drift         : Inf
 ARIMA(1,1,1)            with drift         : 8805.312
 ARIMA(1,1,3)            with drift         : 8805.757
 ARIMA(3,1,1)            with drift         : 8807.842
 ARIMA(3,1,3)            with drift         : Inf
 ARIMA(2,1,2)                               : 8770.997
 ARIMA(1,1,2)                               : 8816.655
 ARIMA(2,1,1)                               : 8814.847
 ARIMA(3,1,2)                               : 8816.895
 ARIMA(2,1,3)                               : Inf
 ARIMA(1,1,1)                               : 8814.583
 ARIMA(1,1,3)                               : 8814.858
 ARIMA(3,1,1)                               : 8816.859
 ARIMA(3,1,3)                               : Inf

 Best model: ARIMA(2,1,2)

# Summary Best Auto ARIMA
summary(auto_arima)

Series: data_ts_train 
ARIMA(2,1,2) 

Coefficients:
         ar1      ar2      ma1     ma2
      0.9993  -0.0139  -1.9336  0.9379
s.e.  0.0259   0.0260   0.0094  0.0095

sigma^2 = 9.521:  log likelihood = -4380.48
AIC=8770.96   AICc=8771   BIC=8798.21

Training set error measures:
                    ME     RMSE      MAE         MPE     MAPE      MASE
Training set 0.1069866 3.081138 2.450959 -0.06255506 3.579664 0.5804732
                   ACF1
Training set -0.0009806

Cross Validation

## cross -validation # : overlay on data_ts_test ----
auto_arima_forecast_cross <- forecast(auto_arima,h = nTest)
accuracy(auto_arima_forecast_cross$mean,data_ts_test)

                 ME     RMSE      MAE        MPE     MAPE       ACF1 Theil's U
Test set -0.7097641 2.985815 2.450115 -0.6843765 2.153494 -0.1548798 0.6741929

plot(auto_arima_forecast_cross)
lines(data_ts_test, col = "orange", lwd = 1, lty = 2)

Normality of Standard Residuals

## normality of (standardized) residuals ####

par(mfrow=c(1,2))
hist(rstandard(auto_arima),xlab="Standardised residuals")
qqnorm(rstandard(auto_arima))
qqline(rstandard(auto_arima))

shapiro.test(rstandard(auto_arima))


    Shapiro-Wilk normality test

data:  rstandard(auto_arima)
W = 0.99942, p-value = 0.903

## Alternatively you can checkresiduals()
# It will give you Ljung-Box test for overall pattern (Portmanteau tests )
checkresiduals(auto_arima)


    Ljung-Box test

data:  Residuals from ARIMA(2,1,2)
Q* = 84.567, df = 100, p-value = 0.8655

Model df: 4.   Total lags used: 104

## ACF of residuals ####
tsdiag(auto_arima,gof.lag=20)

Normal >> explain more about this test

2.2 Manually Choose Parameter

Try ARIMA(2,1,2), ARIMA(0,1,1) and ARIMA(0,1,2) -> less parameter with less AIC

2.2.1 Try ARIMA(0,1,1) -> ARIMA(0,1,1) (Test: MAPE 2.132%, RMSE 2.989)

from ACF, PACF suggestion

arima_011 <- Arima(data_ts_train, order = c(0,1,1))
summary(arima_011)

Series: data_ts_train 
ARIMA(0,1,1) 

Coefficients:
          ma1
      -0.8828
s.e.   0.0093

sigma^2 = 9.784:  log likelihood = -4404.82
AIC=8813.65   AICc=8813.65   BIC=8824.55

Training set error measures:
                   ME     RMSE      MAE        MPE     MAPE      MASE
Training set 0.246923 3.126104 2.483466 0.07375947 3.619464 0.5881718
                    ACF1
Training set -0.02925469

Cross Validation

## cross -validation # : overlay on data_ts_test ----
arima_011_forecast_cross <- forecast(arima_011,h = nTest)
accuracy(arima_011_forecast_cross$mean,data_ts_test)

                 ME    RMSE      MAE        MPE     MAPE       ACF1 Theil's U
Test set -0.4953798 2.98869 2.429791 -0.4986125 2.132252 -0.1187564  0.675264

plot(arima_011_forecast_cross)
lines(data_ts_test, col = "orange", lwd = 1, lty = 2)

Normality of Standard Residuals

par(mfrow=c(1,2))
hist(rstandard(arima_011),xlab="Standardised residuals")
qqnorm(rstandard(arima_011))
qqline(rstandard(arima_011))

shapiro.test(rstandard(arima_011))


    Shapiro-Wilk normality test

data:  rstandard(arima_011)
W = 0.99922, p-value = 0.7051

## Alternatively you can checkresiduals()
# It will give you Ljung-Box test for overall pattern (Portmanteau tests )
checkresiduals(arima_011)


    Ljung-Box test

data:  Residuals from ARIMA(0,1,1)
Q* = 109.62, df = 103, p-value = 0.3092

Model df: 1.   Total lags used: 104

## ACF of residuals ####
tsdiag(arima_011,gof.lag=20)

Normal >> explain more about this test too

2.2.2 Try ARIMA(0,1,2) -> ARIMA(0,1,2) (Test: MAPE 2.131%, RMSE 2.986)

less parameter with quite less AIC

arima_012 <- Arima(data_ts_train, order = c(0,1,2))
summary(arima_012)

Series: data_ts_train 
ARIMA(0,1,2) 

Coefficients:
          ma1     ma2
      -0.9074  0.0258
s.e.   0.0248  0.0240

sigma^2 = 9.783:  log likelihood = -4404.25
AIC=8814.49   AICc=8814.5   BIC=8830.84

Training set error measures:
                    ME     RMSE      MAE        MPE     MAPE      MASE
Training set 0.2446421 3.125041 2.481181 0.07155408 3.616221 0.5876308
                     ACF1
Training set -0.005206844

Cross Validation

## cross -validation # : overlay on data_ts_test ----
arima_012_forecast_cross <- forecast(arima_012,h = nTest)
accuracy(arima_012_forecast_cross$mean,data_ts_test)

                 ME     RMSE      MAE        MPE     MAPE       ACF1 Theil's U
Test set -0.4748687 2.985515 2.428485 -0.4807163 2.130746 -0.1188088 0.6745449

plot(arima_012_forecast_cross)
lines(data_ts_test, col = "orange", lwd = 1, lty = 2)

Normality of Standard Residuals

par(mfrow=c(1,2))
hist(rstandard(arima_012),xlab="Standardised residuals")
qqnorm(rstandard(arima_012))
qqline(rstandard(arima_012))

shapiro.test(rstandard(arima_012))


    Shapiro-Wilk normality test

data:  rstandard(arima_012)
W = 0.99919, p-value = 0.6713

## Alternatively you can checkresiduals()
# It will give you Ljung-Box test for overall pattern (Portmanteau tests )
checkresiduals(arima_012)


    Ljung-Box test

data:  Residuals from ARIMA(0,1,2)
Q* = 108.5, df = 102, p-value = 0.3112

Model df: 2.   Total lags used: 104

tsdiag(arima_012,gof.lag=20)

Normal >> explain more about this test

3. Modeling: SARIMA(0,1,1)(0,0,1)[52] (Test: MAPE 2.801%, RMSE 3.937)

3.1 Auto SARIMA -> SARIMA(0,1,1)(0,0,1)[52] (Test: MAPE 2.801%, RMSE 3.937)

auto_sarima <- auto.arima(data_ts_train,
                         stepwise = T,
                         seasonal = T,
                         approximation = F,
                         trace = T)


 ARIMA(2,1,2)(1,0,1)[52] with drift         : Inf
 ARIMA(0,1,0)            with drift         : 9843.164
 ARIMA(1,1,0)(1,0,0)[52] with drift         : 9358.628
 ARIMA(0,1,1)(0,0,1)[52] with drift         : 8802.43
 ARIMA(0,1,0)                               : 9841.245
 ARIMA(0,1,1)            with drift         : 8804.167
 ARIMA(0,1,1)(1,0,1)[52] with drift         : 8804.385
 ARIMA(0,1,1)(0,0,2)[52] with drift         : 8804.336
 ARIMA(0,1,1)(1,0,0)[52] with drift         : 8802.502
 ARIMA(0,1,1)(1,0,2)[52] with drift         : Inf
 ARIMA(0,1,0)(0,0,1)[52] with drift         : 9843.064
 ARIMA(1,1,1)(0,0,1)[52] with drift         : 8803.603
 ARIMA(0,1,2)(0,0,1)[52] with drift         : 8803.55
 ARIMA(1,1,0)(0,0,1)[52] with drift         : 9358.453
 ARIMA(1,1,2)(0,0,1)[52] with drift         : 8805.388
 ARIMA(0,1,1)(0,0,1)[52]                    : 8811.022

 Best model: ARIMA(0,1,1)(0,0,1)[52] with drift

summary(auto_sarima)

Series: data_ts_train 
ARIMA(0,1,1)(0,0,1)[52] with drift 

Coefficients:
          ma1    sma1   drift
      -0.8931  0.0484  0.0287
s.e.   0.0091  0.0249  0.0084

sigma^2 = 9.708:  log likelihood = -4397.2
AIC=8802.41   AICc=8802.43   BIC=8824.21

Training set error measures:
                      ME     RMSE      MAE        MPE     MAPE      MASE
Training set 0.003954669 3.112107 2.472576 -0.2841696 3.622012 0.5855928
                    ACF1
Training set -0.02028327

Cross Validation

# cross-validation # : overlay on data_ts_test
auto_sarima_forecast_cross <- forecast(auto_sarima,h = nTest)
accuracy(auto_sarima_forecast_cross$mean,data_ts_test)

                ME     RMSE      MAE       MPE     MAPE       ACF1 Theil's U
Test set -2.256842 3.936564 3.164668 -2.039848 2.801297 0.06205823 0.8905625

plot(auto_sarima_forecast_cross)
lines(data_ts_test, col = "orange", lwd = 1, lty = 2)

Normality of Standard Residual

# cross-validation # : overlay on data_ts_test
auto_sarima_forecast_cross <- forecast(auto_sarima,h = nTest)
accuracy(auto_sarima_forecast_cross$mean,data_ts_test)

                ME     RMSE      MAE       MPE     MAPE       ACF1 Theil's U
Test set -2.256842 3.936564 3.164668 -2.039848 2.801297 0.06205823 0.8905625

plot(auto_sarima_forecast_cross)
lines(data_ts_test, col = "red")

## normality of (standardized) residuals ####

par(mfrow=c(1,2))
hist(rstandard(auto_sarima),xlab="Standardised residuals")
qqnorm(rstandard(auto_sarima))
qqline(rstandard(auto_sarima))

shapiro.test(rstandard(auto_sarima))


    Shapiro-Wilk normality test

data:  rstandard(auto_sarima)
W = 0.99926, p-value = 0.7551

## Alternatively you can checkresiduals()
# It will give you Ljung-Box test for overall pattern (Portmanteau tests )
checkresiduals(auto_sarima)


    Ljung-Box test

data:  Residuals from ARIMA(0,1,1)(0,0,1)[52] with drift
Q* = 106.94, df = 102, p-value = 0.3494

Model df: 2.   Total lags used: 104

## ACF of residuals ####
tsdiag(auto_sarima,gof.lag=20)

Normal >> explain more about this test

Multivariate

Data Manipulate

# Check
str(data)

'data.frame':   1826 obs. of  7 variables:
 $ REF_DATE       : Date, format: "1990-01-07" "1990-01-14" ...
 $ Wage           : num  23.8 23.8 23.8 23.8 23.8 ...
 $ Cement         : num  60.8 54.7 57.1 65.8 66.6 ...
 $ Energy         : num  33 30.8 36.1 31.8 33.2 ...
 $ Lumber         : num  68.6 64.2 68.2 65.8 65 ...
 $ Metal          : num  57.6 50.8 51.2 52.8 50.2 ...
 $ HOUSE_PRICE_IDX: num  65.8 59.7 62 70.6 71.4 ...

head(data)

    REF_DATE   Wage Cement Energy Lumber Metal HOUSE_PRICE_IDX
1 1990-01-07 23.804  60.75  33.02  68.59 57.62           65.79
2 1990-01-14 23.804  54.70  30.82  64.25 50.81           59.68
3 1990-01-21 23.804  57.09  36.13  68.18 51.21           62.00
4 1990-01-28 23.804  65.76  31.80  65.81 52.78           70.61
5 1990-02-04 23.804  66.62  33.16  65.05 50.23           71.41
6 1990-02-11 23.804  60.96  31.66  64.77 50.98           65.70

tail(data)

       REF_DATE  Wage Cement Energy Lumber  Metal HOUSE_PRICE_IDX
1821 2024-11-24 59.34 142.48 122.29 136.90 140.38          118.43
1822 2024-12-01 59.34 140.24 122.40 135.07 136.91          116.04
1823 2024-12-08 59.34 137.74 127.87 136.31 141.43          113.28
1824 2024-12-15 59.34 135.24 125.84 136.97 142.76          110.51
1825 2024-12-22 59.34 135.36 119.91 133.25 141.89          110.37
1826 2024-12-29 59.34 139.61 122.91 134.38 135.04          114.34

names(data)

[1] "REF_DATE"        "Wage"            "Cement"          "Energy"         
[5] "Lumber"          "Metal"           "HOUSE_PRICE_IDX"

y_zoo  <- zoo(data$HOUSE_PRICE_IDX, order.by = data$REF_DATE)

X_zoo <- zoo(data[, c("Wage","Cement","Energy","Lumber","Metal")],
             order.by = data$REF_DATE)

Granger Causality Test

# 1. Extract y and X as numeric vectors

y  <- as.numeric(y_zoo)

X1 <- as.numeric(X_zoo[, "Wage"])
X2 <- as.numeric(X_zoo[, "Cement"])
X3 <- as.numeric(X_zoo[, "Energy"])
X4 <- as.numeric(X_zoo[, "Lumber"])
X5 <- as.numeric(X_zoo[, "Metal"])

# 2. Put into a simple data frame for grangertest

df_g <- data.frame(
  y     = y,
  Wage  = X1,
  Cement = X2,
  Energy = X3,
  Lumber = X4,
  Metal  = X5
)

# Function to run bidirectional Granger test

run_granger <- function(df, y_col, x_col, maxlag = 4) {
  
  cat("\n----------------------------------------------\n")
  cat("Testing:", x_col, "→", y_col, "(Does X cause Y?)\n")
  print(grangertest(as.formula(paste(y_col, "~", x_col)),
                    order = maxlag, data = df))
  
  cat("\nTesting:", y_col, "→", x_col, "(Does Y cause X?)\n")
  print(grangertest(as.formula(paste(x_col, "~", y_col)),
                    order = maxlag, data = df))
}


# 3. Run Granger test for all exogenous variables

vars <- c("Wage","Cement","Energy","Lumber","Metal")

for (v in vars) {
  run_granger(df_g, "y", v, maxlag = 4)
}


----------------------------------------------
Testing: Wage → y (Does X cause Y?)
Granger causality test

Model 1: y ~ Lags(y, 1:4) + Lags(Wage, 1:4)
Model 2: y ~ Lags(y, 1:4)
  Res.Df Df      F    Pr(>F)    
1   1813                        
2   1817 -4 12.312 7.018e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Testing: y → Wage (Does Y cause X?)
Granger causality test

Model 1: Wage ~ Lags(Wage, 1:4) + Lags(y, 1:4)
Model 2: Wage ~ Lags(Wage, 1:4)
  Res.Df Df      F   Pr(>F)   
1   1813                      
2   1817 -4 4.6198 0.001033 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

----------------------------------------------
Testing: Cement → y (Does X cause Y?)
Granger causality test

Model 1: y ~ Lags(y, 1:4) + Lags(Cement, 1:4)
Model 2: y ~ Lags(y, 1:4)
  Res.Df Df      F Pr(>F)
1   1813                 
2   1817 -4 0.8618 0.4862

Testing: y → Cement (Does Y cause X?)
Granger causality test

Model 1: Cement ~ Lags(Cement, 1:4) + Lags(y, 1:4)
Model 2: Cement ~ Lags(Cement, 1:4)
  Res.Df Df      F Pr(>F)
1   1813                 
2   1817 -4 0.9283 0.4465

----------------------------------------------
Testing: Energy → y (Does X cause Y?)
Granger causality test

Model 1: y ~ Lags(y, 1:4) + Lags(Energy, 1:4)
Model 2: y ~ Lags(y, 1:4)
  Res.Df Df      F  Pr(>F)  
1   1813                    
2   1817 -4 2.6859 0.02993 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Testing: y → Energy (Does Y cause X?)
Granger causality test

Model 1: Energy ~ Lags(Energy, 1:4) + Lags(y, 1:4)
Model 2: Energy ~ Lags(Energy, 1:4)
  Res.Df Df      F Pr(>F)
1   1813                 
2   1817 -4 1.6509 0.1589

----------------------------------------------
Testing: Lumber → y (Does X cause Y?)
Granger causality test

Model 1: y ~ Lags(y, 1:4) + Lags(Lumber, 1:4)
Model 2: y ~ Lags(y, 1:4)
  Res.Df Df     F Pr(>F)
1   1813                
2   1817 -4 1.028 0.3914

Testing: y → Lumber (Does Y cause X?)
Granger causality test

Model 1: Lumber ~ Lags(Lumber, 1:4) + Lags(y, 1:4)
Model 2: Lumber ~ Lags(Lumber, 1:4)
  Res.Df Df      F Pr(>F)
1   1813                 
2   1817 -4 1.6525 0.1585

----------------------------------------------
Testing: Metal → y (Does X cause Y?)
Granger causality test

Model 1: y ~ Lags(y, 1:4) + Lags(Metal, 1:4)
Model 2: y ~ Lags(y, 1:4)
  Res.Df Df      F Pr(>F)  
1   1813                   
2   1817 -4 3.0634 0.0158 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Testing: y → Metal (Does Y cause X?)
Granger causality test

Model 1: Metal ~ Lags(Metal, 1:4) + Lags(y, 1:4)
Model 2: Metal ~ Lags(Metal, 1:4)
  Res.Df Df      F   Pr(>F)   
1   1813                      
2   1817 -4 4.2992 0.001828 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Wage and y: Bidirectional Granger causality (Wage ↔︎ y).
Cement and y: No Granger causality in either direction.
Energy and y: Unidirectional causality (Energy → y).
Lumber and y: No Granger causality in either direction.
Metal and y: Bidirectional Granger causality (Metal ↔︎ y).

# Candidate exogenous variables
selected_vars <- c("Wage", "Energy", "Metal")
exog_vars <- c("Wage", "Energy", "Metal")

Split Train and Test

## Train / Test Split
nTotal     <- length(y_zoo)
# nTest must already be defined above
nTrain     <- nTotal - nTest

time_points <- time(y_zoo)

end_train  <- time_points[nTrain]
start_test <- time_points[nTrain + 1]


## Y split

y_train <- window(y_zoo, end = end_train)
y_test  <- window(y_zoo, start = start_test)


## X split (same index)

X_train <- window(X_zoo, end = end_train)
X_test  <- window(X_zoo, start = start_test)

## Convert zoo → ts

to_ts <- function(z) {
  ts(coredata(z),
     start = c(year(index(z)[1]), week(index(z)[1])),
     frequency = H)
}

y_train_ts    <- to_ts(y_train)
y_test_ts     <- to_ts(y_test)

X_train_sel    <- X_train[, selected_vars]
X_test_sel     <- X_test[, selected_vars]

X_train_ts <- ts(coredata(X_train_sel),
                 start = c(year(index(X_train_sel)[1]),
                           week(index(X_train_sel)[1])),
                 frequency = H)

X_test_ts <- ts(coredata(X_test_sel),
                start = c(year(index(X_test_sel)[1]),
                          week(index(X_test_sel)[1])),
                frequency = H)

## Build List for use

data_tsx_house <- list(
  y_train = y_train_ts,
  y_test = y_test_ts,
  
  X_train = X_train_ts,
  X_test = X_test_ts
)

## Check

head(data_tsx_house$y_train)

Time Series:
Start = c(1990, 1) 
End = c(1990, 6) 
Frequency = 52 
[1] 65.79 59.68 62.00 70.61 71.41 65.70

tail(data_tsx_house$y_train)

Time Series:
Start = c(2023, 1) 
End = c(2023, 6) 
Frequency = 52 
[1] 111.33 114.80 114.31 117.37 110.81 116.99

head(data_tsx_house$y_test)

Time Series:
Start = c(2023, 2) 
End = c(2023, 7) 
Frequency = 52 
[1] 121.65 111.47 116.12 113.54 116.79 115.55

tail(data_tsx_house$y_test)

Time Series:
Start = c(2024, 48) 
End = c(2025, 1) 
Frequency = 52 
[1] 118.43 116.04 113.28 110.51 110.37 114.34

head(data_tsx_house$X_train)

Time Series:
Start = c(1990, 1) 
End = c(1990, 6) 
Frequency = 52 
           Wage Energy Metal
1990.000 23.804  33.02 57.62
1990.019 23.804  30.82 50.81
1990.038 23.804  36.13 51.21
1990.058 23.804  31.80 52.78
1990.077 23.804  33.16 50.23
1990.096 23.804  31.66 50.98

tail(data_tsx_house$X_train)

Time Series:
Start = c(2023, 1) 
End = c(2023, 6) 
Frequency = 52 
           Wage Energy  Metal
2023.000 55.236 158.30 140.17
2023.019 55.246 161.96 138.63
2023.038 55.255 158.19 147.85
2023.058 55.265 157.00 147.50
2023.077 55.274 148.32 135.82
2023.096 55.284 153.69 141.93

head(data_tsx_house$X_test)

Time Series:
Start = c(2023, 2) 
End = c(2023, 7) 
Frequency = 52 
           Wage Energy  Metal
2023.019 55.335 151.07 148.22
2023.038 55.386 149.40 141.72
2023.058 55.437 151.39 141.61
2023.077 55.488 153.55 142.15
2023.096 55.539 152.04 139.57
2023.115 55.590 147.73 137.74

tail(data_tsx_house$X_test)

Time Series:
Start = c(2024, 48) 
End = c(2025, 1) 
Frequency = 52 
          Wage Energy  Metal
2024.904 59.34 122.29 140.38
2024.923 59.34 122.40 136.91
2024.942 59.34 127.87 141.43
2024.962 59.34 125.84 142.76
2024.981 59.34 119.91 141.89
2025.000 59.34 122.91 135.04

4. Modeling: ARIMAX(2,1,2)+Energy (Test MAPE 2.142%, RMSE 2.971)

# Generate all non-empty combinations ----
all_combos <- unlist(
  lapply(1:length(exog_vars), function(k) combn(exog_vars, k, simplify = FALSE)),
  recursive = FALSE
)

# Prepare results storage with AIC
df_results <- data.frame(
  exog = character(),
  RMSE = numeric(),
  MAE  = numeric(),
  MAPE = numeric(),
  MASE = numeric(),
  AIC  = numeric(),
  stringsAsFactors = FALSE
)

# Loop through each combination
for(vars in all_combos) {
  
  cat("Fitting ARIMAX with:", paste(vars, collapse = "+"), "\n")
  
  # Prepare xreg for train and test 
  X_train_combo <- data_tsx_house$X_train[, vars, drop = FALSE]
  X_test_combo  <- data_tsx_house$X_test[, vars, drop = FALSE]
  
  # Fit ARIMAX
  fit <- auto.arima(
    data_tsx_house$y_train,
    xreg = X_train_combo,
    stepwise = TRUE,
    seasonal = FALSE,
    approximation = FALSE,
    trace = FALSE
  )
  
  # Forecast on test set 
  n_test <- length(data_tsx_house$y_test)
  fc <- forecast(fit, h = n_test, xreg = X_test_combo)
  
  # Compute accuracy safely
  acc <- accuracy(fc, data_tsx_house$y_test)
  row_index <- if(nrow(acc) >= 2) 2 else 1
  
  RMSE <- if("RMSE" %in% colnames(acc)) acc[row_index, "RMSE"] else NA
  MAE  <- if("MAE"  %in% colnames(acc)) acc[row_index, "MAE"] else NA
  MAPE <- if("MAPE" %in% colnames(acc)) acc[row_index, "MAPE"] else NA
  MASE <- if("MASE" %in% colnames(acc)) acc[row_index, "MASE"] else NA
  AIC  <- fit$aic  # extract AIC from fitted model
  
  # Append results
  df_results <- rbind(df_results, data.frame(
    exog = paste(vars, collapse = "+"),
    RMSE = RMSE,
    MAE  = MAE,
    MAPE = MAPE,
    MASE = MASE,
    AIC  = AIC,
    stringsAsFactors = FALSE
  ))
}

Fitting ARIMAX with: Wage 
Fitting ARIMAX with: Energy 
Fitting ARIMAX with: Metal 
Fitting ARIMAX with: Wage+Energy 
Fitting ARIMAX with: Wage+Metal 
Fitting ARIMAX with: Energy+Metal 
Fitting ARIMAX with: Wage+Energy+Metal

# Sort results by RMSE or AIC as needed
df_results_sorted <- df_results[order(df_results$RMSE), ]
print(df_results_sorted)

               exog     RMSE      MAE     MAPE      MASE      AIC
2            Energy 2.989981 2.452650 2.154538 0.5808737 8768.668
6      Energy+Metal 3.014858 2.468917 2.170321 0.5847262 8770.302
7 Wage+Energy+Metal 3.223832 2.589287 2.284200 0.6132340 8772.073
4       Wage+Energy 3.269406 2.624050 2.315441 0.6214670 8772.286
3             Metal 3.275600 2.629261 2.320892 0.6227011 8773.873
1              Wage 3.445523 2.766832 2.445199 0.6552829 8772.696
5        Wage+Metal 3.456720 2.771816 2.450503 0.6564632 8773.152

4.1 Try Min RMSE: ARIMAX(2,1,2) + Energy (Test MAPE 2.142%, RMSE 2.971)

# Select best combination
best_row <- df_results_sorted[1, ]
cat("Best exogenous variables (by RMSE):", best_row$exog, "\n")

Best exogenous variables (by RMSE): Energy

best_vars <- unlist(strsplit(as.character(best_row$exog), "\\+"))

X_train_best <- data_tsx_house$X_train[, best_vars, drop = FALSE]
X_test_best  <- data_tsx_house$X_test[, best_vars, drop = FALSE]

# Fit ARIMAX with best exogenous variables
auto_arimax_best <- auto.arima(
  data_tsx_house$y_train,
  xreg = X_train_best,
  stepwise = TRUE,
  seasonal = FALSE,
  approximation = FALSE
)

summary(auto_arimax_best)

Series: data_tsx_house$y_train 
Regression with ARIMA(2,1,2) errors 

Coefficients:
         ar1      ar2      ma1     ma2  Energy
      1.0009  -0.0143  -1.9391  0.9430  0.0227
s.e.  0.0256   0.0258   0.0085  0.0085  0.0109

sigma^2 = 9.502:  log likelihood = -4378.33
AIC=8768.67   AICc=8768.72   BIC=8801.37

Training set error measures:
                    ME     RMSE      MAE         MPE     MAPE      MASE
Training set 0.1063593 3.077187 2.446476 -0.06290311 3.575574 0.5794114
                     ACF1
Training set 4.436162e-05

Cross Validation

# Forecast on test
forecast_test  <- forecast(
  auto_arimax_best,
  h = length(data_tsx_house$y_test),
  xreg = X_test_best)

accuracy(forecast_test$mean, data_tsx_house$y_test)

                ME     RMSE     MAE        MPE     MAPE       ACF1 Theil's U
Test set -0.635219 2.989981 2.45265 -0.6198784 2.154538 -0.1516986 0.6752129

plot(forecast_test)
lines(data_ts_test, col = "red")

# Standardized residuals histogram and QQ plot
par(mfrow=c(1,2))
hist(rstandard(auto_arimax_best), xlab="Standardized residuals", main="Residuals Histogram")
qqnorm(rstandard(auto_arimax_best))
qqline(rstandard(auto_arimax_best))

# Shapiro-Wilk test for normality
shapiro.test(rstandard(auto_arimax_best))


    Shapiro-Wilk normality test

data:  rstandard(auto_arimax_best)
W = 0.99948, p-value = 0.9426

# Alternatively, use checkresiduals() for full diagnostic
checkresiduals(auto_arimax_best)


    Ljung-Box test

data:  Residuals from Regression with ARIMA(2,1,2) errors
Q* = 84.34, df = 100, p-value = 0.8693

Model df: 4.   Total lags used: 104

## ACF of residuals ####
tsdiag(auto_arimax_best,gof.lag=20)

Normal >> explain more about this test

4.2 Try 2nd Min RMSE: ARIMAX(2,1,2) + Energy +Metal (Test: MAPE 2.170%, RMSE 3.015)

# Select best combination
best_row_mape <- df_results_sorted[2, ]
cat("Best exogenous variables (by MAPE):", best_row_mape$exog, "\n")

Best exogenous variables (by MAPE): Energy+Metal

best_vars_mape <- unlist(strsplit(as.character(best_row_mape$exog), "\\+"))

X_train_best_mape <- data_tsx_house$X_train[, best_vars_mape, drop = FALSE]
X_test_best_mape  <- data_tsx_house$X_test[, best_vars_mape, drop = FALSE]

# Fit ARIMAX with best exogenous variables
auto_arimax_best_mape <- auto.arima(
  data_tsx_house$y_train,
  xreg = X_train_best_mape,
  stepwise = TRUE,
  seasonal = FALSE,
  approximation = FALSE
)

summary(auto_arimax_best_mape)

Series: data_tsx_house$y_train 
Regression with ARIMA(2,1,2) errors 

Coefficients:
         ar1      ar2      ma1     ma2  Energy    Metal
      1.0006  -0.0135  -1.9380  0.9418  0.0233  -0.0125
s.e.  0.0256   0.0258   0.0087  0.0088  0.0111   0.0228

sigma^2 = 9.506:  log likelihood = -4378.15
AIC=8770.3   AICc=8770.37   BIC=8808.46

Training set error measures:
                    ME     RMSE      MAE         MPE     MAPE      MASE
Training set 0.1068916 3.076881 2.446152 -0.06178732 3.575047 0.5793347
                      ACF1
Training set -0.0003758016

Cross Validation

# Forecast on test
forecast_test  <- forecast(
  auto_arimax_best_mape,
  h = length(data_tsx_house$y_test),
  xreg = X_test_best_mape)

accuracy(forecast_test$mean, data_tsx_house$y_test)

                 ME     RMSE      MAE        MPE     MAPE       ACF1 Theil's U
Test set -0.7220578 3.014858 2.468917 -0.6958134 2.170321 -0.1487365 0.6807149

plot(forecast_test)

Normality of Standard Residual

# Standardized residuals histogram and QQ plot
par(mfrow=c(1,2))
hist(rstandard(auto_arimax_best_mape), xlab="Standardized residuals", main="Residuals Histogram")
qqnorm(rstandard(auto_arimax_best_mape))
qqline(rstandard(auto_arimax_best_mape))

# Shapiro-Wilk test for normality
shapiro.test(rstandard(auto_arimax_best_mape))


    Shapiro-Wilk normality test

data:  rstandard(auto_arimax_best_mape)
W = 0.99947, p-value = 0.937

# Alternatively, use checkresiduals() for full diagnostic
checkresiduals(auto_arimax_best_mape)


    Ljung-Box test

data:  Residuals from Regression with ARIMA(2,1,2) errors
Q* = 84.515, df = 100, p-value = 0.8664

Model df: 4.   Total lags used: 104

tsdiag(auto_arimax_best,gof.lag=20)

Normal >> explain more about this test

(No need to Present) 5. Model: SARIMAX

# Loop through each combination
for(vars in all_combos) {
  
  cat("Fitting SARIMAX with:", paste(vars, collapse = "+"), "\n")
  
  # Prepare xreg for train and test 
  X_train_combo <- data_tsx_house$X_train[, vars, drop = FALSE]
  X_test_combo  <- data_tsx_house$X_test[, vars, drop = FALSE]
  
  # Fit SARIMAX
  fit <- auto.arima(
    data_tsx_house$y_train,
    xreg = X_train_combo,
    stepwise = TRUE,
    seasonal = TRUE,
    approximation = FALSE,
    trace = FALSE
  )
  
  # Forecast on test set 
  n_test <- length(data_tsx_house$y_test)
  fc <- forecast(fit, h = n_test, xreg = X_test_combo)
  
  # Compute accuracy safely
  acc <- accuracy(fc, data_tsx_house$y_test)
  row_index <- if(nrow(acc) >= 2) 2 else 1
  
  RMSE <- if("RMSE" %in% colnames(acc)) acc[row_index, "RMSE"] else NA
  MAE  <- if("MAE"  %in% colnames(acc)) acc[row_index, "MAE"] else NA
  MAPE <- if("MAPE" %in% colnames(acc)) acc[row_index, "MAPE"] else NA
  MASE <- if("MASE" %in% colnames(acc)) acc[row_index, "MASE"] else NA
  AIC  <- fit$aic  # extract AIC from fitted model
  
  # Append results
  df_results <- rbind(df_results, data.frame(
    exog = paste(vars, collapse = "+"),
    RMSE = RMSE,
    MAE  = MAE,
    MAPE = MAPE,
    MASE = MASE,
    AIC  = AIC,
    stringsAsFactors = FALSE
  ))
}

Fitting SARIMAX with: Wage 
Fitting SARIMAX with: Energy 
Fitting SARIMAX with: Metal 
Fitting SARIMAX with: Wage+Energy 
Fitting SARIMAX with: Wage+Metal 
Fitting SARIMAX with: Energy+Metal 
Fitting SARIMAX with: Wage+Energy+Metal

# Sort results by RMSE or AIC as needed
df_results_sorted <- df_results[order(df_results$RMSE), ]
print(df_results_sorted)

                exog     RMSE      MAE     MAPE      MASE      AIC
2             Energy 2.989981 2.452650 2.154538 0.5808737 8768.668
6       Energy+Metal 3.014858 2.468917 2.170321 0.5847262 8770.302
7  Wage+Energy+Metal 3.223832 2.589287 2.284200 0.6132340 8772.073
4        Wage+Energy 3.269406 2.624050 2.315441 0.6214670 8772.286
3              Metal 3.275600 2.629261 2.320892 0.6227011 8773.873
11       Wage+Energy 3.397245 2.728657 2.408763 0.6462418 8802.970
14 Wage+Energy+Metal 3.416927 2.742429 2.421335 0.6495035 8804.861
1               Wage 3.445523 2.766832 2.445199 0.6552829 8772.696
5         Wage+Metal 3.456720 2.771816 2.450503 0.6564632 8773.152
9             Energy 3.464203 2.779498 2.454913 0.6582827 8800.988
13      Energy+Metal 3.479259 2.790425 2.464835 0.6608705 8802.873
10             Metal 3.948836 3.175172 2.810661 0.7519922 8804.388
8               Wage 4.207054 3.395346 3.006497 0.8041371 8804.226
12        Wage+Metal 4.236201 3.422214 3.030304 0.8105003 8806.193

5.1 Try Min RMSE: SARIMAX(0,1,1)(0,01)[52] (Test MAPE 2.455%, RMSE 3.464)

# Select best combination
best_row <- df_results_sorted[1, ]
cat("Best exogenous variables (by RMSE):", best_row$exog, "\n")

Best exogenous variables (by RMSE): Energy

best_vars <- unlist(strsplit(as.character(best_row$exog), "\\+"))

X_train_best <- data_tsx_house$X_train[, best_vars, drop = FALSE]
X_test_best  <- data_tsx_house$X_test[, best_vars, drop = FALSE]

# Fit SARIMAX with best exogenous variables
auto_sarimax_best <- auto.arima(
  data_tsx_house$y_train,
  xreg = X_train_best,
  stepwise = TRUE,
  seasonal = TRUE,
  approximation = FALSE
)
summary(auto_sarimax_best)

Series: data_tsx_house$y_train 
Regression with ARIMA(0,1,1)(0,0,1)[52] errors 

Coefficients:
          ma1    sma1   drift  Energy
      -0.8975  0.0492  0.0268  0.0264
s.e.   0.0091  0.0249  0.0082  0.0140

sigma^2 = 9.694:  log likelihood = -4395.49
AIC=8800.99   AICc=8801.02   BIC=8828.24

Training set error measures:
                     ME     RMSE      MAE        MPE     MAPE      MASE
Training set 0.00379997 3.108977 2.467509 -0.2841712 3.618373 0.5843927
                    ACF1
Training set -0.01851526

Cross Validation

# Forecast on test
forecast_test  <- forecast(
  auto_sarimax_best,
  h = length(data_tsx_house$y_test),
  xreg = X_test_best)

accuracy(forecast_test$mean, data_tsx_house$y_test)

                ME     RMSE      MAE       MPE     MAPE        ACF1 Theil's U
Test set -1.520984 3.464203 2.779498 -1.396449 2.454913 -0.02002627 0.7830069

plot(forecast_test)

Normality of Standard Residual

# Standardized residuals histogram and QQ plot
par(mfrow=c(1,2))
hist(rstandard(auto_sarimax_best), xlab="Standardized residuals", main="Residuals Histogram")
qqnorm(rstandard(auto_sarimax_best))
qqline(rstandard(auto_sarimax_best))

# Shapiro-Wilk test for normality
shapiro.test(rstandard(auto_sarimax_best))


    Shapiro-Wilk normality test

data:  rstandard(auto_sarimax_best)
W = 0.99926, p-value = 0.7491

# Alternatively, use checkresiduals() for full diagnostic
checkresiduals(auto_sarimax_best)


    Ljung-Box test

data:  Residuals from Regression with ARIMA(0,1,1)(0,0,1)[52] errors
Q* = 106.77, df = 102, p-value = 0.3538

Model df: 2.   Total lags used: 104

tsdiag(auto_sarimax_best,gof.lag=20)

Normal >> explain more about this test

Select Best Model (Min MAPE): ARIMA(0,1,2) (Test: MAPE 2.131%, RMSE 2.986)

# Best model on Train
summary(arima_012)

Series: data_ts_train 
ARIMA(0,1,2) 

Coefficients:
          ma1     ma2
      -0.9074  0.0258
s.e.   0.0248  0.0240

sigma^2 = 9.783:  log likelihood = -4404.25
AIC=8814.49   AICc=8814.5   BIC=8830.84

Training set error measures:
                    ME     RMSE      MAE        MPE     MAPE      MASE
Training set 0.2446421 3.125041 2.481181 0.07155408 3.616221 0.5876308
                     ACF1
Training set -0.005206844

# forecast through test (nTest) + forecast 2025 (52 weeks)
h <- nTest + 52

arima_012_forecast_full <- forecast(arima_012, h = h)
ts_time <- time(arima_012_forecast_full$mean)

# find which indices are still in 2025
idx_2025 <- which(ts_time < 2026 & ts_time >= 2025)

# final h for 2025
h_2025 <- max(idx_2025)

# final forecast that ends exactly in 2025
fc_2025 <- forecast(arima_012, h = h_2025)

start(data_ts_train)

[1] 1990    1

end(data_ts_train)

[1] 2023    6

start(data_ts_test)

[1] 2023    2

end(data_ts_test)

[1] 2025    1

# Check
tail(fc_2025$mean)

Time Series:
Start = c(2025, 47) 
End = c(2025, 52) 
Frequency = 52 
[1] 115.1388 115.1388 115.1388 115.1388 115.1388 115.1388

# ---- 1. Plot baseline forecast ----
plot(fc_2025,
     main = "Forecast for 2025",
     ylab = "House Price Index",
     xlab = "Year")

# add test data
lines(data_ts_test, col = "orange", lty = 2)

# add legend
legend(
  "topleft",
  legend = c("Train Data", "Test Data"),
  col    = c("black", "orange"),
  lty    = c(1, 2),
  lwd    = c(2, 2),
  bty    = "n"
)

# ---- 2. Helper positions ----
usr <- par("usr")     # plot boundaries: (x1, x2, y1, y2)

# positions for annotation (5% below top)
y_top <- usr[4] - 0.05 * (usr[4] - usr[3])

# ---- VERTICAL LINES ----

# (1) end of training set
vline_train <- time(data_ts_train)[length(data_ts_train)]
abline(v = vline_train, col = "orange", lwd = 2, lty = 2)

# (2) end of actual test data
vline_test <- time(data_ts_test)[length(data_ts_test)]
abline(v = vline_test, col = "darkgreen", lwd = 2, lty = 2)

# ---- X positions for zone labels ----
x_center_train   <- mean(time(data_ts_train))
x_center_test    <- mean(time(data_ts_test))
x_center_forecast <- mean(c(vline_test, max(time(fc_2025$mean))))

# ---- 3. Add annotation labels ----

text(x = x_center_train, y = y_top,
     labels = "Train",
     col = "black", cex = 0.8, font = 2)

text(x = x_center_test, y = y_top,
     labels = "Test",
     col = "orange", cex = 0.8, font = 2)

# allow drawing outside plot boundary
par(xpd = NA)


# small shift to the right of vline_test (2% of x-range)
x_shift <- 0.02 * (usr[2] - usr[1])
x_forecast_label <- vline_test + x_shift
# 
# # y position OUTSIDE the plot (slightly above top border)
# y_outside <- usr[4] + 0.05 * (usr[4] - usr[3])
# 
# annotation outside plot area, left-justified
text(x = x_forecast_label,
     y = y_top,
     labels = "Forecast",
     col = "darkgreen",
     cex = 0.8,
     font = 2,
     adj = 0)

# Check
tail(data_ts_test, 1)

Time Series:
Start = c(2025, 1) 
End = c(2025, 1) 
Frequency = 52 
[1] 114.34

# House Price Prediction
tail(fc_2025$mean, 1)

Time Series:
Start = c(2025, 52) 
End = c(2025, 52) 
Frequency = 52 
[1] 115.1388

GPT: Since the model is ARIMA(0,1,2), long-range forecasts tend to be stable and flat, meaning the predicted value stays close to the most recent 2024 levels. This forecast reflects the baseline expected movement without assuming any sudden shocks or unusual market changes. The number 115.14 should be interpreted relative to your index base (e.g., if the index is normalized to 100 in a base year, then 115.14 means ~15% higher than the base level).

# Compare to 2024
last_actual_2024  <- as.numeric(tail(data_ts_test, 1))
last_forecast_2025 <- as.numeric(tail(fc_2025$mean, 1))

# --- Calculate differences ---
abs_change <- last_forecast_2025 - last_actual_2024
pct_change <- (abs_change / last_actual_2024) * 100

# --- Print summary ---
cat("Latest Actual (2024):   ", round(last_actual_2024, 4), "\n")

Latest Actual (2024):    114.34

cat("Latest Forecast (2025): ", round(last_forecast_2025, 4), "\n\n")

Latest Forecast (2025):  115.1388

cat("Absolute Change:        ", round(abs_change, 4), "\n")

Absolute Change:         0.7988

cat("Percentage Change:      ", round(pct_change, 4), "%\n")

Percentage Change:       0.6986 %

GPT: House Price Index is expected to increase slightly from 114.34 at the end of 2024 to 115.14 at the end of 2025. This represents a small increase of about 0.8 index points, or roughly +0.7% growth over the year.