Time Series Forecasting and Analysis (Units Completed)

Author

Jimmy

Data Source

Has this data set been seasonally adjusted?

I have no clue

Let’s get some packages loaded first!

library(forecast)
library(bsts)
library(ggplot2)
library(dplyr)
library(ggthemes)
library(tseries)
library(pracma)
library(seasonal)
library(plotly)

Data overview:

df0%>%
  head()

  REF_DATE    GEO          DGUID Housing.estimates Type.of.unit   UOM UOM_ID
1  2014-09 Canada 2016A000011124    Housing starts  Total units Units    300
2  2014-10 Canada 2016A000011124    Housing starts  Total units Units    300
3  2014-11 Canada 2016A000011124    Housing starts  Total units Units    300
4  2014-12 Canada 2016A000011124    Housing starts  Total units Units    300
5  2015-01 Canada 2016A000011124    Housing starts  Total units Units    300
6  2015-02 Canada 2016A000011124    Housing starts  Total units Units    300
  SCALAR_FACTOR SCALAR_ID  VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED
1         units         0 v729949      1.1.1 15394     NA     NA         NA
2         units         0 v729949      1.1.1 14294     NA     NA         NA
3         units         0 v729949      1.1.1 15812     NA     NA         NA
4         units         0 v729949      1.1.1 13264     NA     NA         NA
5         units         0 v729949      1.1.1 11529     NA     NA         NA
6         units         0 v729949      1.1.1  9026     NA     NA         NA
  DECIMALS
1        0
2        0
3        0
4        0
5        0
6        0

df0$Housing.estimates<-factor(df0$Housing.estimates)
levels(df0$Housing.estimates)

[1] "Housing completions" "Housing starts"

df0$Type.of.unit<-factor(df0$Type.of.unit)
levels(df0$Type.of.unit)

[1] "Apartment and other unit types" "Multiples"                     
[3] "Row units"                      "Semi-detached units"           
[5] "Single-detached units"          "Total units"

df1<-df0%>%
  select(REF_DATE,Housing.estimates,VALUE,Type.of.unit)%>%
  filter(Housing.estimates == "Housing completions",
         Type.of.unit == "Total units")

df1$REF_DATE<-as.Date(paste0(as.character(df1$REF_DATE), "-01"), format="%Y-%m-%d")
df1%>%
  str()

'data.frame':   100 obs. of  4 variables:
 $ REF_DATE         : Date, format: "2014-09-01" "2014-10-01" ...
 $ Housing.estimates: Factor w/ 2 levels "Housing completions",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ VALUE            : int  12714 15260 13754 16189 20828 15017 14262 11668 15103 15442 ...
 $ Type.of.unit     : Factor w/ 6 levels "Apartment and other unit types",..: 6 6 6 6 6 6 6 6 6 6 ...

Visualize the data:

df1_ts<-ts(df1$VALUE,start=c(2014,9),frequency=12)
autoplot(df1_ts,col="blue")+geom_point(col="black")+
  labs(x="Year",y="Units Completed")+ggtitle("Units Completed in Canada from 2014-09 to 2022-12")+
  theme_stata(scheme = "s1color")+scale_color_stata()

Let’s check check if we could build an ARIMA model

ts_train0<-window(df1_ts,start=c(2014,09),end=c(2021,11))
ts_test0<-window(df1_ts,start=c(2021,12),end=c(2022,12))

adf.test(df1_ts)


    Augmented Dickey-Fuller Test

data:  df1_ts
Dickey-Fuller = -5.1761, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary

ggAcf(df1_ts)

ggPacf(df1_ts)

ggAcf(diff(df1_ts,1))

ggPacf(diff(df1_ts,1))

Is there a long memory problem?

hurstexp(df1_ts)

Simple R/S Hurst estimation:         0.6618731 
Corrected R over S Hurst exponent:   0.8365118 
Empirical Hurst exponent:            0.6037168 
Corrected empirical Hurst exponent:  0.629525 
Theoretical Hurst exponent:          0.5229461

This series appears to have a weak-moderate long memory problem, while it was shown that the series was stationary.

#auto.arima(df1_ts,stepwise = FALSE, trace=TRUE)
#### Auto Arima suggested ARIMA(1,1,1)(1,0,1)[12] with AIC= 1769
### Let's see if we could find a better combination

First model:

mod_arima0<-arima(ts_train0,order=c(1,1,1),
                  seasonal=c(1,0,1))
mod_arima1<-forecast(mod_arima0,h=length(ts_test0))
accuracy(mod_arima1,ts_test0)

                   ME     RMSE      MAE       MPE     MAPE      MASE
Training set 192.1581 1663.682 1341.038 0.1417257 8.874782 0.8768854
Test set     382.5349 1739.293 1470.146 1.3058491 8.465155 0.9613070
                    ACF1 Theil's U
Training set -0.02203416        NA
Test set     -0.42906312 0.5238728

checkresiduals(mod_arima0)#### no auto-correlation in the residuals


    Ljung-Box test

data:  Residuals from ARIMA(1,1,1)(1,0,1)[12]
Q* = 8.5733, df = 13, p-value = 0.8044

Model df: 4.   Total lags used: 17

summary(mod_arima0)


Call:
arima(x = ts_train0, order = c(1, 1, 1), seasonal = c(1, 0, 1))

Coefficients:
         ar1      ma1    sar1     sma1
      0.3250  -0.9675  0.8183  -0.4517
s.e.  0.1124   0.0351  0.1300   0.2229

sigma^2 estimated as 2800019:  log likelihood = -763.46,  aic = 1536.92

Training set error measures:
                   ME     RMSE      MAE       MPE     MAPE      MASE
Training set 192.1581 1663.682 1341.038 0.1417257 8.874782 0.7264609
                    ACF1
Training set -0.02203416

#AIC = 1536.92

Based on the AICs and accuracy metrics, we would conclude that the first and third models perform marginally better than the second model, particularly when using the observations from the last 12 months as the validation set.

autoplot(df1_ts,series="Historical Data")+
  autolayer(mod_arima1,PI=FALSE,series = "SARIMA (1,1,1)(1,0,1)[12]")+
  labs(x="Year",y="Units Completed")+
  ggtitle("Housing Units Completed in Canada from 2014-09 to 2022-12")+
  theme_stata(scheme = "s1color")+scale_color_stata()

Final Model: We pick \(SARIMA[1,1,1][1,0,1][12]\) as our final model

mod_arima6<-arima(df1_ts,order=c(1,1,1),
                  seasonal=c(1,0,1))
mod_arima7<-forecast(mod_arima6,h=24)
checkresiduals(mod_arima6)####okay


    Ljung-Box test

data:  Residuals from ARIMA(1,1,1)(1,0,1)[12]
Q* = 22.257, df = 16, p-value = 0.1351

Model df: 4.   Total lags used: 20

mod_arima7

         Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
Jan 2023       15240.13 13078.82 17401.44 11934.69 18545.57
Feb 2023       14802.61 12568.70 17036.53 11386.13 18219.10
Mar 2023       16215.27 13970.91 18459.63 12782.81 19647.73
Apr 2023       15738.31 13489.58 17987.04 12299.17 19177.44
May 2023       17226.76 14974.64 19478.87 13782.44 20671.07
Jun 2023       17666.36 15411.06 19921.66 14217.18 21115.54
Jul 2023       19192.82 16934.39 21451.26 15738.85 22646.80
Aug 2023       17151.74 14890.18 19413.30 13692.98 20610.50
Sep 2023       17074.99 14810.29 19339.69 13611.43 20538.54
Oct 2023       16391.84 14124.03 18659.65 12923.53 19860.16
Nov 2023       15807.42 13536.51 18078.34 12334.36 19280.49
Dec 2023       18694.00 16419.98 20968.02 15216.19 22171.81
Jan 2024       15454.23 13026.79 17881.67 11741.78 19166.68
Feb 2024       15205.66 12758.53 17652.79 11463.10 18948.22
Mar 2024       16469.74 14015.10 18924.38 12715.69 20223.78
Apr 2024       16059.19 13598.94 18519.44 12296.56 19821.82
May 2024       17361.68 14896.21 19827.16 13591.06 21132.30
Jun 2024       17746.26 15275.65 20216.87 13967.78 21524.73
Jul 2024       19080.65 16604.94 21556.37 15294.37 22866.94
Aug 2024       17296.49 14815.67 19777.31 13502.41 21090.58
Sep 2024       17229.40 14743.48 19715.33 13427.51 21031.30
Oct 2024       16632.25 14141.25 19123.24 12822.59 20441.90
Nov 2024       16121.39 13625.33 18617.45 12304.00 19938.78
Dec 2024       18644.64 16143.53 21145.75 14819.52 22469.75

Visualize the points forecasted:

autoplot(df1_ts,series="Historical Data")+
  autolayer(mod_arima7,series = "Forecasting Model Picked")+
  labs(x="Year",y="Units Completed")+
  ggtitle("Forecasted Units Completed in Canada")+
  theme_stata(scheme = "s1color")+scale_color_stata()

Notes:

It would be better to compare the data published by Stats Can or CMHC, if you were to compare housing data from another source with my estimates, it might not be the best way to do so.

Seasonal Adjusted and Trend

x11_mod<-seas(df1_ts,x11="")
autoplot(df1_ts, series="Historical Data")+
  autolayer(trendcycle(x11_mod), series="Trend")+
  autolayer(seasadj(x11_mod), series="Seasonally Adjusted Units Completed")+
  xlab("Year")+ylab("Units Completed in Canada")+
  ggtitle("Seasonal Adjusted and Trend of Housing Units Completed in Canada From 2014-09 to 2022-12")+
  theme_stata(scheme = "s1color")

summary(x11_mod)


Call:
seas(x = df1_ts, x11 = "")

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
Constant          0.021288   0.005246   4.058 4.95e-05 ***
AO2015.Jan        0.444326   0.096963   4.582 4.60e-06 ***
AR-Nonseasonal-01 0.398737   0.280901   1.419    0.156    
MA-Nonseasonal-01 0.267173   0.286664   0.932    0.351    
MA-Seasonal-12    0.772220   0.085159   9.068  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

X11 adj.  ARIMA: (1 0 1)(0 1 1)  Obs.: 100  Transform: log
AICc:  1557, BIC:  1571  QS (no seasonality in final):    0  
Box-Ljung (no autocorr.): 32.51   Shapiro (normality): 0.9873

autoplot(x11_mod)

seasonal_plot1<-ggseasonplot(df1_ts)+
  ylab("Volume")+ggtitle("Seasonal Plot for Units Completed")+
  theme_stata(scheme = "s1color")
ggplotly(seasonal_plot1)

Units Started

df10<-df0%>%
  select(REF_DATE,Housing.estimates,VALUE,Type.of.unit)%>%
  filter(Housing.estimates == "Housing starts",
         Type.of.unit == "Total units")

df10$REF_DATE<-as.Date(paste0(as.character(df10$REF_DATE), "-01"), format="%Y-%m-%d")
df10%>%
  str()

'data.frame':   113 obs. of  4 variables:
 $ REF_DATE         : Date, format: "2014-09-01" "2014-10-01" ...
 $ Housing.estimates: Factor w/ 2 levels "Housing completions",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ VALUE            : int  15394 14294 15812 13264 11529 9026 13043 14181 17129 17084 ...
 $ Type.of.unit     : Factor w/ 6 levels "Apartment and other unit types",..: 6 6 6 6 6 6 6 6 6 6 ...

df2_ts<-ts(df10$VALUE,start=c(2014,9),frequency=12)
autoplot(df2_ts,col="navy")+geom_point(col="red")+
  labs(x="Year",y="Units Starts")+ggtitle("Housing Units Starts in Canada from 2014-09 to 2024-01")+
  theme_stata(scheme = "s1color")+scale_color_stata()

ts_train1<-window(df2_ts,start=c(2014,09),end=c(2021,12))
ts_test1<-window(df2_ts,start=c(2022,01),end=c(2024,01))
adf.test(df1_ts)


    Augmented Dickey-Fuller Test

data:  df1_ts
Dickey-Fuller = -5.1761, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary

ggAcf(df2_ts)

ggPacf(df2_ts)

hurstexp(df2_ts)

Simple R/S Hurst estimation:         0.7278853 
Corrected R over S Hurst exponent:   0.8569162 
Empirical Hurst exponent:            0.6168564 
Corrected empirical Hurst exponent:  0.6392321 
Theoretical Hurst exponent:          0.5242311

#auto.arima(df2_ts,stepwise = FALSE, trace=TRUE)
### [1,0,1][2,1,0]
#auto.arima(df2_ts,stepwise = FALSE, trace=TRUE,D=1)

Let check if we were to use [1,0,1][2,1,0]

ggAcf(diff(df2_ts,1,lag=12))

ggPacf(diff(df2_ts,1,lag=12))

By auto.arima

mod_arima14<-arima(ts_train1,order=c(1,0,1),
                  seasonal=c(2,1,0))
mod_arima15<-forecast(mod_arima14,h=length(ts_test1))
accuracy(mod_arima15,ts_test1)

                   ME     RMSE      MAE        MPE     MAPE      MASE
Training set  220.460 1710.787 1232.159  0.5135649  7.39561 0.6056950
Test set     -377.662 2551.365 1960.280 -3.8409436 10.96934 0.9636189
                    ACF1 Theil's U
Training set -0.04276959        NA
Test set     -0.12771400 0.6726785

checkresiduals(mod_arima14)


    Ljung-Box test

data:  Residuals from ARIMA(1,0,1)(2,1,0)[12]
Q* = 13.334, df = 14, p-value = 0.5004

Model df: 4.   Total lags used: 18

plot(mod_arima15)
lines(ts_test1)

#aic = 1359.13

Try another one

mod_arima16<-arima(df2_ts,order=c(1,0,1),
                  seasonal=c(2,1,0))
mod_arima17<-forecast(mod_arima16,h=24)
checkresiduals(mod_arima16)####okay


    Ljung-Box test

data:  Residuals from ARIMA(1,0,1)(2,1,0)[12]
Q* = 20.481, df = 19, p-value = 0.3662

Model df: 4.   Total lags used: 23

summary(mod_arima16)


Call:
arima(x = df2_ts, order = c(1, 0, 1), seasonal = c(2, 1, 0))

Coefficients:
         ar1      ma1     sar1     sar2
      0.9390  -0.6841  -0.6484  -0.4111
s.e.  0.0487   0.0940   0.1059   0.1035

sigma^2 estimated as 4023906:  log likelihood = -914.98,  aic = 1839.96

Training set error measures:
                   ME     RMSE      MAE       MPE     MAPE      MASE
Training set 193.3758 1896.474 1405.044 0.1897545 8.287955 0.6290169
                    ACF1
Training set -0.08667191

mod_arima17

         Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
Feb 2024       15443.89 12873.14 18014.64 11512.268 19375.52
Mar 2024       17711.17 15058.23 20364.11 13653.851 21768.49
Apr 2024       20404.04 17680.70 23127.39 16239.044 24569.04
May 2024       19855.92 17071.97 22639.87 15598.233 24113.60
Jun 2024       23385.95 20549.64 26222.26 19048.185 27723.72
Jul 2024       21218.69 18337.00 24100.38 16811.527 25625.86
Aug 2024       19620.25 16699.14 22541.37 15152.791 24087.72
Sep 2024       21676.67 18721.23 24632.12 17156.713 26196.63
Oct 2024       20450.23 17464.85 23435.61 15884.482 25015.98
Nov 2024       21240.73 18229.19 24252.26 16634.984 25846.47
Dec 2024       18085.84 15051.43 21120.24 13445.117 22726.55
Jan 2025       13758.06 10703.64 16812.49  9086.723 18429.41
Feb 2025       15430.41 12137.01 18723.82 10393.585 20467.24
Mar 2025       16246.63 12909.54 19583.73 11142.987 21350.28
Apr 2025       20448.31 17073.16 23823.45 15286.463 25610.15
May 2025       20076.79 16668.45 23485.14 14864.178 25289.41
Jun 2025       23427.07 19989.72 26864.42 18170.091 28684.05
Jul 2025       21396.86 17934.14 24859.59 16101.079 26692.65
Aug 2025       19968.39 16483.44 23453.33 14638.626 25298.15
Sep 2025       22723.24 19218.82 26227.66 17363.694 28082.79
Oct 2025       21072.76 17551.26 24594.26 15687.088 26458.43
Nov 2025       20226.46 16689.96 23762.95 14817.859 25635.06
Dec 2025       18256.16 14706.50 21805.82 12827.421 23684.89
Jan 2026       13749.13 10187.90 17310.36  8302.703 19195.56

autoplot(df2_ts,series="Historical Data")+
  autolayer(mod_arima17,series = "Forecasting Model Picked")+
  labs(x="Year",y="Units Started")+
  ggtitle("Forecasted Housing Units Started in Canada")+
  theme_stata(scheme = "s1color")+scale_color_stata()

x11_mod1<-seas(df2_ts,x11="")
autoplot(df2_ts, series="Historical Data") +
  autolayer(trendcycle(x11_mod1), series="Trend") +
  autolayer(seasadj(x11_mod1), series="Seasonally Adjusted Units Started") +
  xlab("Year") + ylab("Units Started in Canada") +
  ggtitle("Seasonal Adjusted and Trend of Housing Units Started in Canada From 2014-09 to 2022-12")+
  scale_colour_manual(values=c("gray","blue","red"),
             breaks=c("Historical Data","Seasonally Adjusted Units Started","Trend"))+
  theme_stata(scheme = "s1color")

summary(x11_mod1)


Call:
seas(x = df2_ts, x11 = "")

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
MA-Nonseasonal-01  0.71465    0.06137   11.65   <2e-16 ***
MA-Seasonal-12     0.99996    0.09583   10.44   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

X11 adj.  ARIMA: (0 1 1)(0 1 1)  Obs.: 113  Transform: none
AICc:  1808, BIC:  1816  QS (no seasonality in final):    0  
Box-Ljung (no autocorr.): 31.82   Shapiro (normality): 0.991

autoplot(x11_mod1)

More Seasonal Analysis

seasonal_plot2<-ggseasonplot(df2_ts)+
  ylab("Volume")+ggtitle("Seasonal Plot for Units Started")+
  theme_stata(scheme = "s1color")
ggplotly(seasonal_plot2)