library(forecast)
library(bsts)
library(ggplot2)
library(dplyr)
library(ggthemes)
library(tseries)
library(pracma)
library(seasonal)
library(plotly)Time Series Forecasting and Analysis (Units Completed)
Data Source
Has this data set been seasonally adjusted?
I have no clue
Let’s get some packages loaded first!
Data overview:
df0%>%
head() REF_DATE GEO DGUID Housing.estimates Type.of.unit UOM UOM_ID
1 2014-09 Canada 2016A000011124 Housing starts Total units Units 300
2 2014-10 Canada 2016A000011124 Housing starts Total units Units 300
3 2014-11 Canada 2016A000011124 Housing starts Total units Units 300
4 2014-12 Canada 2016A000011124 Housing starts Total units Units 300
5 2015-01 Canada 2016A000011124 Housing starts Total units Units 300
6 2015-02 Canada 2016A000011124 Housing starts Total units Units 300
SCALAR_FACTOR SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED
1 units 0 v729949 1.1.1 15394 NA NA NA
2 units 0 v729949 1.1.1 14294 NA NA NA
3 units 0 v729949 1.1.1 15812 NA NA NA
4 units 0 v729949 1.1.1 13264 NA NA NA
5 units 0 v729949 1.1.1 11529 NA NA NA
6 units 0 v729949 1.1.1 9026 NA NA NA
DECIMALS
1 0
2 0
3 0
4 0
5 0
6 0
df0$Housing.estimates<-factor(df0$Housing.estimates)
levels(df0$Housing.estimates)[1] "Housing completions" "Housing starts"
df0$Type.of.unit<-factor(df0$Type.of.unit)
levels(df0$Type.of.unit)[1] "Apartment and other unit types" "Multiples"
[3] "Row units" "Semi-detached units"
[5] "Single-detached units" "Total units"
df1<-df0%>%
select(REF_DATE,Housing.estimates,VALUE,Type.of.unit)%>%
filter(Housing.estimates == "Housing completions",
Type.of.unit == "Total units")
df1$REF_DATE<-as.Date(paste0(as.character(df1$REF_DATE), "-01"), format="%Y-%m-%d")
df1%>%
str()'data.frame': 100 obs. of 4 variables:
$ REF_DATE : Date, format: "2014-09-01" "2014-10-01" ...
$ Housing.estimates: Factor w/ 2 levels "Housing completions",..: 1 1 1 1 1 1 1 1 1 1 ...
$ VALUE : int 12714 15260 13754 16189 20828 15017 14262 11668 15103 15442 ...
$ Type.of.unit : Factor w/ 6 levels "Apartment and other unit types",..: 6 6 6 6 6 6 6 6 6 6 ...
Visualize the data:
df1_ts<-ts(df1$VALUE,start=c(2014,9),frequency=12)
autoplot(df1_ts,col="blue")+geom_point(col="black")+
labs(x="Year",y="Units Completed")+ggtitle("Units Completed in Canada from 2014-09 to 2022-12")+
theme_stata(scheme = "s1color")+scale_color_stata()Let’s check check if we could build an ARIMA model
ts_train0<-window(df1_ts,start=c(2014,09),end=c(2021,11))
ts_test0<-window(df1_ts,start=c(2021,12),end=c(2022,12))adf.test(df1_ts)
Augmented Dickey-Fuller Test
data: df1_ts
Dickey-Fuller = -5.1761, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary
ggAcf(df1_ts)ggPacf(df1_ts)ggAcf(diff(df1_ts,1))ggPacf(diff(df1_ts,1))Is there a long memory problem?
hurstexp(df1_ts)Simple R/S Hurst estimation: 0.6618731
Corrected R over S Hurst exponent: 0.8365118
Empirical Hurst exponent: 0.6037168
Corrected empirical Hurst exponent: 0.629525
Theoretical Hurst exponent: 0.5229461
This series appears to have a weak-moderate long memory problem, while it was shown that the series was stationary.
#auto.arima(df1_ts,stepwise = FALSE, trace=TRUE)
#### Auto Arima suggested ARIMA(1,1,1)(1,0,1)[12] with AIC= 1769
### Let's see if we could find a better combination First model:
mod_arima0<-arima(ts_train0,order=c(1,1,1),
seasonal=c(1,0,1))
mod_arima1<-forecast(mod_arima0,h=length(ts_test0))
accuracy(mod_arima1,ts_test0) ME RMSE MAE MPE MAPE MASE
Training set 192.1581 1663.682 1341.038 0.1417257 8.874782 0.8768854
Test set 382.5349 1739.293 1470.146 1.3058491 8.465155 0.9613070
ACF1 Theil's U
Training set -0.02203416 NA
Test set -0.42906312 0.5238728
checkresiduals(mod_arima0)#### no auto-correlation in the residuals
Ljung-Box test
data: Residuals from ARIMA(1,1,1)(1,0,1)[12]
Q* = 8.5733, df = 13, p-value = 0.8044
Model df: 4. Total lags used: 17
summary(mod_arima0)
Call:
arima(x = ts_train0, order = c(1, 1, 1), seasonal = c(1, 0, 1))
Coefficients:
ar1 ma1 sar1 sma1
0.3250 -0.9675 0.8183 -0.4517
s.e. 0.1124 0.0351 0.1300 0.2229
sigma^2 estimated as 2800019: log likelihood = -763.46, aic = 1536.92
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set 192.1581 1663.682 1341.038 0.1417257 8.874782 0.7264609
ACF1
Training set -0.02203416
#AIC = 1536.92Based on the AICs and accuracy metrics, we would conclude that the first and third models perform marginally better than the second model, particularly when using the observations from the last 12 months as the validation set.
autoplot(df1_ts,series="Historical Data")+
autolayer(mod_arima1,PI=FALSE,series = "SARIMA (1,1,1)(1,0,1)[12]")+
labs(x="Year",y="Units Completed")+
ggtitle("Housing Units Completed in Canada from 2014-09 to 2022-12")+
theme_stata(scheme = "s1color")+scale_color_stata()Final Model: We pick \(SARIMA[1,1,1][1,0,1][12]\) as our final model
mod_arima6<-arima(df1_ts,order=c(1,1,1),
seasonal=c(1,0,1))
mod_arima7<-forecast(mod_arima6,h=24)
checkresiduals(mod_arima6)####okay
Ljung-Box test
data: Residuals from ARIMA(1,1,1)(1,0,1)[12]
Q* = 22.257, df = 16, p-value = 0.1351
Model df: 4. Total lags used: 20
mod_arima7 Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
Jan 2023 15240.13 13078.82 17401.44 11934.69 18545.57
Feb 2023 14802.61 12568.70 17036.53 11386.13 18219.10
Mar 2023 16215.27 13970.91 18459.63 12782.81 19647.73
Apr 2023 15738.31 13489.58 17987.04 12299.17 19177.44
May 2023 17226.76 14974.64 19478.87 13782.44 20671.07
Jun 2023 17666.36 15411.06 19921.66 14217.18 21115.54
Jul 2023 19192.82 16934.39 21451.26 15738.85 22646.80
Aug 2023 17151.74 14890.18 19413.30 13692.98 20610.50
Sep 2023 17074.99 14810.29 19339.69 13611.43 20538.54
Oct 2023 16391.84 14124.03 18659.65 12923.53 19860.16
Nov 2023 15807.42 13536.51 18078.34 12334.36 19280.49
Dec 2023 18694.00 16419.98 20968.02 15216.19 22171.81
Jan 2024 15454.23 13026.79 17881.67 11741.78 19166.68
Feb 2024 15205.66 12758.53 17652.79 11463.10 18948.22
Mar 2024 16469.74 14015.10 18924.38 12715.69 20223.78
Apr 2024 16059.19 13598.94 18519.44 12296.56 19821.82
May 2024 17361.68 14896.21 19827.16 13591.06 21132.30
Jun 2024 17746.26 15275.65 20216.87 13967.78 21524.73
Jul 2024 19080.65 16604.94 21556.37 15294.37 22866.94
Aug 2024 17296.49 14815.67 19777.31 13502.41 21090.58
Sep 2024 17229.40 14743.48 19715.33 13427.51 21031.30
Oct 2024 16632.25 14141.25 19123.24 12822.59 20441.90
Nov 2024 16121.39 13625.33 18617.45 12304.00 19938.78
Dec 2024 18644.64 16143.53 21145.75 14819.52 22469.75
Visualize the points forecasted:
autoplot(df1_ts,series="Historical Data")+
autolayer(mod_arima7,series = "Forecasting Model Picked")+
labs(x="Year",y="Units Completed")+
ggtitle("Forecasted Units Completed in Canada")+
theme_stata(scheme = "s1color")+scale_color_stata()Notes:
- It would be better to compare the data published by Stats Can or CMHC, if you were to compare housing data from another source with my estimates, it might not be the best way to do so.
Seasonal Adjusted and Trend
x11_mod<-seas(df1_ts,x11="")
autoplot(df1_ts, series="Historical Data")+
autolayer(trendcycle(x11_mod), series="Trend")+
autolayer(seasadj(x11_mod), series="Seasonally Adjusted Units Completed")+
xlab("Year")+ylab("Units Completed in Canada")+
ggtitle("Seasonal Adjusted and Trend of Housing Units Completed in Canada From 2014-09 to 2022-12")+
theme_stata(scheme = "s1color")summary(x11_mod)
Call:
seas(x = df1_ts, x11 = "")
Coefficients:
Estimate Std. Error z value Pr(>|z|)
Constant 0.021288 0.005246 4.058 4.95e-05 ***
AO2015.Jan 0.444326 0.096963 4.582 4.60e-06 ***
AR-Nonseasonal-01 0.398737 0.280901 1.419 0.156
MA-Nonseasonal-01 0.267173 0.286664 0.932 0.351
MA-Seasonal-12 0.772220 0.085159 9.068 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
X11 adj. ARIMA: (1 0 1)(0 1 1) Obs.: 100 Transform: log
AICc: 1557, BIC: 1571 QS (no seasonality in final): 0
Box-Ljung (no autocorr.): 32.51 Shapiro (normality): 0.9873
autoplot(x11_mod)seasonal_plot1<-ggseasonplot(df1_ts)+
ylab("Volume")+ggtitle("Seasonal Plot for Units Completed")+
theme_stata(scheme = "s1color")
ggplotly(seasonal_plot1)Units Started
df10<-df0%>%
select(REF_DATE,Housing.estimates,VALUE,Type.of.unit)%>%
filter(Housing.estimates == "Housing starts",
Type.of.unit == "Total units")
df10$REF_DATE<-as.Date(paste0(as.character(df10$REF_DATE), "-01"), format="%Y-%m-%d")
df10%>%
str()'data.frame': 113 obs. of 4 variables:
$ REF_DATE : Date, format: "2014-09-01" "2014-10-01" ...
$ Housing.estimates: Factor w/ 2 levels "Housing completions",..: 2 2 2 2 2 2 2 2 2 2 ...
$ VALUE : int 15394 14294 15812 13264 11529 9026 13043 14181 17129 17084 ...
$ Type.of.unit : Factor w/ 6 levels "Apartment and other unit types",..: 6 6 6 6 6 6 6 6 6 6 ...
df2_ts<-ts(df10$VALUE,start=c(2014,9),frequency=12)
autoplot(df2_ts,col="navy")+geom_point(col="red")+
labs(x="Year",y="Units Starts")+ggtitle("Housing Units Starts in Canada from 2014-09 to 2024-01")+
theme_stata(scheme = "s1color")+scale_color_stata()ts_train1<-window(df2_ts,start=c(2014,09),end=c(2021,12))
ts_test1<-window(df2_ts,start=c(2022,01),end=c(2024,01))
adf.test(df1_ts)
Augmented Dickey-Fuller Test
data: df1_ts
Dickey-Fuller = -5.1761, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary
ggAcf(df2_ts)ggPacf(df2_ts)hurstexp(df2_ts)Simple R/S Hurst estimation: 0.7278853
Corrected R over S Hurst exponent: 0.8569162
Empirical Hurst exponent: 0.6168564
Corrected empirical Hurst exponent: 0.6392321
Theoretical Hurst exponent: 0.5242311
#auto.arima(df2_ts,stepwise = FALSE, trace=TRUE)
### [1,0,1][2,1,0]
#auto.arima(df2_ts,stepwise = FALSE, trace=TRUE,D=1)Let check if we were to use [1,0,1][2,1,0]
ggAcf(diff(df2_ts,1,lag=12))ggPacf(diff(df2_ts,1,lag=12))By auto.arima
mod_arima14<-arima(ts_train1,order=c(1,0,1),
seasonal=c(2,1,0))
mod_arima15<-forecast(mod_arima14,h=length(ts_test1))
accuracy(mod_arima15,ts_test1) ME RMSE MAE MPE MAPE MASE
Training set 220.460 1710.787 1232.159 0.5135649 7.39561 0.6056950
Test set -377.662 2551.365 1960.280 -3.8409436 10.96934 0.9636189
ACF1 Theil's U
Training set -0.04276959 NA
Test set -0.12771400 0.6726785
checkresiduals(mod_arima14)
Ljung-Box test
data: Residuals from ARIMA(1,0,1)(2,1,0)[12]
Q* = 13.334, df = 14, p-value = 0.5004
Model df: 4. Total lags used: 18
plot(mod_arima15)
lines(ts_test1)#aic = 1359.13Try another one
mod_arima16<-arima(df2_ts,order=c(1,0,1),
seasonal=c(2,1,0))
mod_arima17<-forecast(mod_arima16,h=24)
checkresiduals(mod_arima16)####okay
Ljung-Box test
data: Residuals from ARIMA(1,0,1)(2,1,0)[12]
Q* = 20.481, df = 19, p-value = 0.3662
Model df: 4. Total lags used: 23
summary(mod_arima16)
Call:
arima(x = df2_ts, order = c(1, 0, 1), seasonal = c(2, 1, 0))
Coefficients:
ar1 ma1 sar1 sar2
0.9390 -0.6841 -0.6484 -0.4111
s.e. 0.0487 0.0940 0.1059 0.1035
sigma^2 estimated as 4023906: log likelihood = -914.98, aic = 1839.96
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set 193.3758 1896.474 1405.044 0.1897545 8.287955 0.6290169
ACF1
Training set -0.08667191
mod_arima17 Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
Feb 2024 15443.89 12873.14 18014.64 11512.268 19375.52
Mar 2024 17711.17 15058.23 20364.11 13653.851 21768.49
Apr 2024 20404.04 17680.70 23127.39 16239.044 24569.04
May 2024 19855.92 17071.97 22639.87 15598.233 24113.60
Jun 2024 23385.95 20549.64 26222.26 19048.185 27723.72
Jul 2024 21218.69 18337.00 24100.38 16811.527 25625.86
Aug 2024 19620.25 16699.14 22541.37 15152.791 24087.72
Sep 2024 21676.67 18721.23 24632.12 17156.713 26196.63
Oct 2024 20450.23 17464.85 23435.61 15884.482 25015.98
Nov 2024 21240.73 18229.19 24252.26 16634.984 25846.47
Dec 2024 18085.84 15051.43 21120.24 13445.117 22726.55
Jan 2025 13758.06 10703.64 16812.49 9086.723 18429.41
Feb 2025 15430.41 12137.01 18723.82 10393.585 20467.24
Mar 2025 16246.63 12909.54 19583.73 11142.987 21350.28
Apr 2025 20448.31 17073.16 23823.45 15286.463 25610.15
May 2025 20076.79 16668.45 23485.14 14864.178 25289.41
Jun 2025 23427.07 19989.72 26864.42 18170.091 28684.05
Jul 2025 21396.86 17934.14 24859.59 16101.079 26692.65
Aug 2025 19968.39 16483.44 23453.33 14638.626 25298.15
Sep 2025 22723.24 19218.82 26227.66 17363.694 28082.79
Oct 2025 21072.76 17551.26 24594.26 15687.088 26458.43
Nov 2025 20226.46 16689.96 23762.95 14817.859 25635.06
Dec 2025 18256.16 14706.50 21805.82 12827.421 23684.89
Jan 2026 13749.13 10187.90 17310.36 8302.703 19195.56
autoplot(df2_ts,series="Historical Data")+
autolayer(mod_arima17,series = "Forecasting Model Picked")+
labs(x="Year",y="Units Started")+
ggtitle("Forecasted Housing Units Started in Canada")+
theme_stata(scheme = "s1color")+scale_color_stata()x11_mod1<-seas(df2_ts,x11="")
autoplot(df2_ts, series="Historical Data") +
autolayer(trendcycle(x11_mod1), series="Trend") +
autolayer(seasadj(x11_mod1), series="Seasonally Adjusted Units Started") +
xlab("Year") + ylab("Units Started in Canada") +
ggtitle("Seasonal Adjusted and Trend of Housing Units Started in Canada From 2014-09 to 2022-12")+
scale_colour_manual(values=c("gray","blue","red"),
breaks=c("Historical Data","Seasonally Adjusted Units Started","Trend"))+
theme_stata(scheme = "s1color")summary(x11_mod1)
Call:
seas(x = df2_ts, x11 = "")
Coefficients:
Estimate Std. Error z value Pr(>|z|)
MA-Nonseasonal-01 0.71465 0.06137 11.65 <2e-16 ***
MA-Seasonal-12 0.99996 0.09583 10.44 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
X11 adj. ARIMA: (0 1 1)(0 1 1) Obs.: 113 Transform: none
AICc: 1808, BIC: 1816 QS (no seasonality in final): 0
Box-Ljung (no autocorr.): 31.82 Shapiro (normality): 0.991
autoplot(x11_mod1)More Seasonal Analysis
seasonal_plot2<-ggseasonplot(df2_ts)+
ylab("Volume")+ggtitle("Seasonal Plot for Units Started")+
theme_stata(scheme = "s1color")
ggplotly(seasonal_plot2)