#install.packages(c("kableExtra","tidyverse","patchwork","bestNormalize","caret","forecast","tseries"))
library(kableExtra)
library(tidyverse)
library(patchwork)
library(bestNormalize)
library(caret)
library(knitr)
library(ggpubr)
library(grid)
library(gridExtra)
require(forecast)
require(tseries)
We used open source data on the UK Housing Price Index (HPI); 1970 to August 2020. (Note: the House type cannot be differentiated until 2005)
The UK House Price Index (HPI) uses house sales data from HM Land Registry, Registers of Scotland, and Land and Property Services Northern Ireland and is calculated by the Office for National Statistics. The index applies a statistical method, called a hedonic regression model, to the various sources of data on property price and attributes to produce estimates of the change in house prices each period.
As of July 2020 the average house price in the UK is £237,963, and the index stands at 124.81. Property prices have risen by 0.5% compared to the previous month, and risen by 2.3% compared to the previous year.
You can verify the data here https://landregistry.data.gov.uk/app/ukhpi
head(df[,1:2]) %>%
kbl() %>%
kable_styling()
| Period | Average.price.All.property.types | |
|---|---|---|
| 22 | 1970-01 | 3920 |
| 23 | 1970-02 | 3920 |
| 24 | 1970-03 | 3920 |
| 25 | 1970-04 | 3980 |
| 26 | 1970-05 | 3980 |
| 27 | 1970-06 | 3980 |
tail(df[,1:2]) %>%
kbl() %>%
kable_styling()
| Period | Average.price.All.property.types | |
|---|---|---|
| 623 | 2020-02 | 231162 |
| 624 | 2020-03 | 234464 |
| 625 | 2020-04 | 230533 |
| 626 | 2020-05 | 231778 |
| 627 | 2020-06 | 236798 |
| 628 | 2020-07 | 237963 |
summary(df[,2:5]) %>%
kbl() %>%
kable_styling()
| Average.price.All.property.types | Average.price.Detached.houses | Average.price.Semi.detached.houses | Average.price.Terraced.houses | |
|---|---|---|---|---|
| Min. : 3920 | Min. :234509 | Min. :145913 | Min. :119434 | |
| 1st Qu.: 22360 | 1st Qu.:258520 | 1st Qu.:159793 | 1st Qu.:136399 | |
| Median : 58250 | Median :270720 | Median :169788 | Median :145778 | |
| Mean : 89710 | Mean :286678 | Mean :178871 | Mean :153030 | |
| 3rd Qu.:167017 | 3rd Qu.:322865 | 3rd Qu.:201184 | 3rd Qu.:173246 | |
| Max. :237963 | Max. :361986 | Max. :227832 | Max. :193619 | |
| NA | NA’s :420 | NA’s :420 | NA’s :420 |
names(df)[2] <- "Units"
dfUnits <- df$Units
tdf <- ts(dfUnits, start = c(1970, 1), frequency = 12)
head(tdf,12)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1970 3920 3920 3920 3980 3980 3980 4163 4163 4163 4163 4163 4163
fit <- auto.arima(tdf)
fit
## Series: tdf
## ARIMA(3,2,2)(0,0,2)[12]
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 sma1 sma2
## -1.4200 -1.1652 -0.4699 0.7083 0.1996 0.3932 0.2108
## s.e. 0.1238 0.1091 0.0651 0.1287 0.0932 0.0431 0.0360
##
## sigma^2 estimated as 992508: log likelihood=-5033.59
## AIC=10083.18 AICc=10083.42 BIC=10118.42
accfit <-accuracy(fit)
accfit
## ME RMSE MAE MPE MAPE MASE
## Training set 5.960677 988.8338 611.1948 0.0156202 0.8465972 0.1014046
## ACF1
## Training set -0.005331425
pred_values <- forecast(fit, 12)
plot(pred_values, xlab = "Date", ylab = "Price/£",
main = "ARIMA 12m Forecast for House price Index"
)
plot(pred_values$residuals, xlab = "Date", ylab = "",
main = "ARIMA Forecast Residuals"
)
pred_values %>%
kbl() %>%
kable_styling()
| Point Forecast | Lo 80 | Hi 80 | Lo 95 | Hi 95 | |
|---|---|---|---|---|---|
| Aug 2020 | 238800.4 | 237523.6 | 240077.1 | 236847.7 | 240753.0 |
| Sep 2020 | 242541.7 | 240459.5 | 244623.9 | 239357.3 | 245726.2 |
| Oct 2020 | 243261.9 | 240325.6 | 246198.2 | 238771.2 | 247752.6 |
| Nov 2020 | 243965.6 | 239857.4 | 248073.7 | 237682.7 | 250248.4 |
| Dec 2020 | 246361.5 | 240964.4 | 251758.6 | 238107.3 | 254615.7 |
| Jan 2021 | 248284.1 | 241619.3 | 254948.8 | 238091.2 | 258476.9 |
| Feb 2021 | 248775.2 | 240645.2 | 256905.2 | 236341.4 | 261209.0 |
| Mar 2021 | 252361.2 | 242677.7 | 262044.7 | 237551.6 | 267170.9 |
| Apr 2021 | 252469.5 | 241193.9 | 263745.0 | 235225.0 | 269713.9 |
| May 2021 | 254329.9 | 241352.3 | 267307.4 | 234482.4 | 274177.3 |
| Jun 2021 | 258179.8 | 243408.7 | 272950.9 | 235589.3 | 280770.3 |
| Jul 2021 | 260315.1 | 243704.3 | 276925.9 | 234911.0 | 285719.1 |
UK_Housing_Index <- window(tdf, start = tail(time(tdf), 67)[1])
ggseasonplot(UK_Housing_Index, year.labels = TRUE, col = rainbow(20))
qqnorm(fit$residuals)
Box.test(fit$residuals, type = "Ljung-Box")
##
## Box-Ljung test
##
## data: fit$residuals
## X-squared = 0.017339, df = 1, p-value = 0.8952
fit$loglik
## [1] -5033.588
fit$sigma2
## [1] 992508.2
The data has a high p-value, so the autocorrelations not significantly different than 0. We apply a log transformation to force normality.
ltdf <- log(tdf)
head(ltdf,12)
## Jan Feb Mar Apr May Jun Jul Aug
## 1970 8.273847 8.273847 8.273847 8.289037 8.289037 8.289037 8.333991 8.333991
## Sep Oct Nov Dec
## 1970 8.333991 8.333991 8.333991 8.333991
Refitting log transformation with seasonal decomposition
fit2 <- stl(ltdf, s.window = "period")
plot(fit2, main = "Seasonal Decomposition of log(Units)")
fit3 <- auto.arima(ltdf)
fitAccuracy <- data.frame(accuracy(fit))
fitAccuracy2 <- data.frame(accuracy(fit3))
fitAccuracyFinal <- rbind(fitAccuracy, fitAccuracy2)
fitAccuracyFinal %>%
kbl() %>%
kable_styling()
| ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | |
|---|---|---|---|---|---|---|---|
| Training set | 5.9606767 | 988.8338010 | 611.194813 | 0.0156202 | 0.8465972 | 0.1014046 | -0.0053314 |
| Training set1 | 0.0000203 | 0.0118251 | 0.007403 | 0.0002833 | 0.0693741 | 0.0800607 | -0.0197401 |
qqnorm(fit3$residuals)
Box.test(fit3$residuals, type = "Ljung-Box")
##
## Box-Ljung test
##
## data: fit3$residuals
## X-squared = 0.2377, df = 1, p-value = 0.6259
fit3$loglik
## [1] 1823.513
fit3$sigma2
## [1] 0.0001412298
plot(forecast(fit3, 12), xlab = "Date", ylab = "Units", main = "ARIMA Forecast for House price Index")
pred_values <- data.frame(forecast(fit, 12))
pred_values2 <- data.frame(forecast(fit3, 12))
pred_values2[,1:5] <- exp(pred_values2[,1:5])
mergedDF <- data.frame(Date = rownames(pred_values), Original_Data_Forecast = pred_values$Point.Forecast, Log_Transformed_Data_Forecast = pred_values2$Point.Forecast, Difference = round(pred_values$Point.Forecast - pred_values2$Point.Forecast, 2))
mergedDF
## Date Original_Data_Forecast Log_Transformed_Data_Forecast Difference
## 1 Aug 2020 238800.4 239599.2 -798.86
## 2 Sep 2020 242541.7 243659.1 -1117.33
## 3 Oct 2020 243261.9 244240.2 -978.31
## 4 Nov 2020 243965.6 245553.1 -1587.55
## 5 Dec 2020 246361.5 248680.4 -2318.91
## 6 Jan 2021 248284.1 250335.2 -2051.12
## 7 Feb 2021 248775.2 251413.8 -2638.55
## 8 Mar 2021 252361.2 255828.3 -3467.11
## 9 Apr 2021 252469.5 255850.1 -3380.66
## 10 May 2021 254329.9 258257.9 -3927.98
## 11 Jun 2021 258179.8 263032.1 -4852.29
## 12 Jul 2021 260315.1 265320.2 -5005.15
write.csv(mergedDF,"Out.csv")
p1<-autoplot(ltdf)
p2<-autoplot(stl(ltdf, s.window="periodic", robust=TRUE))
p3<-autoplot(fit3)
p4<-ggtsdisplay(ltdf)
ggarrange(p1,p2,p3,p4)
UK_Housing_Index <- window(ltdf, start = tail(time(ltdf), 67)[1])
tfitAccuracy2 <- t(fitAccuracy2)
a<-autoplot(UK_Housing_Index) + geom_forecast(h=36)+
ggtitle("ARIMA Average UK House Price 36 month Forecast")+xlab("Year") +ylab("log(Price/£)")+
annotation_custom(tableGrob(tfitAccuracy2), xmin=2020, xmax=2024, ymin=12.2, ymax=12.3)
a
Bringing in the property type split.
# Detached
dfd<-df
names(dfd)[3] <- "Units2"
dfdUnits2 <- dfd$Units2
tdfd <- ts(dfdUnits2, start = c(1970, 1), frequency = 12)
ltdfd <- log(tdfd)
fit3d <- auto.arima(ltdfd)
subtsd<-(window(ltdfd, start = tail(time(tdfd), 67)[1]))
# Semi-
dfsd<-df
names(dfsd)[4] <- "Units3"
dfsdUnits3 <- dfsd$Units3
tdfsd <- ts(dfsdUnits3, start = c(1970, 1), frequency = 12)
ltdfsd <- log(tdfsd)
fit3sd <- auto.arima(ltdfsd)
subtsd<-(window(ltdfsd, start = tail(time(tdfsd), 67)[1]))
# Terraced
dft<-df
names(dft)[5] <- "Units4"
dftUnits4 <- dft$Units4
tdft <- ts(dftUnits4, start = c(1970, 1), frequency = 12)
ltdft <- log(tdft)
fit3t <- auto.arima(ltdft)
subtsd<-(window(ltdft, start = tail(time(tdft), 67)[1]))
# Charts
a2<-autoplot(UK_Housing_Index) + geom_forecast(h=36)+
ggtitle("Average UK House Price ")+xlab("Year") +ylab("log(Price/£)")
b<-autoplot(subtsd) + geom_forecast(h=36)+
ggtitle("Average UK Detached House Price")+xlab("Year") +ylab("log(Price/£)")
c<-autoplot(subtsd) + geom_forecast(h=36)+
ggtitle("Average UK Semi-Detached House Price")+xlab("Year") +ylab("log(Price/£)")
d<-autoplot(subtsd) + geom_forecast(h=36)+
ggtitle("Average UK Terraced House Price")+xlab("Year") +ylab("log(Price/£)")
figure<-ggarrange(a2, b, c,d, ncol = 2, nrow = 2)
annotate_figure(figure,
top = text_grob("Average UK House Price ARIMA 36-month Forecast\n ", face = "bold", size = 15))