Libraries

#install.packages(c("kableExtra","tidyverse","patchwork","bestNormalize","caret","forecast","tseries"))
library(kableExtra)
library(tidyverse)
library(patchwork)
library(bestNormalize)
library(caret)
library(knitr)
library(ggpubr)
library(grid)
library(gridExtra)
require(forecast)
require(tseries)

Data

We used open source data on the UK Housing Price Index (HPI); 1970 to August 2020. (Note: the House type cannot be differentiated until 2005)

The UK House Price Index (HPI) uses house sales data from HM Land Registry, Registers of Scotland, and Land and Property Services Northern Ireland and is calculated by the Office for National Statistics. The index applies a statistical method, called a hedonic regression model, to the various sources of data on property price and attributes to produce estimates of the change in house prices each period.

As of July 2020 the average house price in the UK is £237,963, and the index stands at 124.81. Property prices have risen by 0.5% compared to the previous month, and risen by 2.3% compared to the previous year.

You can verify the data here https://landregistry.data.gov.uk/app/ukhpi

head(df[,1:2]) %>%
  kbl() %>%
  kable_styling()
Period Average.price.All.property.types
22 1970-01 3920
23 1970-02 3920
24 1970-03 3920
25 1970-04 3980
26 1970-05 3980
27 1970-06 3980
tail(df[,1:2]) %>%
  kbl() %>%
  kable_styling()
Period Average.price.All.property.types
623 2020-02 231162
624 2020-03 234464
625 2020-04 230533
626 2020-05 231778
627 2020-06 236798
628 2020-07 237963
summary(df[,2:5]) %>% 
  kbl() %>% 
  kable_styling()
Average.price.All.property.types Average.price.Detached.houses Average.price.Semi.detached.houses Average.price.Terraced.houses
Min. : 3920 Min. :234509 Min. :145913 Min. :119434
1st Qu.: 22360 1st Qu.:258520 1st Qu.:159793 1st Qu.:136399
Median : 58250 Median :270720 Median :169788 Median :145778
Mean : 89710 Mean :286678 Mean :178871 Mean :153030
3rd Qu.:167017 3rd Qu.:322865 3rd Qu.:201184 3rd Qu.:173246
Max. :237963 Max. :361986 Max. :227832 Max. :193619
NA NA’s :420 NA’s :420 NA’s :420

Building a time series for all House types

names(df)[2] <- "Units"
dfUnits <- df$Units
tdf <- ts(dfUnits, start = c(1970, 1), frequency = 12)
head(tdf,12)
##       Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec
## 1970 3920 3920 3920 3980 3980 3980 4163 4163 4163 4163 4163 4163

Exploratory ARIMA

fit <- auto.arima(tdf)
fit
## Series: tdf 
## ARIMA(3,2,2)(0,0,2)[12] 
## 
## Coefficients:
##           ar1      ar2      ar3     ma1     ma2    sma1    sma2
##       -1.4200  -1.1652  -0.4699  0.7083  0.1996  0.3932  0.2108
## s.e.   0.1238   0.1091   0.0651  0.1287  0.0932  0.0431  0.0360
## 
## sigma^2 estimated as 992508:  log likelihood=-5033.59
## AIC=10083.18   AICc=10083.42   BIC=10118.42
accfit <-accuracy(fit) 
accfit
##                    ME     RMSE      MAE       MPE      MAPE      MASE
## Training set 5.960677 988.8338 611.1948 0.0156202 0.8465972 0.1014046
##                      ACF1
## Training set -0.005331425
pred_values <- forecast(fit, 12)
plot(pred_values, xlab = "Date", ylab = "Price/£",
     main = "ARIMA 12m Forecast for House price Index"
     )

plot(pred_values$residuals, xlab = "Date", ylab = "",
     main = "ARIMA Forecast Residuals"
     )

pred_values  %>% 
  
  kbl() %>% 
  kable_styling()
Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
Aug 2020 238800.4 237523.6 240077.1 236847.7 240753.0
Sep 2020 242541.7 240459.5 244623.9 239357.3 245726.2
Oct 2020 243261.9 240325.6 246198.2 238771.2 247752.6
Nov 2020 243965.6 239857.4 248073.7 237682.7 250248.4
Dec 2020 246361.5 240964.4 251758.6 238107.3 254615.7
Jan 2021 248284.1 241619.3 254948.8 238091.2 258476.9
Feb 2021 248775.2 240645.2 256905.2 236341.4 261209.0
Mar 2021 252361.2 242677.7 262044.7 237551.6 267170.9
Apr 2021 252469.5 241193.9 263745.0 235225.0 269713.9
May 2021 254329.9 241352.3 267307.4 234482.4 274177.3
Jun 2021 258179.8 243408.7 272950.9 235589.3 280770.3
Jul 2021 260315.1 243704.3 276925.9 234911.0 285719.1
UK_Housing_Index <- window(tdf, start = tail(time(tdf), 67)[1])
ggseasonplot(UK_Housing_Index, year.labels = TRUE, col = rainbow(20))

qqnorm(fit$residuals)

Box.test(fit$residuals, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  fit$residuals
## X-squared = 0.017339, df = 1, p-value = 0.8952
fit$loglik
## [1] -5033.588
fit$sigma2
## [1] 992508.2

The data has a high p-value, so the autocorrelations not significantly different than 0. We apply a log transformation to force normality.

ltdf <- log(tdf)
head(ltdf,12)
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 1970 8.273847 8.273847 8.273847 8.289037 8.289037 8.289037 8.333991 8.333991
##           Sep      Oct      Nov      Dec
## 1970 8.333991 8.333991 8.333991 8.333991

Refitting log transformation with seasonal decomposition

fit2 <- stl(ltdf, s.window = "period")
plot(fit2, main = "Seasonal Decomposition of log(Units)")

ARIMA model

fit3 <- auto.arima(ltdf)
fitAccuracy <- data.frame(accuracy(fit))
fitAccuracy2 <- data.frame(accuracy(fit3))
fitAccuracyFinal <- rbind(fitAccuracy, fitAccuracy2)
fitAccuracyFinal %>% 
  kbl() %>% 
  kable_styling()
ME RMSE MAE MPE MAPE MASE ACF1
Training set 5.9606767 988.8338010 611.194813 0.0156202 0.8465972 0.1014046 -0.0053314
Training set1 0.0000203 0.0118251 0.007403 0.0002833 0.0693741 0.0800607 -0.0197401
qqnorm(fit3$residuals)

Box.test(fit3$residuals, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  fit3$residuals
## X-squared = 0.2377, df = 1, p-value = 0.6259
fit3$loglik
## [1] 1823.513
fit3$sigma2
## [1] 0.0001412298
plot(forecast(fit3, 12), xlab = "Date", ylab = "Units", main = "ARIMA Forecast for House price Index")

pred_values <- data.frame(forecast(fit, 12))
pred_values2 <- data.frame(forecast(fit3, 12))
pred_values2[,1:5] <- exp(pred_values2[,1:5])
mergedDF <- data.frame(Date = rownames(pred_values), Original_Data_Forecast = pred_values$Point.Forecast, Log_Transformed_Data_Forecast = pred_values2$Point.Forecast, Difference = round(pred_values$Point.Forecast - pred_values2$Point.Forecast, 2))
mergedDF
##        Date Original_Data_Forecast Log_Transformed_Data_Forecast Difference
## 1  Aug 2020               238800.4                      239599.2    -798.86
## 2  Sep 2020               242541.7                      243659.1   -1117.33
## 3  Oct 2020               243261.9                      244240.2    -978.31
## 4  Nov 2020               243965.6                      245553.1   -1587.55
## 5  Dec 2020               246361.5                      248680.4   -2318.91
## 6  Jan 2021               248284.1                      250335.2   -2051.12
## 7  Feb 2021               248775.2                      251413.8   -2638.55
## 8  Mar 2021               252361.2                      255828.3   -3467.11
## 9  Apr 2021               252469.5                      255850.1   -3380.66
## 10 May 2021               254329.9                      258257.9   -3927.98
## 11 Jun 2021               258179.8                      263032.1   -4852.29
## 12 Jul 2021               260315.1                      265320.2   -5005.15
write.csv(mergedDF,"Out.csv")
p1<-autoplot(ltdf)
p2<-autoplot(stl(ltdf, s.window="periodic", robust=TRUE))
p3<-autoplot(fit3)
p4<-ggtsdisplay(ltdf)
ggarrange(p1,p2,p3,p4)

UK_Housing_Index <- window(ltdf, start = tail(time(ltdf), 67)[1])
tfitAccuracy2 <- t(fitAccuracy2)
a<-autoplot(UK_Housing_Index) + geom_forecast(h=36)+
  
  ggtitle("ARIMA Average UK House Price 36 month Forecast")+xlab("Year") +ylab("log(Price/£)")+
  
  annotation_custom(tableGrob(tfitAccuracy2), xmin=2020, xmax=2024, ymin=12.2, ymax=12.3)
a

Bringing in the property type split.

# Detached
dfd<-df
names(dfd)[3] <- "Units2"
dfdUnits2 <- dfd$Units2
tdfd <- ts(dfdUnits2, start = c(1970, 1), frequency = 12)
ltdfd <- log(tdfd)
fit3d <- auto.arima(ltdfd)
subtsd<-(window(ltdfd, start = tail(time(tdfd), 67)[1]))
# Semi-
dfsd<-df
names(dfsd)[4] <- "Units3"
dfsdUnits3 <- dfsd$Units3
tdfsd <- ts(dfsdUnits3, start = c(1970, 1), frequency = 12)
ltdfsd <- log(tdfsd)
fit3sd <- auto.arima(ltdfsd)
subtsd<-(window(ltdfsd, start = tail(time(tdfsd), 67)[1]))
# Terraced
dft<-df
names(dft)[5] <- "Units4"
dftUnits4 <- dft$Units4
tdft <- ts(dftUnits4, start = c(1970, 1), frequency = 12)
ltdft <- log(tdft)
fit3t <- auto.arima(ltdft)
subtsd<-(window(ltdft, start = tail(time(tdft), 67)[1]))

# Charts

a2<-autoplot(UK_Housing_Index) + geom_forecast(h=36)+
  
  ggtitle("Average UK House Price ")+xlab("Year") +ylab("log(Price/£)")

b<-autoplot(subtsd) + geom_forecast(h=36)+
  
  ggtitle("Average UK Detached House Price")+xlab("Year") +ylab("log(Price/£)")

c<-autoplot(subtsd) + geom_forecast(h=36)+
  
  ggtitle("Average UK Semi-Detached House Price")+xlab("Year") +ylab("log(Price/£)")

d<-autoplot(subtsd) + geom_forecast(h=36)+
  
  ggtitle("Average UK Terraced House Price")+xlab("Year") +ylab("log(Price/£)")
figure<-ggarrange(a2, b, c,d, ncol = 2, nrow = 2)
annotate_figure(figure,
                top = text_grob("Average UK House Price ARIMA 36-month Forecast\n    ", face = "bold", size = 15))