#pivot sales data
test<-sales %>%
pivot_longer(,cols = 6:ncol(sales),
names_to = "Time",
values_to = "sales",
values_drop_na = TRUE) %>%
mutate(Month = ymd(Time)) %>%
select(RegionName,Month, sales)
#extract sales for seattle
seattle_sales <- test %>%
filter(RegionName == "Seattle, WA") %>%
select(Month, sales)
seattle_sales %>% ggplot(aes(x = sales))+
geom_histogram(bins =20, aes(y = ..density..)) +
geom_density(col = "blue", lwd = 1) +
ylab("Density") + xlab("Sales(count)") +
ggtitle("Distribution of sales")
ggplot(seattle_sales,aes(x = Month, y = sales)) +
geom_line(color = "blue") +
xlab("Year") + ylab("Sales (Count)") +
ggtitle("Home Sales through the years")
Upon visual inspection we can see that the data is not mean stationary but is variance stationary. ADF and KPSS tests will help us test this.
adf.test(seattle_sales$sales)
##
## Augmented Dickey-Fuller Test
##
## data: seattle_sales$sales
## Dickey-Fuller = -4.8812, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
We observe that the p-value is <0.05, this indicates stationarity.
kpss.test(seattle_sales$sales)
##
## KPSS Test for Level Stationarity
##
## data: seattle_sales$sales
## KPSS Level = 2.4902, Truncation lag parameter = 4, p-value = 0.01
Here as well the p-value is observed to be <0.05, but the null hypothesis is the opposite and indicates non-stationarity.
We therefore take the first order difference and test the same.
seattle_diff = seattle_sales %>%
mutate(sales_diff = sales - lag(sales))
seattle_diff = seattle_diff %>%
filter(!is.na(sales_diff))
seattle_diff %>%
ggplot()+
geom_line(aes(Month,sales_diff), col = "blue") +
ggtitle("Sales First Difference") +
ylab("Sales (difference)")
adf.test(seattle_diff$sales_diff)
##
## Augmented Dickey-Fuller Test
##
## data: seattle_diff$sales_diff
## Dickey-Fuller = -8.8536, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(seattle_diff$sales_diff)
##
## KPSS Test for Level Stationarity
##
## data: seattle_diff$sales_diff
## KPSS Level = 0.013639, Truncation lag parameter = 4, p-value = 0.1
After taking the first-order difference we can see that the test results are in agreement with each other regarding stationarity.
acf(seattle_diff$sales_diff,lag.max=20,main = "Sales")
pacf(seattle_diff$sales_diff,lag.max=20, main = "Sales")
Through the ACF and PACF plots we can see that it must be an AR process with p >= 4 and MA >0.
We now proceed with modelling the sales data.
ar_mod_aic<-auto.arima(seattle_sales$sales,stepwise = F,approximation = F,
max.p = 20, max.d = 20, max.q = 20, trace = T, ic = "aic")
##
## ARIMA(0,1,0) : 2677.872
## ARIMA(0,1,0) with drift : 2679.77
## ARIMA(0,1,1) : 2679.694
## ARIMA(0,1,1) with drift : 2681.595
## ARIMA(0,1,2) : 2680.58
## ARIMA(0,1,2) with drift : 2682.491
## ARIMA(0,1,3) : 2678.176
## ARIMA(0,1,3) with drift : 2680.106
## ARIMA(0,1,4) : 2663.666
## ARIMA(0,1,4) with drift : 2662.412
## ARIMA(0,1,5) : 2652.04
## ARIMA(0,1,5) with drift : 2649.99
## ARIMA(1,1,0) : 2679.672
## ARIMA(1,1,0) with drift : 2681.573
## ARIMA(1,1,1) : 2681.508
## ARIMA(1,1,1) with drift : 2683.412
## ARIMA(1,1,2) : 2681.974
## ARIMA(1,1,2) with drift : 2683.893
## ARIMA(1,1,3) : 2679.672
## ARIMA(1,1,3) with drift : 2681.597
## ARIMA(1,1,4) : 2655.839
## ARIMA(1,1,4) with drift : Inf
## ARIMA(2,1,0) : 2681.012
## ARIMA(2,1,0) with drift : 2682.921
## ARIMA(2,1,1) : 2682.966
## ARIMA(2,1,1) with drift : 2684.876
## ARIMA(2,1,2) : Inf
## ARIMA(2,1,2) with drift : Inf
## ARIMA(2,1,3) : Inf
## ARIMA(2,1,3) with drift : Inf
## ARIMA(3,1,0) : 2682.553
## ARIMA(3,1,0) with drift : 2684.47
## ARIMA(3,1,1) : 2683.218
## ARIMA(3,1,1) with drift : 2685.136
## ARIMA(3,1,2) : Inf
## ARIMA(3,1,2) with drift : Inf
## ARIMA(4,1,0) : 2673.199
## ARIMA(4,1,0) with drift : 2675.061
## ARIMA(4,1,1) : 2633.279
## ARIMA(4,1,1) with drift : 2631.546
## ARIMA(5,1,0) : 2657.442
## ARIMA(5,1,0) with drift : 2659.164
##
##
##
## Best model: ARIMA(4,1,1) with drift
With AIC as the information criterion as per which the model will be selected we get an ARIMA(4,1,1) model with drift as the best model.
We will repeat the same with BIC as the criterion for selection.
ar_mod_bic<-auto.arima(seattle_sales$sales,stepwise = F,approximation = F,
max.p = 20, max.d = 20, max.q = 20, trace = T, ic = "bic")
##
## ARIMA(0,1,0) : 2680.984
## ARIMA(0,1,0) with drift : 2685.994
## ARIMA(0,1,1) : 2685.918
## ARIMA(0,1,1) with drift : 2690.931
## ARIMA(0,1,2) : 2689.916
## ARIMA(0,1,2) with drift : 2694.939
## ARIMA(0,1,3) : 2690.624
## ARIMA(0,1,3) with drift : 2695.666
## ARIMA(0,1,4) : 2679.226
## ARIMA(0,1,4) with drift : 2681.084
## ARIMA(0,1,5) : 2670.712
## ARIMA(0,1,5) with drift : 2671.774
## ARIMA(1,1,0) : 2685.896
## ARIMA(1,1,0) with drift : 2690.909
## ARIMA(1,1,1) : 2690.844
## ARIMA(1,1,1) with drift : 2695.86
## ARIMA(1,1,2) : 2694.422
## ARIMA(1,1,2) with drift : 2699.453
## ARIMA(1,1,3) : 2695.232
## ARIMA(1,1,3) with drift : 2700.269
## ARIMA(1,1,4) : 2674.511
## ARIMA(1,1,4) with drift : Inf
## ARIMA(2,1,0) : 2690.348
## ARIMA(2,1,0) with drift : 2695.369
## ARIMA(2,1,1) : 2695.414
## ARIMA(2,1,1) with drift : 2700.436
## ARIMA(2,1,2) : Inf
## ARIMA(2,1,2) with drift : Inf
## ARIMA(2,1,3) : Inf
## ARIMA(2,1,3) with drift : Inf
## ARIMA(3,1,0) : 2695.001
## ARIMA(3,1,0) with drift : 2700.03
## ARIMA(3,1,1) : 2698.778
## ARIMA(3,1,1) with drift : 2703.808
## ARIMA(3,1,2) : Inf
## ARIMA(3,1,2) with drift : Inf
## ARIMA(4,1,0) : 2688.759
## ARIMA(4,1,0) with drift : 2693.732
## ARIMA(4,1,1) : 2651.951
## ARIMA(4,1,1) with drift : 2653.33
## ARIMA(5,1,0) : 2676.114
## ARIMA(5,1,0) with drift : 2680.948
##
##
##
## Best model: ARIMA(4,1,1)
We get the ARIMA(4,1,1) model as the best model when we set BIC as the criterion for selection.
modelsummary(ar_mod_aic)
| Model 1 | |
|---|---|
| ar1 | 0.665 |
| (0.075) | |
| ar2 | 0.068 |
| (0.088) | |
| ar3 | 0.025 |
| (0.088) | |
| ar4 | -0.397 |
| (0.073) | |
| ma1 | -0.877 |
| (0.038) | |
| drift | 22.171 |
| (10.079) | |
| Num.Obs. | 166 |
| AIC | 2631.5 |
| BIC | 2653.3 |
| RMSE | 636.52 |
| x | 0.861 |
modelsummary(ar_mod_bic)
| Model 1 | |
|---|---|
| ar1 | 0.656 |
| (0.075) | |
| ar2 | 0.069 |
| (0.088) | |
| ar3 | 0.027 |
| (0.087) | |
| ar4 | -0.396 |
| (0.072) | |
| ma1 | -0.846 |
| (0.038) | |
| Num.Obs. | 166 |
| AIC | 2633.3 |
| BIC | 2652.0 |
| RMSE | 644.21 |
| x | 0.860 |
We will proceed with model with drift as our best model.
checkresiduals(ar_mod_aic)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,1,1) with drift
## Q* = 6.9708, df = 4, p-value = 0.1374
##
## Model df: 6. Total lags used: 10
From the above plots we can observe the there is no unmodeled variation in the residuals and residuals seem to be normally distributed. However at the 12th Lag we do seem to have some auto-co-linearity among the residuals. We will check the same with more Ljung - Box tests.
resid = ar_mod_aic$residuals
for (i in 1:12){
print(Box.test(resid,type='Ljung-Box',lag = i))
}
##
## Box-Ljung test
##
## data: resid
## X-squared = 0.89524, df = 1, p-value = 0.3441
##
##
## Box-Ljung test
##
## data: resid
## X-squared = 1.3282, df = 2, p-value = 0.5147
##
##
## Box-Ljung test
##
## data: resid
## X-squared = 1.8049, df = 3, p-value = 0.6139
##
##
## Box-Ljung test
##
## data: resid
## X-squared = 3.8881, df = 4, p-value = 0.4214
##
##
## Box-Ljung test
##
## data: resid
## X-squared = 3.914, df = 5, p-value = 0.5619
##
##
## Box-Ljung test
##
## data: resid
## X-squared = 4.1442, df = 6, p-value = 0.6572
##
##
## Box-Ljung test
##
## data: resid
## X-squared = 4.2928, df = 7, p-value = 0.7455
##
##
## Box-Ljung test
##
## data: resid
## X-squared = 5.3994, df = 8, p-value = 0.7142
##
##
## Box-Ljung test
##
## data: resid
## X-squared = 5.4962, df = 9, p-value = 0.7891
##
##
## Box-Ljung test
##
## data: resid
## X-squared = 6.9708, df = 10, p-value = 0.7282
##
##
## Box-Ljung test
##
## data: resid
## X-squared = 8.433, df = 11, p-value = 0.6741
##
##
## Box-Ljung test
##
## data: resid
## X-squared = 70.694, df = 12, p-value = 2.375e-10
Here too we see that only at the 12th Lag there may be auto-co-linearity among the residuals.
pred = seattle_sales$sales - resid
plot1 = ggplot() +
geom_line(aes(seattle_sales$Month,seattle_sales$sales), color = "blue") +
geom_line(aes(seattle_sales$Month,pred), color = "red", alpha = 0.6) +
xlab("Year") + ylab("Sales (count)") +
ggtitle("Observed vs fitted values")
plot1
Here we can see that the model is able to capture most of the variation in the data.
RMSE = sqrt(mean((pred - seattle_sales$sales)^2,na.rm=T))
RMSE
## [1] 636.5157
The results suggest that our model predicts the Sales (count) within ~636 on average in-sample.
ar_mod_aic %>%
forecast(h=5) %>%
autoplot() +
ylab("Sales(count)")
Upon visual inspection we can say that the forecasted values are within a reasonable interval. If we look at the 95% intervals we are still expecting a rise in the sales (count) of homes across america.
We will fit a linear model to compare with our ARIMA model.
lin_mod<-lm(sales~Month,data = seattle_sales)
modelsummary(lin_mod)
| Model 1 | |
|---|---|
| (Intercept) | -9706.469 |
| (989.598) | |
| Month | 0.875 |
| (0.060) | |
| Num.Obs. | 167 |
| R2 | 0.564 |
| R2 Adj. | 0.561 |
| AIC | 2827.5 |
| BIC | 2836.8 |
| Log.Lik. | -1410.728 |
| F | 213.520 |
A linear model does not capture any of the variation in the data.
seattle_sales %>%
ggplot()+
geom_line(aes(Month,sales), col = "blue")+
geom_line(aes(Month,predict(lin_mod)), col= "green") +
geom_line(aes(Month,pred), color = "red", alpha = 0.6) +
ggtitle("Comparison with a Linear model") + xlab("Year") + ylab("Sales (Count)")
We can see that the ARIMA model significantly outperforms the simple linear model.