Assignment02_14291371_WDI data ARIMA models

library(WDI)
library(ggplot2)
wdi_data = WDI(indicator = c('inf_mort' = "SP.DYN.IMRT.IN",
                             'gdpPercap'="NY.GDP.PCAP.KD",
                             'yrs_schooling'='BAR.SCHL.15UP'), # interest rate spread
               start = 1960, end = 2020,
               extra=TRUE) %>% 
  as_tibble()
united_states_data = wdi_data %>% 
  filter(country == 'United States')
united_states_data

Section-1

Verifying the stationarity

From the plot of infant mortality rate of United States, we can observe that the mortality rate timeseries is neither mean stationary nor variance stationary as the mortality rate is constantly decreasing over time. This is evident from the mortality rate plot.

ggplot(united_states_data, aes(x = year, y = inf_mort)) +geom_line() + labs(y = 'Infant Mortality Rate',x = 'Year') + ggtitle("Infant mortality rate across Years")

Since the mortality rate is continuously reducing overtime, we can be sure that variance is non-stationary.

Seasonality

From the mortality rate plots above, we can safely say that there is no seasonlaity in the mortality rate as it is pretty much a decreasing trend overtime.

Log Transformation

We are performing log transformation to try and remove the non-stationarity

US_log_diff <- united_states_data %>%
  mutate(
    infMort_log = log1p(inf_mort), # Log for Variance Stationarity
    infMort_diff = inf_mort - lag(inf_mort),
    infMort_log_diff = infMort_log - lag(infMort_log) # Difference for Mean Stationarity
  )%>%
  drop_na()
US_log_diff %>%
  ggplot() +
  geom_line(aes(year, infMort_log_diff)) +
  theme_bw() +
  ggtitle("Difference in Log Values, US Infant Mortality Rate over Time") +
  ylab("Mortality Rate (Difference))") +
  xlab("Year")

Augmented Dickey Fuller(ADF) and KPSS tests

We conduct ADF and KPSS tests provide mathematical proof of the presence of stationarity using the p-value.

adf.test(US_log_diff$inf_mort) #raw close value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  US_log_diff$inf_mort
## Dickey-Fuller = -2.962, Lag order = 2, p-value = 0.2059
## alternative hypothesis: stationary

adf.test(US_log_diff$infMort_diff) #first difference

## 
##  Augmented Dickey-Fuller Test
## 
## data:  US_log_diff$infMort_diff
## Dickey-Fuller = -31.873, Lag order = 2, p-value = 0.01
## alternative hypothesis: stationary

adf.test(US_log_diff$infMort_log) #log close value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  US_log_diff$infMort_log
## Dickey-Fuller = -0.43272, Lag order = 2, p-value = 0.9779
## alternative hypothesis: stationary

adf.test(US_log_diff$infMort_log_diff) #differenced log close value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  US_log_diff$infMort_log_diff
## Dickey-Fuller = -10.258, Lag order = 2, p-value = 0.01
## alternative hypothesis: stationary

kpss.test(US_log_diff$inf_mort) #raw close value

## 
##  KPSS Test for Level Stationarity
## 
## data:  US_log_diff$inf_mort
## KPSS Level = 0.4142, Truncation lag parameter = 2, p-value = 0.07103

kpss.test(US_log_diff$infMort_diff) #first difference

## 
##  KPSS Test for Level Stationarity
## 
## data:  US_log_diff$infMort_diff
## KPSS Level = 0.22978, Truncation lag parameter = 2, p-value = 0.1

kpss.test(US_log_diff$infMort_log) #log close value

## 
##  KPSS Test for Level Stationarity
## 
## data:  US_log_diff$infMort_log
## KPSS Level = 0.42179, Truncation lag parameter = 2, p-value = 0.06776

kpss.test(US_log_diff$infMort_log_diff) #differenced log close value

## 
##  KPSS Test for Level Stationarity
## 
## data:  US_log_diff$infMort_log_diff
## KPSS Level = 0.22044, Truncation lag parameter = 2, p-value = 0.1

From the KPSS tests we can see that the p-value in case of First difference and Difference Close Log value are greater than 0.05. We know that p>0.05 from KPSS test indicates stationarity.

ACF and PACF Plots

acf(US_log_diff$infMort_log_diff,lag.max=20)

pacf(US_log_diff$infMort_log_diff,lag.max=20)

From the ACF and PACF plots, we can say that the time-series appears to be neither autoregressive process nor a moving average process. We can run the Auto-Arima function to further conform this.

Section-2

ARIMA Models

BIC(
  arima(US_log_diff$infMort_log,order=c(0,0,0)),
  arima(US_log_diff$infMort_log,order=c(0,1,0)),
  arima(US_log_diff$infMort_log,order=c(1,0,0)),
  arima(US_log_diff$infMort_log,order=c(0,1,1)),
  arima(US_log_diff$infMort_log,order=c(0,1,2)),
  arima(US_log_diff$infMort_log,order=c(0,1,3)),
  arima(US_log_diff$infMort_log,order=c(1,1,0)),
  arima(US_log_diff$infMort_log,order=c(1,1,1)),
  arima(US_log_diff$infMort_log,order=c(2,1,0)),
  arima(US_log_diff$infMort_log,order=c(3,1,0))
)

AIC(
  arima(US_log_diff$infMort_log,order=c(0,0,0)),
  arima(US_log_diff$infMort_log,order=c(0,1,0)),
  arima(US_log_diff$infMort_log,order=c(1,0,0)),
  arima(US_log_diff$infMort_log,order=c(0,1,1)),
  arima(US_log_diff$infMort_log,order=c(0,1,2)),
  arima(US_log_diff$infMort_log,order=c(0,1,3)),
  arima(US_log_diff$infMort_log,order=c(1,1,0)),
  arima(US_log_diff$infMort_log,order=c(1,1,1)),
  arima(US_log_diff$infMort_log,order=c(2,1,0)),
  arima(US_log_diff$infMort_log,order=c(3,1,0))
)

auto.arima(US_log_diff$infMort_log,stationary=FALSE,allowdrift=FALSE,
seasonal=FALSE,stepwise=FALSE,approximation=FALSE)

## Series: US_log_diff$infMort_log 
## ARIMA(0,2,0) 
## 
## sigma^2 = 0.002006:  log likelihood = 11.81
## AIC=-21.63   AICc=-20.83   BIC=-21.68

Based on the AIC and BIC values, we can say that (0,1,0) of the non-differenced data gives the best model because of the lowest AIC and BIC values. This is further substantiated by using auto.arima on the data frame to obtain the suitable model.

Checking for Residuals

best_mod = arima(US_log_diff$infMort_log,order=c(0,1,0))
forecast::checkresiduals(best_mod)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 0.87878, df = 3, p-value = 0.8305
## 
## Model df: 0.   Total lags used: 3

We conduct Ljung-Box test to verify residual auto-correlation. Presence of residual auto-correlation indicates that we don’t have the best model. So to make sure that we have the best model. We plot the ACF and PACF for the residuals. We also make sure that p-value in Ljung-Box test is not less than 0.05 at the significant lags from ACF and PACF plots

Ljung-Box test

par(mfrow=c(1,2))
resid = best_mod$resid
acf(resid,lag.max=20)
pacf(resid,lag.max=20)

There are no significant lags from the ACF and PACF plots. Just to make sure, we still do the Ljung-Box test for the first five lags

Box.test(resid,type='Ljung-Box',lag=1)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 0.18677, df = 1, p-value = 0.6656

Box.test(resid,type='Ljung-Box',lag=2)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 0.71858, df = 2, p-value = 0.6982

Box.test(resid,type='Ljung-Box',lag=3)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 0.87878, df = 3, p-value = 0.8305

Box.test(resid,type='Ljung-Box',lag=4)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 0.8788, df = 4, p-value = 0.9276

Box.test(resid,type='Ljung-Box',lag=5)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 3.9453, df = 5, p-value = 0.5573

Box.test(resid,type='Ljung-Box',lag=10)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = NA, df = 10, p-value = NA

Box.test(resid,type='Ljung-Box',lag=20)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = NA, df = 20, p-value = NA

The p-values for all the lags are greater than 0.05 indicating that there is no residual auto-correlation.

Predicted vs Actual Model

best_mod = arima(US_log_diff$infMort_log,order=c(0,1,0))
resid = best_mod$residuals
pred = resid+US_log_diff$infMort_log
ggplot()+
  geom_line(aes(US_log_diff$year,US_log_diff$infMort_log))+
  geom_line(aes(US_log_diff$year,pred),color='blue',alpha=0.4)+
  theme_bw()+
  xlab("Year")+
  ylab("Log Infant Mortality Rate")

RMSE

RMSE = sqrt(mean((expm1(pred) - expm1(US_log_diff$infMort_log))^2,na.rm=T))
RMSE

## [1] 1.637295

We compute the Root Mean Squared Error and find it to be 1.785. The results suggest that our model predicts the infant mortality rate within ~1.785 points on average in-sample.

Forecast

best_mod %>%
  forecast(h=5) %>% 
  autoplot()

Assignment02_14291371_WDI data ARIMA models

Guna Shanmukha

Feb 05, 2022