1 Introduction

In this report, to generate and select a best model, several methods were utilised to first analyse a chosen dataset to get a better understanding of its trends and patterns before suitable models can be applied. These methods involve plotting the data against its time of observation, then carrrying out autocorrelation functions (ACF), partial autocorrelation functios (PACF), extended autocorrelation functions (EACF), differencing, normality and stationarity tests to produce a set of possible models. Akaike information criterion (AIC), bayesian information criterion (BIC) scores, other accuracy metrics as well as overfitting tests were then employed to evaluate and ensure that the chosen model was best suited and provided the most accurate performance with the chosen dataset.

2 Objectives

The aim of this report was to generate then select the best model using time series analysis and forecasting methods to accurately predict the next 10 years of the series.

3 Methodology

3.1 The Data

The dataset analysed in this report is publicly available through the website USA Today’s Tornado Archive and can be accessed through the link: https://data.usatoday.com/tornado-archive/united-states/ [1]. It is updated on a daily basis and was manually extracted then placed in a csv file, “US_tornado_data.csv”. The dataset represents the yearly number of tornadoes in the US from 1950 to present date. However, 2024’s data will be excluded and the analysis will cover only the years from 1850 to 2023, as its incompleteness may affect the model’s accuracies and inclusion of frequently updated data can create data inconsistencies and reduce reliability of model performance.

3.2 Data Cleaning

The data was checked for any missing values using the sum() and is.na() functions, from which there was none to ensure data integrity. Additionally, the ‘,’ in the column, ‘Number_of_Tornadoes’ was removed using gsub() function before it was converted to a numeric data type using as.numeric() function so that that data is in an appropriate format to work with.

3.3 R Packages and User Defined Functions

The R packages and user-defined functions used in this project are loaded below:

# Load the required packages
library(TSA)

## 
## Attaching package: 'TSA'

## The following objects are masked from 'package:stats':
## 
##     acf, arima

## The following object is masked from 'package:utils':
## 
##     tar

library(tseries)

## Warning: package 'tseries' was built under R version 4.2.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(forecast)

## Warning: package 'forecast' was built under R version 4.2.3

## Registered S3 methods overwritten by 'forecast':
##   method       from
##   fitted.Arima TSA 
##   plot.Arima   TSA

# Function to generate autocorrelation function (acf) and partial autocorrelation function (pacf) plots
acf_pacf_plot <- function(data, acf_title, pacf_title,title_size) {
  par(mfrow=c(1,2))
  acf(data, main = acf_title, cex.main = title_size) # Adjust title size to fit
  pacf(data, main = "", cex.main = title_size)
  title(main = pacf_title, cex.main = title_size) 
  par(mfrow=c(1,1))
}

# Function to sort the models from best to worse/ in ascending order
sort.score <- function(x, score = c("bic", "aic")){
  if (score == "aic"){
    x[with(x, order(AIC)),]
  } else if (score == "bic") {
    x[with(x, order(BIC)),]
  } else {
    warning('score = "x" only accepts valid arguments ("aic","bic")')
  }
}

# LBQPlot function from FitAR
LBQPlot <-
function(res, lag.max=30, StartLag=k+1, k=0, SquaredQ=FALSE){
stopifnot(k>=0, lag.max-StartLag>0, length(res)>lag.max)
ans<-LjungBoxTest(res, k=k, lag.max=lag.max, StartLag=StartLag, SquaredQ=FALSE)
m <- ans[,"m"]
pv <- ans[,"pvalue"]
plot(m, pv, xlab="lag", ylab="p-value", ylim=c(0,1), main="Ljung-Box Test")
abline(h=0.05, col="red", lty=2)
}

# LjungBoxTest function from FitAR
LjungBoxTest <-
function(res, k=0, lag.max=30, StartLag=1, SquaredQ=FALSE){
stopifnot(k>=0, StartLag>=1, lag.max>=StartLag)
n<-length(res)
L0<-StartLag
if (SquaredQ) {
    z<-(res-mean(res))^2
    kpar<-0
    }
else {
    z<-res
    kpar<-k
}
ra<-(acf(z, lag.max=lag.max, plot=FALSE)$acf)[-1]
lags<-L0:lag.max
QQ<-n*(n+2)*cumsum((ra^2)/(n-(1:lag.max)))[lags]
df <- ifelse(lags-kpar > 0, lags-kpar, 1)
pv<-1-pchisq(QQ,df)
QQ<-round(QQ,2)
a<-matrix(c(lags,QQ,pv),ncol=3)
dimnames(a)<-list(rep("",length(QQ)),c("m","Qm", "pvalue"))
a
}

# Function to perform residual analysis for trend models
trend.residual.analysis <- function(model, std = TRUE, start = 2){
  res.tModel <- rstudent(model)
  par(mfrow=c(3,2))
  plot(res.tModel,type='o',ylab='Standardised residuals', main="Time series plot of standardised residuals")
  abline(h=0)
  hist(res.tModel,main="Histogram of standardised residuals")
  qqnorm(res.tModel,main="QQ plot of standardised residuals")
  qqline(res.tModel, col = 2)
  acf(res.tModel,main="ACF of standardised residuals")
  pacf(res.tModel, main = "PACF of standardised residuals")
  print(shapiro.test(res.tModel))
  par(mfrow=c(1,1))
}

# Function to perform residual analysis
residual.analysis <- function(model, std = TRUE, start = 2, class = c("ARIMA","GARCH","ARMA-GARCH", "fGARCH")[1]){
  library(TSA)
  #library(FitAR)
  if (class == "ARIMA"){
    if (std == TRUE){
      res.model = rstandard(model)
    }else{
      res.model = residuals(model)
    }
  }else if (class == "GARCH"){
    res.model = model$residuals[start:model$n.used]
  }else if (class == "ARMA-GARCH"){
    res.model = model@fit$residuals
  }else if (class == "fGARCH"){
    res.model = model@residuals
  }else {
    stop("The argument 'class' must be either 'ARIMA' or 'GARCH' ")
  }
  par(mfrow=c(3,2))
  plot(res.model,type='o',ylab='Standardised residuals', main="Time series plot of standardised residuals")
  abline(h=0)
  hist(res.model,main="Histogram of standardised residuals")
  qqnorm(res.model,main="QQ plot of standardised residuals")
  qqline(res.model, col = 2)
  acf(res.model,main="ACF of standardised residuals")
  print(shapiro.test(res.model))
  k=0
  LBQPlot(res.model, lag.max = 30, StartLag = k + 1, k = 0, SquaredQ = FALSE)
  par(mfrow=c(1,1))
}

4 Results

4.1 Data Analysis

The first variable, ‘Year’ covered the years from 1850 to 2023 and acted as the unique index, with the second column, ‘Number_of_Tornadoes’ containing all of the actual observations. Investigating the data further showed that it contained no missing values and the ‘Number_of_Tornadoes’ variable having a median of 990 indicated that there were no negative values within the data. Moreover, the maximum value being 2074 and the minimum value being 49, suggests the data does not contain any zero values.

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    49.0   737.5   990.0  1021.6  1263.5  2074.0

The dataset was then converted into a time series object before the data was plotted against its time of observation to help gain a better understanding of the data’s characteristics. Analysing Figure 1 there appears to be no seasonality since there are no repeated patterns. The data points first appears to sharply move upwards and steadily downwards, before changing direction to an increasing trend during the 1960s, then consistently wander in a decreasing trend over time. In terms of changing variances, there are several fluctuations that get considerably wider and smaller over time. In addition, both moving average (MA) and auto regressive (AR) behaviour are present as suggested by the occasional up, down movements of the data points and gradual shift in the trend with successive points.

Figure 1

The scatter plot in Figure 2 shows the relationship between the current and previous yearly number of tornadoes in the US, whereby the Pearson’s correlation coefficient being 0.56, indicates that the current yearly number of tornadoes is moderately, positively correlated to the previous yearly number of tornadoes.

## Pearson's Correlation Coefficient:  0.5605586

Figure 2

Looking at the acf plot in Figure 3, the slow decaying pattern may indicate the presence of a trend in the data and the lack of a wave pattern, suggests there is no seasonality in the series. For the PACF plot, there are significant correlations only at the first and second lag, which supports the implications of a trend and suggests an autoregressive term of order 1 or 2. Since the dataset does not demonstrate seasonality from descriptive analysis, the most relevant models to apply were the basic regression, linear and quadratic as well as ARIMA models. Thus, cubic regression and SARIMA models will not be attempted [2].

Figure 3

ADF Test Assumptions:
H0 : The process is difference non-stationary
HA : The process is stationary

Shapiro-Wilk Test Assumptions:
H0 : The given series is normally distributed
HA : The given series is not normally distributed

The p-value obtained from the ADF unit root test being 0.01668 is less than the chosen significant value, 0.05, hence the null hypothesis is rejected and data is considered stationary. Whilst the p-value from the Shapiro-Wilk test being 0.7313, is greater than 0.05, indicating failure to reject the null hypothesis. Hence, both tests suggests that the data exhibits normality and stationarity. Furthermore, the data points in the QQ plot (Figure 4) is positively correlated and support the assumption of a normally distributed data, as it closely resembles a straight line with some departures in the top right corner. As a result, transformation will not be necessary to stabilise the variance across time as the series is already stationary, however, differencing will need to be applied to remove trends.

## 
##  Augmented Dickey-Fuller Test
## 
## data:  US_tornado_data_ts
## Dickey-Fuller = -4.062, Lag order = 4, p-value = 0.01169
## alternative hypothesis: stationary

## 
##  Shapiro-Wilk normality test
## 
## data:  US_tornado_data_ts
## W = 0.98677, p-value = 0.6366

Figure 4

4.2 Trend Models

4.2.1 Linear Regression

The R-squared value of the linear model is 0.4534, meaning that approximately 45.34% of the variability in the yearly number of tornadoes can be explained by the linear model. When observing the p-value = 2.969e-11, it is shown to be less than the significance level 0.05, thus we reject the the null hypothesis and it can be said that the model is statistically significant and fits the data. In which this fit was visualized by plotting the linear trend line (dotted red line) over the yearly number of tornadoes in the US time series and can be seen by referring to figure 5.

## 
## Call:
## lm(formula = US_tornado_data_ts ~ t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1437.55  -146.63    -7.55   134.98   749.22 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25751.132   3150.597   8.173 7.28e-12 ***
## t             -12.443      1.586  -7.846 2.97e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 291.4 on 72 degrees of freedom
## Multiple R-squared:  0.4609, Adjusted R-squared:  0.4534 
## F-statistic: 61.56 on 1 and 72 DF,  p-value: 2.969e-11

Figure 5

Inspecting the Shapiro-Wilk test on the residuals of the linear trend model, the obtained p-value = 1.874e-08 is less than the significance level, which implies that the residuals are not normally distributed. This assumption is supported by the time series plot of standardised residuals as it depicts a sharp rise before the data points fluctuate up and down throughout the series then slowly move downwards towards the end. The histogram of standardised residuals is positively skewed and there are no significant lags in both the ACF and PACF plot of standardised residuals. In addition, the data points in the QQ plot appear to demonstrate a reasonably straight line, however, it does contain straying data points at the tails of the reference line. Therefore, it can be seen that the residuals are not normally distributed and a variance of 45.34% suggests that this model is not a good enough fit as it does not effectively capture the trend in the series.

## 
##  Shapiro-Wilk normality test
## 
## data:  res.tModel
## W = 0.80656, p-value = 1.874e-08

4.2.2 Quadratic Regression

When testing the quadratic models, the R-squared value achieved here is 0.4835 which is slightly better than the linear model’s. Albeit, it is still very low and that only 48.35% of the variation in the the yearly number of tornadoes can be explained by the quadratic model. The p-value here being 2.437e-11 is once again less than the significance level 0.05, hence the null hypothesis is rejected and we conclude that the model is statistically significant. This model’s fit was then visualized in figure 6 and is indicated as the dotted red line over the original yearly number of tornadoes in the US time series.

## 
## Call:
## lm(formula = US_tornado_data_ts ~ t + t2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1276.54  -146.67    -2.91   117.34   766.86 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -6.995e+05  3.185e+05  -2.196   0.0314 *
## t            7.178e+02  3.207e+02   2.238   0.0283 *
## t2          -1.838e-01  8.072e-02  -2.277   0.0258 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 283.3 on 71 degrees of freedom
## Multiple R-squared:  0.4976, Adjusted R-squared:  0.4835 
## F-statistic: 35.16 on 2 and 71 DF,  p-value: 2.437e-11

Figure 6

Analysing the quadratic model’s Shapiro-Wilk test on the residuals yields a p-value of 3.058e-07, indicating that the residuals are not normally distributed here as well. Scrutinising the overall plots of the standardised residuals demonstrates very similar results to the linear model’s, however, the quadratic plot does have a slightly better fit. In which, the residuals of the time series plot is closer to the reference line, with the ACF and PACF still displaying no significant lags. Whilst the histogram is skewed in the right direction, it does appear to be a bit more balanced compared to the linear model’s histogram of the standardised residuals. Despite this, the R-squared value produced here was 0.4835, which is not sufficient enough, thus the quadratic model is also not the best fit for this chosen dataset and will not be considered for forecasting.

## 
##  Shapiro-Wilk normality test
## 
## data:  res.tModel
## W = 0.84723, p-value = 3.058e-07

4.3 ARIMA Models

4.3.1 Differencing

Before attempting ARIMA models, as mentioned before, differencing is first required to help stabilise the mean by removing changes in the level of a time series. After first differencing was implemented, Figure 7 clearly shows that a flatter mean level has been attain compared to the data distribution in Figure 1.

Figure 7

Now looking at Figure 8, the QQ plot showed improvement as the tail at the bottom left depicts the data points being more aligned with the straight line compared to Figure 4, despite the data points deviating at the tail in the top right corner. Thus, the data is also more normally distributed than before.

Figure 8

ADF Test Assumptions:
H0 : The process is difference non-stationary
HA : The process is stationary

The p-value from the ADF unit root test is 0.01, therefore less than the significant value, 0.05 and so the null hypothesis is rejected. This means that the trend in the data has been removed and the data has been altered to a stationary series through first differencing. Looking at the Phillips Perron (PP) test, the p-value of 0.01 is less than 0.05, hence providing strong evidence to reject the null hypothesis in support of the assumption that the data has reached stationarity.

## Warning in adf.test(diff_data): p-value smaller than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff_data
## Dickey-Fuller = -6.4332, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary

## Warning in pp.test(diff_data): p-value smaller than printed p-value

## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff_data
## Dickey-Fuller Z(alpha) = -89.051, Truncation lag parameter = 3, p-value
## = 0.01
## alternative hypothesis: stationary

4.4 ARIMA Model Specification

The raw data initially contained trends and was non-stationary, therefore first differencing was applied to remove the trends and convert the data into a stationary series. Now, appropriate model specification tools can be implemented to produce a set of possible ARIMA models by first finding the values of ‘p’ and ‘q’ which represent the order of an AR and MA model, respectively.

4.4.1 ACF and PACF Plots

Referring to the ACF plot in Figure 9, there is a clear significant lag and a second, less distinct lag that may also be considered, hence, MA(1) and MA(2) models were taken for potential ‘q’ values. Looking at the PACF plot, there is only one obvious significant lag, thus AR(1) model was considered for a possible ‘p’ value. Since first difference was applied, ‘d’ will be 1. As a result, two sets of ARIMA models were derived from analyzing the ACF and PACF plots, that is; {ARIMA(1,1,1) and ARIMA(1,1,2)}.

Figure 9

4.4.2 EACF (Extended Autocorrelation Function) Test

Now using another approach to search for likely values for ‘p’ and ‘q’, the set of potential models obtained from EACF are; {ARIMA(0,1,7), ARIMA(0,1,8), ARIMA(1,1,6), ARIMA(1,1,7) and ARIMA(2,1,6)}. These models were derived from selecting the smallest pairs of consistently significant AR and MA orders in a region that avoided all the crosses, in which the rows correspond to different AR orders and the columns indicate different MA orders.

## AR/MA
##   0 1 2 3 4 5 6 7 8
## 0 x o o o x o o o o
## 1 x o o o o o o o o
## 2 x x o o o o o o o
## 3 o x o o o o o o o
## 4 o x x x x o o o o
## 5 x o o o o o o o o
## 6 x x o o o o o o o
## 7 o o o o o o o o o
## 8 x x o o o o o o o

4.4.3 BIC Table

Probable models were also extracted using a BIC table, whereby the models were selected based on the lowest BIC value observed, that is AR(1), AR(2) for p values and MA(1), MA(3) and MA(5) for q values. Thus, the set of possible models created based on those orders were; {ARIMA(1,1,1), ARIMA(1,1,3), ARIMA(1,1,5), ARIMA(2,1,1), ARIMA(2,1,3) and ARIMA(2,1,5)}.

From this, the final set of possible candidate models chosen for parameter estimation were:

ARIMA(0,1,7)
ARIMA(0,1,8)
ARIMA(1,1,1)
ARIMA(1,1,2)
ARIMA(1,1,3)
ARIMA(1,1,5)
ARIMA(1,1,6)
ARIMA(1,1,7)
ARIMA(2,1,1)
ARIMA(2,1,3)
ARIMA(2,1,5)
ARIMA(2,1,6)

4.5 ARIMA Model Parameter Estimation

To estimate the parameters and test the significance of the models at a significance level of 5%, maximum likelihood (ML) and conditional sum of squares (CSS) methods will be used to fit each of the models from the candidate set.

4.5.1 ARIMA(0,1,7)

The coefficients, ma1 and ma5, were found to be statistically significant at a level of 5% according to both CSS and ML techniques. Additionally, ML also indicates ma6 to be significant, whereas CSS does not. Both methods agree that ma2, ma3, ma4 and ma7 were not statistically significant.

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ma1 -0.441597   0.115831 -3.8124 0.0001376 ***
## ma2  0.031563   0.125392  0.2517 0.8012632    
## ma3 -0.025897   0.123954 -0.2089 0.8345047    
## ma4  0.068909   0.125082  0.5509 0.5816931    
## ma5 -0.334791   0.146675 -2.2825 0.0224574 *  
## ma6  0.147218   0.115372  1.2760 0.2019448    
## ma7  0.123488   0.130899  0.9434 0.3454840    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ma1 -0.792021   0.134221 -5.9009 3.616e-09 ***
## ma2  0.188077   0.257848  0.7294   0.46575    
## ma3 -0.024633   0.136063 -0.1810   0.85634    
## ma4 -0.049814   0.256890 -0.1939   0.84625    
## ma5 -0.560209   0.220421 -2.5415   0.01104 *  
## ma6  0.445245   0.200207  2.2239   0.02615 *  
## ma7  0.211479   0.169182  1.2500   0.21130    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5.2 ARIMA(0,1,8)

Much like ARIMA(0,1,7), CSS and ML yielded the same results and regarded ma1 and ma5 to be statistically significant, with only ML once again including ma6 as significant as well.

## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ma1 -0.4413670  0.1186180 -3.7209 0.0001985 ***
## ma2  0.0318956  0.1286967  0.2478 0.8042619    
## ma3 -0.0264110  0.1324684 -0.1994 0.8419685    
## ma4  0.0685266  0.1305876  0.5248 0.5997530    
## ma5 -0.3348142  0.1467056 -2.2822 0.0224764 *  
## ma6  0.1470303  0.1167422  1.2594 0.2078697    
## ma7  0.1232869  0.1316787  0.9363 0.3491337    
## ma8  0.0014973  0.1365195  0.0110 0.9912495    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ma1 -0.835208   0.155782 -5.3614 8.258e-08 ***
## ma2  0.152112   0.210316  0.7233  0.469523    
## ma3  0.028420   0.162906  0.1745  0.861506    
## ma4 -0.032226   0.213632 -0.1508  0.880096    
## ma5 -0.569052   0.185139 -3.0736  0.002115 ** 
## ma6  0.502502   0.227883  2.2051  0.027448 *  
## ma7  0.240875   0.168636  1.4284  0.153185    
## ma8 -0.106214   0.207756 -0.5112  0.609180    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5.3 ARIMA(1,1,1)

Here, both CSS and ML suggest only the coefficient ma1 to be statistically significant at a 5% level and not ar1.

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value Pr(>|z|)    
## ar1 -0.036472   0.089750 -0.4064   0.6845    
## ma1 -0.746289   0.079380 -9.4015   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1 -0.010610   0.169356 -0.0626      0.95    
## ma1 -0.763030   0.099238 -7.6889 1.484e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5.4 ARIMA(1,1,2)

According to the p-values, CSS indicates only ma1 to be statistically significant, whilst ML considers ar1 and ma2 but not ma1.

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1 -0.059994   0.126173 -0.4755    0.6344    
## ma1 -0.702698   0.175298 -4.0086 6.108e-05 ***
## ma2 -0.042926   0.152530 -0.2814    0.7784    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error  z value  Pr(>|z|)    
## ar1 -0.939155   0.088772 -10.5794 < 2.2e-16 ***
## ma1  0.113064   0.131636   0.8589    0.3904    
## ma2 -0.637898   0.115627  -5.5168 3.452e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5.5 ARIMA(1,1,3)

CSS found only the coefficient ma1 to be statistically significant and not ar1, ma2 and ma3. Whereas, ML regarded ar1 and ma2 to be statistically significant, but not ma1 and ma3.

## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ar1 -0.0595386  0.1262054 -0.4718    0.6371    
## ma1 -0.7047019  0.1763349 -3.9964 6.432e-05 ***
## ma2 -0.0490782  0.1674366 -0.2931    0.7694    
## ma3  0.0089451  0.1119564  0.0799    0.9363    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error  z value  Pr(>|z|)    
## ar1 -0.939470   0.083495 -11.2518 < 2.2e-16 ***
## ma1  0.227415   0.173216   1.3129    0.1892    
## ma2 -0.638290   0.105789  -6.0336 1.603e-09 ***
## ma3 -0.140473   0.148623  -0.9452    0.3446    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5.6 ARIMA(1,1,5)

Looking at CSS, it shows only the coefficient ma1 to be statistically significant and the remaining other coefficients to be insignificant. On the other hand, ML suggests ar1 and ma2. Thus, this model does not appear to fit suitably well with the data.

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1 -0.043777   0.117620 -0.3722    0.7098    
## ma1 -0.738288   0.176957 -4.1721 3.018e-05 ***
## ma2 -0.093584   0.195956 -0.4776    0.6330    
## ma3 -0.023100   0.125117 -0.1846    0.8535    
## ma4  0.112062   0.191176  0.5862    0.5578    
## ma5  0.023758   0.122686  0.1936    0.8465    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1 -0.930031   0.099505 -9.3466 < 2.2e-16 ***
## ma1  0.201036   0.168602  1.1924  0.233115    
## ma2 -0.716467   0.224321 -3.1939  0.001403 ** 
## ma3 -0.095782   0.207029 -0.4627  0.643615    
## ma4  0.116291   0.194426  0.5981  0.549755    
## ma5 -0.041325   0.230407 -0.1794  0.857657    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5.7 ARIMA(1,1,6)

Both approaches, CSS and ML show ma1, ma5 and ma6 along with ML also including ma2 to be statistically significant at a 5% level. Hence, this model exudes similar results to previous models that had regarded ma1, ma5 and ma6 as statistically significant as well.

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1 -0.021926   0.095489 -0.2296  0.818386    
## ma1 -0.860963   0.120643 -7.1365 9.576e-13 ***
## ma2  0.195985   0.165937  1.1811  0.237570    
## ma3  0.056103   0.170189  0.3297  0.741663    
## ma4 -0.079407   0.171240 -0.4637  0.642853    
## ma5 -0.385431   0.154078 -2.5015  0.012366 *  
## ma6  0.441097   0.115335  3.8245  0.000131 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1  0.226772   0.231242  0.9807  0.326757    
## ma1 -1.052672   0.170635 -6.1691 6.866e-10 ***
## ma2  0.423002   0.200393  2.1109  0.034785 *  
## ma3 -0.125110   0.140490 -0.8905  0.373184    
## ma4  0.021672   0.198155  0.1094  0.912911    
## ma5 -0.600888   0.207892 -2.8904  0.003848 ** 
## ma6  0.650808   0.134559  4.8366 1.321e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5.8 ARIMA(1,1,7)

In CSS, the p-values indicate ma1, ma5 and ma7 to be significant, in which ma1 is the more statistically significant than the other two coefficients. Whereas, ML only found ma5 to be significant.

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1 -0.094943   0.118452 -0.8015   0.42282    
## ma1 -0.692851   0.153001 -4.5284 5.943e-06 ***
## ma2  0.083311   0.172056  0.4842   0.62824    
## ma3  0.047939   0.142232  0.3370   0.73608    
## ma4 -0.086143   0.132819 -0.6486   0.51661    
## ma5 -0.326832   0.137351 -2.3795   0.01733 *  
## ma6  0.198911   0.166242  1.1965   0.23149    
## ma7  0.261016   0.135083  1.9323   0.05333 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value Pr(>|z|)   
## ar1 -0.169857   0.691770 -0.2455 0.806038   
## ma1 -0.632842   0.655621 -0.9653 0.334416   
## ma2  0.035927   0.642519  0.0559 0.955408   
## ma3  0.023237   0.217205  0.1070 0.914803   
## ma4 -0.044195   0.197955 -0.2233 0.823337   
## ma5 -0.569965   0.190947 -2.9849 0.002836 **
## ma6  0.365185   0.350091  1.0431 0.296896   
## ma7  0.299467   0.376407  0.7956 0.426270   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5.9 ARIMA(2,1,1)

Here, CSS and ML regarded only ma1 as statistically significant and ar1 and ar2 to be insignificant when looking at their p-values.

## 
## z test of coefficients:
## 
##        Estimate  Std. Error z value Pr(>|z|)    
## ar1 -0.00040153  0.15014731 -0.0027   0.9979    
## ar2  0.01304271  0.08961243  0.1455   0.8843    
## ma1 -0.76359002  0.09202893 -8.2973   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##        Estimate  Std. Error z value  Pr(>|z|)    
## ar1 -0.00070653  0.17876809 -0.0040    0.9968    
## ar2  0.02666577  0.16317169  0.1634    0.8702    
## ma1 -0.77253385  0.11132445 -6.9395 3.935e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5.10 ARIMA(2,1,3)

According to CSS method, only ar1 and ma2 are found to be statistically significant. Whereas in ML, all coefficients were regarded as statistically significant, indicating this model has potential to fit better to the data than the other models.

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1 -0.751020   0.189622 -3.9606 7.476e-05 ***
## ar2  0.132493   0.125073  1.0593    0.2895    
## ma1  0.045507   0.206981  0.2199    0.8260    
## ma2 -0.720451   0.144642 -4.9809 6.328e-07 ***
## ma3  0.089143   0.168731  0.5283    0.5973    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##     Estimate Std. Error z value  Pr(>|z|)    
## ar1 -1.70912    0.23122 -7.3918 1.449e-13 ***
## ar2 -0.73612    0.22884 -3.2167 0.0012967 ** 
## ma1  0.97628    0.20238  4.8240 1.407e-06 ***
## ma2 -0.52832    0.14888 -3.5486 0.0003872 ***
## ma3 -0.62846    0.14843 -4.2340 2.295e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5.11 ARIMA(2,1,5)

Both CSS and ML agree that ar1, ar2 are statistically significant, in which CSS also indicated ma2 but not ma1 and ma3, whilst ML suggested ma1 and ma3 but not ma2 to be statistically significant. Ma4 and ma5 were considered by both methods to be insignificant.

## 
## z test of coefficients:
## 
##      Estimate Std. Error  z value  Pr(>|z|)    
## ar1 -0.763830   0.015947 -47.8973 < 2.2e-16 ***
## ar2  0.161606   0.005034  32.1032 < 2.2e-16 ***
## ma1  0.116053   0.109715   1.0578    0.2902    
## ma2 -0.884209   0.117271  -7.5399 4.704e-14 ***
## ma3  0.078133   0.154756   0.5049    0.6136    
## ma4  0.044971   0.145529   0.3090    0.7573    
## ma5  0.155744   0.130795   1.1908    0.2338    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1 -1.217470   0.127694 -9.5343 < 2.2e-16 ***
## ar2 -0.785388   0.108448 -7.2421 4.419e-13 ***
## ma1  0.522626   0.194013  2.6938  0.007065 ** 
## ma2 -0.094259   0.195772 -0.4815  0.630179    
## ma3 -0.652244   0.163578 -3.9873 6.682e-05 ***
## ma4  0.092219   0.151933  0.6070  0.543868    
## ma5 -0.136337   0.185964 -0.7331  0.463475    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5.12 ARIMA(2,1,6)

According to the p-values, both models are unanimous that ma1, ma2, ma5 and ma6 are significant, thus being consistent with the pattern observed from previous models.

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1  0.279897   0.217305  1.2880   0.19773    
## ar2 -0.042714   0.074911 -0.5702   0.56854    
## ma1 -1.103573   0.190605 -5.7898 7.046e-09 ***
## ma2  0.444160   0.228446  1.9443   0.05186 .  
## ma3 -0.062602   0.205614 -0.3045   0.76077    
## ma4 -0.069096   0.190384 -0.3629   0.71666    
## ma5 -0.332389   0.168960 -1.9673   0.04915 *  
## ma6  0.444634   0.106932  4.1581 3.209e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1  0.334259   0.256522  1.3030  0.192560    
## ar2 -0.285946   0.218898 -1.3063  0.191451    
## ma1 -1.151404   0.227252 -5.0666 4.049e-07 ***
## ma2  0.694944   0.346198  2.0074  0.044711 *  
## ma3 -0.287298   0.242081 -1.1868  0.235313    
## ma4  0.010612   0.186384  0.0569  0.954595    
## ma5 -0.474141   0.249511 -1.9003  0.057397 .  
## ma6  0.567449   0.174238  3.2567  0.001127 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Overall, CSS and ML strongly suggests the coefficients ma1 and ma5 fit better to the data, however, ML also considered ar1 and ma2. From which, all of these coefficients were often regarded statistically significant more so than the other coefficients.

4.5.13 AIC Scores

The models are then ranked in ascending order from best to worse by their AIC and BIC scores to help determine which ARIMA model from the set of twelve performs best with the chosen dataset. For the models fitted using ML technique this method of evaluation was utilised, in which AIC suggests model_017_ml then model_116_ml to be the best fitted to the dataset, whereas, BIC indicates an entirely different model, that is, model_111_ml then model_112_ml. Thus, an in depth exploration into each model’s measures of accuracy may help to decide which ARIMA model is really most suitable.

##              df      AIC
## model_017_ml  8 1041.649
## model_116_ml  8 1042.213
## model_216_ml  9 1042.650
## model_018_ml  9 1043.403
## model_117_ml  9 1043.576
## model_111_ml  3 1045.795
## model_112_ml  4 1046.098
## model_113_ml  5 1047.214
## model_213_ml  6 1047.597
## model_211_ml  4 1047.768
## model_215_ml  8 1049.759
## model_115_ml  7 1049.978

4.5.14 BIC Scores

##              df      BIC
## model_111_ml  3 1052.666
## model_112_ml  4 1055.260
## model_211_ml  4 1056.930
## model_113_ml  5 1058.666
## model_017_ml  8 1059.973
## model_116_ml  8 1060.537
## model_213_ml  6 1061.340
## model_216_ml  9 1063.264
## model_018_ml  9 1064.017
## model_117_ml  9 1064.190
## model_115_ml  7 1066.011
## model_215_ml  8 1068.083

4.6 ARIMA Model Evaluation

Measures of accuracy were also used to evaluate models fitted using CSS in terms of mean error (ME), root mean squared error (RMSE), mean absolute error (MAE), mean percentage error (MPE), mean absolute percentage error (MAPE), mean absolute scaled error (MASE) and the first-order autocorrelation coefficient (ACF1). Whereby, according to the outputs, ARIMA(1,1,7) produced the best results in ‘RMSE’ (230.712), ‘MAE’ (174.919), ‘MAPE’ (19.754) and ‘MASE’ (0.700) as lowest values and those closer to zero would indicate a better overall fit and therefore, a good model. Hence, residual analysis was carried out on both ARIMA(0,1,7) fitted using ML, and the ARIMA(1,1,7) model fitted using the CSS method.

##                   ME    RMSE     MAE     MPE   MAPE  MASE   ACF1
## ARIMA(0,1,7)   6.839 309.646 214.379  -5.069 21.107 0.848 -0.001
## ARIMA(0,1,8)   6.822 309.645 214.399  -5.068 21.108 0.848 -0.001
## ARIMA(1,1,1) -53.790 248.614 192.440 -12.193 21.779 0.761 -0.026
## ARIMA(1,1,2) -52.733 248.477 190.860 -12.123 21.632 0.755 -0.045
## ARIMA(1,1,3) -52.672 248.466 191.069 -12.120 21.654 0.756 -0.044
## ARIMA(1,1,5) -50.088 247.310 186.061 -11.770 21.185 0.736 -0.035
## ARIMA(1,1,6) -40.651 235.851 182.797  -9.416 19.818 0.723  0.044
## ARIMA(1,1,7) -30.449 230.602 174.475  -8.017 18.667 0.690 -0.025
## ARIMA(2,1,1) -52.129 248.512 190.478 -12.080 21.599 0.753 -0.041
## ARIMA(2,1,3) -56.612 242.786 186.043 -12.050 21.154 0.736 -0.050
## ARIMA(2,1,5) -49.027 227.349 166.229 -10.633 18.925 0.657  0.024
## ARIMA(2,1,6) -28.697 232.204 176.413  -8.089 19.032 0.698  0.005

4.7 Diagnostic Checking

From scrutinising both ARIMA models’ time series plot of standardised residuals, no obvious trend is depicted, although ARIMA(0,1,7) does portray a more normal distribution than ARIMA(1,1,7). The models’ histogram and QQ plot show that the data is normal, albeit from the ARIMA(1,1,7) model, the histogram is slightly more balanced and the points in the QQ plot deviate less from the straight line. ACF plots show no significant lags in either model and the Ljung-Box Test observes all points are above the alpha level, suggesting failure to reject the null hypothesis and that both models can effectively capture autocorrelation in the series. Hence, both ARIMA(1,1,7) and ARIMA(0,1,7) are supported for forecasting. However, ARIMA(1,1,7) is a larger model, and the components were found to be not significant at a significance level of 5%, hence ARIMA(0,1,7) will be chosen as the best model, in which this decision also abides by the principle of parsimony. As ARIMA(0,1,7) is a less complex and smaller model than ARIMA(1,1,7) which has 1 AR term and so is higher than ARIMA(0,1,7).

4.7.1 Residual Analysis for ARIMA(1,1,7)

## 
##  Shapiro-Wilk normality test
## 
## data:  res.model
## W = 0.97117, p-value = 0.08859

## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple of
## shorter object length

4.7.2 Residual Analysis for ARIMA(0,1,7)

## 
##  Shapiro-Wilk normality test
## 
## data:  res.model
## W = 0.91005, p-value = 6.355e-05

## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple of
## shorter object length

4.8 Overfitting Test

It is in good practice to test for over-fitting of the chosen model, ARIMA(0,1,7). This can be accomplished by increasing only the model’s AR order or ‘p’ by 1 and keeping ‘q’ the same, then checking if the parameters are significant or not. The same process is then repeated for the model’s MA order or ‘q’ whilst ‘p’ is maintained now.

4.8.1 Increasing p by 1, ARIMA(1,1,7)

Assessing the p-values when AR order is increased by 1 from the chosen model, CSS exhibits coefficients ma1 and ma5 to be statisatically significant, but ma7 to be only a bit significant. Whereas, ML found ma5 to be statistically significant at a significance level of 5%, as it disregards all other coefficients to be insignificant.

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1 -0.094943   0.118452 -0.8015   0.42282    
## ma1 -0.692851   0.153001 -4.5284 5.943e-06 ***
## ma2  0.083311   0.172056  0.4842   0.62824    
## ma3  0.047939   0.142232  0.3370   0.73608    
## ma4 -0.086143   0.132819 -0.6486   0.51661    
## ma5 -0.326832   0.137351 -2.3795   0.01733 *  
## ma6  0.198911   0.166242  1.1965   0.23149    
## ma7  0.261016   0.135083  1.9323   0.05333 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value Pr(>|z|)   
## ar1 -0.169857   0.691770 -0.2455 0.806038   
## ma1 -0.632842   0.655621 -0.9653 0.334416   
## ma2  0.035927   0.642519  0.0559 0.955408   
## ma3  0.023237   0.217205  0.1070 0.914803   
## ma4 -0.044195   0.197955 -0.2233 0.823337   
## ma5 -0.569965   0.190947 -2.9849 0.002836 **
## ma6  0.365185   0.350091  1.0431 0.296896   
## ma7  0.299467   0.376407  0.7956 0.426270   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.8.2 Increasing q by 1, ARIMA(0,1,8)

When MA order is increased by 1 from the chosen model, the p-values from CSS and ML both suggest that ma1 and ma5 are statiscally significant, however, ML also indicates ma6 to have some significance at a 5% significance level.

Thus, the best model ARIMA(0,1,7) is suitable for the chosen dataset as there are no new or major differences found when either ‘p’ or ‘q’ is taken one step higher than the original model, and it can be said that ARIMA(1,1,7) and ARIMA(0,1,8) are over-parametrised models for ARIMA(0,1,7).

## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ma1 -0.4413670  0.1186180 -3.7209 0.0001985 ***
## ma2  0.0318956  0.1286967  0.2478 0.8042619    
## ma3 -0.0264110  0.1324684 -0.1994 0.8419685    
## ma4  0.0685266  0.1305876  0.5248 0.5997530    
## ma5 -0.3348142  0.1467056 -2.2822 0.0224764 *  
## ma6  0.1470303  0.1167422  1.2594 0.2078697    
## ma7  0.1232869  0.1316787  0.9363 0.3491337    
## ma8  0.0014973  0.1365195  0.0110 0.9912495    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ma1 -0.835208   0.155782 -5.3614 8.258e-08 ***
## ma2  0.152112   0.210316  0.7233  0.469523    
## ma3  0.028420   0.162906  0.1745  0.861506    
## ma4 -0.032226   0.213632 -0.1508  0.880096    
## ma5 -0.569052   0.185139 -3.0736  0.002115 ** 
## ma6  0.502502   0.227883  2.2051  0.027448 *  
## ma7  0.240875   0.168636  1.4284  0.153185    
## ma8 -0.106214   0.207756 -0.5112  0.609180    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.9 Forcasting

Below in figure 10 is the plotted forecast of the following next 10 years for the dataset representing the yearly number of tornadoes occurring in the US using the best chosen model, ARIMA(0,1,7):

## $pred
## Time Series:
## Start = 2024 
## End = 2033 
## Frequency = 1 
##  [1] 430.1792 455.8695 615.0297 592.6101 564.4461 452.1300 414.9315 414.9315
##  [9] 414.9315 414.9315
## 
## $se
## Time Series:
## Start = 2024 
## End = 2033 
## Frequency = 1 
##  [1] 275.7799 281.8657 301.2420 317.0989 329.1718 335.2834 340.0899 358.8399
##  [9] 376.6578 393.6700

Figure 10

5 Discussion

From the two linear and quadratic trend models tried, both were found to be statistically significant as the corresponding p-value of 2.969e-11 and 2.437e-11 obtained was smaller than the significance level at 5%. However, the R-squared being 0.4534 for the linear regression model and 0.4835 for the quadratic model implied that, respectively only 45.34% and 48.35% of the variance can be accounted for by the models. Which was far too low to be a good fit with the chosen data.

On the other hand, ARIMA models were able to effectively work with the data as it is capable of removing trends and seasonality through transformation and differencing if required. In which ARIMA modelling integrates three parameters, that is the autoregressive terms, the number of times the data is differenced to achieve stationary and the number of lagged errors to produce a model that can capture the relationship between an observation and a number of lagged observations (autoregressive), as well as the errors (moving average) that occurred in the past. From which, the best ARIMA model elected was ARIMA(0,1,7) as it achieved the lowest AIC scores and the fifth lowest BIC scores out of all twelve potential candidate models, as well as satisfied the diagnostic checks and tests as mentioned in the results. Therefore, this model was selected and used to forecast the next 10 years of the yearly number of tornadoes occurring in the US, following along with the time frame of when the chosen dataset had left off.

6 Conclusion

Overall, the best model determined for yearly tornadoes in the US dataset was Arima(0,1,7). This was accomplished through incorporating various modelling and specification tools to search for possible combinations of ‘p’ and ‘q’ parameters after trying other types of models. These parameters were passed through for estimation for significant coefficients before being evaluated based on AIC and BIC scores, accuracy metrics and diagnostic checking. The chosen model was then used to generate the next 10 years of the series, from which this report can be extended with further exploration into analysing monthly data, which can provide a greater quantity of data observations as well as potential seasonal patterns to indicate which months tornadoes tend to occur more in. This can in turn increase the accuracy and effectiveness of the forecasting models.

7 Appendices

Below is all the code used to compute and generate the models and computations in this report:

####################################################################
#
#    Title: solutionModule9Tasks_2024.R source code
#    Author: Demirhan, H
#    Date: May, 2024
#    Availability: https://rmit.instructure.com/courses/124176/files/38240987?module_item_id=6235012
#
####################################################################


# Load the required packages
library(TSA)
library(tseries)
library(lmtest)
library(forecast)

# Function to generate autocorrelation function (acf) and partial autocorrelation function (pacf) plots
acf_pacf_plot <- function(data, acf_title, pacf_title,title_size) {
  par(mfrow=c(1,2))
  acf(data, main = acf_title, cex.main = title_size) # Adjust title size to fit
  pacf(data, main = "", cex.main = title_size)
  title(main = pacf_title, cex.main = title_size) 
  par(mfrow=c(1,1))
}

# Function to sort the models from best to worse/ in ascending order
sort.score <- function(x, score = c("bic", "aic")){
  if (score == "aic"){
    x[with(x, order(AIC)),]
  } else if (score == "bic") {
    x[with(x, order(BIC)),]
  } else {
    warning('score = "x" only accepts valid arguments ("aic","bic")')
  }
}

# LBQPlot function from FitAR
LBQPlot <-
function(res, lag.max=30, StartLag=k+1, k=0, SquaredQ=FALSE){
stopifnot(k>=0, lag.max-StartLag>0, length(res)>lag.max)
ans<-LjungBoxTest(res, k=k, lag.max=lag.max, StartLag=StartLag, SquaredQ=FALSE)
m <- ans[,"m"]
pv <- ans[,"pvalue"]
plot(m, pv, xlab="lag", ylab="p-value", ylim=c(0,1), main="Ljung-Box Test")
abline(h=0.05, col="red", lty=2)
}

# LjungBoxTest function from FitAR
LjungBoxTest <-
function(res, k=0, lag.max=30, StartLag=1, SquaredQ=FALSE){
stopifnot(k>=0, StartLag>=1, lag.max>=StartLag)
n<-length(res)
L0<-StartLag
if (SquaredQ) {
    z<-(res-mean(res))^2
    kpar<-0
    }
else {
    z<-res
    kpar<-k
}
ra<-(acf(z, lag.max=lag.max, plot=FALSE)$acf)[-1]
lags<-L0:lag.max
QQ<-n*(n+2)*cumsum((ra^2)/(n-(1:lag.max)))[lags]
df <- ifelse(lags-kpar > 0, lags-kpar, 1)
pv<-1-pchisq(QQ,df)
QQ<-round(QQ,2)
a<-matrix(c(lags,QQ,pv),ncol=3)
dimnames(a)<-list(rep("",length(QQ)),c("m","Qm", "pvalue"))
a
}

# Function to perform residual analysis for trend models
trend.residual.analysis <- function(model, std = TRUE, start = 2){
  res.tModel <- rstudent(model)
  par(mfrow=c(3,2))
  plot(res.tModel,type='o',ylab='Standardised residuals', main="Time series plot of standardised residuals")
  abline(h=0)
  hist(res.tModel,main="Histogram of standardised residuals")
  qqnorm(res.tModel,main="QQ plot of standardised residuals")
  qqline(res.tModel, col = 2)
  acf(res.tModel,main="ACF of standardised residuals")
  pacf(res.tModel, main = "PACF of standardised residuals")
  print(shapiro.test(res.tModel))
  par(mfrow=c(1,1))
}

# Function to perform residual analysis
residual.analysis <- function(model, std = TRUE,start = 2, class = c("ARIMA","GARCH","ARMA-GARCH", "fGARCH")[1]){
  library(TSA)
  #library(FitAR)
  if (class == "ARIMA"){
    if (std == TRUE){
      res.model = rstandard(model)
    }else{
      res.model = residuals(model)
    }
  }else if (class == "GARCH"){
    res.model = model$residuals[start:model$n.used]
  }else if (class == "ARMA-GARCH"){
    res.model = model@fit$residuals
  }else if (class == "fGARCH"){
    res.model = model@residuals
  }else {
    stop("The argument 'class' must be either 'ARIMA' or 'GARCH' ")
  }
  par(mfrow=c(3,2))
  plot(res.model,type='o',ylab='Standardised residuals', main="Time series plot of standardised residuals")
  abline(h=0)
  hist(res.model,main="Histogram of standardised residuals")
  qqnorm(res.model,main="QQ plot of standardised residuals")
  qqline(res.model, col = 2)
  acf(res.model,main="ACF of standardised residuals")
  print(shapiro.test(res.model))
  k=0
  LBQPlot(res.model, lag.max = 30, StartLag = k + 1, k = 0, SquaredQ = FALSE)
  par(mfrow=c(1,1))
}

# The dataset represents the yearly number of tornadoes in US since 1950 to 2024.
# Read in the data csv files
US_tornado_data <- read.csv("US_tornado_data.csv", header=TRUE)

# View the first five rows of data to check its accuracy to the original data
head(US_tornado_data)

# Check the date type of the columns
class(US_tornado_data$Number_of_Tornadoes) # character
class(US_tornado_data$Year)

# Remove the ',' before converting the column to numeric
US_tornado_data$Number_of_Tornadoes <-gsub(",", "", as.character(US_tornado_data$Number_of_Tornadoes))

# Convert Number_of_Tornadoes to numeric
US_tornado_data$Number_of_Tornadoes <- as.numeric(US_tornado_data$Number_of_Tornadoes)

# Check it has been converted
class(US_tornado_data$Number_of_Tornadoes)

# Check the data type
class(US_tornado_data)

# Checking for any missing values/NAs in the column, Number_of_Tornadoes
sum(is.na(US_tornado_data$Number_of_Tornadoes))

# Drop all columns except for Year and Number_of_Tornadoes
US_tornado_data <- US_tornado_data[, c("Year", "Number_of_Tornadoes")]

# Get a summary of the column, Number_of_Tornadoes
summary(US_tornado_data$Number_of_Tornadoes) # Minimum value: 49.0, Maximum value: 2074.0

# Convert to a time series data object, excluding 2024's data
US_tornado_data_ts = ts(US_tornado_data$Number_of_Tornadoes, start = 1950, end = 2023, frequency = 1) 
class(US_tornado_data_ts) 

# Plot the data against the time of observation (time series plot)
plot(US_tornado_data_ts, type='o', main="Time series plot of yearly tornadoes in US series.", ylab="Number of tornadoes")

# Create function to generate autocorrelation function (acf) and partial autocorrelation function (pacf) plots
acf_pacf_plot <- function(data, acf_title, pacf_title,title_size) {
  par(mfrow=c(1,2))
  acf(data, main = acf_title, cex.main = title_size) # Adjust title size to fit
  pacf(data, main = "", cex.main = title_size)
  title(main = pacf_title, cex.main = title_size) 
  par(mfrow=c(1,1))
}
# Call the function
acf_pacf_plot(US_tornado_data_ts, acf_title = "ACF plot of yearly tornadoes in US series", pacf_title = "PACF plot of yearly tornadoes in US series", title_size=0.9)

# Perform adf (Augmented Dickey-Fuller) test
adf.test(US_tornado_data_ts) # Dickey-Fuller = -3.9499, Lag order = 4, p-value = 0.01668

# Perform Shapiro-Wilk test to examine normality
shapiro.test(US_tornado_data_ts)

# Create function to generate QQ plot
qqPlot <- function(data, title) {
  qqnorm(data, main = title)
  qqline(data, col = 2)
}
# Call the function
qqPlot(US_tornado_data_ts, title = "QQ plot of yearly tornadoes in US series")

# Linear trend model
t <- time(US_tornado_data_ts)
LinearModel = lm(US_tornado_data_ts~t)
summary(LinearModel)

# Plot the original model
plot(US_tornado_data_ts,type='o',xlab = "Time (Years)", ylab='Number of tornadoes in US', main = "Fitted linear model to number of tornadoes in US")
abline(LinearModel, col = "red",lty=2)

# Perform residual analysis for linear model
trend.residual.analysis(LinearModel)

# Quadratic Model
t = time(US_tornado_data_ts)
t2 <- t^2
QuadraticModel <- lm(US_tornado_data_ts ~ t + t2)
summary(QuadraticModel)
plot(ts(fitted(QuadraticModel)), ylim = c(min(c(fitted(QuadraticModel), as.vector(US_tornado_data_ts))), max(c(fitted(QuadraticModel),as.vector(US_tornado_data_ts)))), xlab = "Time (Years)", ylab='Number of yearly tornadoes in US', main = "Fitted quadratic model curve to number of yearly tornadoes in US", type="l",lty=2,col="red")

lines(as.vector(US_tornado_data_ts),type="o")

# Residual Analysis for quadratic model
trend.residual.analysis(QuadraticModel)

# Apply first differencing to raw data
diff_data = diff(US_tornado_data_ts, differences=1)

# Plot time series of first difference raw data
plot(diff_data, type='o', main="Time series plot of first difference yearly tornadoes in US", ylab="First difference yearly tornadoes in US", xlab="Time (in Years)")

# Generate QQ plot to check normality on first difference data using the function
qqPlot(diff_data, title = "QQ plot of first difference yearly tornadoes in US")

# Perform adf test on first difference raw data
adf.test(diff_data) 

# Perform Phillips Perron (pp) test on first difference raw data
pp.test(diff_data)

# Produce ACF and PACF plots for first difference raw data
acf_pacf_plot(diff_data, acf_title = "ACF of first difference yearly tornadoes in US data", pacf_title = "PACF of first difference yearly tornadoes in US data", title_size=0.7)

# Create a function to perform eacf to search for potential models with max limit of 8
eacf(diff_data, ar.max = 8, ma.max = 8)

# Create a BIC table to help specify the final set of possible models.
res = armasubsets(y=diff_data,nar=5,nma=5,y.name='p',ar.method='ols')
plot(res)

# ARIMA(0,1,7)
model_017_css <- Arima(US_tornado_data_ts, order=c(0,1,7), method='CSS')
lmtest::coeftest(model_017_css)

model_017_ml <- Arima(US_tornado_data_ts, order=c(0,1,7), method='ML')
coeftest(model_017_ml)

# ARIMA(0,1,8)
model_018_css <- Arima(US_tornado_data_ts, order=c(0,1,8), method='CSS')
lmtest::coeftest(model_018_css)

model_018_ml <- Arima(US_tornado_data_ts, order=c(0,1,8), method='ML')
coeftest(model_018_ml)

# ARIMA(1,1,1)
model_111_css <- Arima(US_tornado_data_ts, order=c(1,1,1), method='CSS')
lmtest::coeftest(model_111_css)

model_111_ml <- Arima(US_tornado_data_ts, order=c(1,1,1), method='ML')
coeftest(model_111_ml)

# ARIMA(1,1,2)
model_112_css <- Arima(US_tornado_data_ts, order=c(1,1,2), method='CSS')
lmtest::coeftest(model_112_css)

model_112_ml <- Arima(US_tornado_data_ts, order=c(1,1,2), method='ML')
coeftest(model_112_ml)

# ARIMA(1,1,3)
model_113_css <- Arima(US_tornado_data_ts, order=c(1,1,3), method='CSS')
lmtest::coeftest(model_113_css)

model_113_ml <- Arima(US_tornado_data_ts, order=c(1,1,3), method='ML')
coeftest(model_113_ml)

# ARIMA(1,1,5)
model_115_css <- Arima(US_tornado_data_ts, order=c(1,1,5), method='CSS')
lmtest::coeftest(model_115_css)

model_115_ml <- Arima(US_tornado_data_ts, order=c(1,1,5), method='ML')
coeftest(model_115_ml)

# ARIMA(1,1,6)
model_116_css <- Arima(US_tornado_data_ts, order=c(1,1,6), method='CSS')
lmtest::coeftest(model_116_css)

model_116_ml <- Arima(US_tornado_data_ts, order=c(1,1,6), method='ML')
coeftest(model_116_ml)

# ARIMA(1,1,7)
model_117_css <- Arima(US_tornado_data_ts, order=c(1,1,7), method='CSS')
lmtest::coeftest(model_117_css)

model_117_ml <- Arima(US_tornado_data_ts, order=c(1,1,7), method='ML')
coeftest(model_117_ml)

# ARIMA(2,1,1)
model_211_css <- Arima(US_tornado_data_ts, order=c(2,1,1), method='CSS')
lmtest::coeftest(model_211_css)

model_211_ml <- Arima(US_tornado_data_ts, order=c(2,1,1), method='ML')
coeftest(model_211_ml)

# ARIMA(2,1,3)
model_213_css <- Arima(US_tornado_data_ts, order=c(2,1,3), method='CSS')
lmtest::coeftest(model_213_css)

model_213_ml <- Arima(US_tornado_data_ts, order=c(2,1,3), method='ML')
coeftest(model_213_ml)

# ARIMA(2,1,5)
model_215_css <- Arima(US_tornado_data_ts, order=c(2,1,5), method='CSS')
lmtest::coeftest(model_215_css)

model_215_ml <- Arima(US_tornado_data_ts, order=c(2,1,5), method='ML')
coeftest(model_215_ml)

# ARIMA(2,1,6)
model_216_css <- Arima(US_tornado_data_ts, order=c(2,1,6), method='CSS')
lmtest::coeftest(model_216_css)

model_216_ml <- Arima(US_tornado_data_ts, order=c(2,1,6), method='ML')
coeftest(model_216_ml)

# Function to sort the models from best to worse/ in ascending order
sort.score <- function(x, score = c("bic", "aic")){
  if (score == "aic"){
    x[with(x, order(AIC)),]
  } else if (score == "bic") {
    x[with(x, order(BIC)),]
  } else {
    warning('score = "x" only accepts valid arguments ("aic","bic")')
  }
}

# Producing sorted AIC scores using sort.score function to sort the models from best to worse
sort.score(AIC(model_017_ml, model_018_ml, model_111_ml, model_112_ml, model_113_ml, model_115_ml, model_116_ml, model_117_ml, model_211_ml, model_213_ml, model_215_ml, model_216_ml), score = "aic")

# Producing BIC scores and sorting the models in ascending order
sort.score(BIC(model_017_ml, model_018_ml, model_111_ml, model_112_ml, model_113_ml, model_115_ml, model_116_ml, model_117_ml, model_211_ml, model_213_ml, model_215_ml, model_216_ml), score = "bic")

# Getting the calculated measures from each model to put into a table
Smodel_017_css <- accuracy(model_017_css)[1:7] #[c(1:3, 6:7)] to exclude MPE and MAPE
Smodel_018_css <- accuracy(model_018_css)[1:7]
Smodel_111_css <- accuracy(model_111_css)[1:7]
Smodel_112_css <- accuracy(model_112_css)[1:7]
Smodel_113_css <- accuracy(model_113_css)[1:7]
Smodel_115_css <- accuracy(model_115_css)[1:7]
Smodel_116_css <- accuracy(model_116_css)[1:7]
Smodel_117_css <- accuracy(model_117_css)[1:7]
Smodel_211_css <- accuracy(model_211_css)[1:7]
Smodel_213_css <- accuracy(model_213_css)[1:7]
Smodel_215_css <- accuracy(model_215_css)[1:7]
Smodel_216_css <- accuracy(model_216_css)[1:7]

# Joining multiple rows (model accuracy results) into one singular data frame
df.Smodels <- data.frame(rbind(Smodel_017_css, Smodel_018_css, Smodel_111_css, Smodel_112_css, Smodel_113_css, Smodel_115_css, Smodel_116_css, Smodel_117_css, Smodel_211_css, Smodel_213_css, Smodel_215_css, Smodel_216_css))

# Creating the data frame's column and row names
colnames(df.Smodels) <- c("ME", "RMSE", "MAE", "MPE", "MAPE", "MASE", "ACF1")
rownames(df.Smodels) <- c("ARIMA(0,1,7)","ARIMA(0,1,8)", "ARIMA(1,1,1)", "ARIMA(1,1,2)", "ARIMA(1,1,3)", "ARIMA(1,1,5)", "ARIMA(1,1,6)", "ARIMA(1,1,7)", "ARIMA(2,1,1)", "ARIMA(2,1,3)", "ARIMA(2,1,5)", "ARIMA(2,1,6)")

# Round up the results to the nearest 3 decimal place
round(df.Smodels,  digits = 3)

# LBQPlot function from FitAR
LBQPlot <-
function(res, lag.max=30, StartLag=k+1, k=0, SquaredQ=FALSE){
stopifnot(k>=0, lag.max-StartLag>0, length(res)>lag.max)
ans<-LjungBoxTest(res, k=k, lag.max=lag.max, StartLag=StartLag, SquaredQ=FALSE)
m <- ans[,"m"]
pv <- ans[,"pvalue"]
plot(m, pv, xlab="lag", ylab="p-value", ylim=c(0,1), main="Ljung-Box Test")
abline(h=0.05, col="red", lty=2)
}

# LjungBoxTest function from FitAR
LjungBoxTest <-
function(res, k=0, lag.max=30, StartLag=1, SquaredQ=FALSE){
stopifnot(k>=0, StartLag>=1, lag.max>=StartLag)
n<-length(res)
L0<-StartLag
if (SquaredQ) {
    z<-(res-mean(res))^2
    kpar<-0
    }
else {
    z<-res
    kpar<-k
}
ra<-(acf(z, lag.max=lag.max, plot=FALSE)$acf)[-1]
lags<-L0:lag.max
QQ<-n*(n+2)*cumsum((ra^2)/(n-(1:lag.max)))[lags]
df <- ifelse(lags-kpar > 0, lags-kpar, 1)
pv<-1-pchisq(QQ,df)
QQ<-round(QQ,2)
a<-matrix(c(lags,QQ,pv),ncol=3)
dimnames(a)<-list(rep("",length(QQ)),c("m","Qm", "pvalue"))
a
}

# Residual Analysis
residual.analysis <- function(model, std = TRUE,start = 2, class = c("ARIMA","GARCH","ARMA-GARCH", "fGARCH")[1]){
  library(TSA)
  #library(FitAR)
  if (class == "ARIMA"){
    if (std == TRUE){
      res.model = rstandard(model)
    }else{
      res.model = residuals(model)
    }
  }else if (class == "GARCH"){
    res.model = model$residuals[start:model$n.used]
  }else if (class == "ARMA-GARCH"){
    res.model = model@fit$residuals
  }else if (class == "fGARCH"){
    res.model = model@residuals
  }else {
    stop("The argument 'class' must be either 'ARIMA' or 'GARCH' ")
  }
  par(mfrow=c(3,2))
  plot(res.model,type='o',ylab='Standardised residuals', main="Time series plot of standardised residuals")
  abline(h=0)
  hist(res.model,main="Histogram of standardised residuals")
  qqnorm(res.model,main="QQ plot of standardised residuals")
  qqline(res.model, col = 2)
  acf(res.model,main="ACF of standardised residuals")
  print(shapiro.test(res.model))
  k=0
  LBQPlot(res.model, lag.max = 30, StartLag = k + 1, k = 0, SquaredQ = FALSE)
  par(mfrow=c(1,1))
}

residual.analysis(model = model_117_css)

residual.analysis(model = model_017_css)

# Increase p by 1, ARIMA(1,1,7)
model_117_css <- Arima(US_tornado_data_ts, order=c(1,1,7), method='CSS')
lmtest::coeftest(model_117_css)

model_117_ml <- Arima(US_tornado_data_ts, order=c(1,1,7), method='ML')
coeftest(model_117_ml)

# Increase q by 1, ARIMA(0,1,8)
model_018_css <- Arima(US_tornado_data_ts, order=c(0,1,8), method='CSS')
lmtest::coeftest(model_018_css)

model_018_ml <- Arima(US_tornado_data_ts, order=c(0,1,8), method='ML')
coeftest(model_018_ml)

predict(model_017_ml, n.ahead = 10, newxreg = NULL, se.fit = TRUE)
best <- Arima(US_tornado_data_ts, c(0,1,7))
plot(forecast(best, h=10))

8 References

[1] USA Today, “A history of twisters: Tornadoes in the United States since 1950”, USA Today. [Online]. Available: https://data.usatoday.com/tornado-archive/united-states/

[2] H. Demirhan, “Module 8 - Seasonal Models”, RMIT. [Online]. Available: https://rmit.instructure.com/courses/124176/files/38018442?module_item_id=6201891

[3] H. Demirhan, “solutionModule9Tasks_2024.R”, RMIT, (May, 2024). [Online]. Available: https://rmit.instructure.com/courses/124176/files/38240987?module_item_id=6235012

MATH1318/2204 Time Series Analysis

Assignment 3: Final Project Report

Elissa Van, s3935201

2024-05-29