Random Processes Project

Author

Julio Torres

1. Introduction

The time series selected for this project is from USAccDeaths dataset. This data set represents the monthly totals of accidental deaths in the USA from 1973 to 1978.

Warning: package 'forecast' was built under R version 4.2.2

Warning: package 'ggplot2' was built under R version 4.2.2

Warning: package 'tseries' was built under R version 4.2.2

data <- USAccDeaths
data

       Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov   Dec
1973  9007  8106  8928  9137 10017 10826 11317 10744  9713  9938  9161  8927
1974  7750  6981  8038  8422  8714  9512 10120  9823  8743  9129  8710  8680
1975  8162  7306  8124  7870  9387  9556 10093  9620  8285  8466  8160  8034
1976  7717  7461  7767  7925  8623  8945 10078  9179  8037  8488  7874  8647
1977  7792  6957  7726  8106  8890  9299 10625  9302  8314  8850  8265  8796
1978  7836  6892  7791  8192  9115  9434 10484  9827  9110  9070  8633  9240

start(data)

[1] 1973    1

end(data)

[1] 1978   12

frequency(data)

[1] 12

plot(data)

It can see from the time plot that this time series could probably be described using additive model, since the random fluctuations in the data are roughly constant over time.

1.1 Detailed metrics

summary(data)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   6892    8089    8728    8789    9323   11317

The monthly totals of accidental death in USA are distributed across the spectrum.

plot(data)
abline(reg=lm(data~time(data)))

plot(aggregate(data, FUN=mean))

boxplot(data~cycle(data))

The year on year trend clearly shows that total death have been decreasing until 1976, when they started raising. Besides, the seasonal fluctuations are clearly high in July and low in February. Hence, we have a strong seasonal effect with a cycle of 12 months or less.

2. Deterministic part

2.1 Overview of deterministic part

#Decomposition of the time series
decomposed_ts <- decompose(data, type = "additive")
plot(decomposed_ts)

The plot above shows a clear evidence about the trend and the seasonal behavior of the time series. In this step, a linear, polynomial and exponential model are going to compare in order to select the best model that fit with the trend of the time series.

2.1 Best fit model

Trend

trend <- decomposed_ts$trend
trend <- trend[1:length(trend)]
trend <- trend[7:66]
time_step <- 1:length(trend)
trend_df <- data.frame(time_step, trend)
plot(trend_df$trend ~ trend_df$time_step)

#models
#degree 1
lineal_model <- lm(trend ~ time_step, data=trend_df)
linealcoeff <- lineal_model$coefficients
linealfit <- linealcoeff[1] + linealcoeff[2]*time_step
#degree 2
cuadratic_model <- lm(trend ~ poly(time_step,2, raw = TRUE), data = trend_df)
cuadraticcoeff <- cuadratic_model$coefficients
cuadraticfit <- cuadraticcoeff[1] + cuadraticcoeff[2]*time_step + cuadraticcoeff[3]*(time_step^2)
#degree 3
cubic_model <- lm(trend ~ poly(time_step,3, raw = TRUE), data = trend_df)
cubiccoeff <- cubic_model$coefficients
cubicfit <- cubiccoeff[1] + cubiccoeff[2]*time_step + cubiccoeff[3]*(time_step^2) + cubiccoeff[4]*(time_step^3)
#degree 4
fourth_model <- lm(trend ~ poly(time_step,4, raw = TRUE), data = trend_df)
fourthcoeff <- fourth_model$coefficients
fourthfit <- fourthcoeff[1] + fourthcoeff[2]*time_step + fourthcoeff[3]*(time_step^2) + fourthcoeff[4]*(time_step^3) + fourthcoeff[5]*(time_step^4)
#degree 5
fifth_model <- lm(trend ~ poly(time_step,5, raw = TRUE), data = trend_df)
fifthcoeff <- fifth_model$coefficients
fifthfit <- fifthcoeff[1] + fifthcoeff[2]*time_step + fifthcoeff[3]*(time_step^2) + fifthcoeff[4]*(time_step^3) + fifthcoeff[5]*(time_step^4) + fifthcoeff[6]*(time_step^5)
#exponential
exp_model <- lm(log(trend) ~ time_step, data = trend_df)
expcoeff <- exp_model$coefficients
expfit <- expcoeff[1] + expcoeff[2]*time_step
expfit <- exp(expcoeff[1]) + (exp(expcoeff[2]))^(time_step)

models_df <- data.frame(degree_1 = linealfit,
                        degree_2 = cuadraticfit,
                        degree_3 = cubicfit,
                        degree_4 = fourthfit,
                        degree_5 = fifthfit,
                        exponential_fit = expfit,
                        time_step = time_step)

ggplot(data = trend_df, aes(x = time_step, y = trend)) +
  geom_point(aes(color = "Trend"), shape = 1) +
  geom_smooth(method = "lm", formula = y ~ x, aes(color = "lineal fit"),se=FALSE, ) +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), aes(color = "cuadratic fit"), se=FALSE) +
  geom_smooth(method = "lm", formula = y ~ poly(x, 3), aes(color = "cubic fit"), se=FALSE) +
  geom_smooth(method = "lm", formula = y ~ poly(x, 4), aes(color = "fourth fit"), se=FALSE) +
  geom_smooth(method = "lm", formula = y ~ poly(x, 5), aes(color = "fifth fit"), se=FALSE) +
  geom_line(data = models_df, aes(x=time_step,y=exponential_fit, color="exponential fit")) +
  theme_bw() +
  labs(title = "Fit models for the trend component",
       y = "Totals of accidental death in The USA",
       x = "Time step") +
  scale_color_manual(name = "Fit models", values = c("red","blue","yellow","green","gray","violet","black"))

Trend model error

AIC(lineal_model,cuadratic_model,cubic_model,fourth_model,fifth_model,exp_model)

                df       AIC
lineal_model     3  822.5859
cuadratic_model  4  703.2768
cubic_model      5  703.2467
fourth_model     6  698.4942
fifth_model      7  662.4895
exp_model        3 -268.2096

The Akaike Information Criterion determines that the polynomial of degree 5 is the best model. Thus, the selected model for the trend part is the polynomial of degree five.

Season

TTT <- length(data)
TIME <- 1:TTT
nYEARS <- TTT/12
Dummyw <- matrix(data = 0, nrow = TTT, ncol = 12)
# ---- measurement started at January 1973 ----
for (rok in 1:nYEARS){
  for (mnth in 1:12) {
    Dummyw[rok*12+mnth-12, mnth] = 1
  }
}
fifth_season <- lm(data ~ TIME + I(TIME^2) + I(TIME^3) + I(TIME^4) + I(TIME^5) + Dummyw)
coeff_trend <- fifth_season$coefficients[1:6]
coeff_season <- fifth_season$coefficients[7:18]

Difference = sum(coeff_season, na.rm = TRUE); DELTA = Difference/12
coeff_season[12] = 0
coeff_season = coeff_season - DELTA

2.2 Parametric components

trend

plot(fifth_model$residuals, type="l")

season

plot(fifth_season$fitted.values, type="l")

plot(fifth_season$coefficients[5:12])

plot(coeff_season)

plot(c(coeff_season,coeff_season), type="l")

plot(fifth_season$residuals, type="l")

3. Random part

random<-decomposed_ts$random
time_random <- length(random)
random <- random[1:time_random]
random <- random[7:66]

WN <- arima(random, order = c(0,0,0))
MA1 <- arima(random, order = c(0,0,1))
MA2 <- arima(random, order = c(0,0,2))
AR1 <- arima(random, order = c(1,0,0))
AR2 <- arima(random, order = c(2,0,0))
ARMA11 <- arima(random, order = c(1,0,1))
ARMA22 <- arima(random, order = c(2,0,2))
ARMA21 <- arima(random, order = c(2,0,1))
ARMA12 <- arima(random, order = c(1,0,2))

AIC(WN,MA1, MA2, AR1, AR2, ARMA11, ARMA22, ARMA12, ARMA21)

       df      AIC
WN      2 820.6246
MA1     3 819.5439
MA2     4 821.4388
AR1     3 819.4660
AR2     4 821.4457
ARMA11  4 821.4536
ARMA22  6 808.4461
ARMA12  5 816.6456
ARMA21  5 814.2025

Checking the Akaike Information Criterion the best model that fit the random process is the ARMA(2,2). It is, p = 2 and q = 2

plot(ARMA22$residuals)