Warning: package 'forecast' was built under R version 4.2.2
Warning: package 'ggplot2' was built under R version 4.2.2
Warning: package 'tseries' was built under R version 4.2.2
The time series selected for this project is from USAccDeaths dataset. This data set represents the monthly totals of accidental deaths in the USA from 1973 to 1978.
Warning: package 'forecast' was built under R version 4.2.2
Warning: package 'ggplot2' was built under R version 4.2.2
Warning: package 'tseries' was built under R version 4.2.2
data <- USAccDeaths
data Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
1973 9007 8106 8928 9137 10017 10826 11317 10744 9713 9938 9161 8927
1974 7750 6981 8038 8422 8714 9512 10120 9823 8743 9129 8710 8680
1975 8162 7306 8124 7870 9387 9556 10093 9620 8285 8466 8160 8034
1976 7717 7461 7767 7925 8623 8945 10078 9179 8037 8488 7874 8647
1977 7792 6957 7726 8106 8890 9299 10625 9302 8314 8850 8265 8796
1978 7836 6892 7791 8192 9115 9434 10484 9827 9110 9070 8633 9240
start(data)[1] 1973 1
end(data)[1] 1978 12
frequency(data)[1] 12
plot(data)It can see from the time plot that this time series could probably be described using additive model, since the random fluctuations in the data are roughly constant over time.
summary(data) Min. 1st Qu. Median Mean 3rd Qu. Max.
6892 8089 8728 8789 9323 11317
The monthly totals of accidental death in USA are distributed across the spectrum.
plot(data)
abline(reg=lm(data~time(data)))plot(aggregate(data, FUN=mean))boxplot(data~cycle(data))The year on year trend clearly shows that total death have been decreasing until 1976, when they started raising. Besides, the seasonal fluctuations are clearly high in July and low in February. Hence, we have a strong seasonal effect with a cycle of 12 months or less.
#Decomposition of the time series
decomposed_ts <- decompose(data, type = "additive")
plot(decomposed_ts)The plot above shows a clear evidence about the trend and the seasonal behavior of the time series. In this step, a linear, polynomial and exponential model are going to compare in order to select the best model that fit with the trend of the time series.
Trend
trend <- decomposed_ts$trend
trend <- trend[1:length(trend)]
trend <- trend[7:66]
time_step <- 1:length(trend)
trend_df <- data.frame(time_step, trend)
plot(trend_df$trend ~ trend_df$time_step)#models
#degree 1
lineal_model <- lm(trend ~ time_step, data=trend_df)
linealcoeff <- lineal_model$coefficients
linealfit <- linealcoeff[1] + linealcoeff[2]*time_step
#degree 2
cuadratic_model <- lm(trend ~ poly(time_step,2, raw = TRUE), data = trend_df)
cuadraticcoeff <- cuadratic_model$coefficients
cuadraticfit <- cuadraticcoeff[1] + cuadraticcoeff[2]*time_step + cuadraticcoeff[3]*(time_step^2)
#degree 3
cubic_model <- lm(trend ~ poly(time_step,3, raw = TRUE), data = trend_df)
cubiccoeff <- cubic_model$coefficients
cubicfit <- cubiccoeff[1] + cubiccoeff[2]*time_step + cubiccoeff[3]*(time_step^2) + cubiccoeff[4]*(time_step^3)
#degree 4
fourth_model <- lm(trend ~ poly(time_step,4, raw = TRUE), data = trend_df)
fourthcoeff <- fourth_model$coefficients
fourthfit <- fourthcoeff[1] + fourthcoeff[2]*time_step + fourthcoeff[3]*(time_step^2) + fourthcoeff[4]*(time_step^3) + fourthcoeff[5]*(time_step^4)
#degree 5
fifth_model <- lm(trend ~ poly(time_step,5, raw = TRUE), data = trend_df)
fifthcoeff <- fifth_model$coefficients
fifthfit <- fifthcoeff[1] + fifthcoeff[2]*time_step + fifthcoeff[3]*(time_step^2) + fifthcoeff[4]*(time_step^3) + fifthcoeff[5]*(time_step^4) + fifthcoeff[6]*(time_step^5)
#exponential
exp_model <- lm(log(trend) ~ time_step, data = trend_df)
expcoeff <- exp_model$coefficients
expfit <- expcoeff[1] + expcoeff[2]*time_step
expfit <- exp(expcoeff[1]) + (exp(expcoeff[2]))^(time_step)
models_df <- data.frame(degree_1 = linealfit,
degree_2 = cuadraticfit,
degree_3 = cubicfit,
degree_4 = fourthfit,
degree_5 = fifthfit,
exponential_fit = expfit,
time_step = time_step)
ggplot(data = trend_df, aes(x = time_step, y = trend)) +
geom_point(aes(color = "Trend"), shape = 1) +
geom_smooth(method = "lm", formula = y ~ x, aes(color = "lineal fit"),se=FALSE, ) +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), aes(color = "cuadratic fit"), se=FALSE) +
geom_smooth(method = "lm", formula = y ~ poly(x, 3), aes(color = "cubic fit"), se=FALSE) +
geom_smooth(method = "lm", formula = y ~ poly(x, 4), aes(color = "fourth fit"), se=FALSE) +
geom_smooth(method = "lm", formula = y ~ poly(x, 5), aes(color = "fifth fit"), se=FALSE) +
geom_line(data = models_df, aes(x=time_step,y=exponential_fit, color="exponential fit")) +
theme_bw() +
labs(title = "Fit models for the trend component",
y = "Totals of accidental death in The USA",
x = "Time step") +
scale_color_manual(name = "Fit models", values = c("red","blue","yellow","green","gray","violet","black"))Trend model error
AIC(lineal_model,cuadratic_model,cubic_model,fourth_model,fifth_model,exp_model) df AIC
lineal_model 3 822.5859
cuadratic_model 4 703.2768
cubic_model 5 703.2467
fourth_model 6 698.4942
fifth_model 7 662.4895
exp_model 3 -268.2096
The Akaike Information Criterion determines that the polynomial of degree 5 is the best model. Thus, the selected model for the trend part is the polynomial of degree five.
Season
TTT <- length(data)
TIME <- 1:TTT
nYEARS <- TTT/12
Dummyw <- matrix(data = 0, nrow = TTT, ncol = 12)
# ---- measurement started at January 1973 ----
for (rok in 1:nYEARS){
for (mnth in 1:12) {
Dummyw[rok*12+mnth-12, mnth] = 1
}
}
fifth_season <- lm(data ~ TIME + I(TIME^2) + I(TIME^3) + I(TIME^4) + I(TIME^5) + Dummyw)
coeff_trend <- fifth_season$coefficients[1:6]
coeff_season <- fifth_season$coefficients[7:18]
Difference = sum(coeff_season, na.rm = TRUE); DELTA = Difference/12
coeff_season[12] = 0
coeff_season = coeff_season - DELTAtrend
plot(fifth_model$residuals, type="l")season
plot(fifth_season$fitted.values, type="l")plot(fifth_season$coefficients[5:12])plot(coeff_season)plot(c(coeff_season,coeff_season), type="l")plot(fifth_season$residuals, type="l")random<-decomposed_ts$random
time_random <- length(random)
random <- random[1:time_random]
random <- random[7:66]
WN <- arima(random, order = c(0,0,0))
MA1 <- arima(random, order = c(0,0,1))
MA2 <- arima(random, order = c(0,0,2))
AR1 <- arima(random, order = c(1,0,0))
AR2 <- arima(random, order = c(2,0,0))
ARMA11 <- arima(random, order = c(1,0,1))
ARMA22 <- arima(random, order = c(2,0,2))
ARMA21 <- arima(random, order = c(2,0,1))
ARMA12 <- arima(random, order = c(1,0,2))
AIC(WN,MA1, MA2, AR1, AR2, ARMA11, ARMA22, ARMA12, ARMA21) df AIC
WN 2 820.6246
MA1 3 819.5439
MA2 4 821.4388
AR1 3 819.4660
AR2 4 821.4457
ARMA11 4 821.4536
ARMA22 6 808.4461
ARMA12 5 816.6456
ARMA21 5 814.2025
Checking the Akaike Information Criterion the best model that fit the random process is the ARMA(2,2). It is, p = 2 and q = 2
plot(ARMA22$residuals)