Analysis objectives: Plot, examine, and prepare series for modeling.

Extract the seasonality component from the time series.

Test for stationarity and apply appropriate transformations

Choose the order of an ARIMA model.

Forecast the series.

library('ggplot2')
library('forecast')
library('tseries')
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# read the data

Data=read.csv("accidents_database_analysis.csv", header = TRUE)

# convert to TS, set the frequancy to monthly timeseries 
tsdata<-ts(Data[,11], frequency=12,start=c(2011,1), end =c(2014,12) )
# yearly and monthly aggregation 
Data$Date<-as.Date(Data$dtime)
Acce_count1<-aggregate(Data$month, by=list(Data$Date), sum)

# plot monthly count 
ggplot() +
geom_line(data = Acce_count1, aes(x =Acce_count1$Group.1, y = Acce_count1$x, colour = "Monthly Accidents"))  +
  ylab('Accidents count')

# convert to TS, set the frequancy to monthly timeseries 
tsdata<-ts(Data[,11], frequency=12,start=c(2011,1), end =c(2014,12) )

plot(tsdata)

There’s sudden drop between 2012 and 2013 which may indicate a seasinality decomposing the series and removing the seasonality

stlDemand <- stl(tsdata, s.window = "periodic") # decomposing the ts 
TsDemand<-seasadj(stlDemand)

plot(stlDemand)

We now have a de-seasonalized series and can proceed to the next step. To be ble to fit an ARIMA model series needs to be stationary(mean, variance, and autocovariance are not dependent on time) we preform and ADF test for stationarity

# P-value should be smallar than 0.05 
adf.test(TsDemand)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  TsDemand
## Dickey-Fuller = -2.6114, Lag order = 3, p-value = 0.3299
## alternative hypothesis: stationary

To fix this we preform differencing

TsDemand1 = diff(TsDemand, differences = 1)

adf.test(TsDemand1, alternative = "stationary")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  TsDemand1
## Dickey-Fuller = -3.7011, Lag order = 3, p-value = 0.03497
## alternative hypothesis: stationary
# A seasonal plot allows pattern to be seen more clearly, and can be useful in identifying years in which the pattern changes
ggseasonplot(TsDemand1, year.labels=FALSE, continuous=TRUE)

# both plots show a tren in the months(Sept-Nov)
ggseasonplot(TsDemand1, year.labels=FALSE, continuous=TRUE, polar = TRUE)

# Autocorrelations and Choosing Model Order
Acf(TsDemand, main='ACF for Differenced Series')

#There are significant auto correlations at lag 1 and 2 and beyond.

##Partial correlation plots show a significant spike at lag 1. This suggests that we might want to test models with AR or MA components of order 1 or 2. 


Pacf(TsDemand1, main='PACF for Differenced Series')

We specify non-seasonal ARIMA structure and fit the model to de-seasonalize data. Parameters (1,1,0) suggested by the automated procedure

our ideal model is one with minimum AIC and BIC values

# Fitting an ARIMA model
auto.arima(TsDemand, seasonal=FALSE)
## Series: TsDemand 
## ARIMA(1,1,0) 
## 
## Coefficients:
##           ar1
##       -0.3604
## s.e.   0.1339
## 
## sigma^2 estimated as 3777:  log likelihood=-259.82
## AIC=523.63   AICc=523.9   BIC=527.33

Examining ACF and PACF plots for model residuals.If model structure is correct, There shouldn’t be any significant autocorrelations present.

fit<-auto.arima(TsDemand, seasonal=FALSE)
tsdisplay(residuals(fit), lag.max=45, main='(1,1,0) Model Residuals')

The Blue line representing the forcasted results goes to a straight line fairly soon, which seems unlikeky given the past bahavior of the series. The model is assuming no seasonality.the plotted predictions are based on the assumption that there will be no other seasonal fluctuations in the data. Model can be improved by adding back the seasonal trend that we extracted. Another approcah is to include the components that are default in auto.Arima.

#Forecasting using the fitted model

fcast <- forecast(fit, h=12)
plot(fcast)

#parameters are changed after we included a seasonal component

fit_w_seasonality = auto.arima(TsDemand, seasonal=TRUE)
fit_w_seasonality
## Series: TsDemand 
## ARIMA(1,1,0)(1,0,0)[12] 
## 
## Coefficients:
##           ar1     sar1
##       -0.3671  -0.3721
## s.e.   0.1335   0.1272
## 
## sigma^2 estimated as 3182:  log likelihood=-256.17
## AIC=518.34   AICc=518.89   BIC=523.89
seas_fcast <- forecast(fit_w_seasonality, h=12)
plot(seas_fcast)

#ETS Model 
fit3=ets(TsDemand)
pred = forecast(fit3, h=12) 
plot(pred)

 fit6 <- ets(TsDemand,model="MMM",damped=TRUE)
  fcast6 <- forecast(fit6, h=12)
  plot(fcast6)

#use AIC to determine the best model 
barplot(c(ETS=fit6$aic, ARIMA=fit_w_seasonality$aic),
        col="light blue",
        ylab="AIC")

#Accuracy level 
 autoplot(fcast6)

autoplot(seas_fcast)

Normlizing by region

Acc_reg<-aggregate(Data$year, by=list(Data$region),sum)

Date1<-Acc_reg$Group.1
count1<-Acc_reg$x
df <- data.frame(Date1,count1)

ggplot(df, aes(count1)) + geom_bar(aes(fill = Date1), position = "dodge")

Acc_reg[,2]<-as.numeric(Acc_reg[,2])
scale(Acc_reg[,2], center = TRUE, scale = TRUE)
##              [,1]
##  [1,] -0.62915316
##  [2,] -0.10357917
##  [3,]  0.62676444
##  [4,]  0.01223487
##  [5,]  3.77612668
##  [6,] -0.69649983
##  [7,]  0.23977726
##  [8,] -0.69649967
##  [9,] -0.17726195
## [10,]  2.06429241
## [11,] -0.69548975
## [12,]  1.18092794
## [13,] -0.20322962
## [14,] -0.56884799
## [15,] -0.69616286
## [16,] -0.39275639
## [17,] -0.69649950
## [18,] -0.69548975
## [19,]  0.65472669
## [20,] -0.69616319
## [21,]  1.55911378
## [22,] -0.34701929
## [23,] -0.69649950
## [24,]  0.31798778
## [25,]  0.50804679
## [26,] -0.69649967
## [27,] -0.07147819
## [28,] -0.65948795
## [29,]  0.86950618
## [30,] -0.26508229
## [31,] -0.69616319
## [32,]  0.07982800
## [33,]  1.97126819
## [34,]  1.39274075
## [35,] -0.69649983
## [36,] -0.69616286
## [37,] -0.69650000
## [38,] -0.69582706
## [39,] -0.69616286
## [40,] -0.69649950
## [41,] -0.69582672
## attr(,"scaled:center")
## [1] 4164784
## attr(,"scaled:scale")
## [1] 5976702
hist(Acc_reg[,2],border="blue", col="purple", main="hist for regions")

plot(density(Acc_reg[,2], plot = F))
## Warning: In density.default(Acc_reg[, 2], plot = F) :
##  extra argument 'plot' will be disregarded