To use various forecasting algorithms to determine the best model for a time series that will be created using artificially generated data below:
newData <-read.csv("~/Desktop/ALL_HW/Project3_practise/OKOILU RUTH.csv", header=TRUE)
head(newData)
## X
## 1 84.46624
## 2 107.03100
## 3 104.85777
## 4 89.38064
## 5 93.43544
## 6 94.95788
summary(newData)
## X
## Min. : 67.80
## 1st Qu.: 86.87
## Median : 90.91
## Mean : 90.99
## 3rd Qu.: 95.15
## Max. :109.24
nrow(newData)
## [1] 2000
plot(newData[,1])
From the plot above, data looks random, no correlation, constant variance.
library(forecast)
## Warning: package 'forecast' was built under R version 3.4.2
library(ggplot2)
library(tseries)
newData_ts <- ts(newData)
plot(newData_ts)
### How to test if a time series is stationary? The data looks stationary at first glance. To confirm this, we’ll use the Augmented Dickey-Fuller Test (adf test). A p-Value of less than 0.05 in adf.test() indicates that it is stationary.
adf.test(newData_ts) # p-value < 0.05 indicates the TS is stationary
## Warning in adf.test(newData_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: newData_ts
## Dickey-Fuller = -12.045, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
training <- newData[1:1500,]
testing <- newData[1501:2000,]
training <- as.data.frame(training)
testing <-as.data.frame(testing)
nrow(training)
## [1] 1500
nrow(testing)
## [1] 500
sma
I have developed a funcrion called D_sma() in R that implements the above formula.
library(Metrics)
D_sma <- function(x, m, df){ # Takes in x = 1, m = no of period
# i = x:m
i <- x:m
result <- data.frame()
for(t in 1:nrow(df)){
check = t-i
for(z in check){
if(z<1){
value = TRUE
break
}else{value = FALSE}
}
if(value){
result[t,1] <- NA
}else{
result[t,1] <- sum(df[check,])/m
}
}
return(result)
}
m_one <- D_sma(1, 1, training)
m_two <- D_sma(1, 2, training)
m_three <- D_sma(1, 3, training)
m_four <- D_sma(1, 4, training)
m_five <- D_sma(1, 5, training)
m_six <- D_sma(1, 6, training)
m_seven <- D_sma(1, 7, training)
m_eight <- D_sma(1, 8, training)
m_nine <- D_sma(1, 9, training)
m_ten <- D_sma(1, 10, training)
m_eleven <- D_sma(1, 11, training)
m_twelve <- D_sma(1, 12, training)
m_thirteen <- D_sma(1, 13, training)
m_fourteen <- D_sma(1, 14, training)
m_fifteen <- D_sma(1, 15, training)
m_sixteen <- D_sma(1, 16, training)
m_seventeen <- D_sma(1, 17, training)
m_eighteen <- D_sma(1, 18, training)
m_nineteen <- D_sma(1, 19, training)
m_twenty <- D_sma(1, 20, training)
m_twenty_one <- D_sma(1, 21, training)
m_twenty_two <- D_sma(1, 22, training)
m_twenty_three <- D_sma(1, 23, training)
m_twenty_four <- D_sma(1, 24, training)
m_twenty_five <- D_sma(1, 25, training)
m_twenty_six <- D_sma(1, 26, training)
m_twenty_seven <- D_sma(1, 27, training)
m_twenty_eight <- D_sma(1, 28, training)
m_twenty_nine <- D_sma(1, 29, training)
m_thirty <- D_sma(1, 30, training)
calculate_error <- function(observed, predicted){
return(observed - predicted)
}
err_one <- calculate_error(training, m_one)
err_two <- calculate_error(training, m_two)
err_three <- calculate_error(training, m_three)
err_four <- calculate_error(training, m_four)
err_five <- calculate_error(training, m_five)
err_six <- calculate_error(training, m_six)
err_seven <- calculate_error(training, m_seven)
err_eight <- calculate_error(training, m_eight)
err_nine <- calculate_error(training, m_nine)
err_ten <- calculate_error(training, m_ten)
err_eleven <- calculate_error(training, m_eleven)
err_twelve <- calculate_error(training, m_twelve)
err_thirteen <- calculate_error(training, m_thirteen)
err_fourteen <- calculate_error(training, m_fourteen)
err_fifteen <- calculate_error(training, m_fifteen)
err_sixteen <- calculate_error(training, m_sixteen)
err_seventeen <- calculate_error(training, m_seventeen)
err_eighteen <- calculate_error(training, m_eighteen)
err_nineteen <- calculate_error(training, m_nineteen)
err_twenty <- calculate_error(training, m_twenty)
err_twenty_one <- calculate_error(training, m_twenty_one)
err_twenty_two <- calculate_error(training, m_twenty_two)
err_twenty_three <- calculate_error(training, m_twenty_three)
err_twenty_four <- calculate_error(training, m_twenty_four)
err_twenty_five <- calculate_error(training, m_twenty_five)
err_twenty_six <- calculate_error(training, m_twenty_six)
err_twenty_seven <- calculate_error(training, m_twenty_seven)
err_twenty_eight <- calculate_error(training, m_twenty_eight)
err_twenty_nine <- calculate_error(training, m_twenty_nine)
err_thirty <- calculate_error(training, m_thirty)
calculate_rmse <- function(err_m){
err_m <- err_m[!is.na(err_m),]
sqrt(mean(err_m^2))
}
rmse_one <- calculate_rmse(err_one)
rmse_two <- calculate_rmse(err_two)
rmse_three <- calculate_rmse(err_three)
rmse_four <- calculate_rmse(err_four)
rmse_five <- calculate_rmse(err_five)
rmse_six <- calculate_rmse(err_six)
rmse_seven <- calculate_rmse(err_seven)
rmse_eight <- calculate_rmse(err_eight)
rmse_nine <- calculate_rmse(err_nine)
rmse_ten <- calculate_rmse(err_ten)
rmse_eleven <- calculate_rmse(err_eleven)
rmse_twelve <- calculate_rmse(err_twelve)
rmse_thirteen <- calculate_rmse(err_thirteen)
rmse_fourteen <- calculate_rmse(err_fourteen)
rmse_fifteen <- calculate_rmse(err_fifteen)
rmse_sixteen <- calculate_rmse(err_sixteen)
rmse_seventeen <- calculate_rmse(err_seventeen)
rmse_eighteen <- calculate_rmse(err_eighteen)
rmse_nineteen <- calculate_rmse(err_nineteen)
rmse_twenty <- calculate_rmse(err_twenty)
rmse_twenty_one <- calculate_rmse(err_twenty_one)
rmse_twenty_two <- calculate_rmse(err_twenty_two)
rmse_twenty_three <- calculate_rmse(err_twenty_three)
rmse_twenty_four <- calculate_rmse(err_twenty_four)
rmse_twenty_five <- calculate_rmse(err_twenty_five)
rmse_twenty_six <- calculate_rmse(err_twenty_six)
rmse_twenty_seven <- calculate_rmse(err_twenty_seven)
rmse_twenty_eight <- calculate_rmse(err_twenty_eight)
rmse_twenty_nine <- calculate_rmse(err_twenty_nine)
rmse_thirty <- calculate_rmse(err_thirty)
m <- 1:30
sma_rmse <- c(rmse_one,rmse_two, rmse_three, rmse_four, rmse_five, rmse_six, rmse_seven, rmse_eight, rmse_nine, rmse_ten, rmse_eleven,rmse_twelve, rmse_thirteen, rmse_fourteen, rmse_fifteen, rmse_sixteen, rmse_seventeen, rmse_eighteen, rmse_nineteen, rmse_twenty, rmse_twenty_one, rmse_twenty_two, rmse_twenty_three, rmse_twenty_four, rmse_twenty_five, rmse_twenty_six, rmse_twenty_seven, rmse_twenty_eight, rmse_twenty_nine, rmse_thirty)
metrics_m_df <- data.frame(m, sma_rmse)
metrics_m_df
## m sma_rmse
## 1 1 8.394683
## 2 2 7.035626
## 3 3 7.140791
## 4 4 6.795451
## 5 5 6.658700
## 6 6 6.487049
## 7 7 6.413648
## 8 8 6.339398
## 9 9 6.298913
## 10 10 6.267145
## 11 11 6.248079
## 12 12 6.217875
## 13 13 6.200579
## 14 14 6.174304
## 15 15 6.162268
## 16 16 6.156685
## 17 17 6.149031
## 18 18 6.146965
## 19 19 6.139815
## 20 20 6.137478
## 21 21 6.126631
## 22 22 6.120475
## 23 23 6.111956
## 24 24 6.105540
## 25 25 6.102102
## 26 26 6.099789
## 27 27 6.090181
## 28 28 6.089180
## 29 29 6.076412
## 30 30 6.082634
plot(metrics_m_df, type="o")
m = 1 to 7 have high rmse and beyond 10 the rmse decreases very slowly. We have to choose between 8, 9, and 10 with rmse 6.339398, 6.298913 and 6.267145 respectively. Due to the fact that a larger m leads to slower computation and to ensure that our choice of m is optimal, we choose m = 8.
# Selected m
choice <- subset(metrics_m_df, metrics_m_df[,1] == 8)
choice
## m sma_rmse
## 8 8 6.339398
# choice is m = 8
rmse_eight
## [1] 6.339398
#make m_eight time series data
m_eight_ts <- as.ts(m_eight, start=1, end = numeric(), frequency=1)
# plot m_eight for each period : 1500 periods in all
plot(m_eight_ts, main ="Predicted values for Simple Moving Averahe model with m = 8")
The plot above are the predicted values from the SMA model.
data_to_plot <- data.frame(training, m_eight)
colnames(data_to_plot) <- c("training", "m = 8")
# Plot training values vs predicted values
plot(data_to_plot)
We observe that the training values vs predicted values show no correlation. No positive or negative correlation. The values are simple random. This is not an impressive model. RMSE is 6.339398.
Exponential Smoothing
I developed an R function called exponential_smoothing() to implement the exponential smoothing formula above.
exponential_smoothing <- function(series, alpha){
result <- data.frame()
a<- c(series[1,1], series[2,1],series[3,1])
result[1,1] = mean(series[,1]) # first value is mean of all values of observed/series
for(n in 2:nrow(series)){
result[n,]<-(alpha * series[n-1,1] + (1 - alpha) * result[n-1,1])
}
return(result)
}
alpha_one <- exponential_smoothing(training, 0.1)
alpha_two <- exponential_smoothing(training, 0.2)
alpha_three <- exponential_smoothing(training, 0.3)
alpha_four <- exponential_smoothing(training, 0.4)
alpha_five <- exponential_smoothing(training, 0.5)
alpha_six <- exponential_smoothing(training, 0.6)
alpha_seven <- exponential_smoothing(training, 0.7)
alpha_eight <- exponential_smoothing(training, 0.8)
alpha_nine <- exponential_smoothing(training, 0.9)
err_alpha_one <- calculate_error(training, alpha_one)
err_alpha_two <- calculate_error(training, alpha_two)
err_alpha_three <- calculate_error(training, alpha_three)
err_alpha_four <- calculate_error(training, alpha_four)
err_alpha_five <- calculate_error(training, alpha_five)
err_alpha_six <- calculate_error(training, alpha_six)
err_alpha_seven <- calculate_error(training, alpha_seven)
err_alpha_eight <- calculate_error(training, alpha_eight)
err_alpha_nine <- calculate_error(training, alpha_nine)
rmse_alpha_one <- calculate_rmse(err_one)
rmse_alpha_two <- calculate_rmse(err_two)
rmse_alpha_three <- calculate_rmse(err_three)
rmse_alpha_four <- calculate_rmse(err_four)
rmse_alpha_five <- calculate_rmse(err_five)
rmse_alpha_six <- calculate_rmse(err_six)
rmse_alpha_seven <- calculate_rmse(err_seven)
rmse_alpha_eight <- calculate_rmse(err_eight)
rmse_alpha_nine <- calculate_rmse(err_nine)
es_rmse <- c(rmse_alpha_one,rmse_alpha_two, rmse_alpha_three, rmse_alpha_four, rmse_alpha_five, rmse_alpha_six, rmse_alpha_seven, rmse_alpha_eight, rmse_alpha_nine)
alpha<- c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
From the above, our best value of alpha is the one with the lowest rmse
metric_alpha_df <- data.frame(alpha, es_rmse)
metric_alpha_df
## alpha es_rmse
## 1 0.1 8.394683
## 2 0.2 7.035626
## 3 0.3 7.140791
## 4 0.4 6.795451
## 5 0.5 6.658700
## 6 0.6 6.487049
## 7 0.7 6.413648
## 8 0.8 6.339398
## 9 0.9 6.298913
plot(metric_alpha_df, type="o")
data_alpha_plot <- data.frame(training, alpha_nine)
colnames(data_alpha_plot) <- c("training", " α = 0.9")
# Plot training values vs predicted values
plot(data_alpha_plot)
Again we can see, the training values vs predicted values show no correlation. No positive or negative correlation. The values are simple random. This is not an impressive model. RMSE is 6.298913. This RMSE is slightly lower than that of SMA.
We’ll use the Arima(p,0,0) function in R for this experiment. We’ll run the PACF functions on the training set to determine P
par(mfrow = c(1,1))
acf.training = acf(training, main='ACF Plot', lag.max=100)
pacf.training = pacf(training, main='PACF Plot', lag.max=100)
From the acf, Clearly, the decay of ACF chart is very fast, which means that the population is stationary.
Three factors considered when choosing p
From the PACF, we can spot 3 lines that go beyond the blue line, so best p = 3
Checking Akaike information criterion (AICc) value for p values 1,2,3,4
# AR(1)
fit <- Arima(training, order = c(1,0,0))
summary(fit)
## Series: training
## ARIMA(1,0,0) with non-zero mean
##
## Coefficients:
## ar1 mean
## 0.0153 91.0180
## s.e. 0.0258 0.1568
##
## sigma^2 estimated as 35.82: log likelihood=-4811.18
## AIC=9628.37 AICc=9628.38 BIC=9644.31
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 6.754022e-05 5.980577 4.817361 -0.4369323 5.335187 0.7132365
## ACF1
## Training set -0.001575673
# AR(2)
fit <- Arima(training, order = c(2,0,0))
summary(fit)
## Series: training
## ARIMA(2,0,0) with non-zero mean
##
## Coefficients:
## ar1 ar2 mean
## 0.0136 0.1030 91.0186
## s.e. 0.0257 0.0257 0.1738
##
## sigma^2 estimated as 35.46: log likelihood=-4803.22
## AIC=9614.44 AICc=9614.47 BIC=9635.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0006807599 5.948878 4.789425 -0.4331384 5.303664 0.7091004
## ACF1
## Training set 0.02351946
# AR(3)
fit <- Arima(training, order = c(3,0,0))
summary(fit)
## Series: training
## ARIMA(3,0,0) with non-zero mean
##
## Coefficients:
## ar1 ar2 ar3 mean
## 0.0374 0.1056 -0.2280 91.0150
## s.e. 0.0251 0.0251 0.0252 0.1379
##
## sigma^2 estimated as 33.65: log likelihood=-4763.51
## AIC=9537.01 AICc=9537.05 BIC=9563.58
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002615907 5.793125 4.631301 -0.407282 5.128272 0.6856893
## ACF1
## Training set 0.003546986
# AR(4)
fit <- Arima(training, order = c(4,0,0))
summary(fit)
## Series: training
## ARIMA(4,0,0) with non-zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4 mean
## 0.0413 0.1037 -0.2286 0.0170 91.0152
## s.e. 0.0258 0.0252 0.0253 0.0259 0.1403
##
## sigma^2 estimated as 33.66: log likelihood=-4763.29
## AIC=9538.58 AICc=9538.64 BIC=9570.46
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002536353 5.79229 4.629502 -0.4072681 5.12633 0.6854229
## ACF1
## Training set -0.0004810112
We fit an ARIMA(3,0,0) model along with variations including ARIMA(1,0,0), ARIMA(2,0,0), ARIMA(4,0,0). Of all these, the ARIMA(3,0,0) has a slightly smaller AICc value.
auto.arima(training)
## Series: training
## ARIMA(3,0,0) with non-zero mean
##
## Coefficients:
## ar1 ar2 ar3 mean
## 0.0374 0.1056 -0.2280 91.0150
## s.e. 0.0251 0.0251 0.0252 0.1379
##
## sigma^2 estimated as 33.65: log likelihood=-4763.51
## AIC=9537.01 AICc=9537.05 BIC=9563.58
This function in R provides the Arima combination that produces the least AIC. The above result confirms that p =3.
fit1 <- Arima(training, c(3,0,0))
#Plotting the forecated series in red and the original training series in yellow below
plot(forecast(fit1), col="red")
lines(training, col = "yellow")
data_p_to_plot <- data.frame(training, fit1$fitted)
colnames(data_p_to_plot) <- c("training", "predicted values for p = 3")
plot(data_p_to_plot)
accuracy(fit1)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002615907 5.793125 4.631301 -0.407282 5.128272 0.6856893
## ACF1
## Training set 0.003546986
Here we can see, the training values vs predicted values show a slight positive correlation. This is a better model relative to the SMA and Exponential smooting we saw earlier. RMSE is 5.793125. So far, it has the least RMSE.
We will now run these models on the test data to see how they perform
For Simple Average Model, using our D_sma() function, best model is m = 8.
m_testing <- D_sma(1, 8, testing)
err_testing <- calculate_error(testing, m_testing)
rmse_testing <- calculate_rmse(err_testing)
rmse_testing
## [1] 6.619054
testing_df <- data.frame(testing, m_testing)
plot(testing_df)
As in the training set, the testing values vs predicted values show no correlation. No positive or negative correlation. The values are simple random. This is not an impressive model. RMSE is 6.619054 This RMSE is worse than when the model was used on the training set.
alpha_testing <- exponential_smoothing(testing, 0.9)
err_testing <- calculate_error(testing, alpha_testing)
rmse_testing <- calculate_rmse(err_testing)
rmse_testing
## [1] 7.87134
testing_df <- data.frame(testing, alpha_testing)
plot(testing_df)
Here we can see, the testing values vs predicted values show a slight positive correlation. However the RMSE is 7.87134. Which is worse than when the model was used on the training set and worse than the SMA model on the testing set.
fit2 <- Arima(testing, c(3,0,0))
#Plotting the forecated series in red and the original testing series in blue
plot(forecast(fit2), col="red")
lines(testing, col = "blue")
data_arima_plot <- data.frame(testing, fit2$fitted)
colnames(data_arima_plot) <- c("testing", "predicted values for p = 3")
plot(data_arima_plot)
accuracy(fit2)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.004016223 5.917087 4.738452 -0.4288647 5.272369 0.7116276
## ACF1
## Training set -0.001330702
Here we can see, the testing values vs predicted values show a positive correlation. This is the best model relative to the SMA and Exponential smooting for testing. RMSE is 5.917087. So far, it has the least RMSE.
Therefore, based on the RMSE and correlation between the predicted and observed values when we used the ARIMA(3,0,0) model, we conclude that the best model is the ARIMA model.