Loaded all necessary libraries.
library(readxl) # read excel
library(dplyr) # mutate
library(PerformanceAnalytics) # correlation and histogram
library(ggplot2) # ggplot
library(forecast) # autoplot
library(imputeTS) # impute NAs
library(tseries)
library(writexl) # excel
We have a dataset in excel file, first read the data using read_csv(), and have a look on the staucture of data by glimpse function.
data <- readxl::read_excel('./Data Set for Class.xls')
data <- data %>% arrange(SeriesInd)
glimpse(data)
## Rows: 10,572
## Columns: 7
## $ SeriesInd <dbl> 40669, 40669, 40669, 40669, 40669, 40669, 40670, 40670, 4067…
## $ group <chr> "S03", "S02", "S01", "S06", "S05", "S04", "S03", "S02", "S01…
## $ Var01 <dbl> 30.64286, 10.28000, 26.61000, 27.48000, 69.26000, 17.20000, …
## $ Var02 <dbl> 123432400, 60855800, 10369300, 39335700, 27809100, 16587400,…
## $ Var03 <dbl> 30.34000, 10.05000, 25.89000, 26.82000, 68.19000, 16.88000, …
## $ Var05 <dbl> 30.49000, 10.17000, 26.20000, 27.02000, 68.72000, 16.94000, …
## $ Var07 <dbl> 30.57286, 10.28000, 26.01000, 27.32000, 69.15000, 17.10000, …
Dataset have 10,572 rows and 7 columns. To get more clarity added Date column using SeriesInd column.
#data <- data %>% dplyr::mutate(Date = as.Date(SeriesInd))
head(data)
There are total 6 groups, split the data into 6 datasets based on sub_group. Created another column as date, convert SeriesInd to date.
data_01 <- data %>% filter(group == 'S01')
data_02 <- data %>% filter(group == 'S02')
data_03 <- data %>% filter(group == 'S03')
data_04 <- data %>% filter(group == 'S04')
data_05 <- data %>% filter(group == 'S05')
data_06 <- data %>% filter(group == 'S06')
data_02 <- data_02 %>% select(c(SeriesInd, Var02, Var03)) %>% mutate(date = as.Date(SeriesInd, origin = '1898-08-30'))
data_06 <- data_06 %>% select(c(SeriesInd, Var05, Var07)) %>% mutate(date = as.Date(SeriesInd, origin = '1898-08-30'))
print(head(data_02))
## # A tibble: 6 x 4
## SeriesInd Var02 Var03 date
## <dbl> <dbl> <dbl> <date>
## 1 40669 60855800 10.0 2010-01-04
## 2 40670 215620200 10.4 2010-01-05
## 3 40671 200070600 11.1 2010-01-06
## 4 40672 130201700 11.3 2010-01-07
## 5 40673 130463000 11.5 2010-01-08
## 6 40676 170626200 11.8 2010-01-11
print(head(data_06))
## # A tibble: 6 x 4
## SeriesInd Var05 Var07 date
## <dbl> <dbl> <dbl> <date>
## 1 40669 27.0 27.3 2010-01-04
## 2 40670 27.3 28.1 2010-01-05
## 3 40671 28.0 28.1 2010-01-06
## 4 40672 28.1 29.1 2010-01-07
## 5 40673 28.9 28.9 2010-01-08
## 6 40676 29.1 28.8 2010-01-11
A dataset was provided with 1762 records and 3 variables. 1622 rows among the total 1762 rows had values for the variables and were meant to be used to train models while rest 140 records would be used for predicting the values using the models.
In the next set of plots, we will see the data distribution for S02 along with Var02 and Var03. This dataset looks like Ford stock data. There are missing value available for for S02, lets clean the data.
Total 144 rows have NAs, out of that 140 we will do forecast and rest 4 rows impute the values for NA.
data_02[!complete.cases(data_02),]
Get the subsets Var02 and Var03.data_02 has all NA from row 43022. Last 140 rows have NAs, to do forecast we will need to drop those NA values.
data_s02_v2 <- data_02 %>% filter(SeriesInd <= 43021) %>% select(date, Var02)
data_s02_v3 <- data_02 %>% filter(SeriesInd <= 43021) %>% select(date, Var03)
summary(data_s02_v2)
## date Var02
## Min. :2010-01-04 Min. : 7128800
## 1st Qu.:2011-08-11 1st Qu.: 27880300
## Median :2013-03-25 Median : 39767500
## Mean :2013-03-23 Mean : 50633098
## 3rd Qu.:2014-10-30 3rd Qu.: 59050900
## Max. :2016-06-13 Max. :480879500
summary(data_s02_v3)
## date Var03
## Min. :2010-01-04 Min. : 8.82
## 1st Qu.:2011-08-11 1st Qu.:11.82
## Median :2013-03-25 Median :13.76
## Mean :2013-03-23 Mean :13.68
## 3rd Qu.:2014-10-30 3rd Qu.:15.52
## Max. :2016-06-13 Max. :38.28
## NA's :4
Summary shows, Var02 mean value very higher than Median value may be due to presenece of outlier. In Var03 mean and median almost same. Var02 has no missing data but Var03 has 4 missing datapoints.
Impute the NAs using na_interpolation from package imputeTS.
library(imputeTS)
data_s02_v3 <- imputeTS::na_interpolation(data_s02_v3)
summary(data_s02_v3)
## date Var03
## Min. :2010-01-04 Min. : 8.82
## 1st Qu.:2011-08-11 1st Qu.:11.81
## Median :2013-03-25 Median :13.76
## Mean :2013-03-23 Mean :13.68
## 3rd Qu.:2014-10-30 3rd Qu.:15.52
## Max. :2016-06-13 Max. :38.28
Apply time series on data_s02_v2, and data_s02_v3. The data looks daily stocks of ford, we are keeping frequency = 365.
data_s02_v2 <- ts(data_s02_v2$Var02, start = 2010, frequency = 365)
str(data_s02_v2)
## Time-Series [1:1622] from 2010 to 2014: 6.09e+07 2.16e+08 2.00e+08 1.30e+08 1.30e+08 ...
data_s02_v3 <- ts(data_s02_v3$Var03, start = 2010, frequency = 365)
str(data_s02_v3)
## Time-Series [1:1622] from 2010 to 2014: 10.1 10.4 11.1 11.3 11.5 ...
Below plots shows the distribution, outliers and trend present in time series.
autoplot(data_s02_v2) +
geom_line( color="#69b3a2", show.legend = FALSE) + ylab("") +
ggtitle("Var 02")
autoplot(data_s02_v3) +
geom_line(color="#E69F00", show.legend = FALSE) + ylab("") +
ggtitle("Var 03")
Var02 shows downward trend mutiple outliers. Var03 may be cyclical and has one outlier.
ggplot(data_s02_v2, aes(data_s02_v2)) +
geom_histogram(bins=30, fill = "#E69F00") + xlab("") +
labs(title = "Histogram of S02 by Var02")
ggplot(data_s02_v3, aes(data_s02_v3)) +
geom_histogram(bins=15, fill = "#69b3a2") + xlab("") +
labs(title = "Histogram of S02 by Var03")
Var02 is unimodal but skewed towards right. Var03 looks nearly normal. Both variables have outliers.
par(mfrow = c(2, 1))
ggplot(data_s02_v2, aes(data_s02_v2)) + geom_boxplot(col = "#E69F00") + coord_flip() +
labs(title = "Boxplot of S02 by Var02") + xlab("Var 02")
ggplot(data_s02_v3, aes(data_s02_v3)) + geom_boxplot(col = "#69b3a2") + coord_flip() +
labs(title = "Boxplot of S02 by Var03") + xlab("Var 03")
Some variables have few outliers, which could strongly impacted the model. We use forecast::tsclean() to return a cleaned version of a time series with outliers and missing values replaced by estimated values.
We will fix outlier using the forecast package.
data_s02_v2 <- tsclean(data_s02_v2)
autoplot(data_s02_v2) +
geom_line( color="#E69F00", show.legend = FALSE) +
ylab("Var02")
data_s02_v3 <- tsclean(data_s02_v3)
autoplot(data_s02_v3) +
geom_line( color="#69b3a2", show.legend = FALSE) +
ylab("Var03")
Dataset data_s02_v2 have trend, ACF slowing reducing and PACF shows significant critical value at lag 0. Data looks non-stationary. Dataset data_s02_v3, ACF slowing reducing due to trend and PACF shows significant critical value at lag 0. Due is presence of trend data is not stationary.
ggtsdisplay(data_s02_v2, main="Group S02 - Var 02", ylab="Var02")
ggtsdisplay(data_s02_v3, main="Group S02 - Var 03", ylab="Var03")
Need to apply differencing to make these datasets stationary.
print(ndiffs(data_s02_v2))
## [1] 1
print(ndiffs(data_s02_v3))
## [1] 1
It shows number of differences required for a stationary series is 1.
data_s02_v2 %>% diff() %>% ggtsdisplay(main="Group S02 - Var 02", ylab="Var02", lag.max = 30)
data_s02_v2_diff <- data_s02_v2 %>% diff()
The differencing of the data has now made it stationary. In ACF plot there is a significant spike at lag 0.
data_s02_v3 %>% diff() %>% ggtsdisplay(main="Group S02 - Var 03", ylab="Var03", lag.max = 30)
data_s02_v3_diff <- data_s02_v3 %>% diff()
The differencing of the data has now made it stationary. In ACF plot there is a significant spike at lag 0.
Perform ugmented Dickey-Fuller Test to check the stationary of the data, if the p-value is less than 0.05 then data is stationary.
tseries::adf.test(data_s02_v2_diff)
##
## Augmented Dickey-Fuller Test
##
## data: data_s02_v2_diff
## Dickey-Fuller = -16.604, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
tseries::adf.test(data_s02_v3_diff)
##
## Augmented Dickey-Fuller Test
##
## data: data_s02_v3_diff
## Dickey-Fuller = -11.373, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
P-value is < 0.05, hence after differencing data became stationary.
Split the data in to train aand test set, train contains 80% of the data, and test contains 20% of the data.
data_s02_v2_train <- ts(data_s02_v2[1: floor(length(data_s02_v2)*0.8)], frequency = 365)
data_s02_v2_test <- length(data_s02_v2) - floor(length(data_s02_v2)*0.8)
data_s02_v2_test_ts <- ts(data_s02_v2[floor(length(data_s02_v2)*0.8 + 1):length(data_s02_v2)], frequency = 365)
data_s02_v3_train <- ts(data_s02_v3[1: floor(length(data_s02_v3)*0.8)], frequency = 365)
data_s02_v3_test <- length(data_s02_v3) - floor(length(data_s02_v3)*0.8)
data_s02_v3_test_ts <- ts(data_s02_v3[floor(length(data_s02_v3)*0.8 + 1):length(data_s02_v3)], frequency = 365)
Created 2 models :
Apply Arima model on Var02 and Var03. From ACF and PCAF for Var02 model seems like ARIMA(1,1,1) or ARIMA(0,1,0) or ARIMA(0,1,1). Similary, for Var03 model seems like ARIMA(1,1,1) or ARIMA(7,1,7).
Arima for Var02
#fit_arima_s02_v2 <- data_s02_v3_train %>% auto.arima(stepwise = FALSE)
fit_arima_s02_v2 <- data_s02_v2_train %>% Arima(order = c(0,1,1), seasonal = c(0,1,0))
summary(fit_arima_s02_v2)
## Series: .
## ARIMA(0,1,1)(0,1,0)[365]
##
## Coefficients:
## ma1
## -0.5797
## s.e. 0.0435
##
## sigma^2 estimated as 5.054e+14: log likelihood=-17084.32
## AIC=34172.65 AICc=34172.66 BIC=34182.32
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -51375.18 19036485 12078959 -5.348957 32.00494 0.4937086 0.1495702
Arima for Var03
#fit_arima_s02_v3 <- data_s02_v3_train %>% auto.arima(stepwise = FALSE)
fit_arima_s02_v3 <- data_s02_v3_train %>% Arima(order = c(1,1,1), seasonal = c(0,1,0))
summary(fit_arima_s02_v3)
## Series: .
## ARIMA(1,1,1)(0,1,0)[365]
##
## Coefficients:
## ar1 ma1
## 0.0559 0.0505
## s.e. 0.2573 0.2566
##
## sigma^2 estimated as 0.1314: log likelihood=-378.93
## AIC=763.86 AICc=763.88 BIC=778.36
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.002619765 0.3068386 0.1917915 -0.02888633 1.507823 0.05498502
## ACF1
## Training set -3.88827e-05
Arima var02 test
farima_s02_v02 <- forecast(fit_arima_s02_v2, data_s02_v2_test)
predictions_test <- ts(farima_s02_v02$mean, frequency = 365)
data_s02_v2_test_ts %>% autoplot(series = "Actuals") +
autolayer(predictions_test)
forecast::accuracy(as.numeric((farima_s02_v02$mean)),as.numeric((data_s02_v2_test_ts)))
## ME RMSE MAE MPE MAPE
## Test set 5859185 19912723 15249574 5.141829 49.29075
Arima var03 test
farima_s02_v03 <- forecast(fit_arima_s02_v3, data_s02_v3_test)
predictions_test_v03 <- ts(farima_s02_v03$mean, frequency = 365)
data_s02_v3_test_ts %>% autoplot(series = "Actuals") +
autolayer(predictions_test_v03)
forecast::accuracy(as.numeric((farima_s02_v03$mean)),as.numeric((data_s02_v3_test_ts)))
## ME RMSE MAE MPE MAPE
## Test set -0.8084843 1.705694 1.16842 -6.545275 8.996228
Fit ETS model on train data and forecast using test data for Var02 and Var03.
fets_s02_v02 <- data_s02_v2_train %>% ets() %>%
forecast(h = data_s02_v2_test)
fets_s02_v03 <- data_s02_v3_train %>% ets() %>%
forecast(h = data_s02_v3_test)
forecast::accuracy(fets_s02_v02,data_s02_v2_test)["Test set", ]
## ME RMSE MAE MPE MAPE
## -2.183161e+07 2.183161e+07 2.183161e+07 -6.717419e+06 6.717419e+06
## MASE ACF1
## 1.735409e+00 NA
forecast::accuracy(fets_s02_v03,data_s02_v3_test)["Test set", ]
## ME RMSE MAE MPE MAPE MASE ACF1
## 308.77954 308.77954 308.77954 95.00909 95.00909 1707.31785 NA
Comparing ARIMA and ETS models, for Var02 and Var03 ARIMA performed better than ETS.
Predict n = 140 using ARIMA model.
final_forecast_v02 <- data_s02_v2 %>% Arima(order = c(0,1,1), seasonal = c(0,1,0)) %>%
forecast(h = 140)
final_forecast_v02 %>% autoplot()
final_forecast_v03 <- data_s02_v3 %>% Arima(order = c(1,1,1), seasonal = c(0,1,0)) %>%
forecast(h = 140)
final_forecast_v03 %>% autoplot()
checkresiduals(farima_s02_v02, test = F)
Residuals shows the model is a good fit or not, if the residuals are white noise then then we can consider a model is a good fit. We use forecast::checkresiduals() to know about model fit. Mainly we consider below points for the model fit,
Residual Plots This plot should be a random walk.
Residual histograms The residuals of the model seem to better follow a normal distribution.
ACF plots The residuals of the model are less autocorrelated. More lags stays inside the 95% confidence interval better for the model.
checkresiduals(farima_s02_v03, test = F)
Above the residual plots, shows it’s a random walk. On ACF, autocorrelation values are larger at some lags. This means there is relationships between the lags. The histogram shows that the residuals are normally distributed. This model is may not be the best fit model, in ARIMA it’s harder to get the best fit model. We use this model for our forecast.
Export forecast for Var02 and Var03 to excel using write_xlsx() from writexl package.
writexl::write_xlsx(data.frame(final_forecast_v02$mean), "S02_v2.xlsx")
writexl::write_xlsx(data.frame(final_forecast_v03$mean), "S02_v3.xlsx")
Use dataset data_06, this dataset belongs to group S06.
A dataset was provided with 1762 records and 3 variables. 1622 rows among the total 1762 rows had values for the variables and were meant to be used to train models while rest 140 records would be used for predicting the values using the models.
In the next set of plots, we will see the data distribution for S06 along with Var05 and Var07.
Total 145 rows have NAs, to do forecast we will need to drop 140 rows and rest 5 rows impute values for NA.
data_06[!complete.cases(data_06),]
Get the subsets Var05 and Var07. The data_02 has all NA from row 43022.
data_s06_v5 <- data_06 %>% filter(SeriesInd <= 43021) %>% select(date, Var05)
data_s06_v7 <- data_06 %>% filter(SeriesInd <= 43021) %>% select(date, Var07)
summary(data_s06_v5)
## date Var05
## Min. :2010-01-04 Min. : 22.91
## 1st Qu.:2011-08-11 1st Qu.: 30.32
## Median :2013-03-25 Median : 36.87
## Mean :2013-03-23 Mean : 39.85
## 3rd Qu.:2014-10-30 3rd Qu.: 50.47
## Max. :2016-06-13 Max. :195.00
## NA's :5
summary(data_s06_v7)
## date Var07
## Min. :2010-01-04 Min. : 22.88
## 1st Qu.:2011-08-11 1st Qu.: 30.26
## Median :2013-03-25 Median : 36.97
## Mean :2013-03-23 Mean : 39.85
## 3rd Qu.:2014-10-30 3rd Qu.: 50.45
## Max. :2016-06-13 Max. :189.72
## NA's :5
Both variables summary shows 5 NAs and, mean and median does not show larger difference.
Impute NAs using na_interpolation() from package imputeTS.
data_s06_v5 <- imputeTS::na_interpolation(data_s06_v5)
summary(data_s06_v5)
## date Var05
## Min. :2010-01-04 Min. : 22.91
## 1st Qu.:2011-08-11 1st Qu.: 30.32
## Median :2013-03-25 Median : 36.94
## Mean :2013-03-23 Mean : 39.86
## 3rd Qu.:2014-10-30 3rd Qu.: 50.45
## Max. :2016-06-13 Max. :195.00
data_s06_v7 <- imputeTS::na_interpolation(data_s06_v7)
summary(data_s06_v7)
## date Var07
## Min. :2010-01-04 Min. : 22.88
## 1st Qu.:2011-08-11 1st Qu.: 30.26
## Median :2013-03-25 Median : 36.98
## Mean :2013-03-23 Mean : 39.87
## 3rd Qu.:2014-10-30 3rd Qu.: 50.43
## Max. :2016-06-13 Max. :189.72
Apply time series on data_s06_v5, and data_s06_v7. The data looks daily stock price, frequency = 365.
data_s06_v5 <- ts(data_s06_v5$Var05, start = 2010, frequency = 365)
str(data_s06_v5)
## Time-Series [1:1622] from 2010 to 2014: 27 27.3 28 28.1 28.9 ...
data_s06_v7 <- ts(data_s06_v7$Var07, start = 2010, frequency = 365)
str(data_s06_v7)
## Time-Series [1:1622] from 2010 to 2014: 27.3 28.1 28.1 29.1 28.9 ...
Below plots shows the distribution, outliers and trend present in time series.
autoplot(data_s06_v5) +
geom_line( color="#69b3a2", show.legend = FALSE) + ylab("") +
ggtitle("Var 05")
autoplot(data_s06_v7) +
geom_line(color="#E69F00", show.legend = FALSE) + ylab("") +
ggtitle("Var 07")
Var05 and Var07 both show an upward trend and one outlier. The trend is available in both series.
ggplot(data_s06_v5, aes(data_s06_v5)) +
geom_histogram(bins=20, fill = "#E69F00") + xlab("") +
labs(title = "Histogram of S06 by Var05")
ggplot(data_s06_v7, aes(data_s06_v5)) +
geom_histogram(bins=20, fill = "#69b3a2") + xlab("") +
labs(title = "Histogram of S06 by Var07")
Distribution is not normal for both the series and presents an outlier towards right.
ggplot(data_s06_v5, aes(data_s06_v5)) + geom_boxplot(col = "#E69F00") + coord_flip() +
labs(title = "Boxplot of S06 by Var05") + xlab("Var 05")
ggplot(data_s06_v7, aes(data_s06_v7)) + geom_boxplot(col = "#69b3a2") + coord_flip() +
labs(title = "Boxplot of S06 by Var07") + xlab("Var 07")
Fix outlier using the forecast package.
data_s06_v5 <- forecast::tsclean(ts(data_s06_v5))
#data_s06_v5[320] <- mean(data_s06_v5)
autoplot(data_s06_v5) +
geom_line( color="#E69F00", show.legend = FALSE) +
ylab("") + ggtitle("Var 05")
data_s06_v7 <- tsclean(ts(data_s06_v7))
#data_s06_v7[320] <- mean(data_s06_v7)
autoplot(data_s06_v7) +
geom_line( color="#69b3a2", show.legend = FALSE) +
ylab("") + ggtitle("Var 07")
Timeseries data_s06_v5 have trend, ACF slowing reducing and PACF shows significant critical value at lag 1. Data looks non-stationary. Timeseries data_s06_v7, ACF slowing reducing due to trend and PACF shows significant critical value at lag 1. Due is presence of trend data is not stationary.
ggtsdisplay(data_s06_v5, main="Group S06 - Var 05", ylab="Var05", lag.max = 80)
ggtsdisplay(data_s06_v7, main="Group S06 - Var 07", ylab="Var07", lag.max = 80)
Need to apply differencing to make these datasets stationary.
print(ndiffs(data_s06_v5))
## [1] 1
print(ndiffs(data_s06_v7))
## [1] 1
It shows number of differences required for a stationary series is 1.
data_s06_v5 %>% diff() %>% ggtsdisplay(main="Group S06 - Var 05", ylab="Var05", lag.max = 30)
data_s06_v5_diff <- data_s06_v5 %>% diff()
Data looks stationary, to confirm we will apply the Augmented Dickey-Fuller Test. ACF and PACF have a significant spike at lag 1.
data_s06_v7 %>% diff() %>% ggtsdisplay(main="Group S06 - Var 07", ylab="Var07", lag.max = 30)
data_s06_v7_diff <- data_s06_v7 %>% diff()
ACF and PACF have significant spikes at lag 1.
tseries::adf.test(data_s06_v5_diff)
##
## Augmented Dickey-Fuller Test
##
## data: data_s06_v5_diff
## Dickey-Fuller = -11.661, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
tseries::adf.test(data_s06_v7_diff)
##
## Augmented Dickey-Fuller Test
##
## data: data_s06_v7_diff
## Dickey-Fuller = -11.926, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
P-value is < 0.05, hence after differencing data became stationary.
Split the series in to train aand test set, train contains 80% of the data, and test contains 20% of the data.
data_s06_v5_train <- ts(data_s06_v5[1: floor(length(data_s06_v5)*0.8)], frequency = 365)
data_s06_v5_test <- length(data_s06_v5) - floor(length(data_s06_v5)*0.8)
data_s06_v5_test_ts <- ts(data_s06_v5[floor(length(data_s06_v5)*0.8 + 1):length(data_s06_v5)], frequency = 365)
data_s06_v7_train <- ts(data_s06_v7[1: floor(length(data_s06_v7)*0.8)], frequency = 365)
data_s06_v7_test <- length(data_s06_v7) - floor(length(data_s06_v7)*0.8)
data_s06_v7_test_ts <- ts(data_s06_v7[floor(length(data_s06_v7)*0.8 + 1):length(data_s06_v7)], frequency = 365)
Created 2 models, Arima and ETS.
Apply ARIMA model on both the time series. From ACF and PCAF for Var05 model seems like ARIMA(1,1,1) or ARIMA(1,1,5). For Var07, P values could be 1,5,15 and D : 1 and Q : 1, 5, 15.
#fit_arima_s06_v5 <- data_s06_v5_train %>% auto.arima(stepwise = FALSE)
fit_arima_s06_v5 <- data_s06_v5_train %>% Arima(order = c(1,1,1), seasonal = c(0,1,0))
summary(fit_arima_s06_v5)
## Series: .
## ARIMA(1,1,1)(0,1,0)[365]
##
## Coefficients:
## ar1 ma1
## 0.5110 -0.6220
## s.e. 0.2229 0.2041
##
## sigma^2 estimated as 0.5298: log likelihood=-1027.75
## AIC=2061.5 AICc=2061.53 BIC=2076.01
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01227903 0.6159914 0.4063471 0.02605054 1.132408 0.04527921
## ACF1
## Training set -0.01848887
farima_s06_v05 <- forecast(fit_arima_s06_v5, data_s06_v5_test)
predictions_test_v05 <- ts(farima_s06_v05$mean, frequency = 365)
data_s06_v5_test_ts %>% autoplot(series = "Actuals") +
autolayer(predictions_test_v05)
accuracy(as.numeric((farima_s06_v05$mean)),as.numeric((data_s06_v5_test_ts)))
## ME RMSE MAE MPE MAPE
## Test set -7.919303 10.2856 8.15498 -15.70893 16.13025
#fit_arima_s06_v7 <- data_s06_v7_train %>% auto.arima(stepwise = FALSE)
fit_arima_s06_v7 <- data_s06_v7_train %>% Arima(order = c(1,1,1), seasonal = c(0,1,0))
summary(fit_arima_s06_v7)
## Series: .
## ARIMA(1,1,1)(0,1,0)[365]
##
## Coefficients:
## ar1 ma1
## -0.8302 0.7490
## s.e. 0.0903 0.1075
##
## sigma^2 estimated as 0.5376: log likelihood=-1034.65
## AIC=2075.3 AICc=2075.32 BIC=2089.81
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.009945213 0.6205616 0.4062232 0.02062351 1.132192 0.045178
## ACF1
## Training set -0.04199531
farima_s06_v07 <- forecast(fit_arima_s06_v7, data_s06_v7_test)
predictions_test_v07 <- ts(farima_s06_v07$mean, frequency = 365)
data_s06_v7_test_ts %>% autoplot(series = "Actuals") +
autolayer(predictions_test_v07)
accuracy(as.numeric((farima_s06_v07$mean)),as.numeric((data_s06_v7_test_ts)))
## ME RMSE MAE MPE MAPE
## Test set -7.628729 10.04393 7.942482 -15.14514 15.70505
Fit ETS model on train data and forecast using test data for Var05 and Var07.
fets_s06_v05 <- data_s06_v5_train %>% ets() %>%
forecast(h = data_s06_v5_test)
fets_s06_v07 <- data_s06_v7_train %>% ets() %>%
forecast(h = data_s06_v7_test)
accuracy(fets_s06_v05,data_s06_v5_test)["Test set", ]
## ME RMSE MAE MPE MAPE MASE ACF1
## 270.14148 270.14148 270.14148 83.12046 83.12046 694.45666 NA
accuracy(fets_s06_v07,data_s06_v7_test)["Test set", ]
## ME RMSE MAE MPE MAPE MASE ACF1
## 270.17059 270.17059 270.17059 83.12941 83.12941 687.83234 NA
Comparing ARIMA and ETS models, for Var05 and Var07 ARIMA performed well.
Predict next 140 rows using ARIMA model.
final_forecast_v05 <- ts(data_s06_v5, start = 2010, frequency = 365) %>% Arima(order = c(1,1,1), seasonal = c(0,1,0)) %>%
forecast(h = 140)
final_forecast_v05 %>% autoplot()
final_forecast_v07 <- ts(data_s06_v7, start = 2010, frequency = 365) %>% Arima(order = c(1,1,1), seasonal = c(0,1,0)) %>%
forecast(h = 140)
final_forecast_v07 %>% autoplot()
checkresiduals(farima_s06_v05, test = F)
checkresiduals(farima_s06_v07, test = F)
Above the residual plots, shows it’s a random walk. On ACF, autocorrelation values are larger at some lags. This means there is relationships between the lags. The histogram shows that the residuals are normally distributed.
Export forecast for Var05 and Var07 to excel using write_xlsx() from writexl package.
writexl::write_xlsx(data.frame(final_forecast_v05$mean), "S06_v5.xlsx")
writexl::write_xlsx(data.frame(final_forecast_v07$mean), "S06_v7.xlsx")