Problem Definition
In the xtsdata (csv file) there are 5 columns.
The aim is to use the columns (RUB_sol and MFA_sol) and predict 30 data points each for the two columns. Auto Regressive Integrated Moving Average
- Visualize Time Series
- Check Stationarity
- Plot ACF / PACF Charts
- Build ARIMA Model
- Make Forecast
Data Location
The data is available as a csv file named xtsdata.csv
Data Description
The data consists of 5 columns.
time15
RUB_sol
MFA_sol
NFA_sol
NFY_sol
SFY_baro_air
NFA_baro_air
Setup
Load Libs
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(tseries)
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(forecast)
library(quantmod)
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
library(devtools)
library(ggfortify)
## Loading required package: ggplot2
##
## Attaching package: 'ggfortify'
## The following object is masked from 'package:forecast':
##
## gglagplot
Read Data
setwd("D:/Welingkar/Trim 4/Machine Learning/Datasets/Lecture 6-7")
dfrModel <- read.csv("./xtsdata.csv", header=T, stringsAsFactors=F)
head(dfrModel)
## time15 RUB_sol MFA_sol NFA_sol NFY_sol SFY_baro_air NFA_baro_air
## 1 4/30/2012 0:15 11.929 12.689 11.26 8.19 9.43 13.134
## 2 4/30/2012 0:30 11.879 12.627 11.28 8.18 9.25 12.925
## 3 4/30/2012 0:45 11.828 12.570 11.26 8.21 9.03 12.736
## 4 4/30/2012 1:00 11.779 12.511 11.23 8.22 8.82 12.605
## 5 4/30/2012 1:15 11.730 12.459 11.17 8.23 8.60 12.455
## 6 4/30/2012 1:30 11.682 12.397 11.15 8.24 8.42 12.335
Create Extensible Time Series
names(dfrModel)
## [1] "time15" "RUB_sol" "MFA_sol" "NFA_sol"
## [5] "NFY_sol" "SFY_baro_air" "NFA_baro_air"
## This is done to prevent date being read as a string. xts needs it to be a date object.
dfrModel$time15 <- as.POSIXlt(dfrModel$time15,format="%m/%d/%Y %H:%M")
xtsData <- xts(dfrModel, order.by=dfrModel$time15)
head(xtsData)
## time15 RUB_sol MFA_sol NFA_sol
## 2012-04-30 00:15:00 "2012-04-30 00:15:00" "11.929" "12.689" "11.26"
## 2012-04-30 00:30:00 "2012-04-30 00:30:00" "11.879" "12.627" "11.28"
## 2012-04-30 00:45:00 "2012-04-30 00:45:00" "11.828" "12.570" "11.26"
## 2012-04-30 01:00:00 "2012-04-30 01:00:00" "11.779" "12.511" "11.23"
## 2012-04-30 01:15:00 "2012-04-30 01:15:00" "11.730" "12.459" "11.17"
## 2012-04-30 01:30:00 "2012-04-30 01:30:00" "11.682" "12.397" "11.15"
## NFY_sol SFY_baro_air NFA_baro_air
## 2012-04-30 00:15:00 " 8.19" " 9.43" "13.134"
## 2012-04-30 00:30:00 " 8.18" " 9.25" "12.925"
## 2012-04-30 00:45:00 " 8.21" " 9.03" "12.736"
## 2012-04-30 01:00:00 " 8.22" " 8.82" "12.605"
## 2012-04-30 01:15:00 " 8.23" " 8.60" "12.455"
## 2012-04-30 01:30:00 " 8.24" " 8.42" "12.335"
Creating two separate Extensible Time Series for the two columns “RUB_sol”, “MFA_sol”
Creating Extensible Time Series for RUB_sol
xtsRUB_sol <-xts(dfrModel$RUB_sol,order.by =dfrModel$time15)
names(xtsRUB_sol)[1] <-paste("RUB_sol")
class(xtsRUB_sol)
## [1] "xts" "zoo"
head(xtsRUB_sol)
## RUB_sol
## 2012-04-30 00:15:00 11.929
## 2012-04-30 00:30:00 11.879
## 2012-04-30 00:45:00 11.828
## 2012-04-30 01:00:00 11.779
## 2012-04-30 01:15:00 11.730
## 2012-04-30 01:30:00 11.682
Creating Extensible Time Series for MFA_sol
xtsMFA_sol <-xts(dfrModel$MFA_sol,order.by =dfrModel$time15)
names(xtsMFA_sol)[1] <-paste("MFA_sol")
class(xtsMFA_sol)
## [1] "xts" "zoo"
head(xtsMFA_sol)
## MFA_sol
## 2012-04-30 00:15:00 12.689
## 2012-04-30 00:30:00 12.627
## 2012-04-30 00:45:00 12.570
## 2012-04-30 01:00:00 12.511
## 2012-04-30 01:15:00 12.459
## 2012-04-30 01:30:00 12.397
Dealing first with xtsRUB_sol
Xts Info
cat("\nSummary:\n")
##
## Summary:
summary(xtsRUB_sol)
## Index RUB_sol
## Min. :2012-04-30 00:15:00 Min. : 9.489
## 1st Qu.:2012-05-15 21:11:15 1st Qu.:13.906
## Median :2012-05-31 18:07:30 Median :16.043
## Mean :2012-05-31 18:07:30 Mean :15.956
## 3rd Qu.:2012-06-16 15:03:45 3rd Qu.:17.971
## Max. :2012-07-02 12:00:00 Max. :22.439
cat("\nStart:\n")
##
## Start:
start(xtsRUB_sol)
## [1] "2012-04-30 00:15:00 IST"
cat("\nEnds:\n")
##
## Ends:
end(xtsRUB_sol)
## [1] "2012-07-02 12:00:00 IST"
cat("\nFreq:\n")
##
## Freq:
frequency(xtsRUB_sol)
## [1] 0.001111111
cat("\nIndex:\n")
##
## Index:
head(index(xtsRUB_sol))
## [1] "2012-04-30 00:15:00 IST" "2012-04-30 00:30:00 IST"
## [3] "2012-04-30 00:45:00 IST" "2012-04-30 01:00:00 IST"
## [5] "2012-04-30 01:15:00 IST" "2012-04-30 01:30:00 IST"
cat("\nPeriodicity:\n")
##
## Periodicity:
periodicity(xtsRUB_sol)
## 15 minute periodicity from 2012-04-30 00:15:00 to 2012-07-02 12:00:00
cat("\nYearly OHLC:\n")
##
## Yearly OHLC:
to.yearly(xtsRUB_sol)
## xtsRUB_sol.Open xtsRUB_sol.High xtsRUB_sol.Low xtsRUB_sol.Close
## 2012-07-02 11.929 22.439 9.489 20.55
cat("\nYearly Mean:\n")
##
## Yearly Mean:
lapply(split(xtsRUB_sol,f="years"),FUN=mean)
## [[1]]
## [1] 15.95573
cat("\nQuarterly OHLC:\n")
##
## Quarterly OHLC:
head(to.quarterly(xtsRUB_sol))
## xtsRUB_sol.Open xtsRUB_sol.High xtsRUB_sol.Low xtsRUB_sol.Close
## 2012 Q2 11.929 22.439 9.489 18.997
## 2012 Q3 18.960 21.896 17.883 20.550
cat("\nQuarterly Mean:\n")
##
## Quarterly Mean:
head(lapply(split(xtsRUB_sol,f="quarters"),FUN=mean))
## [[1]]
## [1] 15.87123
##
## [[2]]
## [1] 19.42347
cat("\nMonthly OHLC:\n")
##
## Monthly OHLC:
head(to.monthly(xtsRUB_sol))
## xtsRUB_sol.Open xtsRUB_sol.High xtsRUB_sol.Low xtsRUB_sol.Close
## Apr 2012 11.929 13.665 10.914 12.589
## May 2012 12.546 18.891 9.489 17.254
## Jun 2012 17.207 22.439 12.995 18.997
## Jul 2012 18.960 21.896 17.883 20.550
cat("\nMonthly Mean:\n")
##
## Monthly Mean:
head(lapply(split(xtsRUB_sol,f="months"),FUN=mean))
## [[1]]
## [1] 12.27673
##
## [[2]]
## [1] 14.09251
##
## [[3]]
## [1] 17.82781
##
## [[4]]
## [1] 19.42347
Plot xts
autoplot(xtsRUB_sol, ts.colour='blue') +
labs(title="Times Series") +
labs(x="Date and Month") +
labs(y="RUB_sol")
Observation
1. Graph is shown as having frequent fluctuations in data.
2. This is not like a stationary data.
3. As there are no seasonality so we will not decompose the data.
ADF Test
# Augmented Dickey-Fuller Test
adf.test(xtsRUB_sol, alternative="stationary", k=0)
##
## Augmented Dickey-Fuller Test
##
## data: xtsRUB_sol
## Dickey-Fuller = -1.9731, Lag order = 0, p-value = 0.5898
## alternative hypothesis: stationary
Observation
Null Hypothesis: Data is not stationary Alternative Hypothesis: Data is stationary
As we can see that P value is more than 0.05 so at 5% Level of Significance it is not able to reject Null Hypothesis.
This implies the data is not stationary.
Plot ACF- Auto Correlation Function
# Auto Correlation Function
autoplot(acf(xtsRUB_sol, plot = FALSE))
Observation
This plot should be greater than zero.
This displays the ACF graph as mentioned by ARIMA with values above the zero line.
Plot PACF- Partial Auto Correlation Function
#acf(diff(log(xtsD1)))
autoplot(pacf(xtsRUB_sol, plot = FALSE))
Observation
This plot should be below zero as well as above zero which is fulfilling ARIMA model properties.
This displays the ACF graph as mentioned by ARIMA with values below the zero line.
ARIMA
ARIMA (Auto Regressive Integrated Moving Averages)
- ARIMA Models Provide Another Approach To Time Series Forecasting
- ARIMA Models Are, In Theory, The Most General Class Of Models For Forecasting A Time Series.
- Arima Models Aim To Describe The Autocorrelations In The Data
- Pre Requisite Of ARIMA Is That The Time Series Needs To Be Stationary.
- A Stationary Series Has No Trend, Its Variations Around Its Mean Have A Constant Amplitude, And It Wiggles In A Consistent Fashion,
I.E., Its Short-term Random Time Patterns Always Look The Same In A Statistical Sense
- This is a modeling approach that can be used to calculate the probability of future value lying between two specified limits
THE MAIN ASSUMPTION IN ARIMA TECHNIQUES IS THAT THE DATA MUST BE STATIONARY
Unless Time Series Is Stationary, An ARIMA Model Can Not Be Built
Conclusion from the tests:
ADF, ACF and PACF were used to check if the data was stationery.
Observations were:
- ADF indicated a p value greater than 0.05 (ideally should be less than 0.05)
- ACF and PACF behaved like the expected graphs for ARIMA.
Ideally, ADF is the main criteria to decide whether the data is stationary or not. ACF and PACF act as a support.
In reality if ADF is greater than 0.05 then ARIMA should be avoided.
For Academic purpose we are going with ARIMA model for practive even data is not stationary.
Make ARIMA Model
# get arima model (find best model)
armModel <- auto.arima(xtsRUB_sol)
armModel
## Series: xtsRUB_sol
## ARIMA(5,1,1)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1
## 1.9571 -1.0185 0.1645 -0.1499 0.0409 -0.9766
## s.e. 0.0131 0.0282 0.0309 0.0282 0.0130 0.0029
##
## sigma^2 estimated as 0.0003703: log likelihood=15431.69
## AIC=-30849.37 AICc=-30849.36 BIC=-30802.37
Forecast Using ARIMA Model
# forecast using
fcData <- forecast(armModel,h=30)
fcData
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 5486401 20.75225 20.72759 20.77691 20.71453 20.78996
## 5487301 20.94708 20.89236 21.00179 20.86340 21.03075
## 5488201 21.13238 21.04270 21.22205 20.99523 21.26953
## 5489101 21.30777 21.17789 21.43764 21.10914 21.50640
## 5490001 21.47257 21.29815 21.64698 21.20582 21.73931
## 5490901 21.62601 21.40354 21.84849 21.28577 21.96626
## 5491801 21.76750 21.49399 22.04101 21.34920 22.18580
## 5492701 21.89651 21.56949 22.22354 21.39637 22.39666
## 5493601 22.01261 21.63006 22.39516 21.42755 22.59767
## 5494501 22.11543 21.67577 22.55508 21.44303 22.78782
## 5495401 22.20469 21.70674 22.70263 21.44315 22.96623
## 5496301 22.28020 21.72316 22.83724 21.42828 23.13212
## 5497201 22.34186 21.72526 22.95846 21.39886 23.28486
## 5498101 22.38964 21.71337 23.06591 21.35537 23.42390
## 5499001 22.42358 21.68783 23.15933 21.29835 23.54882
## 5499901 22.44382 21.64908 23.23857 21.22837 23.65928
## 5500801 22.45057 21.59760 23.30354 21.14607 23.75508
## 5501701 22.44410 21.53393 23.35427 21.05211 23.83609
## 5502601 22.42475 21.45864 23.39087 20.94721 23.90230
## 5503501 22.39294 21.37236 23.41352 20.83210 23.95378
## 5504401 22.34914 21.27577 23.42251 20.70756 23.99072
## 5505301 22.29388 21.16957 23.41818 20.57440 24.01336
## 5506201 22.22773 21.05449 23.40097 20.43342 24.02205
## 5507101 22.15134 20.93131 23.37137 20.28547 24.01721
## 5508001 22.06537 20.80082 23.32993 20.13140 23.99935
## 5508901 21.97055 20.66381 23.27729 19.97206 23.96903
## 5509801 21.86761 20.52110 23.21411 19.80831 23.92690
## 5510701 21.75732 20.37353 23.14112 19.64100 23.87365
## 5511601 21.64051 20.22193 23.05908 19.47098 23.81003
## 5512501 21.51796 20.06711 22.96882 19.29908 23.73685
Plot Forecast Using ARIMA Model
autoplot(fcData)
Observation
Blue colour data lines are showing the forecasted values.
Dealing with xtsMFA_sol
Xts Info
cat("\nSummary:\n")
##
## Summary:
summary(xtsMFA_sol)
## Index MFA_sol
## Min. :2012-04-30 00:15:00 Min. :10.54
## 1st Qu.:2012-05-15 21:11:15 1st Qu.:13.30
## Median :2012-05-31 18:07:30 Median :15.09
## Mean :2012-05-31 18:07:30 Mean :14.92
## 3rd Qu.:2012-06-16 15:03:45 3rd Qu.:16.45
## Max. :2012-07-02 12:00:00 Max. :19.71
cat("\nStart:\n")
##
## Start:
start(xtsMFA_sol)
## [1] "2012-04-30 00:15:00 IST"
cat("\nEnds:\n")
##
## Ends:
end(xtsMFA_sol)
## [1] "2012-07-02 12:00:00 IST"
cat("\nFreq:\n")
##
## Freq:
frequency(xtsMFA_sol)
## [1] 0.001111111
cat("\nIndex:\n")
##
## Index:
head(index(xtsMFA_sol))
## [1] "2012-04-30 00:15:00 IST" "2012-04-30 00:30:00 IST"
## [3] "2012-04-30 00:45:00 IST" "2012-04-30 01:00:00 IST"
## [5] "2012-04-30 01:15:00 IST" "2012-04-30 01:30:00 IST"
cat("\nPeriodicity:\n")
##
## Periodicity:
periodicity(xtsMFA_sol)
## 15 minute periodicity from 2012-04-30 00:15:00 to 2012-07-02 12:00:00
cat("\nYearly OHLC:\n")
##
## Yearly OHLC:
to.yearly(xtsMFA_sol)
## xtsMFA_sol.Open xtsMFA_sol.High xtsMFA_sol.Low xtsMFA_sol.Close
## 2012-07-02 12.689 19.709 10.541 17.819
cat("\nYearly Mean:\n")
##
## Yearly Mean:
lapply(split(xtsMFA_sol,f="years"),FUN=mean)
## [[1]]
## [1] 14.92437
cat("\nQuarterly OHLC:\n")
##
## Quarterly OHLC:
head(to.quarterly(xtsMFA_sol))
## xtsMFA_sol.Open xtsMFA_sol.High xtsMFA_sol.Low xtsMFA_sol.Close
## 2012 Q2 12.689 19.709 10.541 16.295
## 2012 Q3 16.252 18.691 15.603 17.819
cat("\nQuarterly Mean:\n")
##
## Quarterly Mean:
head(lapply(split(xtsMFA_sol,f="quarters"),FUN=mean))
## [[1]]
## [1] 14.87504
##
## [[2]]
## [1] 16.94893
cat("\nMonthly OHLC:\n")
##
## Monthly OHLC:
head(to.monthly(xtsMFA_sol))
## xtsMFA_sol.Open xtsMFA_sol.High xtsMFA_sol.Low xtsMFA_sol.Close
## Apr 2012 12.689 13.855 11.579 12.769
## May 2012 12.701 17.203 10.541 15.834
## Jun 2012 15.796 19.709 13.883 16.295
## Jul 2012 16.252 18.691 15.603 17.819
cat("\nMonthly Mean:\n")
##
## Monthly Mean:
head(lapply(split(xtsMFA_sol,f="months"),FUN=mean))
## [[1]]
## [1] 12.69096
##
## [[2]]
## [1] 13.41241
##
## [[3]]
## [1] 16.45846
##
## [[4]]
## [1] 16.94893
Plot xts
autoplot(xtsMFA_sol, ts.colour='blue') +
labs(title="Times Series Plot") +
labs(x="Date and Month") +
labs(y="MFA_sol")
Observation
1. Graph is shown as having frequent fluctuations in data.
2. This is not like a stationary data.
3. As there are no seasonality so we will not decompose the data.
ADF Test
# Augmented Dickey-Fuller Test
adf.test(xtsMFA_sol, alternative="stationary", k=0)
##
## Augmented Dickey-Fuller Test
##
## data: xtsMFA_sol
## Dickey-Fuller = -2.0628, Lag order = 0, p-value = 0.5517
## alternative hypothesis: stationary
Observation
Null Hypothesis: Data is not stationary Alternative Hypothesis: Data is stationary
As we can see that P value is more than 0.05 so at 5% Level of Significance it is not able to reject Null Hypothesis.
This implies the data is not stationary.
Plot ACF
# Auto Correlation Function
autoplot(acf(xtsMFA_sol, plot = FALSE))
Observation
This plot should be greater than zero.
This displays the ACF graph as mentioned by ARIMA with values above the zero line.
Plot PACF
#acf(diff(log(xtsD1)))
autoplot(pacf(xtsMFA_sol, plot = FALSE))
Observation
This plot should be below zero as well as above zero which is fulfilling ARIMA model properties.
This displays the ACF graph as mentioned by ARIMA with values below the zero line.
ARIMA
ARIMA (Auto Regressive Integrated Moving Averages)
- ARIMA Models Provide Another Approach To Time Series Forecasting
- ARIMA Models Are, In Theory, The Most General Class Of Models For Forecasting A Time Series.
- Arima Models Aim To Describe The Autocorrelations In The Data
- Pre Requisite Of ARIMA Is That The Time Series Needs To Be Stationary.
- A Stationary Series Has No Trend, Its Variations Around Its Mean Have A Constant Amplitude, And It Wiggles In A Consistent Fashion,
I.E., Its Short-term Random Time Patterns Always Look The Same In A Statistical Sense
- This is a modeling approach that can be used to calculate the probability of future value lying between two specified limits
THE MAIN ASSUMPTION IN ARIMA TECHNIQUES IS THAT THE DATA MUST BE STATIONARY
Unless Time Series Is Stationary, An ARIMA Model Can Not Be Built
Conclusion from the tests:
ADF, ACF and PACF were used to check if the data was stationery.
Observations were:
- ADF indicated a p value greater than 0.05 (ideally should be less than 0.05)
- ACF and PACF behaved like the expected graphs for ARIMA.
Ideally, ADF is the main criteria to decide whether the data is stationary or not. ACF and PACF act as a support. In reality if ADF is greater than 0.05 then ARIMA should be avoided.
For Academic purpose we are going with ARIMA model for practive even data is not stationary.
Make ARIMA Model
# get arima model (find best model)
armModel1 <- auto.arima(xtsMFA_sol)
armModel1
## Series: xtsMFA_sol
## ARIMA(4,1,4)
##
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): NaNs produced
## ar1 ar2 ar3 ar4 ma1 ma2 ma3 ma4
## 0.4867 0.7978 -0.3897 -0.0122 0.1243 -0.4434 0.2734 0.1057
## s.e. NaN NaN NaN NaN NaN 0.0442 NaN NaN
##
## sigma^2 estimated as 0.0009886: log likelihood=12441.15
## AIC=-24864.31 AICc=-24864.28 BIC=-24803.87
Forecast Using ARIMA Model
# forecast using
fcData1 <- forecast(armModel1,h=30)
fcData1
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 5486401 17.89277 17.85248 17.93307 17.83115 17.95440
## 5487301 17.95961 17.88321 18.03601 17.84276 18.07645
## 5488201 18.02245 17.90350 18.14141 17.84053 18.20438
## 5489101 18.08158 17.91339 18.24977 17.82435 18.33881
## 5490001 18.13355 17.90982 18.35729 17.79138 18.47572
## 5490901 18.18072 17.89786 18.46358 17.74812 18.61331
## 5491801 18.22133 17.87652 18.56615 17.69398 18.74868
## 5492701 18.25775 17.84990 18.66561 17.63399 18.88151
## 5493601 18.28887 17.81725 18.76050 17.56758 19.01016
## 5494501 18.31667 17.78157 18.85178 17.49830 19.13505
## 5495401 18.34034 17.74214 18.93855 17.42547 19.25522
## 5496301 18.36147 17.70115 19.02180 17.35160 19.37135
## 5497201 18.37943 17.65793 19.10093 17.27600 19.48287
## 5498101 18.39547 17.61408 19.17686 17.20044 19.59050
## 5499001 18.40908 17.56900 19.24915 17.12430 19.69386
## 5499901 18.42124 17.52388 19.31860 17.04884 19.79364
## 5500801 18.43155 17.47819 19.38490 16.97351 19.88958
## 5501701 18.44077 17.43281 19.44873 16.89923 19.98231
## 5502601 18.44858 17.38732 19.50984 16.82552 20.07163
## 5503501 18.45557 17.34235 19.56879 16.75305 20.15809
## 5504401 18.46148 17.29755 19.62542 16.68140 20.24157
## 5505301 18.46679 17.25339 19.68018 16.61106 20.32251
## 5506201 18.47126 17.20959 19.73294 16.54170 20.40082
## 5507101 18.47528 17.16650 19.78407 16.47367 20.47690
## 5508001 18.47868 17.12387 19.83348 16.40668 20.55067
## 5508901 18.48172 17.08197 19.88148 16.34098 20.62246
## 5509801 18.48429 17.04059 19.92799 16.27635 20.69224
## 5510701 18.48660 16.99995 19.97326 16.21296 20.76025
## 5511601 18.48855 16.95985 20.01724 16.15061 20.82648
## 5512501 18.49030 16.92047 20.06013 16.08945 20.89115
Plot Forecast Using ARIMA Model
autoplot(fcData1)
Observation
Blue colour data lines are showing the forecasted values.