P1
Requirement #1 You will turn in a written report. You need to write this report as if it the report will be routed in an office to personnel of vary different backgrounds. You need to be able to reach readers that have no data science background to full fledge data scientists. So, you need to explain what you have done, why and how you did it with both layman and technical terminology. Do not simply write this with me in mind. Visuals and output are expected, but it is not necessary to include every bit of analysis. In fact, a terse report with simple terminology is preferred to throwing in everything into a long, ad nausem report. Story telling is really taking on for data science, so please flex your muscles here. The report is part 1 of 2 requirements.
NOTE: We have covered a lot of material. I don’t want you to try every method to create your forecast. But you should perform fundamentals, visualize your data, and perform exploratory data analysis as appropriate.
Requirement #2 Your second requirement is to produce forecasts. This workbook will contain at least 6 sheets where I will calculate your error rates. There will be one sheet (tab) for each Group - S01, S02, S03, SO4, S05, S06. You should order each sheet by the variable SeriesIND (low to high). Your source data is sorted this way, except there are all 6 groups present in one sheet which you must break out into 6 tabs. You will submit the data I sent AND the forward forecast for 140 periods. I want you to forecast the following
- S01 - Forecast Var01, Var02
- S02 - Forecast Var02, Var03
- S03 - Forecast Var05, Var07
- S04 - Forecast Var01, Var02
- S05 - Forecast Var02, Var03
- S06 - Forecast Var05, Var07
# For simplicity lets convert SeriesInd to date. by setting Origin of date to 1900 Jan 1st.
print(paste("Start Of Data:",as.Date(min(full_data$SeriesInd) , origin = "1900-01-01")))## [1] "Start Of Data: 2011-05-08"
## [1] "End Of Data: 2018-05-03"
print(paste("Forecast from :",as.Date(max(full_data$SeriesInd)+1 , origin = "1900-01-01")," TO ",as.Date(max(full_data$SeriesInd)+140 , origin = "1900-01-01")))## [1] "Forecast from : 2018-05-04 TO 2018-09-20"
# Get the full date range for the data
allDates <- seq.Date(
min(full_data$SeriesInd),
max(full_data$SeriesInd),
"day")
# creating NA for missing data entry in full data.
full_data_na <- left_join(as.data.frame(allDates),full_data,by=c("allDates"= "SeriesInd"))
# subset(full_data[,c(1,2,3)], group == "S01")%>% .[,c(1,3)] %>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
#
#
# subset(full_data[,c(1,2,3)], group == "S02")%>% .[,c(1,3)] %>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
#
# full_data_na %>% filter(.,group=="S01") %>% .[,c(1,3)] %>% plot()
#
#
# #+geom_col(position="dodge", alpha=0.5)
#
#
# dt_s06_v1 %>% ggplot(aes(x=allDates, y= Var01))+ geom_line()
#
#
#
# full_data_na[,c(1,3)]plot()- Here we breaking data by each Group and variable
- Keep Only Date and Variables value
- Merge with Full Range of Dates to see if we have any NA in the data
# Creating a subset of date for each Group and Varible.
dt_s01_v1 = subset(full_data[,c(1,2,3)], group == "S01")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s01_v2 = subset(full_data[,c(1,2,4)], group == "S01")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s01_v3 = subset(full_data[,c(1,2,5)], group == "S01")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s01_v5 = subset(full_data[,c(1,2,6)], group == "S01")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s01_v7 = subset(full_data[,c(1,2,7)], group == "S01")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s02_v1 = subset(full_data[,c(1,2,3)], group == "S02")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s02_v2= subset(full_data[,c(1,2,4)], group == "S02")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s02_v3= subset(full_data[,c(1,2,5)], group == "S02")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s02_v5= subset(full_data[,c(1,2,6)], group == "S02")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s02_v7= subset(full_data[,c(1,2,7)], group == "S02")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s03_v1 = subset(full_data[,c(1,2,3)], group == "S03")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s03_v2 = subset(full_data[,c(1,2,4)], group == "S03")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s03_v3 = subset(full_data[,c(1,2,5)], group == "S03")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s03_v5 = subset(full_data[,c(1,2,6)], group == "S03")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s03_v7 = subset(full_data[,c(1,2,7)], group == "S03")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s04_v1 = subset(full_data[,c(1,2,3)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s04_v2= subset(full_data[,c(1,2,4)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s04_v3= subset(full_data[,c(1,2,5)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s04_v5= subset(full_data[,c(1,2,6)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s04_v7= subset(full_data[,c(1,2,7)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s05_v1 = subset(full_data[,c(1,2,3)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s05_v2 = subset(full_data[,c(1,2,4)], group == "S05")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s05_v3 = subset(full_data[,c(1,2,5)], group == "S05")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s05_v5 = subset(full_data[,c(1,2,6)], group == "S05")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s05_v7 = subset(full_data[,c(1,2,7)], group == "S05")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s06_v1 = subset(full_data[,c(1,2,3)], group == "S06")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s06_v2 = subset(full_data[,c(1,2,4)], group == "S06")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s06_v3 = subset(full_data[,c(1,2,5)], group == "S06")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s06_v5 = subset(full_data[,c(1,2,6)], group == "S06")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s06_v7 = subset(full_data[,c(1,2,7)], group == "S06")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))full_data_na [!which(full_data$group== 'S06'),] %>%
ggplot(aes(x=allDates, y= Var01,fill=group)) +
geom_line(aes(colour=group)) +
ggtitle("Var01 : For all the Groups in series") +
xlab("Year") +
ylab("Value")# Plot FUll data with groups and see how does data looks.
full_data_na %>%
ggplot(aes(x=allDates, y= Var01,fill=group)) +
geom_line(aes(colour=group)) +
ggtitle("Var01 : For all the Groups in series") +
xlab("Year") +
ylab("Value")full_data_na %>% ggplot(aes(x=allDates, fill=group,colour=group)) +
geom_line(aes(y= Var02)) +
ggtitle("Var02 : For all the Groups in series") +
xlab("Year") +
ylab("Value")full_data_na %>% ggplot(aes(x=allDates, fill=group,colour=group)) +
geom_line(aes(y= Var03)) +
ggtitle("Var03 : For all the Groups in series") +
xlab("Year") +
ylab("Value")full_data_na %>% ggplot(aes(x=allDates, fill=group,colour=group)) +
geom_line(aes(y= Var05)) +
ggtitle("Var5 : For all the Groups in series") +
xlab("Year") +
ylab("Value")full_data_na %>% ggplot(aes(x=allDates, fill=group,colour=group)) +
geom_line(aes(y= Var07)) +
ggtitle("Var07 : For all the Groups in series") +
xlab("Year") +
ylab("Value")subset(full_data_na[,], group == "S01") %>% ggplot(aes(x=allDates)) +
geom_line(aes(y= Var02,color="2")) +
geom_line(aes(y= Var01,color="1")) +
geom_line(aes(y= Var03,color="3")) +
geom_line(aes(y= Var05,color="4")) +
geom_line(aes(y= Var07,color="5")) +
ggtitle("Group S01: All Variable") +
xlab("Year") +
ylab("Value")subset(full_data_na[,], group == "S01") %>% ggplot(aes(x=allDates)) +
geom_line(aes(y= Var01,color="1")) +
geom_line(aes(y= Var03,color="3")) +
geom_line(aes(y= Var05,color="4")) +
geom_line(aes(y= Var07,color="5")) +
ggtitle("Group S01: All Variable -V02") +
xlab("Year") +
ylab("Value")full_data_na %>% ggplot(aes(x=group, y= Var01,fill=group)) + geom_boxplot(position = "dodge") + theme(axis.text.x = element_text(angle = 30, hjust = 1)) + labs(title = "Boxplot of Series by Group ")+ ylim(0,200) + xlab( "Group") We can very clearly see that we have some outlier in Series S06 and Series S02.We will validate this with latter using which.max(
## allDates group Var01 Var02
## Min. :2011-05-08 S01 :1762 Min. : 9.03 Min. : 1339900
## 1st Qu.:2013-02-01 S02 :1762 1st Qu.: 23.10 1st Qu.: 12520675
## Median :2014-11-04 S03 :1762 Median : 38.44 Median : 21086550
## Mean :2014-11-03 S04 :1762 Mean : 46.98 Mean : 37035741
## 3rd Qu.:2016-08-05 S05 :1762 3rd Qu.: 66.78 3rd Qu.: 42486700
## Max. :2018-05-03 S06 :1762 Max. :195.18 Max. :480879500
## NA's: 791 NA's :1645 NA's :1633
## Var03 Var05 Var07
## Min. : 8.82 Min. : 8.99 Min. : 8.92
## 1st Qu.: 22.59 1st Qu.: 22.91 1st Qu.: 22.88
## Median : 37.66 Median : 38.05 Median : 38.05
## Mean : 46.12 Mean : 46.55 Mean : 46.56
## 3rd Qu.: 65.88 3rd Qu.: 66.38 3rd Qu.: 66.31
## Max. :189.36 Max. :195.00 Max. :189.72
## NA's :1657 NA's :1657 NA's :1657
## 'data.frame': 11363 obs. of 7 variables:
## $ allDates: Date, format: "2011-05-08" "2011-05-08" ...
## $ group : Factor w/ 6 levels "S01","S02","S03",..: 3 2 1 6 5 4 3 2 1 6 ...
## $ Var01 : num 30.6 10.3 26.6 27.5 69.3 ...
## $ Var02 : num 1.23e+08 6.09e+07 1.04e+07 3.93e+07 2.78e+07 ...
## $ Var03 : num 30.3 10.1 25.9 26.8 68.2 ...
## $ Var05 : num 30.5 10.2 26.2 27 68.7 ...
## $ Var07 : num 30.6 10.3 26 27.3 69.2 ...
## allDates Var01
## Min. :2011-05-08 Min. :23.01
## 1st Qu.:2013-02-04 1st Qu.:29.85
## Median :2014-11-04 Median :35.66
## Mean :2014-11-04 Mean :39.41
## 3rd Qu.:2016-08-03 3rd Qu.:48.70
## Max. :2018-05-03 Max. :62.31
## NA's :933
## allDates Var02
## Min. :2011-05-08 Min. : 1339900
## 1st Qu.:2013-02-04 1st Qu.: 5347550
## Median :2014-11-04 Median : 7895050
## Mean :2014-11-04 Mean : 8907092
## 3rd Qu.:2016-08-03 3rd Qu.:11321675
## Max. :2018-05-03 Max. :48477500
## NA's :931
## allDates Var03
## Min. :2011-05-08 Min. :22.28
## 1st Qu.:2013-02-04 1st Qu.:29.33
## Median :2014-11-04 Median :35.07
## Mean :2014-11-04 Mean :38.65
## 3rd Qu.:2016-08-03 3rd Qu.:47.89
## Max. :2018-05-03 Max. :61.59
## NA's :935
## allDates Var05
## Min. :2011-05-08 Min. :22.55
## 1st Qu.:2013-02-04 1st Qu.:29.59
## Median :2014-11-04 Median :35.34
## Mean :2014-11-04 Mean :39.01
## 3rd Qu.:2016-08-03 3rd Qu.:48.28
## Max. :2018-05-03 Max. :62.29
## NA's :935
## allDates Var07
## Min. :2011-05-08 Min. :22.50
## 1st Qu.:2013-02-04 1st Qu.:29.60
## Median :2014-11-04 Median :35.36
## Mean :2014-11-04 Mean :39.05
## 3rd Qu.:2016-08-03 3rd Qu.:48.26
## Max. :2018-05-03 Max. :62.14
## NA's :935
1 Group S01
Below plots suggest there is some increasing tend with respect to all the varaible in Group S01, except Var02. which seems to staty normal with no trend at the major half of the datapoints.
2 working with NA in the data and creating xts object
dt_s01_v1_xts <- xts(c(dt_s01_v1$Var01), order.by=as.Date(dt_s01_v1$allDates))%>% na.approx()
dt_s01_v2_xts <- xts(c(dt_s01_v2$Var02), order.by=as.Date(dt_s01_v2$allDates))%>% na.approx()
dt_s01_v3_xts <- xts(c(dt_s01_v3$Var03), order.by=as.Date(dt_s01_v3$allDates))%>% na.approx()
dt_s01_v5_xts <- xts(c(dt_s01_v5$Var05), order.by=as.Date(dt_s01_v5$allDates))%>% na.approx()
dt_s01_v7_xts <- xts(c(dt_s01_v7$Var07), order.by=as.Date(dt_s01_v7$allDates))%>% na.approx()
# View of some differnet imputation methods and their values.
cbind("Value"=dt_s01_v1$Var01,
"na.interp"=na.interp(dt_s01_v1$Var01),
"na.approx"=na_seadec(dt_s01_v1$Var01),
"na.kalman"=na.kalman(dt_s01_v1$Var01),
"na.interpolation"=na.interpolation(dt_s01_v1$Var01)
) %>%
head(.,20)## Time Series:
## Start = 1
## End = 20
## Frequency = 1
## Value na.interp na.approx na.kalman na.interpolation
## 1 26.61 26.61000 26.61000 26.61000 26.61000
## 2 26.30 26.30000 26.30000 26.30000 26.30000
## 3 26.03 26.03000 26.03000 26.03000 26.03000
## 4 25.84 25.84000 25.84000 25.84000 25.84000
## 5 26.34 26.34000 26.34000 26.34000 26.34000
## 6 NA 26.39000 26.39000 26.37014 26.39000
## 7 NA 26.44000 26.44000 26.41866 26.44000
## 8 26.49 26.49000 26.49000 26.49000 26.49000
## 9 26.03 26.03000 26.03000 26.03000 26.03000
## 10 25.16 25.16000 25.16000 25.16000 25.16000
## 11 25.00 25.00000 25.00000 25.00000 25.00000
## 12 24.77 24.77000 24.77000 24.77000 24.77000
## 13 NA 24.77250 24.77250 24.77825 24.77250
## 14 NA 24.77500 24.77500 24.77663 24.77500
## 15 NA 24.77750 24.77750 24.77502 24.77750
## 16 24.78 24.78000 24.78000 24.78000 24.78000
## 17 24.61 24.61000 24.61000 24.61000 24.61000
## 18 24.88 24.88000 24.88000 24.88000 24.88000
## 19 24.17 24.17000 24.17000 24.17000 24.17000
## 20 NA 24.05333 24.05333 24.07196 24.05333
We will use na.interp to fill the missing value for all NA in our dataset. Below graph suggest how the imputed NA values are flowing with the rest of the data.
dt_s01_v1_xts <- xts(c(dt_s01_v1$Var01), order.by=as.Date(dt_s01_v1$allDates))
dt_s01_v2_xts <- xts(c(dt_s01_v2$Var02), order.by=as.Date(dt_s01_v2$allDates))
dt_s01_v3_xts <- xts(c(dt_s01_v3$Var03), order.by=as.Date(dt_s01_v3$allDates))
dt_s01_v5_xts <- xts(c(dt_s01_v5$Var05), order.by=as.Date(dt_s01_v5$allDates))
dt_s01_v7_xts <- xts(c(dt_s01_v7$Var07), order.by=as.Date(dt_s01_v7$allDates))
#
# na.approx(dt_s01_v1_xts) %>% autoplot()
# Plot NA and Actaul Data for Var1 of Group S01
autoplot(na.interp(as.ts(dt_s01_v1_xts)) , series="na.approx") +
autolayer(as.ts(dt_s01_v1_xts),series="Data")+
scale_colour_manual(values=c("blue","red"),
breaks=c("na.approx","Data"))dt_s01_v1_xts <- xts(c(dt_s01_v1$Var01), order.by=as.Date(dt_s01_v1$allDates))%>% na.approx()
dt_s01_v2_xts <- xts(c(dt_s01_v2$Var02), order.by=as.Date(dt_s01_v2$allDates))%>% na.approx()
dt_s01_v3_xts <- xts(c(dt_s01_v3$Var03), order.by=as.Date(dt_s01_v3$allDates))%>% na.approx()
dt_s01_v5_xts <- xts(c(dt_s01_v5$Var05), order.by=as.Date(dt_s01_v5$allDates))%>% na.approx()
dt_s01_v7_xts <- xts(c(dt_s01_v7$Var07), order.by=as.Date(dt_s01_v7$allDates))%>% na.approx()Above plots shows how imputed values and other data points are flowing over time. Lets impute rest of the data.
dt_s01_v1_xts <- xts(c(dt_s01_v1$Var01), order.by=as.Date(dt_s01_v1$allDates)) %>% na.approx()
dt_s01_v2_xts <- xts(c(dt_s01_v2$Var02), order.by=as.Date(dt_s01_v2$allDates))%>% na.approx()
dt_s01_v3_xts <- xts(c(dt_s01_v3$Var03), order.by=as.Date(dt_s01_v3$allDates))%>% na.approx()
dt_s01_v5_xts <- xts(c(dt_s01_v5$Var05), order.by=as.Date(dt_s01_v5$allDates))%>% na.approx()
dt_s01_v7_xts <- xts(c(dt_s01_v7$Var07), order.by=as.Date(dt_s01_v7$allDates))%>% na.approx()
cbind(dt_s01_v1[1:10,], as.data.frame(dt_s01_v1_xts[][1:10]))# Plot ACF and PACF
par(mfrow=c(1,2)) # set the plotting area into a 1*2 array
acf(dt_s01_v1_xts)
pacf(dt_s01_v1_xts)Since ACF plots very slowly moving towards ZERO for all the variables, we can say that our data series are non-stationary.
Let’s use Differencing to make data stationary. We will use KPSS test to see if our null hypothesis is false.
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 23.217
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9468 -0.1307 -0.0100 0.1275 4.8576
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 0.0003055 0.0002005 1.523 0.128
## z.diff.lag 0.1140899 0.0205036 5.564 2.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3978 on 2349 degrees of freedom
## Multiple R-squared: 0.01432, Adjusted R-squared: 0.01348
## F-statistic: 17.07 on 2 and 2349 DF, p-value: 4.376e-08
##
##
## Value of test-statistic is: 1.5234
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.58 -1.95 -1.62
SO very cleary our data is not stationary , lets apply difference and see if we can reduce test-statistic to <= 1%.
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.1004
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9236 -0.1182 0.0013 0.1399 4.8670
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 -0.89825 0.02744 -32.734 <2e-16 ***
## z.diff.lag 0.01536 0.02063 0.744 0.457
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.398 on 2348 degrees of freedom
## Multiple R-squared: 0.4425, Adjusted R-squared: 0.442
## F-statistic: 931.9 on 2 and 2348 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -32.7341
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.58 -1.95 -1.62
##
## Augmented Dickey-Fuller Test
##
## data: .
## Dickey-Fuller = -12.741, Lag order = 13, p-value = 0.01
## alternative hypothesis: stationary
cbind("Main Data S01_V1"= dt_s01_v1_xts,
"Diff Var01"= (diff(dt_s01_v1_xts,na.action = na.pass ))) %>% autoplot() Value of test-statistic is: 0.1004, which can be accepted to have stationary data for our series. We will use ndiffs() to find rest of the number of differenes.
par(mfrow=c(1,2))
acf(diff(dt_s01_v1_xts ),na.action = na.pass)
pacf(diff(dt_s01_v1_xts ),na.action = na.pass)Above ACF plots suggest that data for variale 1 is more corrleated with 1st lag, i.e. first order MA model can be used to define such data after applying difference on the data. PACF plots shows so many significant PACF values, so we would have to consider too many varaibles for AR model , which would make it comaplicated. Hennce we are going to use MA first order model.
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
# Lets use auto.arima to validate our model and fir it.
aa_fit_dt_s01_v1 <- auto.arima(dt_s01_v1_xts)
aa_fit_dt_s01_v1_log <- auto.arima(log(dt_s01_v1_xts))
summary(aa_fit_dt_s01_v1)## Series: dt_s01_v1_xts
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## 0.1163 0.0152
## s.e. 0.0205 0.0091
##
## sigma^2 estimated as 0.1581: log likelihood=-1167.44
## AIC=2340.89 AICc=2340.9 BIC=2358.18
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1.447248e-05 0.3974086 0.2373141 -0.009213095 0.6215781 0.9837201
## ACF1
## Training set -0.0002184172
## Series: log(dt_s01_v1_xts)
## ARIMA(1,1,0) with drift
##
## Coefficients:
## ar1 drift
## 0.1117 4e-04
## s.e. 0.0205 2e-04
##
## sigma^2 estimated as 0.0001003: log likelihood=7491.16
## AIC=-14976.31 AICc=-14976.3 BIC=-14959.02
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set 9.481692e-07 0.01001024 0.006208169 -0.0004976877 0.1734165
## MASE ACF1
## Training set 0.9833871 -8.713263e-06
## ma1 drift
## 0.11626944 0.01517711
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1) with drift
## Q* = 10.332, df = 8, p-value = 0.2425
##
## Model df: 2. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0) with drift
## Q* = 8.7143, df = 8, p-value = 0.367
##
## Model df: 2. Total lags used: 10
##
## Box-Ljung test
##
## data: aa_fit_dt_s01_v1$residuals
## X-squared = 49.988, df = 30, p-value = 0.01244
# ACF plot of the residuals are white noise, as no prominent patterns can be seen here.
# The Ljung-Box test also returned High p-vlaue indicating that we can't reject the null hypothessis, and data is white nosie.## Series: log(dt_s01_v1_xts)
## ARIMA(1,1,0) with drift
##
## Coefficients:
## ar1 drift
## 0.1117 4e-04
## s.e. 0.0205 2e-04
##
## sigma^2 estimated as 0.0001003: log likelihood=7491.16
## AIC=-14976.31 AICc=-14976.3 BIC=-14959.02
Fore_dt_s01_v1<- forecast(aa_fit_dt_s01_v1,h=140)
Fore_dt_s01_v1_log<- forecast(aa_fit_dt_s01_v1_log,h=140)
# xlsx::write.xlsx(exp(Fore_dt_s01_v1_log$mean), file = "Predictive Analytics Project 1log.xlsx", col.name = T, row.names = T, append = T)
autoplot(aa_fit_dt_s01_v1)# inverse characteristic roots for Armia(0,1,1) model fitted to adjsuted data, with red dots are well within the unit circle, Since the red point is not close to unit cirle may not gurentee better fit of the model. ## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2354 62.32814 61.81851 62.83776 61.54873 63.10754
## 2355 62.34331 61.57955 63.10708 61.17523 63.51139
## 2356 62.35849 61.40615 63.31084 60.90200 63.81498
## 2357 62.37367 61.26435 63.48299 60.67711 64.07022
## 2358 62.38885 61.14217 63.63552 60.48222 64.29548
## 2359 62.40402 61.03368 63.77436 60.30827 64.49978
## 2360 62.41920 60.93547 63.90293 60.15003 64.68837
## 2361 62.43438 60.84533 64.02343 60.00414 64.86462
## 2362 62.44955 60.76175 64.13736 59.86827 65.03083
## 2363 62.46473 60.68363 64.24583 59.74077 65.18869
## 2364 62.47991 60.61016 64.34965 59.62038 65.33943
## 2365 62.49509 60.54071 64.44946 59.50613 65.48404
## 2366 62.51026 60.47478 64.54574 59.39726 65.62326
## 2367 62.52544 60.41196 64.63892 59.29315 65.75773
## 2368 62.54062 60.35191 64.72932 59.19328 65.88795
## 2369 62.55579 60.29437 64.81722 59.09724 66.01435
## 2370 62.57097 60.23909 64.90285 59.00466 66.13728
## 2371 62.58615 60.18588 64.98642 58.91525 66.25705
## 2372 62.60132 60.13456 65.06809 58.82874 66.37391
## 2373 62.61650 60.08499 65.14801 58.74489 66.48811
## 2374 62.63168 60.03704 65.22632 58.66352 66.59984
## 2375 62.64686 59.99058 65.30313 58.58444 66.70928
## 2376 62.66203 59.94553 65.37854 58.50749 66.81657
## 2377 62.67721 59.90178 65.45264 58.43255 66.92187
## 2378 62.69239 59.85925 65.52552 58.35948 67.02529
## 2379 62.70756 59.81788 65.59725 58.28817 67.12696
## 2380 62.72274 59.77759 65.66789 58.21853 67.22696
## 2381 62.73792 59.73833 65.73751 58.15045 67.32539
## 2382 62.75310 59.70004 65.80615 58.08385 67.42234
## 2383 62.76827 59.66267 65.87388 58.01866 67.51788
## 2384 62.78345 59.62617 65.94073 57.95481 67.61209
## 2385 62.79863 59.59051 66.00674 57.89224 67.70502
## 2386 62.81380 59.55564 66.07197 57.83087 67.79674
## 2387 62.82898 59.52153 66.13644 57.77067 67.88730
## 2388 62.84416 59.48814 66.20018 57.71157 67.97675
## 2389 62.85934 59.45544 66.26323 57.65353 68.06515
## 2390 62.87451 59.42341 66.32562 57.59650 68.15252
## 2391 62.88969 59.39201 66.38737 57.54045 68.23893
## 2392 62.90487 59.36123 66.44851 57.48534 68.32439
## 2393 62.92004 59.33103 66.50905 57.43113 68.40896
## 2394 62.93522 59.30141 66.56904 57.37778 68.49266
## 2395 62.95040 59.27232 66.62847 57.32527 68.57553
## 2396 62.96558 59.24377 66.68738 57.27356 68.65759
## 2397 62.98075 59.21572 66.74579 57.22263 68.73888
## 2398 62.99593 59.18816 66.80370 57.17245 68.81941
## 2399 63.01111 59.16108 66.86114 57.12299 68.89922
## 2400 63.02628 59.13445 66.91812 57.07424 68.97833
## 2401 63.04146 59.10827 66.97465 57.02617 69.05675
## 2402 63.05664 59.08252 67.03075 56.97875 69.13452
## 2403 63.07182 59.05719 67.08644 56.93197 69.21166
## 2404 63.08699 59.03226 67.14172 56.88582 69.28817
## 2405 63.10217 59.00773 67.19661 56.84026 69.36408
## 2406 63.11735 58.98357 67.25112 56.79528 69.43941
## 2407 63.13252 58.95979 67.30526 56.75088 69.51417
## 2408 63.14770 58.93637 67.35904 56.70702 69.58838
## 2409 63.16288 58.91330 67.41246 56.66370 69.66205
## 2410 63.17806 58.89056 67.46555 56.62090 69.73521
## 2411 63.19323 58.86817 67.51830 56.57861 69.80785
## 2412 63.20841 58.84609 67.57073 56.53682 69.88000
## 2413 63.22359 58.82433 67.62284 56.49551 69.95167
## 2414 63.23876 58.80288 67.67465 56.45466 70.02286
## 2415 63.25394 58.78173 67.72615 56.41428 70.09360
## 2416 63.26912 58.76087 67.77737 56.37435 70.16389
## 2417 63.28430 58.74030 67.82830 56.33485 70.23374
## 2418 63.29947 58.72000 67.87894 56.29577 70.30317
## 2419 63.31465 58.69998 67.92932 56.25712 70.37218
## 2420 63.32983 58.68022 67.97943 56.21887 70.44078
## 2421 63.34500 58.66073 68.02928 56.18102 70.50898
## 2422 63.36018 58.64149 68.07887 56.14356 70.57680
## 2423 63.37536 58.62250 68.12822 56.10649 70.64423
## 2424 63.39053 58.60375 68.17732 56.06978 70.71129
## 2425 63.40571 58.58524 68.22618 56.03344 70.77798
## 2426 63.42089 58.56697 68.27481 55.99746 70.84432
## 2427 63.43607 58.54892 68.32321 55.96183 70.91031
## 2428 63.45124 58.53110 68.37138 55.92654 70.97595
## 2429 63.46642 58.51350 68.41934 55.89158 71.04126
## 2430 63.48160 58.49612 68.46708 55.85696 71.10623
## 2431 63.49677 58.47894 68.51461 55.82266 71.17089
## 2432 63.51195 58.46197 68.56193 55.78868 71.23523
## 2433 63.52713 58.44521 68.60905 55.75501 71.29925
## 2434 63.54231 58.42865 68.65597 55.72164 71.36297
## 2435 63.55748 58.41228 68.70269 55.68857 71.42640
## 2436 63.57266 58.39610 68.74922 55.65580 71.48952
## 2437 63.58784 58.38011 68.79556 55.62331 71.55236
## 2438 63.60301 58.36431 68.84172 55.59111 71.61492
## 2439 63.61819 58.34869 68.88769 55.55919 71.67720
## 2440 63.63337 58.33325 68.93349 55.52754 71.73920
## 2441 63.64855 58.31799 68.97911 55.49616 71.80093
## 2442 63.66372 58.30289 69.02455 55.46504 71.86241
## 2443 63.67890 58.28797 69.06983 55.43418 71.92362
## 2444 63.69408 58.27322 69.11494 55.40358 71.98457
## 2445 63.70925 58.25862 69.15988 55.37324 72.04527
## 2446 63.72443 58.24420 69.20467 55.34313 72.10573
## 2447 63.73961 58.22993 69.24929 55.31327 72.16594
## 2448 63.75479 58.21581 69.29376 55.28366 72.22592
## 2449 63.76996 58.20185 69.33807 55.25427 72.28565
## 2450 63.78514 58.18804 69.38224 55.22512 72.34516
## 2451 63.80032 58.17439 69.42625 55.19620 72.40444
## 2452 63.81549 58.16087 69.47011 55.16750 72.46349
## 2453 63.83067 58.14751 69.51383 55.13902 72.52232
## 2454 63.84585 58.13428 69.55741 55.11076 72.58093
## 2455 63.86103 58.12120 69.60085 55.08272 72.63933
## 2456 63.87620 58.10826 69.64415 55.05489 72.69752
## 2457 63.89138 58.09545 69.68731 55.02727 72.75549
## 2458 63.90656 58.08277 69.73034 54.99985 72.81327
## 2459 63.92173 58.07023 69.77324 54.97263 72.87083
## 2460 63.93691 58.05782 69.81600 54.94562 72.92820
## 2461 63.95209 58.04554 69.85864 54.91880 72.98538
## 2462 63.96727 58.03338 69.90115 54.89218 73.04235
## 2463 63.98244 58.02135 69.94353 54.86574 73.09914
## 2464 63.99762 58.00945 69.98579 54.83950 73.15574
## 2465 64.01280 57.99766 70.02793 54.81345 73.21215
## 2466 64.02797 57.98600 70.06995 54.78757 73.26837
## 2467 64.04315 57.97446 70.11185 54.76188 73.32442
## 2468 64.05833 57.96303 70.15363 54.73637 73.38029
## 2469 64.07351 57.95172 70.19529 54.71104 73.43597
## 2470 64.08868 57.94052 70.23685 54.68588 73.49149
## 2471 64.10386 57.92943 70.27829 54.66089 73.54683
## 2472 64.11904 57.91846 70.31962 54.63607 73.60200
## 2473 64.13421 57.90759 70.36083 54.61142 73.65701
## 2474 64.14939 57.89684 70.40194 54.58694 73.71185
## 2475 64.16457 57.88619 70.44295 54.56262 73.76652
## 2476 64.17974 57.87565 70.48384 54.53846 73.82103
## 2477 64.19492 57.86521 70.52464 54.51446 73.87539
## 2478 64.21010 57.85487 70.56533 54.49062 73.92958
## 2479 64.22528 57.84464 70.60591 54.46693 73.98362
## 2480 64.24045 57.83451 70.64640 54.44340 74.03750
## 2481 64.25563 57.82447 70.68679 54.42002 74.09124
## 2482 64.27081 57.81454 70.72708 54.39680 74.14482
## 2483 64.28598 57.80470 70.76727 54.37372 74.19825
## 2484 64.30116 57.79496 70.80736 54.35079 74.25154
## 2485 64.31634 57.78532 70.84736 54.32800 74.30468
## 2486 64.33152 57.77576 70.88727 54.30536 74.35767
## 2487 64.34669 57.76630 70.92708 54.28286 74.41053
## 2488 64.36187 57.75694 70.96680 54.26050 74.46324
## 2489 64.37705 57.74766 71.00643 54.23828 74.51582
## 2490 64.39222 57.73848 71.04597 54.21619 74.56826
## 2491 64.40740 57.72938 71.08543 54.19425 74.62056
## 2492 64.42258 57.72037 71.12479 54.17243 74.67273
## 2493 64.43776 57.71145 71.16407 54.15075 74.72476
##
## Forecast method: ARIMA(0,1,1) with drift
##
## Model Information:
## Series: dt_s01_v1_xts
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## 0.1163 0.0152
## s.e. 0.0205 0.0091
##
## sigma^2 estimated as 0.1581: log likelihood=-1167.44
## AIC=2340.89 AICc=2340.9 BIC=2358.18
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1.447248e-05 0.3974086 0.2373141 -0.009213095 0.6215781 0.9837201
## ACF1
## Training set -0.0002184172
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2354 62.32814 61.81851 62.83776 61.54873 63.10754
## 2355 62.34331 61.57955 63.10708 61.17523 63.51139
## 2356 62.35849 61.40615 63.31084 60.90200 63.81498
## 2357 62.37367 61.26435 63.48299 60.67711 64.07022
## 2358 62.38885 61.14217 63.63552 60.48222 64.29548
## 2359 62.40402 61.03368 63.77436 60.30827 64.49978
## 2360 62.41920 60.93547 63.90293 60.15003 64.68837
## 2361 62.43438 60.84533 64.02343 60.00414 64.86462
## 2362 62.44955 60.76175 64.13736 59.86827 65.03083
## 2363 62.46473 60.68363 64.24583 59.74077 65.18869
## 2364 62.47991 60.61016 64.34965 59.62038 65.33943
## 2365 62.49509 60.54071 64.44946 59.50613 65.48404
## 2366 62.51026 60.47478 64.54574 59.39726 65.62326
## 2367 62.52544 60.41196 64.63892 59.29315 65.75773
## 2368 62.54062 60.35191 64.72932 59.19328 65.88795
## 2369 62.55579 60.29437 64.81722 59.09724 66.01435
## 2370 62.57097 60.23909 64.90285 59.00466 66.13728
## 2371 62.58615 60.18588 64.98642 58.91525 66.25705
## 2372 62.60132 60.13456 65.06809 58.82874 66.37391
## 2373 62.61650 60.08499 65.14801 58.74489 66.48811
## 2374 62.63168 60.03704 65.22632 58.66352 66.59984
## 2375 62.64686 59.99058 65.30313 58.58444 66.70928
## 2376 62.66203 59.94553 65.37854 58.50749 66.81657
## 2377 62.67721 59.90178 65.45264 58.43255 66.92187
## 2378 62.69239 59.85925 65.52552 58.35948 67.02529
## 2379 62.70756 59.81788 65.59725 58.28817 67.12696
## 2380 62.72274 59.77759 65.66789 58.21853 67.22696
## 2381 62.73792 59.73833 65.73751 58.15045 67.32539
## 2382 62.75310 59.70004 65.80615 58.08385 67.42234
## 2383 62.76827 59.66267 65.87388 58.01866 67.51788
## 2384 62.78345 59.62617 65.94073 57.95481 67.61209
## 2385 62.79863 59.59051 66.00674 57.89224 67.70502
## 2386 62.81380 59.55564 66.07197 57.83087 67.79674
## 2387 62.82898 59.52153 66.13644 57.77067 67.88730
## 2388 62.84416 59.48814 66.20018 57.71157 67.97675
## 2389 62.85934 59.45544 66.26323 57.65353 68.06515
## 2390 62.87451 59.42341 66.32562 57.59650 68.15252
## 2391 62.88969 59.39201 66.38737 57.54045 68.23893
## 2392 62.90487 59.36123 66.44851 57.48534 68.32439
## 2393 62.92004 59.33103 66.50905 57.43113 68.40896
## 2394 62.93522 59.30141 66.56904 57.37778 68.49266
## 2395 62.95040 59.27232 66.62847 57.32527 68.57553
## 2396 62.96558 59.24377 66.68738 57.27356 68.65759
## 2397 62.98075 59.21572 66.74579 57.22263 68.73888
## 2398 62.99593 59.18816 66.80370 57.17245 68.81941
## 2399 63.01111 59.16108 66.86114 57.12299 68.89922
## 2400 63.02628 59.13445 66.91812 57.07424 68.97833
## 2401 63.04146 59.10827 66.97465 57.02617 69.05675
## 2402 63.05664 59.08252 67.03075 56.97875 69.13452
## 2403 63.07182 59.05719 67.08644 56.93197 69.21166
## 2404 63.08699 59.03226 67.14172 56.88582 69.28817
## 2405 63.10217 59.00773 67.19661 56.84026 69.36408
## 2406 63.11735 58.98357 67.25112 56.79528 69.43941
## 2407 63.13252 58.95979 67.30526 56.75088 69.51417
## 2408 63.14770 58.93637 67.35904 56.70702 69.58838
## 2409 63.16288 58.91330 67.41246 56.66370 69.66205
## 2410 63.17806 58.89056 67.46555 56.62090 69.73521
## 2411 63.19323 58.86817 67.51830 56.57861 69.80785
## 2412 63.20841 58.84609 67.57073 56.53682 69.88000
## 2413 63.22359 58.82433 67.62284 56.49551 69.95167
## 2414 63.23876 58.80288 67.67465 56.45466 70.02286
## 2415 63.25394 58.78173 67.72615 56.41428 70.09360
## 2416 63.26912 58.76087 67.77737 56.37435 70.16389
## 2417 63.28430 58.74030 67.82830 56.33485 70.23374
## 2418 63.29947 58.72000 67.87894 56.29577 70.30317
## 2419 63.31465 58.69998 67.92932 56.25712 70.37218
## 2420 63.32983 58.68022 67.97943 56.21887 70.44078
## 2421 63.34500 58.66073 68.02928 56.18102 70.50898
## 2422 63.36018 58.64149 68.07887 56.14356 70.57680
## 2423 63.37536 58.62250 68.12822 56.10649 70.64423
## 2424 63.39053 58.60375 68.17732 56.06978 70.71129
## 2425 63.40571 58.58524 68.22618 56.03344 70.77798
## 2426 63.42089 58.56697 68.27481 55.99746 70.84432
## 2427 63.43607 58.54892 68.32321 55.96183 70.91031
## 2428 63.45124 58.53110 68.37138 55.92654 70.97595
## 2429 63.46642 58.51350 68.41934 55.89158 71.04126
## 2430 63.48160 58.49612 68.46708 55.85696 71.10623
## 2431 63.49677 58.47894 68.51461 55.82266 71.17089
## 2432 63.51195 58.46197 68.56193 55.78868 71.23523
## 2433 63.52713 58.44521 68.60905 55.75501 71.29925
## 2434 63.54231 58.42865 68.65597 55.72164 71.36297
## 2435 63.55748 58.41228 68.70269 55.68857 71.42640
## 2436 63.57266 58.39610 68.74922 55.65580 71.48952
## 2437 63.58784 58.38011 68.79556 55.62331 71.55236
## 2438 63.60301 58.36431 68.84172 55.59111 71.61492
## 2439 63.61819 58.34869 68.88769 55.55919 71.67720
## 2440 63.63337 58.33325 68.93349 55.52754 71.73920
## 2441 63.64855 58.31799 68.97911 55.49616 71.80093
## 2442 63.66372 58.30289 69.02455 55.46504 71.86241
## 2443 63.67890 58.28797 69.06983 55.43418 71.92362
## 2444 63.69408 58.27322 69.11494 55.40358 71.98457
## 2445 63.70925 58.25862 69.15988 55.37324 72.04527
## 2446 63.72443 58.24420 69.20467 55.34313 72.10573
## 2447 63.73961 58.22993 69.24929 55.31327 72.16594
## 2448 63.75479 58.21581 69.29376 55.28366 72.22592
## 2449 63.76996 58.20185 69.33807 55.25427 72.28565
## 2450 63.78514 58.18804 69.38224 55.22512 72.34516
## 2451 63.80032 58.17439 69.42625 55.19620 72.40444
## 2452 63.81549 58.16087 69.47011 55.16750 72.46349
## 2453 63.83067 58.14751 69.51383 55.13902 72.52232
## 2454 63.84585 58.13428 69.55741 55.11076 72.58093
## 2455 63.86103 58.12120 69.60085 55.08272 72.63933
## 2456 63.87620 58.10826 69.64415 55.05489 72.69752
## 2457 63.89138 58.09545 69.68731 55.02727 72.75549
## 2458 63.90656 58.08277 69.73034 54.99985 72.81327
## 2459 63.92173 58.07023 69.77324 54.97263 72.87083
## 2460 63.93691 58.05782 69.81600 54.94562 72.92820
## 2461 63.95209 58.04554 69.85864 54.91880 72.98538
## 2462 63.96727 58.03338 69.90115 54.89218 73.04235
## 2463 63.98244 58.02135 69.94353 54.86574 73.09914
## 2464 63.99762 58.00945 69.98579 54.83950 73.15574
## 2465 64.01280 57.99766 70.02793 54.81345 73.21215
## 2466 64.02797 57.98600 70.06995 54.78757 73.26837
## 2467 64.04315 57.97446 70.11185 54.76188 73.32442
## 2468 64.05833 57.96303 70.15363 54.73637 73.38029
## 2469 64.07351 57.95172 70.19529 54.71104 73.43597
## 2470 64.08868 57.94052 70.23685 54.68588 73.49149
## 2471 64.10386 57.92943 70.27829 54.66089 73.54683
## 2472 64.11904 57.91846 70.31962 54.63607 73.60200
## 2473 64.13421 57.90759 70.36083 54.61142 73.65701
## 2474 64.14939 57.89684 70.40194 54.58694 73.71185
## 2475 64.16457 57.88619 70.44295 54.56262 73.76652
## 2476 64.17974 57.87565 70.48384 54.53846 73.82103
## 2477 64.19492 57.86521 70.52464 54.51446 73.87539
## 2478 64.21010 57.85487 70.56533 54.49062 73.92958
## 2479 64.22528 57.84464 70.60591 54.46693 73.98362
## 2480 64.24045 57.83451 70.64640 54.44340 74.03750
## 2481 64.25563 57.82447 70.68679 54.42002 74.09124
## 2482 64.27081 57.81454 70.72708 54.39680 74.14482
## 2483 64.28598 57.80470 70.76727 54.37372 74.19825
## 2484 64.30116 57.79496 70.80736 54.35079 74.25154
## 2485 64.31634 57.78532 70.84736 54.32800 74.30468
## 2486 64.33152 57.77576 70.88727 54.30536 74.35767
## 2487 64.34669 57.76630 70.92708 54.28286 74.41053
## 2488 64.36187 57.75694 70.96680 54.26050 74.46324
## 2489 64.37705 57.74766 71.00643 54.23828 74.51582
## 2490 64.39222 57.73848 71.04597 54.21619 74.56826
## 2491 64.40740 57.72938 71.08543 54.19425 74.62056
## 2492 64.42258 57.72037 71.12479 54.17243 74.67273
## 2493 64.43776 57.71145 71.16407 54.15075 74.72476
## Series: log(dt_s01_v1_xts)
## ARIMA(1,1,0) with drift
##
## Coefficients:
## ar1 drift
## 0.1117 4e-04
## s.e. 0.0205 2e-04
##
## sigma^2 estimated as 0.0001003: log likelihood=7491.16
## AIC=-14976.31 AICc=-14976.3 BIC=-14959.02
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set 9.481692e-07 0.01001024 0.006208169 -0.0004976877 0.1734165
## MASE ACF1
## Training set 0.9833871 -8.713263e-06
## Series: dt_s01_v1_xts
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## 0.1163 0.0152
## s.e. 0.0205 0.0091
##
## sigma^2 estimated as 0.1581: log likelihood=-1167.44
## AIC=2340.89 AICc=2340.9 BIC=2358.18
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1.447248e-05 0.3974086 0.2373141 -0.009213095 0.6215781 0.9837201
## ACF1
## Training set -0.0002184172
## [1] -14976.31
## [1] 0.006215781
## [1] 0.006205727
2.1 Extra v01
lambda1 <- BoxCox.lambda(diff(dt_s01_v1_xts))
dt_s01_v1_xts_t <- BoxCox((dt_s01_v1_xts),lambda1)
auto.arima(dt_s01_v1_xts_t,stepwise = F, approximation = F, d = 0)## Series: dt_s01_v1_xts_t
## ARIMA(0,0,5) with non-zero mean
##
## Coefficients:
## ma1 ma2 ma3 ma4 ma5 mean
## 2.3479 3.21 2.9510 1.7925 0.5865 38.4549
## s.e. 0.0204 0.04 0.0401 0.0269 0.0135 0.2961
##
## sigma^2 estimated as 1.466: log likelihood=-3789.92
## AIC=7593.84 AICc=7593.89 BIC=7634.18
## Series: ts(dt_s01_v1_xts, frequency = 30)
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## 0.1163 0.0152
## s.e. 0.0205 0.0091
##
## sigma^2 estimated as 0.1581: log likelihood=-1167.44
## AIC=2340.89 AICc=2340.9 BIC=2358.18
## Series: ts(dt_s01_v1_xts)
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## 0.1163 0.0152
## s.e. 0.0205 0.0091
##
## sigma^2 estimated as 0.1581: log likelihood=-1167.44
## AIC=2340.89 AICc=2340.9 BIC=2358.18
## Series: log(dt_s01_v1_xts)
## ARIMA(1,1,0) with drift
##
## Coefficients:
## ar1 drift
## 0.1117 4e-04
## s.e. 0.0205 2e-04
##
## sigma^2 estimated as 0.0001003: log likelihood=7491.16
## AIC=-14976.31 AICc=-14976.3 BIC=-14959.02
autoplot(ts(dt_s01_v1_xts,frequency=365), series="Data") +
autolayer(trendcycle(decompose(ts(dt_s01_v1_xts,frequency=365))), series="Trend") +
autolayer(seasadj(decompose(ts(dt_s01_v1_xts,frequency=365))), series="Seasonally Adjusted") +
xlab("Year") + ylab("Var01") +
ggtitle("Group S01 Var01") +
scale_colour_manual(values=c("orange","blue","red"),
breaks=c("Data","Seasonally Adjusted","Trend"))3 Varaible 02
# Plot NA and Actaul Data for Var1 of Group S01
autoplot(as.ts(dt_s01_v2_xts) , series="V02") +
autolayer(log(as.ts(dt_s01_v2_xts)),series="Data")+
scale_colour_manual(values=c("blue","red"),
breaks=c("V02","Data"))# PACF plot suggest too much variations which are not close to zero, Let try doing some log tranfartion and check .Non zero autocorreations in the data from lag 1 to 35.
# Acf plots shows that plots has some strong coreatlion in the frist few lags then it is very is of random walk model, but slowly it exibits Moderate Autocorrelation .
#https://www.statisticshowto.com/kpss-test/
# The Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test figures out if a time series is stationary around a mean or linear trend, or is non-stationary due to a unit root. A stationary time series is one where statistical properties - like the mean and variance - are constant over time.
# The null hypothesis for the test is that the data is stationary.
# The alternate hypothesis for the test is that the data is not stationary.
dt_s01_v2_xts %>% ur.kpss() %>% summary()##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 13.4497
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# The null hypothesis for the test is that the data is stationary.
# Here you should reject the null hypothesis of stationarity as the value of the test statistic is more extreme than the 10%, 5% and 1% critical values (13.4497 > 0.347 , 13.4497 > 0.463, 13.4497 > 0.739).
# we will reject the null and accpet the alternate .
dt_s01_v2_xts %>% diff() %>% ur.kpss() %>% summary()##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.003
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# Since as the value of the test statistic = 0.003 is less than than the 10%, 5% and 1% critical values (0.003 > 0.347 , 0.003 > 0.463, 0.003 > 0.739). We can say that data is now stationary.
dt_s01_v2_xts %>% log() %>%diff() %>% ur.kpss() %>% summary()##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.0033
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15966892 -864079 107490 1125529 40490634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 -0.035888 0.006013 -5.969 2.75e-09 ***
## z.diff.lag -0.189572 0.020255 -9.359 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2879000 on 2349 degrees of freedom
## Multiple R-squared: 0.05734, Adjusted R-squared: 0.05654
## F-statistic: 71.44 on 2 and 2349 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -5.9688
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.58 -1.95 -1.62
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.19728 -0.13975 -0.01605 0.12066 1.75762
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 -1.28918 0.03112 -41.423 < 2e-16 ***
## z.diff.lag 0.11780 0.02049 5.749 1.02e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2656 on 2348 degrees of freedom
## Multiple R-squared: 0.5826, Adjusted R-squared: 0.5822
## F-statistic: 1639 on 2 and 2348 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -41.4231
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.58 -1.95 -1.62
# Plot
cbind("Main Data S01_V1"= log(dt_s01_v2_xts),
"Seasonlly Diff"= (diff(log(dt_s01_v2_xts),na.action = na.pass ))) %>% autoplot()autoplot((as.ts(log(dt_s01_v2_xts))) , series="Log Data") +
autolayer(diff(as.ts(log(dt_s01_v2_xts))),series="Diff Log")+
scale_colour_manual(values=c("blue","red"),
breaks=c("Log Data","Diff Log"))# ACF plot suggest that after 3 lag data is moslty neagativily auto correalted to zero mean. This suggest we can apply
# Negative ACF means that a positive value return for one observation increases the probability of having a negative value return for another observation (depending on the lag) and vice-versa. Or you can say (for a stationary time series) if one observation is above the average the other one (depending on the lag) is below average and vice-versa. Have a look at "Interpreting a negative autocorrelation".
# A negative autocorrelation changes the direction of the influence. A negative autocorrelation implies that if a particular value is above average the next value (or for that matter the previous value) is more likely to be below average. If a particular value is below average, the next value is likely to be above average.
#http://www.pmean.com/09/NegativeAutocorrelation.html
#We will use ndiffs() to find rest of the number of differenes.
ndiffs(dt_s01_v2_xts)## [1] 1
## [1] 1
## Lets use auto.arima to validate our model and fir it.
aa_fit_dt_s01_v2 <- auto.arima(dt_s01_v2_xts,seasonal=TRUE)
summary(aa_fit_dt_s01_v2)## Series: dt_s01_v2_xts
## ARIMA(1,1,2)
##
## Coefficients:
## ar1 ma1 ma2
## 0.6594 -1.0199 0.0448
## s.e. 0.0300 0.0366 0.0329
##
## sigma^2 estimated as 7.352e+12: log likelihood=-38176.85
## AIC=76361.69 AICc=76361.71 BIC=76384.75
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -50897.15 2709242 1658282 -7.900959 20.20669 0.9527611
## ACF1
## Training set -0.0002251844
## Series: log(dt_s01_v2_xts)
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## 0.7196 -0.9763
## s.e. 0.0188 0.0076
##
## sigma^2 estimated as 0.0644: log likelihood=-111.65
## AIC=229.29 AICc=229.3 BIC=246.58
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.004840398 0.2536064 0.1794515 -0.0548662 1.130844 0.9432066
## ACF1
## Training set -0.01213411
# AIC Score of Log tranfomed model is better so we will use it.
# ACF plot of residuals
checkresiduals(aa_fit_dt_s01_v2)##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,2)
## Q* = 12.324, df = 7, p-value = 0.0904
##
## Model df: 3. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,2)
## Q* = 41.588, df = 32, p-value = 0.1195
##
## Model df: 3. Total lags used: 35
# Ressiduals are giving very distrubuted erros and mostly close to zero.
# Pvalue is greater than 5% and hence we can't reject null hypothessis i.e. autocorrelations up to lag 8 equal
# loged data
checkresiduals(aa_fit_dt_s01_v2_log)##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)
## Q* = 10.953, df = 8, p-value = 0.2044
##
## Model df: 2. Total lags used: 10
fc1<-forecast(aa_fit_dt_s01_v2,h=140)
fc<-forecast(aa_fit_dt_s01_v2_log,h=140)
summary(aa_fit_dt_s01_v2)## Series: dt_s01_v2_xts
## ARIMA(1,1,2)
##
## Coefficients:
## ar1 ma1 ma2
## 0.6594 -1.0199 0.0448
## s.e. 0.0300 0.0366 0.0329
##
## sigma^2 estimated as 7.352e+12: log likelihood=-38176.85
## AIC=76361.69 AICc=76361.71 BIC=76384.75
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -50897.15 2709242 1658282 -7.900959 20.20669 0.9527611
## ACF1
## Training set -0.0002251844
## Series: log(dt_s01_v2_xts)
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## 0.7196 -0.9763
## s.e. 0.0188 0.0076
##
## sigma^2 estimated as 0.0644: log likelihood=-111.65
## AIC=229.29 AICc=229.3 BIC=246.58
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.004840398 0.2536064 0.1794515 -0.0548662 1.130844 0.9432066
## ACF1
## Training set -0.01213411
fc$mean<-exp(fc$mean)
fc$upper<-exp(fc$upper)
fc$lower<-exp(fc$lower)
fc$x<-exp(fc$x)
fc1 %>% autoplot()## [1] 0.2020669
## [1] 0.1812568
forVa02 <- exp(fc$mean)
# xlsx::write.xlsx(forVa02, file = "Predictive Analytics Project 1V2log.xlsx", col.name = T, row.names = T, append = T)# https://nwfsc-timeseries.github.io/atsa-labs/sec-boxjenkins-kpss.html
# https://www.machinelearningplus.com/time-series/kpss-test-for-stationarity/
# below test suggest that p value is less than .05 and hence we rejct the null, and say that data is not statiaonlry around the trend line.
# But we thought the KPSS should not have rejected the null, because of the 'stationarity around a deterministic trend'. Yes?
#
# Well, that's probably because we used the default option of the KPSS test.
#
# By default, it tests for stationarity around a 'mean' only.
# tseries::kpss.test(diff(dt_s01_v1_xts), null = "Trend")
tseries::kpss.test(dt_s01_v1_xts)##
## KPSS Test for Level Stationarity
##
## data: dt_s01_v1_xts
## KPSS Level = 23.217, Truncation lag parameter = 8, p-value = 0.01
# KPSS test on data with a trend. The default is a null hypothesis with no trend. We will change this to null="Trend".
tseries::kpss.test(dt_s01_v1_xts, null = "Trend")##
## KPSS Test for Trend Stationarity
##
## data: dt_s01_v1_xts
## KPSS Trend = 2.8939, Truncation lag parameter = 8, p-value = 0.01
##
## KPSS Test for Level Stationarity
##
## data: dt_s01_v1_xts
## KPSS Level = 23.217, Truncation lag parameter = 8, p-value = 0.01
# The p-value is less than 0.05. The null hypothesis of stationarity around a level is rejected. This is white noise around a trend so it is definitely a stationary process but has a trend.
tseries::kpss.test(diff(dt_s01_v1_xts), null = "Trend")##
## KPSS Test for Trend Stationarity
##
## data: diff(dt_s01_v1_xts)
## KPSS Trend = 0.033528, Truncation lag parameter = 8, p-value = 0.1
4 FOR var 1 (EXTRA )
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9468 -0.1307 -0.0100 0.1275 4.8576
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 0.0003055 0.0002005 1.523 0.128
## z.diff.lag 0.1140899 0.0205036 5.564 2.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3978 on 2349 degrees of freedom
## Multiple R-squared: 0.01432, Adjusted R-squared: 0.01348
## F-statistic: 17.07 on 2 and 2349 DF, p-value: 4.376e-08
##
##
## Value of test-statistic is: 1.5234
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.58 -1.95 -1.62
#https://stats.stackexchange.com/questions/24072/interpreting-rs-ur-df-dickey-fuller-unit-root-test-results
# Given that the test statistic 1.5234 is within the all 3 regions (1%, 5%, 10%) where we fail to reject the null, we should presume the data is a random walk, ie that a unit root is present. In this case, the tau1 refers to the gamma = 0 hypothesis, The "z.lag1" is the gamma term, the coefficient for the lag term (y(t-1)), which is p=0.128, which we fail to reject as significant, simply implying that gamma isn't statistically significant to this model.
dt_s01_v1_xts %>% ur.df(type = "drift") %>% summary()##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression drift
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9387 -0.1323 -0.0127 0.1245 4.8627
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0211874 0.0304403 0.696 0.486
## z.lag.1 -0.0001929 0.0007435 -0.259 0.795
## z.diff.lag 0.1144296 0.0205117 5.579 2.7e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3978 on 2348 degrees of freedom
## Multiple R-squared: 0.01308, Adjusted R-squared: 0.01224
## F-statistic: 15.56 on 2 and 2348 DF, p-value: 1.929e-07
##
##
## Value of test-statistic is: -0.2594 1.4023
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau2 -3.43 -2.86 -2.57
## phi1 6.43 4.59 3.78
# (where a0 is "a sub-zero" and refers to the constant, or drift term) Here is where the output interpretation gets trickier. "tau2" is still the ??=0 null hypothesis. In this case, where the first test statistic = -0.2594 is within the region of failing to reject the null, we should again presume a unit root, that ??=0 i.e. z.lag.1.
# The phi1 term refers to the second hypothesis, which is a combined null hypothesis of a0 = gamma = 0. This means that BOTH of the values are tested to be 0 at the same time. If p<0.05, we reject the null, and presume that AT LEAST one of these is false--i.e. one or both of the terms a0 or gamma are not 0. Failing to reject this null implies that BOTH a0 AND gamma = 0, implying 1) that gamma=0 therefore a unit root is present, AND 2) a0=0, so there is no drift term. Here we can say p is significant hence Drift is present with , so ao is not zero .
dt_s01_v1_xts %>% diff() %>% .[!is.na(.)] %>% ur.df(type = "drift") %>% summary()##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression drift
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9371 -0.1321 -0.0127 0.1260 4.8534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.013915 0.008217 1.693 0.0905 .
## z.lag.1 -0.900617 0.027465 -32.791 <2e-16 ***
## z.diff.lag 0.016533 0.020634 0.801 0.4231
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3978 on 2347 degrees of freedom
## Multiple R-squared: 0.4432, Adjusted R-squared: 0.4427
## F-statistic: 934 on 2 and 2347 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -32.7909 537.6205
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau2 -3.43 -2.86 -2.57
## phi1 6.43 4.59 3.78
# here we can say that no unit root is present .ie. Gama and Constant are not zero after differencing .##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15966892 -864079 107490 1125529 40490634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 -0.035888 0.006013 -5.969 2.75e-09 ***
## z.diff.lag -0.189572 0.020255 -9.359 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2879000 on 2349 degrees of freedom
## Multiple R-squared: 0.05734, Adjusted R-squared: 0.05654
## F-statistic: 71.44 on 2 and 2349 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -5.9688
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.58 -1.95 -1.62
## Series: dt_s01_v2_xts
## ARIMA(1,1,2)
##
## Coefficients:
## ar1 ma1 ma2
## 0.6594 -1.0199 0.0448
## s.e. 0.0300 0.0366 0.0329
##
## sigma^2 estimated as 7.352e+12: log likelihood=-38176.85
## AIC=76361.69 AICc=76361.71 BIC=76384.75
## Series: log(dt_s01_v2_xts)
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## 0.7196 -0.9763
## s.e. 0.0188 0.0076
##
## sigma^2 estimated as 0.0644: log likelihood=-111.65
## AIC=229.29 AICc=229.3 BIC=246.58
autoplot(ts(dt_s01_v2_xts,frequency=365), series="Data") +
autolayer(trendcycle(decompose(ts(dt_s01_v2_xts,frequency=365))), series="Trend") +
autolayer(seasadj(decompose(ts(dt_s01_v2_xts,frequency=365))), series="Seasonally Adjusted") +
xlab("Year") + ylab("Var02") +
ggtitle("Group S01 Var02") +
scale_colour_manual(values=c("orange","blue","red"),
breaks=c("Data","Seasonally Adjusted","Trend"))