P1

Requirement #1 You will turn in a written report. You need to write this report as if it the report will be routed in an office to personnel of vary different backgrounds. You need to be able to reach readers that have no data science background to full fledge data scientists. So, you need to explain what you have done, why and how you did it with both layman and technical terminology. Do not simply write this with me in mind. Visuals and output are expected, but it is not necessary to include every bit of analysis. In fact, a terse report with simple terminology is preferred to throwing in everything into a long, ad nausem report. Story telling is really taking on for data science, so please flex your muscles here. The report is part 1 of 2 requirements.

NOTE: We have covered a lot of material. I don’t want you to try every method to create your forecast. But you should perform fundamentals, visualize your data, and perform exploratory data analysis as appropriate.

Requirement #2 Your second requirement is to produce forecasts. This workbook will contain at least 6 sheets where I will calculate your error rates. There will be one sheet (tab) for each Group - S01, S02, S03, SO4, S05, S06. You should order each sheet by the variable SeriesIND (low to high). Your source data is sorted this way, except there are all 6 groups present in one sheet which you must break out into 6 tabs. You will submit the data I sent AND the forward forecast for 140 periods. I want you to forecast the following

full_data <- readxl::read_excel(path='Data Set for Class.xls')

head(full_data)
# For simplicity lets convert SeriesInd to date. by setting Origin of date to 1900 Jan 1st.


print(paste("Start Of Data:",as.Date(min(full_data$SeriesInd) , origin = "1900-01-01")))
## [1] "Start Of Data: 2011-05-08"
print(paste("End Of Data:",as.Date(max(full_data$SeriesInd) , origin = "1900-01-01")))
## [1] "End Of Data: 2018-05-03"
print(paste("Forecast from :",as.Date(max(full_data$SeriesInd)+1 , origin = "1900-01-01")," TO ",as.Date(max(full_data$SeriesInd)+140 , origin = "1900-01-01")))
## [1] "Forecast from : 2018-05-04  TO  2018-09-20"
full_data$SeriesInd <- as.Date(full_data$SeriesInd, origin = "1900-01-01")

head(full_data)
# Get the full date range for the data 

allDates <- seq.Date(
  min(full_data$SeriesInd),
  max(full_data$SeriesInd),
  "day")

# creating NA for missing data entry in full data.

full_data_na <- left_join(as.data.frame(allDates),full_data,by=c("allDates"= "SeriesInd"))

# subset(full_data[,c(1,2,3)], group == "S01")%>% .[,c(1,3)]  %>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
# 
# 
# subset(full_data[,c(1,2,3)], group == "S02")%>% .[,c(1,3)]  %>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))

# 
# full_data_na %>% filter(.,group=="S01") %>% .[,c(1,3)] %>% plot()
# 
#   
# #+geom_col(position="dodge", alpha=0.5) 
# 
# 
# dt_s06_v1 %>% ggplot(aes(x=allDates, y= Var01))+  geom_line()
# 
# 
# 
# full_data_na[,c(1,3)]plot()
# Creating a subset of date for each Group and Varible. 
dt_s01_v1 = subset(full_data[,c(1,2,3)], group == "S01")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s01_v2 = subset(full_data[,c(1,2,4)], group == "S01")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s01_v3 = subset(full_data[,c(1,2,5)], group == "S01")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s01_v5 = subset(full_data[,c(1,2,6)], group == "S01")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s01_v7 = subset(full_data[,c(1,2,7)], group == "S01")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))

dt_s02_v1 = subset(full_data[,c(1,2,3)], group == "S02")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s02_v2= subset(full_data[,c(1,2,4)], group == "S02")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s02_v3= subset(full_data[,c(1,2,5)], group == "S02")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s02_v5= subset(full_data[,c(1,2,6)], group == "S02")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s02_v7= subset(full_data[,c(1,2,7)], group == "S02")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))

dt_s03_v1 = subset(full_data[,c(1,2,3)], group == "S03")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s03_v2 = subset(full_data[,c(1,2,4)], group == "S03")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s03_v3 = subset(full_data[,c(1,2,5)], group == "S03")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s03_v5 = subset(full_data[,c(1,2,6)], group == "S03")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s03_v7 = subset(full_data[,c(1,2,7)], group == "S03")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))

dt_s04_v1 = subset(full_data[,c(1,2,3)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s04_v2= subset(full_data[,c(1,2,4)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s04_v3= subset(full_data[,c(1,2,5)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s04_v5= subset(full_data[,c(1,2,6)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s04_v7= subset(full_data[,c(1,2,7)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))

dt_s05_v1 = subset(full_data[,c(1,2,3)], group == "S04")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s05_v2 = subset(full_data[,c(1,2,4)], group == "S05")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s05_v3 = subset(full_data[,c(1,2,5)], group == "S05")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s05_v5 = subset(full_data[,c(1,2,6)], group == "S05")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s05_v7 = subset(full_data[,c(1,2,7)], group == "S05")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))

dt_s06_v1 = subset(full_data[,c(1,2,3)], group == "S06")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s06_v2 = subset(full_data[,c(1,2,4)], group == "S06")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s06_v3 = subset(full_data[,c(1,2,5)], group == "S06")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s06_v5 = subset(full_data[,c(1,2,6)], group == "S06")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
dt_s06_v7 = subset(full_data[,c(1,2,7)], group == "S06")%>% .[,c(1,3)]%>% left_join(as.data.frame(allDates),.,by=c("allDates"= "SeriesInd"))
full_data_na [!which(full_data$group== 'S06'),] %>%
ggplot(aes(x=allDates, y= Var01,fill=group)) +  
  geom_line(aes(colour=group)) +
  ggtitle("Var01 : For all the Groups in series") +  
  xlab("Year") + 
  ylab("Value")

# Plot FUll data with groups and see how does data looks.
full_data_na %>%
ggplot(aes(x=allDates, y= Var01,fill=group)) +  
  geom_line(aes(colour=group)) +
  ggtitle("Var01 : For all the Groups in series") +  
  xlab("Year") + 
  ylab("Value")

full_data_na %>% ggplot(aes(x=allDates, fill=group,colour=group)) + 
  geom_line(aes(y= Var02)) +
  ggtitle("Var02 : For all the Groups in series") +  
  xlab("Year") + 
  ylab("Value")

full_data_na %>% ggplot(aes(x=allDates, fill=group,colour=group)) + 
  geom_line(aes(y= Var03)) +
  ggtitle("Var03 : For all the Groups in series") +  
  xlab("Year") + 
  ylab("Value")

full_data_na %>% ggplot(aes(x=allDates, fill=group,colour=group)) + 
  geom_line(aes(y= Var05)) +
  ggtitle("Var5 : For all the Groups in series") +  
  xlab("Year") + 
  ylab("Value")

full_data_na %>% ggplot(aes(x=allDates, fill=group,colour=group)) + 
  geom_line(aes(y= Var07)) +
  ggtitle("Var07 : For all the Groups in series") +  
  xlab("Year") + 
  ylab("Value")

subset(full_data_na[,], group == "S01") %>% ggplot(aes(x=allDates)) + 
  geom_line(aes(y= Var02,color="2")) +
  geom_line(aes(y= Var01,color="1")) +
  geom_line(aes(y= Var03,color="3")) +
  geom_line(aes(y= Var05,color="4")) +
  geom_line(aes(y= Var07,color="5")) +
  ggtitle("Group S01: All Variable") +  
  xlab("Year") + 
  ylab("Value")

subset(full_data_na[,], group == "S01") %>% ggplot(aes(x=allDates)) + 
  geom_line(aes(y= Var01,color="1")) +
  geom_line(aes(y= Var03,color="3")) +
  geom_line(aes(y= Var05,color="4")) +
  geom_line(aes(y= Var07,color="5")) +
  ggtitle("Group S01: All Variable -V02") +  
  xlab("Year") + 
  ylab("Value")

full_data_na %>% ggplot(aes(x=group, y= Var01,fill=group)) + geom_boxplot(position = "dodge") + theme(axis.text.x = element_text(angle = 30, hjust = 1)) + labs(title = "Boxplot of Series by Group ")+ ylim(0,200) + xlab( "Group")

# just checking one Group line plot
dt_s06_v1 %>% ggplot(aes(x=allDates, y= Var01))+  geom_line()

We can very clearly see that we have some outlier in Series S06 and Series S02.We will validate this with latter using which.max(). We noted that we have around 1631 values missing, which we have replaced with “NA” in above data preparation.

full_data_na$group <-  as.factor(full_data_na$group) 
summary(full_data_na )
##     allDates           group          Var01            Var02          
##  Min.   :2011-05-08   S01 :1762   Min.   :  9.03   Min.   :  1339900  
##  1st Qu.:2013-02-01   S02 :1762   1st Qu.: 23.10   1st Qu.: 12520675  
##  Median :2014-11-04   S03 :1762   Median : 38.44   Median : 21086550  
##  Mean   :2014-11-03   S04 :1762   Mean   : 46.98   Mean   : 37035741  
##  3rd Qu.:2016-08-05   S05 :1762   3rd Qu.: 66.78   3rd Qu.: 42486700  
##  Max.   :2018-05-03   S06 :1762   Max.   :195.18   Max.   :480879500  
##                       NA's: 791   NA's   :1645     NA's   :1633       
##      Var03            Var05            Var07       
##  Min.   :  8.82   Min.   :  8.99   Min.   :  8.92  
##  1st Qu.: 22.59   1st Qu.: 22.91   1st Qu.: 22.88  
##  Median : 37.66   Median : 38.05   Median : 38.05  
##  Mean   : 46.12   Mean   : 46.55   Mean   : 46.56  
##  3rd Qu.: 65.88   3rd Qu.: 66.38   3rd Qu.: 66.31  
##  Max.   :189.36   Max.   :195.00   Max.   :189.72  
##  NA's   :1657     NA's   :1657     NA's   :1657
str(full_data_na)
## 'data.frame':    11363 obs. of  7 variables:
##  $ allDates: Date, format: "2011-05-08" "2011-05-08" ...
##  $ group   : Factor w/ 6 levels "S01","S02","S03",..: 3 2 1 6 5 4 3 2 1 6 ...
##  $ Var01   : num  30.6 10.3 26.6 27.5 69.3 ...
##  $ Var02   : num  1.23e+08 6.09e+07 1.04e+07 3.93e+07 2.78e+07 ...
##  $ Var03   : num  30.3 10.1 25.9 26.8 68.2 ...
##  $ Var05   : num  30.5 10.2 26.2 27 68.7 ...
##  $ Var07   : num  30.6 10.3 26 27.3 69.2 ...
summary(dt_s01_v1)
##     allDates              Var01      
##  Min.   :2011-05-08   Min.   :23.01  
##  1st Qu.:2013-02-04   1st Qu.:29.85  
##  Median :2014-11-04   Median :35.66  
##  Mean   :2014-11-04   Mean   :39.41  
##  3rd Qu.:2016-08-03   3rd Qu.:48.70  
##  Max.   :2018-05-03   Max.   :62.31  
##                       NA's   :933
summary(dt_s01_v2)
##     allDates              Var02         
##  Min.   :2011-05-08   Min.   : 1339900  
##  1st Qu.:2013-02-04   1st Qu.: 5347550  
##  Median :2014-11-04   Median : 7895050  
##  Mean   :2014-11-04   Mean   : 8907092  
##  3rd Qu.:2016-08-03   3rd Qu.:11321675  
##  Max.   :2018-05-03   Max.   :48477500  
##                       NA's   :931
summary(dt_s01_v3)
##     allDates              Var03      
##  Min.   :2011-05-08   Min.   :22.28  
##  1st Qu.:2013-02-04   1st Qu.:29.33  
##  Median :2014-11-04   Median :35.07  
##  Mean   :2014-11-04   Mean   :38.65  
##  3rd Qu.:2016-08-03   3rd Qu.:47.89  
##  Max.   :2018-05-03   Max.   :61.59  
##                       NA's   :935
summary(dt_s01_v5)
##     allDates              Var05      
##  Min.   :2011-05-08   Min.   :22.55  
##  1st Qu.:2013-02-04   1st Qu.:29.59  
##  Median :2014-11-04   Median :35.34  
##  Mean   :2014-11-04   Mean   :39.01  
##  3rd Qu.:2016-08-03   3rd Qu.:48.28  
##  Max.   :2018-05-03   Max.   :62.29  
##                       NA's   :935
summary(dt_s01_v7)
##     allDates              Var07      
##  Min.   :2011-05-08   Min.   :22.50  
##  1st Qu.:2013-02-04   1st Qu.:29.60  
##  Median :2014-11-04   Median :35.36  
##  Mean   :2014-11-04   Mean   :39.05  
##  3rd Qu.:2016-08-03   3rd Qu.:48.26  
##  Max.   :2018-05-03   Max.   :62.14  
##                       NA's   :935

1 Group S01

Below plots suggest there is some increasing tend with respect to all the varaible in Group S01, except Var02. which seems to staty normal with no trend at the major half of the datapoints.

plot(dt_s01_v1)

plot(dt_s01_v2)

plot(dt_s01_v3)

plot(dt_s01_v5)

plot(dt_s01_v7)

2 working with NA in the data and creating xts object

dt_s01_v1_xts <- xts(c(dt_s01_v1$Var01),  order.by=as.Date(dt_s01_v1$allDates))%>% na.approx()
dt_s01_v2_xts <- xts(c(dt_s01_v2$Var02),  order.by=as.Date(dt_s01_v2$allDates))%>% na.approx()
dt_s01_v3_xts <- xts(c(dt_s01_v3$Var03),  order.by=as.Date(dt_s01_v3$allDates))%>% na.approx()
dt_s01_v5_xts <- xts(c(dt_s01_v5$Var05),  order.by=as.Date(dt_s01_v5$allDates))%>% na.approx()
dt_s01_v7_xts <- xts(c(dt_s01_v7$Var07),  order.by=as.Date(dt_s01_v7$allDates))%>% na.approx()



# View of some differnet imputation methods and their values.  
cbind("Value"=dt_s01_v1$Var01,
      "na.interp"=na.interp(dt_s01_v1$Var01),
      "na.approx"=na_seadec(dt_s01_v1$Var01),
      "na.kalman"=na.kalman(dt_s01_v1$Var01),
      "na.interpolation"=na.interpolation(dt_s01_v1$Var01)
      
      ) %>% 
  head(.,20)
## Time Series:
## Start = 1 
## End = 20 
## Frequency = 1 
##    Value na.interp na.approx na.kalman na.interpolation
##  1 26.61  26.61000  26.61000  26.61000         26.61000
##  2 26.30  26.30000  26.30000  26.30000         26.30000
##  3 26.03  26.03000  26.03000  26.03000         26.03000
##  4 25.84  25.84000  25.84000  25.84000         25.84000
##  5 26.34  26.34000  26.34000  26.34000         26.34000
##  6    NA  26.39000  26.39000  26.37014         26.39000
##  7    NA  26.44000  26.44000  26.41866         26.44000
##  8 26.49  26.49000  26.49000  26.49000         26.49000
##  9 26.03  26.03000  26.03000  26.03000         26.03000
## 10 25.16  25.16000  25.16000  25.16000         25.16000
## 11 25.00  25.00000  25.00000  25.00000         25.00000
## 12 24.77  24.77000  24.77000  24.77000         24.77000
## 13    NA  24.77250  24.77250  24.77825         24.77250
## 14    NA  24.77500  24.77500  24.77663         24.77500
## 15    NA  24.77750  24.77750  24.77502         24.77750
## 16 24.78  24.78000  24.78000  24.78000         24.78000
## 17 24.61  24.61000  24.61000  24.61000         24.61000
## 18 24.88  24.88000  24.88000  24.88000         24.88000
## 19 24.17  24.17000  24.17000  24.17000         24.17000
## 20    NA  24.05333  24.05333  24.07196         24.05333

We will use na.interp to fill the missing value for all NA in our dataset. Below graph suggest how the imputed NA values are flowing with the rest of the data.

dt_s01_v1_xts <- xts(c(dt_s01_v1$Var01),  order.by=as.Date(dt_s01_v1$allDates))
dt_s01_v2_xts <- xts(c(dt_s01_v2$Var02),  order.by=as.Date(dt_s01_v2$allDates))
dt_s01_v3_xts <- xts(c(dt_s01_v3$Var03),  order.by=as.Date(dt_s01_v3$allDates))
dt_s01_v5_xts <- xts(c(dt_s01_v5$Var05),  order.by=as.Date(dt_s01_v5$allDates))
dt_s01_v7_xts <- xts(c(dt_s01_v7$Var07),  order.by=as.Date(dt_s01_v7$allDates))


# 
# na.approx(dt_s01_v1_xts) %>% autoplot()

# Plot NA and Actaul Data for Var1 of Group S01
  autoplot(na.interp(as.ts(dt_s01_v1_xts)) , series="na.approx") + 
  autolayer(as.ts(dt_s01_v1_xts),series="Data")+
  scale_colour_manual(values=c("blue","red"),
             breaks=c("na.approx","Data"))

dt_s01_v1_xts <- xts(c(dt_s01_v1$Var01),  order.by=as.Date(dt_s01_v1$allDates))%>% na.approx()
dt_s01_v2_xts <- xts(c(dt_s01_v2$Var02),  order.by=as.Date(dt_s01_v2$allDates))%>% na.approx()
dt_s01_v3_xts <- xts(c(dt_s01_v3$Var03),  order.by=as.Date(dt_s01_v3$allDates))%>% na.approx()
dt_s01_v5_xts <- xts(c(dt_s01_v5$Var05),  order.by=as.Date(dt_s01_v5$allDates))%>% na.approx()
dt_s01_v7_xts <- xts(c(dt_s01_v7$Var07),  order.by=as.Date(dt_s01_v7$allDates))%>% na.approx()

Above plots shows how imputed values and other data points are flowing over time. Lets impute rest of the data.

dt_s01_v1_xts <- xts(c(dt_s01_v1$Var01),  order.by=as.Date(dt_s01_v1$allDates)) %>% na.approx()
dt_s01_v2_xts <- xts(c(dt_s01_v2$Var02),  order.by=as.Date(dt_s01_v2$allDates))%>% na.approx()
dt_s01_v3_xts <- xts(c(dt_s01_v3$Var03),  order.by=as.Date(dt_s01_v3$allDates))%>% na.approx()
dt_s01_v5_xts <- xts(c(dt_s01_v5$Var05),  order.by=as.Date(dt_s01_v5$allDates))%>% na.approx()
dt_s01_v7_xts <- xts(c(dt_s01_v7$Var07),  order.by=as.Date(dt_s01_v7$allDates))%>% na.approx()


cbind(dt_s01_v1[1:10,], as.data.frame(dt_s01_v1_xts[][1:10]))
# Plot ACF and PACF
par(mfrow=c(1,2))    # set the plotting area into a 1*2 array
acf(dt_s01_v1_xts)
pacf(dt_s01_v1_xts)

par(mfrow=c(1,2)) 
acf(dt_s01_v2_xts)
pacf(dt_s01_v2_xts)

par(mfrow=c(1,2)) 
acf(dt_s01_v3_xts)
pacf(dt_s01_v3_xts)

par(mfrow=c(1,2)) 
acf(dt_s01_v5_xts)
pacf(dt_s01_v5_xts)

par(mfrow=c(1,2)) 
acf(dt_s01_v7_xts)
pacf(dt_s01_v7_xts)

dt_s01_v1_xts %>% ggtsdisplay(main="")

dt_s01_v1_xts %>% diff()  %>% ggtsdisplay(main="")

Since ACF plots very slowly moving towards ZERO for all the variables, we can say that our data series are non-stationary.

Let’s use Differencing to make data stationary. We will use KPSS test to see if our null hypothesis is false.

library(urca)
dt_s01_v1_xts %>% ur.kpss() %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 23.217 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
# Fickey fuller test Before and after diff
dt_s01_v1_xts %>% ur.df() %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9468 -0.1307 -0.0100  0.1275  4.8576 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## z.lag.1    0.0003055  0.0002005   1.523    0.128    
## z.diff.lag 0.1140899  0.0205036   5.564 2.93e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3978 on 2349 degrees of freedom
## Multiple R-squared:  0.01432,    Adjusted R-squared:  0.01348 
## F-statistic: 17.07 on 2 and 2349 DF,  p-value: 4.376e-08
## 
## 
## Value of test-statistic is: 1.5234 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.58 -1.95 -1.62

SO very cleary our data is not stationary , lets apply difference and see if we can reduce test-statistic to <= 1%.

dt_s01_v1_xts %>% diff() %>% ur.kpss() %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 0.1004 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
dt_s01_v1_xts %>% diff() %>% .[!is.na(.)] %>% ur.df() %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9236 -0.1182  0.0013  0.1399  4.8670 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## z.lag.1    -0.89825    0.02744 -32.734   <2e-16 ***
## z.diff.lag  0.01536    0.02063   0.744    0.457    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.398 on 2348 degrees of freedom
## Multiple R-squared:  0.4425, Adjusted R-squared:  0.442 
## F-statistic: 931.9 on 2 and 2348 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -32.7341 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.58 -1.95 -1.62
dt_s01_v1_xts %>% diff() %>% .[!is.na(.)] %>% tseries::adf.test()
## 
##  Augmented Dickey-Fuller Test
## 
## data:  .
## Dickey-Fuller = -12.741, Lag order = 13, p-value = 0.01
## alternative hypothesis: stationary
cbind("Main Data S01_V1"= dt_s01_v1_xts,
      "Diff Var01"= (diff(dt_s01_v1_xts,na.action = na.pass ))) %>% autoplot()

Value of test-statistic is: 0.1004, which can be accepted to have stationary data for our series. We will use ndiffs() to find rest of the number of differenes.

par(mfrow=c(1,2)) 
acf(diff(dt_s01_v1_xts ),na.action = na.pass)
pacf(diff(dt_s01_v1_xts ),na.action = na.pass)

Above ACF plots suggest that data for variale 1 is more corrleated with 1st lag, i.e. first order MA model can be used to define such data after applying difference on the data. PACF plots shows so many significant PACF values, so we would have to consider too many varaibles for AR model , which would make it comaplicated. Hennce we are going to use MA first order model.

ndiffs(dt_s01_v1_xts)
## [1] 1
ndiffs(dt_s01_v2_xts)
## [1] 1
ndiffs(dt_s01_v3_xts)
## [1] 1
ndiffs(dt_s01_v5_xts)
## [1] 1
ndiffs(dt_s01_v7_xts)
## [1] 1

# Lets use auto.arima to validate our model and fir it.

aa_fit_dt_s01_v1 <- auto.arima(dt_s01_v1_xts)
aa_fit_dt_s01_v1_log <- auto.arima(log(dt_s01_v1_xts))
summary(aa_fit_dt_s01_v1)
## Series: dt_s01_v1_xts 
## ARIMA(0,1,1) with drift 
## 
## Coefficients:
##          ma1   drift
##       0.1163  0.0152
## s.e.  0.0205  0.0091
## 
## sigma^2 estimated as 0.1581:  log likelihood=-1167.44
## AIC=2340.89   AICc=2340.9   BIC=2358.18
## 
## Training set error measures:
##                        ME      RMSE       MAE          MPE      MAPE      MASE
## Training set 1.447248e-05 0.3974086 0.2373141 -0.009213095 0.6215781 0.9837201
##                       ACF1
## Training set -0.0002184172
summary(aa_fit_dt_s01_v1_log)
## Series: log(dt_s01_v1_xts) 
## ARIMA(1,1,0) with drift 
## 
## Coefficients:
##          ar1  drift
##       0.1117  4e-04
## s.e.  0.0205  2e-04
## 
## sigma^2 estimated as 0.0001003:  log likelihood=7491.16
## AIC=-14976.31   AICc=-14976.3   BIC=-14959.02
## 
## Training set error measures:
##                        ME       RMSE         MAE           MPE      MAPE
## Training set 9.481692e-07 0.01001024 0.006208169 -0.0004976877 0.1734165
##                   MASE          ACF1
## Training set 0.9833871 -8.713263e-06
# aa_fit_dt_s01_v1 %>% forecast(h=140)

coef(aa_fit_dt_s01_v1)
##        ma1      drift 
## 0.11626944 0.01517711
# ACF plot of residuals 
checkresiduals(aa_fit_dt_s01_v1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1) with drift
## Q* = 10.332, df = 8, p-value = 0.2425
## 
## Model df: 2.   Total lags used: 10
checkresiduals(aa_fit_dt_s01_v1_log)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0) with drift
## Q* = 8.7143, df = 8, p-value = 0.367
## 
## Model df: 2.   Total lags used: 10
acf(resid(aa_fit_dt_s01_v1))

pacf(aa_fit_dt_s01_v1$residuals)

Box.test(aa_fit_dt_s01_v1$residuals, lag=30,type = 'Ljung-Box')
## 
##  Box-Ljung test
## 
## data:  aa_fit_dt_s01_v1$residuals
## X-squared = 49.988, df = 30, p-value = 0.01244
# ACF plot of the residuals are white noise, as no prominent patterns can be seen here. 
# The Ljung-Box test also returned High p-vlaue indicating that we can't reject the null hypothessis, and data is  white nosie.
forecast(aa_fit_dt_s01_v1,h=140) %>% autoplot()

forecast(aa_fit_dt_s01_v1_log,h=140) %>% autoplot()

aa_fit_dt_s01_v1_log
## Series: log(dt_s01_v1_xts) 
## ARIMA(1,1,0) with drift 
## 
## Coefficients:
##          ar1  drift
##       0.1117  4e-04
## s.e.  0.0205  2e-04
## 
## sigma^2 estimated as 0.0001003:  log likelihood=7491.16
## AIC=-14976.31   AICc=-14976.3   BIC=-14959.02
Fore_dt_s01_v1<- forecast(aa_fit_dt_s01_v1,h=140)
Fore_dt_s01_v1_log<- forecast(aa_fit_dt_s01_v1_log,h=140)

# xlsx::write.xlsx(exp(Fore_dt_s01_v1_log$mean), file = "Predictive Analytics Project 1log.xlsx",  col.name = T, row.names = T, append = T)


autoplot(aa_fit_dt_s01_v1)

autoplot(aa_fit_dt_s01_v1_log)

# inverse characteristic roots for Armia(0,1,1) model fitted to adjsuted data, with red dots are well within the  unit circle, Since the red point is not close to unit cirle may not gurentee better fit of the model. 
Fore_dt_s01_v1
##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2354       62.32814 61.81851 62.83776 61.54873 63.10754
## 2355       62.34331 61.57955 63.10708 61.17523 63.51139
## 2356       62.35849 61.40615 63.31084 60.90200 63.81498
## 2357       62.37367 61.26435 63.48299 60.67711 64.07022
## 2358       62.38885 61.14217 63.63552 60.48222 64.29548
## 2359       62.40402 61.03368 63.77436 60.30827 64.49978
## 2360       62.41920 60.93547 63.90293 60.15003 64.68837
## 2361       62.43438 60.84533 64.02343 60.00414 64.86462
## 2362       62.44955 60.76175 64.13736 59.86827 65.03083
## 2363       62.46473 60.68363 64.24583 59.74077 65.18869
## 2364       62.47991 60.61016 64.34965 59.62038 65.33943
## 2365       62.49509 60.54071 64.44946 59.50613 65.48404
## 2366       62.51026 60.47478 64.54574 59.39726 65.62326
## 2367       62.52544 60.41196 64.63892 59.29315 65.75773
## 2368       62.54062 60.35191 64.72932 59.19328 65.88795
## 2369       62.55579 60.29437 64.81722 59.09724 66.01435
## 2370       62.57097 60.23909 64.90285 59.00466 66.13728
## 2371       62.58615 60.18588 64.98642 58.91525 66.25705
## 2372       62.60132 60.13456 65.06809 58.82874 66.37391
## 2373       62.61650 60.08499 65.14801 58.74489 66.48811
## 2374       62.63168 60.03704 65.22632 58.66352 66.59984
## 2375       62.64686 59.99058 65.30313 58.58444 66.70928
## 2376       62.66203 59.94553 65.37854 58.50749 66.81657
## 2377       62.67721 59.90178 65.45264 58.43255 66.92187
## 2378       62.69239 59.85925 65.52552 58.35948 67.02529
## 2379       62.70756 59.81788 65.59725 58.28817 67.12696
## 2380       62.72274 59.77759 65.66789 58.21853 67.22696
## 2381       62.73792 59.73833 65.73751 58.15045 67.32539
## 2382       62.75310 59.70004 65.80615 58.08385 67.42234
## 2383       62.76827 59.66267 65.87388 58.01866 67.51788
## 2384       62.78345 59.62617 65.94073 57.95481 67.61209
## 2385       62.79863 59.59051 66.00674 57.89224 67.70502
## 2386       62.81380 59.55564 66.07197 57.83087 67.79674
## 2387       62.82898 59.52153 66.13644 57.77067 67.88730
## 2388       62.84416 59.48814 66.20018 57.71157 67.97675
## 2389       62.85934 59.45544 66.26323 57.65353 68.06515
## 2390       62.87451 59.42341 66.32562 57.59650 68.15252
## 2391       62.88969 59.39201 66.38737 57.54045 68.23893
## 2392       62.90487 59.36123 66.44851 57.48534 68.32439
## 2393       62.92004 59.33103 66.50905 57.43113 68.40896
## 2394       62.93522 59.30141 66.56904 57.37778 68.49266
## 2395       62.95040 59.27232 66.62847 57.32527 68.57553
## 2396       62.96558 59.24377 66.68738 57.27356 68.65759
## 2397       62.98075 59.21572 66.74579 57.22263 68.73888
## 2398       62.99593 59.18816 66.80370 57.17245 68.81941
## 2399       63.01111 59.16108 66.86114 57.12299 68.89922
## 2400       63.02628 59.13445 66.91812 57.07424 68.97833
## 2401       63.04146 59.10827 66.97465 57.02617 69.05675
## 2402       63.05664 59.08252 67.03075 56.97875 69.13452
## 2403       63.07182 59.05719 67.08644 56.93197 69.21166
## 2404       63.08699 59.03226 67.14172 56.88582 69.28817
## 2405       63.10217 59.00773 67.19661 56.84026 69.36408
## 2406       63.11735 58.98357 67.25112 56.79528 69.43941
## 2407       63.13252 58.95979 67.30526 56.75088 69.51417
## 2408       63.14770 58.93637 67.35904 56.70702 69.58838
## 2409       63.16288 58.91330 67.41246 56.66370 69.66205
## 2410       63.17806 58.89056 67.46555 56.62090 69.73521
## 2411       63.19323 58.86817 67.51830 56.57861 69.80785
## 2412       63.20841 58.84609 67.57073 56.53682 69.88000
## 2413       63.22359 58.82433 67.62284 56.49551 69.95167
## 2414       63.23876 58.80288 67.67465 56.45466 70.02286
## 2415       63.25394 58.78173 67.72615 56.41428 70.09360
## 2416       63.26912 58.76087 67.77737 56.37435 70.16389
## 2417       63.28430 58.74030 67.82830 56.33485 70.23374
## 2418       63.29947 58.72000 67.87894 56.29577 70.30317
## 2419       63.31465 58.69998 67.92932 56.25712 70.37218
## 2420       63.32983 58.68022 67.97943 56.21887 70.44078
## 2421       63.34500 58.66073 68.02928 56.18102 70.50898
## 2422       63.36018 58.64149 68.07887 56.14356 70.57680
## 2423       63.37536 58.62250 68.12822 56.10649 70.64423
## 2424       63.39053 58.60375 68.17732 56.06978 70.71129
## 2425       63.40571 58.58524 68.22618 56.03344 70.77798
## 2426       63.42089 58.56697 68.27481 55.99746 70.84432
## 2427       63.43607 58.54892 68.32321 55.96183 70.91031
## 2428       63.45124 58.53110 68.37138 55.92654 70.97595
## 2429       63.46642 58.51350 68.41934 55.89158 71.04126
## 2430       63.48160 58.49612 68.46708 55.85696 71.10623
## 2431       63.49677 58.47894 68.51461 55.82266 71.17089
## 2432       63.51195 58.46197 68.56193 55.78868 71.23523
## 2433       63.52713 58.44521 68.60905 55.75501 71.29925
## 2434       63.54231 58.42865 68.65597 55.72164 71.36297
## 2435       63.55748 58.41228 68.70269 55.68857 71.42640
## 2436       63.57266 58.39610 68.74922 55.65580 71.48952
## 2437       63.58784 58.38011 68.79556 55.62331 71.55236
## 2438       63.60301 58.36431 68.84172 55.59111 71.61492
## 2439       63.61819 58.34869 68.88769 55.55919 71.67720
## 2440       63.63337 58.33325 68.93349 55.52754 71.73920
## 2441       63.64855 58.31799 68.97911 55.49616 71.80093
## 2442       63.66372 58.30289 69.02455 55.46504 71.86241
## 2443       63.67890 58.28797 69.06983 55.43418 71.92362
## 2444       63.69408 58.27322 69.11494 55.40358 71.98457
## 2445       63.70925 58.25862 69.15988 55.37324 72.04527
## 2446       63.72443 58.24420 69.20467 55.34313 72.10573
## 2447       63.73961 58.22993 69.24929 55.31327 72.16594
## 2448       63.75479 58.21581 69.29376 55.28366 72.22592
## 2449       63.76996 58.20185 69.33807 55.25427 72.28565
## 2450       63.78514 58.18804 69.38224 55.22512 72.34516
## 2451       63.80032 58.17439 69.42625 55.19620 72.40444
## 2452       63.81549 58.16087 69.47011 55.16750 72.46349
## 2453       63.83067 58.14751 69.51383 55.13902 72.52232
## 2454       63.84585 58.13428 69.55741 55.11076 72.58093
## 2455       63.86103 58.12120 69.60085 55.08272 72.63933
## 2456       63.87620 58.10826 69.64415 55.05489 72.69752
## 2457       63.89138 58.09545 69.68731 55.02727 72.75549
## 2458       63.90656 58.08277 69.73034 54.99985 72.81327
## 2459       63.92173 58.07023 69.77324 54.97263 72.87083
## 2460       63.93691 58.05782 69.81600 54.94562 72.92820
## 2461       63.95209 58.04554 69.85864 54.91880 72.98538
## 2462       63.96727 58.03338 69.90115 54.89218 73.04235
## 2463       63.98244 58.02135 69.94353 54.86574 73.09914
## 2464       63.99762 58.00945 69.98579 54.83950 73.15574
## 2465       64.01280 57.99766 70.02793 54.81345 73.21215
## 2466       64.02797 57.98600 70.06995 54.78757 73.26837
## 2467       64.04315 57.97446 70.11185 54.76188 73.32442
## 2468       64.05833 57.96303 70.15363 54.73637 73.38029
## 2469       64.07351 57.95172 70.19529 54.71104 73.43597
## 2470       64.08868 57.94052 70.23685 54.68588 73.49149
## 2471       64.10386 57.92943 70.27829 54.66089 73.54683
## 2472       64.11904 57.91846 70.31962 54.63607 73.60200
## 2473       64.13421 57.90759 70.36083 54.61142 73.65701
## 2474       64.14939 57.89684 70.40194 54.58694 73.71185
## 2475       64.16457 57.88619 70.44295 54.56262 73.76652
## 2476       64.17974 57.87565 70.48384 54.53846 73.82103
## 2477       64.19492 57.86521 70.52464 54.51446 73.87539
## 2478       64.21010 57.85487 70.56533 54.49062 73.92958
## 2479       64.22528 57.84464 70.60591 54.46693 73.98362
## 2480       64.24045 57.83451 70.64640 54.44340 74.03750
## 2481       64.25563 57.82447 70.68679 54.42002 74.09124
## 2482       64.27081 57.81454 70.72708 54.39680 74.14482
## 2483       64.28598 57.80470 70.76727 54.37372 74.19825
## 2484       64.30116 57.79496 70.80736 54.35079 74.25154
## 2485       64.31634 57.78532 70.84736 54.32800 74.30468
## 2486       64.33152 57.77576 70.88727 54.30536 74.35767
## 2487       64.34669 57.76630 70.92708 54.28286 74.41053
## 2488       64.36187 57.75694 70.96680 54.26050 74.46324
## 2489       64.37705 57.74766 71.00643 54.23828 74.51582
## 2490       64.39222 57.73848 71.04597 54.21619 74.56826
## 2491       64.40740 57.72938 71.08543 54.19425 74.62056
## 2492       64.42258 57.72037 71.12479 54.17243 74.67273
## 2493       64.43776 57.71145 71.16407 54.15075 74.72476
summary(Fore_dt_s01_v1)
## 
## Forecast method: ARIMA(0,1,1) with drift
## 
## Model Information:
## Series: dt_s01_v1_xts 
## ARIMA(0,1,1) with drift 
## 
## Coefficients:
##          ma1   drift
##       0.1163  0.0152
## s.e.  0.0205  0.0091
## 
## sigma^2 estimated as 0.1581:  log likelihood=-1167.44
## AIC=2340.89   AICc=2340.9   BIC=2358.18
## 
## Error measures:
##                        ME      RMSE       MAE          MPE      MAPE      MASE
## Training set 1.447248e-05 0.3974086 0.2373141 -0.009213095 0.6215781 0.9837201
##                       ACF1
## Training set -0.0002184172
## 
## Forecasts:
##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2354       62.32814 61.81851 62.83776 61.54873 63.10754
## 2355       62.34331 61.57955 63.10708 61.17523 63.51139
## 2356       62.35849 61.40615 63.31084 60.90200 63.81498
## 2357       62.37367 61.26435 63.48299 60.67711 64.07022
## 2358       62.38885 61.14217 63.63552 60.48222 64.29548
## 2359       62.40402 61.03368 63.77436 60.30827 64.49978
## 2360       62.41920 60.93547 63.90293 60.15003 64.68837
## 2361       62.43438 60.84533 64.02343 60.00414 64.86462
## 2362       62.44955 60.76175 64.13736 59.86827 65.03083
## 2363       62.46473 60.68363 64.24583 59.74077 65.18869
## 2364       62.47991 60.61016 64.34965 59.62038 65.33943
## 2365       62.49509 60.54071 64.44946 59.50613 65.48404
## 2366       62.51026 60.47478 64.54574 59.39726 65.62326
## 2367       62.52544 60.41196 64.63892 59.29315 65.75773
## 2368       62.54062 60.35191 64.72932 59.19328 65.88795
## 2369       62.55579 60.29437 64.81722 59.09724 66.01435
## 2370       62.57097 60.23909 64.90285 59.00466 66.13728
## 2371       62.58615 60.18588 64.98642 58.91525 66.25705
## 2372       62.60132 60.13456 65.06809 58.82874 66.37391
## 2373       62.61650 60.08499 65.14801 58.74489 66.48811
## 2374       62.63168 60.03704 65.22632 58.66352 66.59984
## 2375       62.64686 59.99058 65.30313 58.58444 66.70928
## 2376       62.66203 59.94553 65.37854 58.50749 66.81657
## 2377       62.67721 59.90178 65.45264 58.43255 66.92187
## 2378       62.69239 59.85925 65.52552 58.35948 67.02529
## 2379       62.70756 59.81788 65.59725 58.28817 67.12696
## 2380       62.72274 59.77759 65.66789 58.21853 67.22696
## 2381       62.73792 59.73833 65.73751 58.15045 67.32539
## 2382       62.75310 59.70004 65.80615 58.08385 67.42234
## 2383       62.76827 59.66267 65.87388 58.01866 67.51788
## 2384       62.78345 59.62617 65.94073 57.95481 67.61209
## 2385       62.79863 59.59051 66.00674 57.89224 67.70502
## 2386       62.81380 59.55564 66.07197 57.83087 67.79674
## 2387       62.82898 59.52153 66.13644 57.77067 67.88730
## 2388       62.84416 59.48814 66.20018 57.71157 67.97675
## 2389       62.85934 59.45544 66.26323 57.65353 68.06515
## 2390       62.87451 59.42341 66.32562 57.59650 68.15252
## 2391       62.88969 59.39201 66.38737 57.54045 68.23893
## 2392       62.90487 59.36123 66.44851 57.48534 68.32439
## 2393       62.92004 59.33103 66.50905 57.43113 68.40896
## 2394       62.93522 59.30141 66.56904 57.37778 68.49266
## 2395       62.95040 59.27232 66.62847 57.32527 68.57553
## 2396       62.96558 59.24377 66.68738 57.27356 68.65759
## 2397       62.98075 59.21572 66.74579 57.22263 68.73888
## 2398       62.99593 59.18816 66.80370 57.17245 68.81941
## 2399       63.01111 59.16108 66.86114 57.12299 68.89922
## 2400       63.02628 59.13445 66.91812 57.07424 68.97833
## 2401       63.04146 59.10827 66.97465 57.02617 69.05675
## 2402       63.05664 59.08252 67.03075 56.97875 69.13452
## 2403       63.07182 59.05719 67.08644 56.93197 69.21166
## 2404       63.08699 59.03226 67.14172 56.88582 69.28817
## 2405       63.10217 59.00773 67.19661 56.84026 69.36408
## 2406       63.11735 58.98357 67.25112 56.79528 69.43941
## 2407       63.13252 58.95979 67.30526 56.75088 69.51417
## 2408       63.14770 58.93637 67.35904 56.70702 69.58838
## 2409       63.16288 58.91330 67.41246 56.66370 69.66205
## 2410       63.17806 58.89056 67.46555 56.62090 69.73521
## 2411       63.19323 58.86817 67.51830 56.57861 69.80785
## 2412       63.20841 58.84609 67.57073 56.53682 69.88000
## 2413       63.22359 58.82433 67.62284 56.49551 69.95167
## 2414       63.23876 58.80288 67.67465 56.45466 70.02286
## 2415       63.25394 58.78173 67.72615 56.41428 70.09360
## 2416       63.26912 58.76087 67.77737 56.37435 70.16389
## 2417       63.28430 58.74030 67.82830 56.33485 70.23374
## 2418       63.29947 58.72000 67.87894 56.29577 70.30317
## 2419       63.31465 58.69998 67.92932 56.25712 70.37218
## 2420       63.32983 58.68022 67.97943 56.21887 70.44078
## 2421       63.34500 58.66073 68.02928 56.18102 70.50898
## 2422       63.36018 58.64149 68.07887 56.14356 70.57680
## 2423       63.37536 58.62250 68.12822 56.10649 70.64423
## 2424       63.39053 58.60375 68.17732 56.06978 70.71129
## 2425       63.40571 58.58524 68.22618 56.03344 70.77798
## 2426       63.42089 58.56697 68.27481 55.99746 70.84432
## 2427       63.43607 58.54892 68.32321 55.96183 70.91031
## 2428       63.45124 58.53110 68.37138 55.92654 70.97595
## 2429       63.46642 58.51350 68.41934 55.89158 71.04126
## 2430       63.48160 58.49612 68.46708 55.85696 71.10623
## 2431       63.49677 58.47894 68.51461 55.82266 71.17089
## 2432       63.51195 58.46197 68.56193 55.78868 71.23523
## 2433       63.52713 58.44521 68.60905 55.75501 71.29925
## 2434       63.54231 58.42865 68.65597 55.72164 71.36297
## 2435       63.55748 58.41228 68.70269 55.68857 71.42640
## 2436       63.57266 58.39610 68.74922 55.65580 71.48952
## 2437       63.58784 58.38011 68.79556 55.62331 71.55236
## 2438       63.60301 58.36431 68.84172 55.59111 71.61492
## 2439       63.61819 58.34869 68.88769 55.55919 71.67720
## 2440       63.63337 58.33325 68.93349 55.52754 71.73920
## 2441       63.64855 58.31799 68.97911 55.49616 71.80093
## 2442       63.66372 58.30289 69.02455 55.46504 71.86241
## 2443       63.67890 58.28797 69.06983 55.43418 71.92362
## 2444       63.69408 58.27322 69.11494 55.40358 71.98457
## 2445       63.70925 58.25862 69.15988 55.37324 72.04527
## 2446       63.72443 58.24420 69.20467 55.34313 72.10573
## 2447       63.73961 58.22993 69.24929 55.31327 72.16594
## 2448       63.75479 58.21581 69.29376 55.28366 72.22592
## 2449       63.76996 58.20185 69.33807 55.25427 72.28565
## 2450       63.78514 58.18804 69.38224 55.22512 72.34516
## 2451       63.80032 58.17439 69.42625 55.19620 72.40444
## 2452       63.81549 58.16087 69.47011 55.16750 72.46349
## 2453       63.83067 58.14751 69.51383 55.13902 72.52232
## 2454       63.84585 58.13428 69.55741 55.11076 72.58093
## 2455       63.86103 58.12120 69.60085 55.08272 72.63933
## 2456       63.87620 58.10826 69.64415 55.05489 72.69752
## 2457       63.89138 58.09545 69.68731 55.02727 72.75549
## 2458       63.90656 58.08277 69.73034 54.99985 72.81327
## 2459       63.92173 58.07023 69.77324 54.97263 72.87083
## 2460       63.93691 58.05782 69.81600 54.94562 72.92820
## 2461       63.95209 58.04554 69.85864 54.91880 72.98538
## 2462       63.96727 58.03338 69.90115 54.89218 73.04235
## 2463       63.98244 58.02135 69.94353 54.86574 73.09914
## 2464       63.99762 58.00945 69.98579 54.83950 73.15574
## 2465       64.01280 57.99766 70.02793 54.81345 73.21215
## 2466       64.02797 57.98600 70.06995 54.78757 73.26837
## 2467       64.04315 57.97446 70.11185 54.76188 73.32442
## 2468       64.05833 57.96303 70.15363 54.73637 73.38029
## 2469       64.07351 57.95172 70.19529 54.71104 73.43597
## 2470       64.08868 57.94052 70.23685 54.68588 73.49149
## 2471       64.10386 57.92943 70.27829 54.66089 73.54683
## 2472       64.11904 57.91846 70.31962 54.63607 73.60200
## 2473       64.13421 57.90759 70.36083 54.61142 73.65701
## 2474       64.14939 57.89684 70.40194 54.58694 73.71185
## 2475       64.16457 57.88619 70.44295 54.56262 73.76652
## 2476       64.17974 57.87565 70.48384 54.53846 73.82103
## 2477       64.19492 57.86521 70.52464 54.51446 73.87539
## 2478       64.21010 57.85487 70.56533 54.49062 73.92958
## 2479       64.22528 57.84464 70.60591 54.46693 73.98362
## 2480       64.24045 57.83451 70.64640 54.44340 74.03750
## 2481       64.25563 57.82447 70.68679 54.42002 74.09124
## 2482       64.27081 57.81454 70.72708 54.39680 74.14482
## 2483       64.28598 57.80470 70.76727 54.37372 74.19825
## 2484       64.30116 57.79496 70.80736 54.35079 74.25154
## 2485       64.31634 57.78532 70.84736 54.32800 74.30468
## 2486       64.33152 57.77576 70.88727 54.30536 74.35767
## 2487       64.34669 57.76630 70.92708 54.28286 74.41053
## 2488       64.36187 57.75694 70.96680 54.26050 74.46324
## 2489       64.37705 57.74766 71.00643 54.23828 74.51582
## 2490       64.39222 57.73848 71.04597 54.21619 74.56826
## 2491       64.40740 57.72938 71.08543 54.19425 74.62056
## 2492       64.42258 57.72037 71.12479 54.17243 74.67273
## 2493       64.43776 57.71145 71.16407 54.15075 74.72476
summary(aa_fit_dt_s01_v1_log)
## Series: log(dt_s01_v1_xts) 
## ARIMA(1,1,0) with drift 
## 
## Coefficients:
##          ar1  drift
##       0.1117  4e-04
## s.e.  0.0205  2e-04
## 
## sigma^2 estimated as 0.0001003:  log likelihood=7491.16
## AIC=-14976.31   AICc=-14976.3   BIC=-14959.02
## 
## Training set error measures:
##                        ME       RMSE         MAE           MPE      MAPE
## Training set 9.481692e-07 0.01001024 0.006208169 -0.0004976877 0.1734165
##                   MASE          ACF1
## Training set 0.9833871 -8.713263e-06
summary(aa_fit_dt_s01_v1)
## Series: dt_s01_v1_xts 
## ARIMA(0,1,1) with drift 
## 
## Coefficients:
##          ma1   drift
##       0.1163  0.0152
## s.e.  0.0205  0.0091
## 
## sigma^2 estimated as 0.1581:  log likelihood=-1167.44
## AIC=2340.89   AICc=2340.9   BIC=2358.18
## 
## Training set error measures:
##                        ME      RMSE       MAE          MPE      MAPE      MASE
## Training set 1.447248e-05 0.3974086 0.2373141 -0.009213095 0.6215781 0.9837201
##                       ACF1
## Training set -0.0002184172
AIC(aa_fit_dt_s01_v1_log)
## [1] -14976.31
MLmetrics::MAPE(aa_fit_dt_s01_v1$fitted,dt_s01_v1_xts)
## [1] 0.006215781
MLmetrics::MAPE(exp(aa_fit_dt_s01_v1_log$fitted),dt_s01_v1_xts)
## [1] 0.006205727

2.1 Extra v01

ggseasonplot(ts(dt_s01_v1_xts,frequency=30))

ggseasonplot(ts(dt_s01_v1_xts,frequency=30),polar = TRUE)

ggseasonplot(ts(dt_s01_v1_xts,frequency=365))

lambda1 <- BoxCox.lambda(diff(dt_s01_v1_xts))
dt_s01_v1_xts_t <- BoxCox((dt_s01_v1_xts),lambda1)
auto.arima(dt_s01_v1_xts_t,stepwise = F, approximation = F, d = 0)
## Series: dt_s01_v1_xts_t 
## ARIMA(0,0,5) with non-zero mean 
## 
## Coefficients:
##          ma1   ma2     ma3     ma4     ma5     mean
##       2.3479  3.21  2.9510  1.7925  0.5865  38.4549
## s.e.  0.0204  0.04  0.0401  0.0269  0.0135   0.2961
## 
## sigma^2 estimated as 1.466:  log likelihood=-3789.92
## AIC=7593.84   AICc=7593.89   BIC=7634.18
decompose(ts(dt_s01_v1_xts,frequency=31),type = "multiplicative") %>% autoplot()

decompose(ts(dt_s01_v1_xts,frequency=31)) %>% autoplot()

decompose(ts(dt_s01_v1_xts,frequency=365)) %>% autoplot()

auto.arima(ts(dt_s01_v1_xts,frequency=30))
## Series: ts(dt_s01_v1_xts, frequency = 30) 
## ARIMA(0,1,1) with drift 
## 
## Coefficients:
##          ma1   drift
##       0.1163  0.0152
## s.e.  0.0205  0.0091
## 
## sigma^2 estimated as 0.1581:  log likelihood=-1167.44
## AIC=2340.89   AICc=2340.9   BIC=2358.18
auto.arima(ts(dt_s01_v1_xts))
## Series: ts(dt_s01_v1_xts) 
## ARIMA(0,1,1) with drift 
## 
## Coefficients:
##          ma1   drift
##       0.1163  0.0152
## s.e.  0.0205  0.0091
## 
## sigma^2 estimated as 0.1581:  log likelihood=-1167.44
## AIC=2340.89   AICc=2340.9   BIC=2358.18
auto.arima(log(dt_s01_v1_xts))
## Series: log(dt_s01_v1_xts) 
## ARIMA(1,1,0) with drift 
## 
## Coefficients:
##          ar1  drift
##       0.1117  4e-04
## s.e.  0.0205  2e-04
## 
## sigma^2 estimated as 0.0001003:  log likelihood=7491.16
## AIC=-14976.31   AICc=-14976.3   BIC=-14959.02
autoplot(ts(dt_s01_v1_xts,frequency=365), series="Data") +
  autolayer(trendcycle(decompose(ts(dt_s01_v1_xts,frequency=365))), series="Trend") +
  autolayer(seasadj(decompose(ts(dt_s01_v1_xts,frequency=365))), series="Seasonally Adjusted") +
  xlab("Year") + ylab("Var01") +
  ggtitle("Group S01 Var01") +
  scale_colour_manual(values=c("orange","blue","red"),
             breaks=c("Data","Seasonally Adjusted","Trend"))

3 Varaible 02

 # Plot NA and Actaul Data for Var1 of Group S01
  autoplot(as.ts(dt_s01_v2_xts) , series="V02") + 
  autolayer(log(as.ts(dt_s01_v2_xts)),series="Data")+
  scale_colour_manual(values=c("blue","red"),
             breaks=c("V02","Data"))

# ACF and PACF plot 
dt_s01_v2_xts %>% ggtsdisplay(main="",lag.max = 365)

dt_s01_v2_xts %>% ggtsdisplay(main="")

# PACF plot suggest too much variations which are not close to zero, Let try doing some log tranfartion and check .Non zero autocorreations in the data from lag 1 to 35.
# Acf plots shows that plots has some strong coreatlion in the frist few lags then it is very is of random walk model, but slowly it exibits Moderate Autocorrelation . 

#https://www.statisticshowto.com/kpss-test/
# The Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test figures out if a time series is stationary around a mean or linear trend, or is non-stationary due to a unit root. A stationary time series is one where statistical properties - like the mean and variance - are constant over time.

# The null hypothesis for the test is that the data is stationary.
# The alternate hypothesis for the test is that the data is not stationary.


dt_s01_v2_xts %>% ur.kpss() %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 13.4497 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
# The null hypothesis for the test is that the data is stationary.
# Here you should reject the null hypothesis of stationarity as the value of the test statistic is more extreme than the 10%, 5% and 1% critical values (13.4497 >  0.347 , 13.4497 > 0.463, 13.4497 > 0.739).
# we will reject the null and accpet the alternate .

dt_s01_v2_xts %>% diff() %>% ur.kpss() %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 0.003 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
# Since as the value of the test statistic = 0.003 is less than than the 10%, 5% and 1% critical values (0.003 >  0.347 , 0.003 > 0.463, 0.003 > 0.739). We can say that data is now stationary.

dt_s01_v2_xts %>% log() %>%diff() %>% ur.kpss() %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 0.0033 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
dt_s01_v2_xts %>%  .[!is.na(.)] %>% ur.df() %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -15966892   -864079    107490   1125529  40490634 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## z.lag.1    -0.035888   0.006013  -5.969 2.75e-09 ***
## z.diff.lag -0.189572   0.020255  -9.359  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2879000 on 2349 degrees of freedom
## Multiple R-squared:  0.05734,    Adjusted R-squared:  0.05654 
## F-statistic: 71.44 on 2 and 2349 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -5.9688 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.58 -1.95 -1.62
dt_s01_v2_xts %>% log() %>%diff() %>%  .[!is.na(.)] %>% ur.df() %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19728 -0.13975 -0.01605  0.12066  1.75762 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## z.lag.1    -1.28918    0.03112 -41.423  < 2e-16 ***
## z.diff.lag  0.11780    0.02049   5.749 1.02e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2656 on 2348 degrees of freedom
## Multiple R-squared:  0.5826, Adjusted R-squared:  0.5822 
## F-statistic:  1639 on 2 and 2348 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -41.4231 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.58 -1.95 -1.62
# Plot 
cbind("Main Data S01_V1"= log(dt_s01_v2_xts),
      "Seasonlly Diff"= (diff(log(dt_s01_v2_xts),na.action = na.pass ))) %>% autoplot()

cbind("Main Data S01_V2"= log(dt_s01_v2_xts))  %>% autoplot()

cbind("Main Data S01_V2"= diff(log(dt_s01_v2_xts)))  %>% autoplot(series="ASA")

autoplot((as.ts(log(dt_s01_v2_xts))) , series="Log Data") + 
  autolayer(diff(as.ts(log(dt_s01_v2_xts))),series="Diff Log")+
  scale_colour_manual(values=c("blue","red"),
             breaks=c("Log Data","Diff Log"))

#ACF Model 
dt_s01_v2_xts %>% ggtsdisplay(main="")

dt_s01_v2_xts %>% ggtsdisplay(main="",lag.max = 365)

diff(log(dt_s01_v2_xts)) %>% ggtsdisplay(main="")

diff(log(dt_s01_v2_xts),lag = 35) %>% ggtsdisplay(main="")

# ACF plot suggest that after 3 lag data is moslty neagativily auto correalted to zero mean. This suggest we can apply 
# Negative ACF means that a positive value return for one observation increases the probability of having a negative value return for another observation (depending on the lag) and vice-versa. Or you can say (for a stationary time series) if one observation is above the average the other one (depending on the lag) is below average and vice-versa. Have a look at "Interpreting a negative autocorrelation".
# A negative autocorrelation changes the direction of the influence. A negative autocorrelation implies that if a particular value is above average the next value (or for that matter the previous value) is more likely to be below average. If a particular value is below average, the next value is likely to be above average.
#http://www.pmean.com/09/NegativeAutocorrelation.html

 #We will use ndiffs() to find rest of the number of differenes.

ndiffs(dt_s01_v2_xts)
## [1] 1
ndiffs(log(dt_s01_v2_xts))
## [1] 1
## Lets use auto.arima to validate our model and fir it.
aa_fit_dt_s01_v2 <- auto.arima(dt_s01_v2_xts,seasonal=TRUE)
summary(aa_fit_dt_s01_v2)
## Series: dt_s01_v2_xts 
## ARIMA(1,1,2) 
## 
## Coefficients:
##          ar1      ma1     ma2
##       0.6594  -1.0199  0.0448
## s.e.  0.0300   0.0366  0.0329
## 
## sigma^2 estimated as 7.352e+12:  log likelihood=-38176.85
## AIC=76361.69   AICc=76361.71   BIC=76384.75
## 
## Training set error measures:
##                     ME    RMSE     MAE       MPE     MAPE      MASE
## Training set -50897.15 2709242 1658282 -7.900959 20.20669 0.9527611
##                       ACF1
## Training set -0.0002251844
aa_fit_dt_s01_v2_log <- auto.arima(log(dt_s01_v2_xts))
summary(aa_fit_dt_s01_v2_log)
## Series: log(dt_s01_v2_xts) 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.7196  -0.9763
## s.e.  0.0188   0.0076
## 
## sigma^2 estimated as 0.0644:  log likelihood=-111.65
## AIC=229.29   AICc=229.3   BIC=246.58
## 
## Training set error measures:
##                        ME      RMSE       MAE        MPE     MAPE      MASE
## Training set -0.004840398 0.2536064 0.1794515 -0.0548662 1.130844 0.9432066
##                     ACF1
## Training set -0.01213411
# AIC Score of Log tranfomed model is better so we will use it.

# ACF plot of residuals 
checkresiduals(aa_fit_dt_s01_v2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,2)
## Q* = 12.324, df = 7, p-value = 0.0904
## 
## Model df: 3.   Total lags used: 10
checkresiduals(aa_fit_dt_s01_v2,lag = 35)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,2)
## Q* = 41.588, df = 32, p-value = 0.1195
## 
## Model df: 3.   Total lags used: 35
# Ressiduals are giving very distrubuted erros and mostly close to zero.
# Pvalue is greater than 5% and hence we can't reject null hypothessis i.e. autocorrelations up to lag 8 equal 

# loged data
checkresiduals(aa_fit_dt_s01_v2_log)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)
## Q* = 10.953, df = 8, p-value = 0.2044
## 
## Model df: 2.   Total lags used: 10
fc1<-forecast(aa_fit_dt_s01_v2,h=140) 
fc<-forecast(aa_fit_dt_s01_v2_log,h=140) 
summary(aa_fit_dt_s01_v2)
## Series: dt_s01_v2_xts 
## ARIMA(1,1,2) 
## 
## Coefficients:
##          ar1      ma1     ma2
##       0.6594  -1.0199  0.0448
## s.e.  0.0300   0.0366  0.0329
## 
## sigma^2 estimated as 7.352e+12:  log likelihood=-38176.85
## AIC=76361.69   AICc=76361.71   BIC=76384.75
## 
## Training set error measures:
##                     ME    RMSE     MAE       MPE     MAPE      MASE
## Training set -50897.15 2709242 1658282 -7.900959 20.20669 0.9527611
##                       ACF1
## Training set -0.0002251844
summary(aa_fit_dt_s01_v2_log)
## Series: log(dt_s01_v2_xts) 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.7196  -0.9763
## s.e.  0.0188   0.0076
## 
## sigma^2 estimated as 0.0644:  log likelihood=-111.65
## AIC=229.29   AICc=229.3   BIC=246.58
## 
## Training set error measures:
##                        ME      RMSE       MAE        MPE     MAPE      MASE
## Training set -0.004840398 0.2536064 0.1794515 -0.0548662 1.130844 0.9432066
##                     ACF1
## Training set -0.01213411
fc$mean<-exp(fc$mean)
fc$upper<-exp(fc$upper)
fc$lower<-exp(fc$lower)
fc$x<-exp(fc$x)


fc1 %>% autoplot()

fc%>% autoplot()

MLmetrics::MAPE(fc1$fitted,dt_s01_v2_xts)
## [1] 0.2020669
MLmetrics::MAPE(exp(fc$fitted),dt_s01_v2_xts)
## [1] 0.1812568
forVa02 <- exp(fc$mean)

# xlsx::write.xlsx(forVa02, file = "Predictive Analytics Project 1V2log.xlsx",  col.name = T, row.names = T, append = T)
# https://nwfsc-timeseries.github.io/atsa-labs/sec-boxjenkins-kpss.html
# https://www.machinelearningplus.com/time-series/kpss-test-for-stationarity/

# below test suggest that p value is less than .05 and hence we rejct the null, and say that data is not statiaonlry around the trend line. 
# But we thought the KPSS should not have rejected the null, because of the 'stationarity around a deterministic trend'. Yes?
# 
# Well, that's probably because we used the default option of the KPSS test.
# 
# By default, it tests for stationarity around a 'mean' only.
# tseries::kpss.test(diff(dt_s01_v1_xts), null = "Trend")

tseries::kpss.test(dt_s01_v1_xts)
## 
##  KPSS Test for Level Stationarity
## 
## data:  dt_s01_v1_xts
## KPSS Level = 23.217, Truncation lag parameter = 8, p-value = 0.01
# KPSS test on data  with a trend. The default is a null hypothesis with no trend. We will change this to null="Trend".

tseries::kpss.test(dt_s01_v1_xts, null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  dt_s01_v1_xts
## KPSS Trend = 2.8939, Truncation lag parameter = 8, p-value = 0.01
tseries::kpss.test(dt_s01_v1_xts, null = "Level")
## 
##  KPSS Test for Level Stationarity
## 
## data:  dt_s01_v1_xts
## KPSS Level = 23.217, Truncation lag parameter = 8, p-value = 0.01
# The p-value is less than 0.05. The null hypothesis of stationarity around a level is rejected. This is white noise around a trend so it is definitely a stationary process but has a trend. 
tseries::kpss.test(diff(dt_s01_v1_xts), null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  diff(dt_s01_v1_xts)
## KPSS Trend = 0.033528, Truncation lag parameter = 8, p-value = 0.1

4 FOR var 1 (EXTRA )

# FOR var 1 
# Fickey fuller test Before and after diff
dt_s01_v1_xts %>% ur.df() %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9468 -0.1307 -0.0100  0.1275  4.8576 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## z.lag.1    0.0003055  0.0002005   1.523    0.128    
## z.diff.lag 0.1140899  0.0205036   5.564 2.93e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3978 on 2349 degrees of freedom
## Multiple R-squared:  0.01432,    Adjusted R-squared:  0.01348 
## F-statistic: 17.07 on 2 and 2349 DF,  p-value: 4.376e-08
## 
## 
## Value of test-statistic is: 1.5234 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.58 -1.95 -1.62
#https://stats.stackexchange.com/questions/24072/interpreting-rs-ur-df-dickey-fuller-unit-root-test-results
# Given that the test statistic 1.5234  is within the all 3 regions (1%, 5%, 10%) where we fail to reject the null, we should presume the data is a random walk, ie that a unit root is present. In this case, the tau1 refers to the gamma = 0 hypothesis, The "z.lag1" is the gamma term, the coefficient for the lag term (y(t-1)), which is p=0.128, which we fail to reject as significant, simply implying that gamma isn't statistically significant to this model.

dt_s01_v1_xts %>% ur.df(type = "drift") %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9387 -0.1323 -0.0127  0.1245  4.8627 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0211874  0.0304403   0.696    0.486    
## z.lag.1     -0.0001929  0.0007435  -0.259    0.795    
## z.diff.lag   0.1144296  0.0205117   5.579  2.7e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3978 on 2348 degrees of freedom
## Multiple R-squared:  0.01308,    Adjusted R-squared:  0.01224 
## F-statistic: 15.56 on 2 and 2348 DF,  p-value: 1.929e-07
## 
## 
## Value of test-statistic is: -0.2594 1.4023 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.43 -2.86 -2.57
## phi1  6.43  4.59  3.78
# (where a0 is "a sub-zero" and refers to the constant, or drift term) Here is where the output interpretation gets trickier. "tau2" is still the ??=0 null hypothesis. In this case, where the first test statistic = -0.2594 is within the region of failing to reject the null, we should again presume a unit root, that ??=0 i.e. z.lag.1.
# The phi1 term refers to the second hypothesis, which is a combined null hypothesis of a0 = gamma = 0. This means that BOTH of the values are tested to be 0 at the same time. If p<0.05, we reject the null, and presume that AT LEAST one of these is false--i.e. one or both of the terms a0 or gamma are not 0. Failing to reject this null implies that BOTH a0 AND gamma = 0, implying 1) that gamma=0 therefore a unit root is present, AND 2) a0=0, so there is no drift term. Here we can say p is significant hence Drift is present with , so ao is not zero .

dt_s01_v1_xts %>% diff() %>% .[!is.na(.)] %>% ur.df(type = "drift") %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9371 -0.1321 -0.0127  0.1260  4.8534 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.013915   0.008217   1.693   0.0905 .  
## z.lag.1     -0.900617   0.027465 -32.791   <2e-16 ***
## z.diff.lag   0.016533   0.020634   0.801   0.4231    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3978 on 2347 degrees of freedom
## Multiple R-squared:  0.4432, Adjusted R-squared:  0.4427 
## F-statistic:   934 on 2 and 2347 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -32.7909 537.6205 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.43 -2.86 -2.57
## phi1  6.43  4.59  3.78
# here we can say that no unit root is present .ie. Gama and Constant are not zero after differencing .
log(dt_s01_v2_xts) %>% ggtsdisplay(main="")

dt_s01_v2_xts %>% ur.df() %>% summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -15966892   -864079    107490   1125529  40490634 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## z.lag.1    -0.035888   0.006013  -5.969 2.75e-09 ***
## z.diff.lag -0.189572   0.020255  -9.359  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2879000 on 2349 degrees of freedom
## Multiple R-squared:  0.05734,    Adjusted R-squared:  0.05654 
## F-statistic: 71.44 on 2 and 2349 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -5.9688 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.58 -1.95 -1.62
dt_s01_v2_xts%>% diff() %>% ggtsdisplay(main="")

log(dt_s01_v2_xts)%>% diff() %>% ggtsdisplay(main="")

auto.arima(dt_s01_v2_xts)
## Series: dt_s01_v2_xts 
## ARIMA(1,1,2) 
## 
## Coefficients:
##          ar1      ma1     ma2
##       0.6594  -1.0199  0.0448
## s.e.  0.0300   0.0366  0.0329
## 
## sigma^2 estimated as 7.352e+12:  log likelihood=-38176.85
## AIC=76361.69   AICc=76361.71   BIC=76384.75
auto.arima(log(dt_s01_v2_xts))
## Series: log(dt_s01_v2_xts) 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.7196  -0.9763
## s.e.  0.0188   0.0076
## 
## sigma^2 estimated as 0.0644:  log likelihood=-111.65
## AIC=229.29   AICc=229.3   BIC=246.58
autoplot(ts(dt_s01_v2_xts,frequency=365), series="Data") +
  autolayer(trendcycle(decompose(ts(dt_s01_v2_xts,frequency=365))), series="Trend") +
  autolayer(seasadj(decompose(ts(dt_s01_v2_xts,frequency=365))), series="Seasonally Adjusted") +
  xlab("Year") + ylab("Var02") +
  ggtitle("Group S01 Var02") +
  scale_colour_manual(values=c("orange","blue","red"),
             breaks=c("Data","Seasonally Adjusted","Trend"))

Rajwant Mishra

June 20, 2020