## Warning: package 'dplyr' was built under R version 4.1.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v stringr 1.4.0
## v tidyr 1.1.4 v forcats 0.5.1
## v readr 2.0.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Warning: package 'forecast' was built under R version 4.1.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Warning: package 'tseries' was built under R version 4.1.2
df_train <- readr::read_csv('D:\\VAIBHAV\\HOMEWORK\\Time Series\\ASSIGNMENT\\sales_train.csv')
## Rows: 2935849 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): date
## dbl (5): date_block_num, shop_id, item_id, item_price, item_cnt_day
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(df_train)
## spec_tbl_df [2,935,849 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date : chr [1:2935849] "02.01.2013" "03.01.2013" "05.01.2013" "06.01.2013" ...
## $ date_block_num: num [1:2935849] 0 0 0 0 0 0 0 0 0 0 ...
## $ shop_id : num [1:2935849] 59 25 25 25 25 25 25 25 25 25 ...
## $ item_id : num [1:2935849] 22154 2552 2552 2554 2555 ...
## $ item_price : num [1:2935849] 999 899 899 1709 1099 ...
## $ item_cnt_day : num [1:2935849] 1 1 -1 1 1 1 1 1 1 3 ...
## - attr(*, "spec")=
## .. cols(
## .. date = col_character(),
## .. date_block_num = col_double(),
## .. shop_id = col_double(),
## .. item_id = col_double(),
## .. item_price = col_double(),
## .. item_cnt_day = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
df_train$date <- dmy(df_train$date)
str(df_train)
## spec_tbl_df [2,935,849 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date : Date[1:2935849], format: "2013-01-02" "2013-01-03" ...
## $ date_block_num: num [1:2935849] 0 0 0 0 0 0 0 0 0 0 ...
## $ shop_id : num [1:2935849] 59 25 25 25 25 25 25 25 25 25 ...
## $ item_id : num [1:2935849] 22154 2552 2552 2554 2555 ...
## $ item_price : num [1:2935849] 999 899 899 1709 1099 ...
## $ item_cnt_day : num [1:2935849] 1 1 -1 1 1 1 1 1 1 3 ...
## - attr(*, "spec")=
## .. cols(
## .. date = col_character(),
## .. date_block_num = col_double(),
## .. shop_id = col_double(),
## .. item_id = col_double(),
## .. item_price = col_double(),
## .. item_cnt_day = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
attach(df_train)
unique(date_block_num)
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## [26] 25 26 27 28 29 30 31 32 33
lapply(df_train,function(x) { length(which(is.na(df_train)))})
## $date
## [1] 0
##
## $date_block_num
## [1] 0
##
## $shop_id
## [1] 0
##
## $item_id
## [1] 0
##
## $item_price
## [1] 0
##
## $item_cnt_day
## [1] 0
df_train <-df_train %>%
group_by(date) %>%
mutate(total_sale = sum(item_price*item_cnt_day)/1000000) %>%
ungroup()
par(mfrow=c(2,1))
df_train %>% ggplot()+
geom_line(aes(date,total_sale))+
theme_bw()+
geom_smooth(aes(date,total_sale),method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggtitle("TOATAL DAILY SALES")
## $title
## [1] "TOATAL DAILY SALES"
##
## attr(,"class")
## [1] "labels"
df_train %>% ggplot(aes(colour="Blue"))+
geom_line(aes(date,cumsum(total_sale)/1000000),show.legend = FALSE)+
theme_bw()+
ggtitle("TOTAL CUMMULATIVE SALES BY DATE")+
ylab("Cummulative Sales")+
xlab("DATE")
df_train <- subset(df_train,select = c(date,total_sale))
df_train <- df_train %>% group_by(date) %>% filter(!duplicated(date))
dim(df_train)
## [1] 1034 2
df_train %>% ggplot()+
geom_line(aes(date,total_sale))
df_train %>% ggplot()+
geom_line(aes(date,total_sale))+
theme_bw()+
geom_smooth(aes(date,total_sale),method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggtitle("TOATAL DAILY SALES")
## $title
## [1] "TOATAL DAILY SALES"
##
## attr(,"class")
## [1] "labels"
par(mfrow=c(1,2))
acf(df_train$total_sale,lag.max = 40)
Pacf(df_train$total_sale, lag.max = 40)
### From above plots we could see that there is strong autocorrelation in the data ### Now we will move ahead make the series Variable stationary by log transformation and box cox transformation.
train_box_cox <- df_train %>%
mutate(sale_box_cox = forecast::BoxCox(total_sale,lambda="auto"))
train_box_cox %>% ggplot()+
geom_line(aes(date,sale_box_cox))+
theme_bw()+
ggtitle("SALES PER DAY-BOX-COX TRANSFORMED")
#adf.test(train_box_cox$sale_box_cox)
lambda <- BoxCox.lambda(df_train$total_sale)
lambda
## [1] -0.8635756
## [1] 16256
## [1] 20000
### Lets check the residuals of this by manually putting values.
model_auto = arima(train_box_cox$sale_box_cox,order=c(5,1,0))
forecast::checkresiduals(model_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,1,0)
## Q* = 29.472, df = 5, p-value = 1.873e-05
##
## Model df: 5. Total lags used: 10
### Lets Compare this with original ACF and PACF
par(mfrow=c(1,2))
acf(train_box_cox$sale_box_cox,lag.max = 20)
pacf(train_box_cox$sale_box_cox,lag.max = 20)
df_train$Week <- as.Date(cut(df_train$date, "week"))
aggregate(total_sale ~ Week, df_train, sum)
## Week total_sale
## 1 2012-12-31 26.43543
## 2 2013-01-07 20.80383
## 3 2013-01-14 19.40747
## 4 2013-01-21 17.73457
## 5 2013-01-28 17.14022
## 6 2013-02-04 20.76675
## 7 2013-02-11 22.41082
## 8 2013-02-18 29.40029
## 9 2013-02-25 18.63159
## 10 2013-03-04 26.49597
## 11 2013-03-11 25.65420
## 12 2013-03-18 21.02726
## 13 2013-03-25 21.63710
## 14 2013-04-01 16.97071
## 15 2013-04-08 16.84502
## 16 2013-04-15 16.01968
## 17 2013-04-22 15.16685
## 18 2013-04-29 15.56394
## 19 2013-05-06 13.02825
## 20 2013-05-13 14.83875
## 21 2013-05-20 15.78017
## 22 2013-05-27 15.66490
## 23 2013-06-03 16.86827
## 24 2013-06-10 21.99448
## 25 2013-06-17 17.95413
## 26 2013-06-24 17.07484
## 27 2013-07-01 16.81119
## 28 2013-07-08 15.73135
## 29 2013-07-15 15.54233
## 30 2013-07-22 15.33993
## 31 2013-07-29 15.41860
## 32 2013-08-05 14.57381
## 33 2013-08-12 15.47869
## 34 2013-08-19 17.24375
## 35 2013-08-26 20.70857
## 36 2013-09-02 22.79903
## 37 2013-09-09 18.19900
## 38 2013-09-16 42.97683
## 39 2013-09-23 29.31174
## 40 2013-09-30 29.01582
## 41 2013-10-07 25.07020
## 42 2013-10-14 20.17567
## 43 2013-10-21 19.22806
## 44 2013-10-28 30.53934
## 45 2013-11-04 25.75330
## 46 2013-11-11 19.34698
## 47 2013-11-18 25.88803
## 48 2013-11-25 58.94111
## 49 2013-12-02 30.35415
## 50 2013-12-09 30.47249
## 51 2013-12-16 48.95757
## 52 2013-12-23 65.54560
## 53 2013-12-30 53.07747
## 54 2014-01-06 24.78459
## 55 2014-01-13 18.57356
## 56 2014-01-20 17.90493
## 57 2014-01-27 18.26185
## 58 2014-02-03 29.89935
## 59 2014-02-10 23.81434
## 60 2014-02-17 31.84550
## 61 2014-02-24 22.87792
## 62 2014-03-03 22.40873
## 63 2014-03-10 18.87321
## 64 2014-03-17 33.17772
## 65 2014-03-24 23.76429
## 66 2014-03-31 18.45659
## 67 2014-04-07 17.88451
## 68 2014-04-14 21.42710
## 69 2014-04-21 18.60999
## 70 2014-04-28 16.76306
## 71 2014-05-05 14.55092
## 72 2014-05-12 14.43177
## 73 2014-05-19 16.19569
## 74 2014-05-26 39.71713
## 75 2014-06-02 18.63094
## 76 2014-06-09 20.27065
## 77 2014-06-16 18.53425
## 78 2014-06-23 19.03261
## 79 2014-06-30 17.20969
## 80 2014-07-07 17.34606
## 81 2014-07-14 15.89059
## 82 2014-07-21 16.03213
## 83 2014-07-28 21.08208
## 84 2014-08-04 18.34613
## 85 2014-08-11 19.33446
## 86 2014-08-18 22.21304
## 87 2014-08-25 21.02987
## 88 2014-09-01 19.41437
## 89 2014-09-08 26.43745
## 90 2014-09-15 20.82242
## 91 2014-09-22 40.43093
## 92 2014-09-29 40.73875
## 93 2014-10-06 26.24087
## 94 2014-10-13 23.00585
## 95 2014-10-20 18.73650
## 96 2014-10-27 20.07889
## 97 2014-11-03 26.29730
## 98 2014-11-10 34.13653
## 99 2014-11-17 52.47483
## 100 2014-11-24 30.47203
## 101 2014-12-01 32.07337
## 102 2014-12-08 33.15333
## 103 2014-12-15 58.01242
## 104 2014-12-22 65.66092
## 105 2014-12-29 70.27005
## 106 2015-01-05 33.80135
## 107 2015-01-12 21.62727
## 108 2015-01-19 19.88812
## 109 2015-01-26 19.96514
## 110 2015-02-02 16.49110
## 111 2015-02-09 19.34865
## 112 2015-02-16 28.82654
## 113 2015-02-23 21.60438
## 114 2015-03-02 18.76137
## 115 2015-03-09 17.49452
## 116 2015-03-16 17.06582
## 117 2015-03-23 25.72285
## 118 2015-03-30 16.54408
## 119 2015-04-06 14.09163
## 120 2015-04-13 39.18419
## 121 2015-04-20 17.70593
## 122 2015-04-27 14.12287
## 123 2015-05-04 11.72068
## 124 2015-05-11 12.12435
## 125 2015-05-18 41.85571
## 126 2015-05-25 15.73664
## 127 2015-06-01 14.95426
## 128 2015-06-08 15.25172
## 129 2015-06-15 14.51645
## 130 2015-06-22 16.58789
## 131 2015-06-29 13.93846
## 132 2015-07-06 15.59413
## 133 2015-07-13 13.98616
## 134 2015-07-20 12.17754
## 135 2015-07-27 11.23117
## 136 2015-08-03 11.42219
## 137 2015-08-10 11.70116
## 138 2015-08-17 14.09194
## 139 2015-08-24 18.84793
## 140 2015-08-31 20.08190
## 141 2015-09-07 16.93140
## 142 2015-09-14 17.12227
## 143 2015-09-21 25.34495
## 144 2015-09-28 32.81623
## 145 2015-10-05 15.79077
## 146 2015-10-12 15.26764
## 147 2015-10-19 22.56859
## 148 2015-10-26 13.57028
df_train_1 <- df_train %>% group_by(Week) %>% summarise(sale_week=sum(total_sale))
autoplot(ts(df_train_1$sale_week))+ylab("TOTAL SALE (week)")+theme_bw()
head(df_train_1)
## # A tibble: 6 x 2
## Week sale_week
## <date> <dbl>
## 1 2012-12-31 26.4
## 2 2013-01-07 20.8
## 3 2013-01-14 19.4
## 4 2013-01-21 17.7
## 5 2013-01-28 17.1
## 6 2013-02-04 20.8
## Part B-
#rm(train_diff)
train_box_cox$Week <- as.Date(cut(train_box_cox$date, "week"))
aggregate(sale_box_cox ~ Week, train_box_cox, sum)
## Week sale_box_cox
## 1 2012-12-31 20.435426
## 2 2013-01-07 13.803835
## 3 2013-01-14 12.407470
## 4 2013-01-21 10.734573
## 5 2013-01-28 10.140218
## 6 2013-02-04 13.766748
## 7 2013-02-11 15.410815
## 8 2013-02-18 22.400294
## 9 2013-02-25 11.631587
## 10 2013-03-04 19.495966
## 11 2013-03-11 18.654195
## 12 2013-03-18 14.027257
## 13 2013-03-25 14.637100
## 14 2013-04-01 9.970707
## 15 2013-04-08 9.845017
## 16 2013-04-15 9.019684
## 17 2013-04-22 8.166846
## 18 2013-04-29 8.563944
## 19 2013-05-06 6.028252
## 20 2013-05-13 7.838752
## 21 2013-05-20 8.780174
## 22 2013-05-27 8.664896
## 23 2013-06-03 9.868274
## 24 2013-06-10 14.994482
## 25 2013-06-17 10.954131
## 26 2013-06-24 10.074844
## 27 2013-07-01 9.811191
## 28 2013-07-08 8.731352
## 29 2013-07-15 8.542329
## 30 2013-07-22 8.339934
## 31 2013-07-29 8.418597
## 32 2013-08-05 7.573813
## 33 2013-08-12 8.478688
## 34 2013-08-19 10.243753
## 35 2013-08-26 13.708573
## 36 2013-09-02 15.799025
## 37 2013-09-09 11.199003
## 38 2013-09-16 35.976832
## 39 2013-09-23 22.311738
## 40 2013-09-30 22.015823
## 41 2013-10-07 18.070195
## 42 2013-10-14 13.175669
## 43 2013-10-21 12.228058
## 44 2013-10-28 23.539339
## 45 2013-11-04 18.753302
## 46 2013-11-11 12.346978
## 47 2013-11-18 18.888026
## 48 2013-11-25 51.941111
## 49 2013-12-02 23.354148
## 50 2013-12-09 23.472486
## 51 2013-12-16 41.957566
## 52 2013-12-23 58.545603
## 53 2013-12-30 46.077474
## 54 2014-01-06 17.784588
## 55 2014-01-13 11.573564
## 56 2014-01-20 10.904925
## 57 2014-01-27 11.261852
## 58 2014-02-03 22.899346
## 59 2014-02-10 16.814337
## 60 2014-02-17 24.845498
## 61 2014-02-24 15.877917
## 62 2014-03-03 15.408726
## 63 2014-03-10 11.873211
## 64 2014-03-17 26.177724
## 65 2014-03-24 16.764294
## 66 2014-03-31 11.456592
## 67 2014-04-07 10.884507
## 68 2014-04-14 14.427103
## 69 2014-04-21 11.609987
## 70 2014-04-28 9.763055
## 71 2014-05-05 7.550921
## 72 2014-05-12 7.431767
## 73 2014-05-19 9.195685
## 74 2014-05-26 32.717125
## 75 2014-06-02 11.630937
## 76 2014-06-09 13.270650
## 77 2014-06-16 11.534253
## 78 2014-06-23 12.032608
## 79 2014-06-30 10.209688
## 80 2014-07-07 10.346058
## 81 2014-07-14 8.890585
## 82 2014-07-21 9.032135
## 83 2014-07-28 14.082085
## 84 2014-08-04 11.346134
## 85 2014-08-11 12.334458
## 86 2014-08-18 15.213042
## 87 2014-08-25 14.029874
## 88 2014-09-01 12.414367
## 89 2014-09-08 19.437454
## 90 2014-09-15 13.822417
## 91 2014-09-22 33.430928
## 92 2014-09-29 33.738751
## 93 2014-10-06 19.240872
## 94 2014-10-13 16.005848
## 95 2014-10-20 11.736497
## 96 2014-10-27 13.078887
## 97 2014-11-03 19.297304
## 98 2014-11-10 27.136530
## 99 2014-11-17 45.474830
## 100 2014-11-24 23.472030
## 101 2014-12-01 25.073367
## 102 2014-12-08 26.153334
## 103 2014-12-15 51.012423
## 104 2014-12-22 58.660918
## 105 2014-12-29 63.270050
## 106 2015-01-05 26.801351
## 107 2015-01-12 14.627269
## 108 2015-01-19 12.888115
## 109 2015-01-26 12.965139
## 110 2015-02-02 9.491104
## 111 2015-02-09 12.348652
## 112 2015-02-16 21.826537
## 113 2015-02-23 14.604377
## 114 2015-03-02 11.761367
## 115 2015-03-09 10.494525
## 116 2015-03-16 10.065824
## 117 2015-03-23 18.722854
## 118 2015-03-30 9.544076
## 119 2015-04-06 7.091630
## 120 2015-04-13 32.184190
## 121 2015-04-20 10.705932
## 122 2015-04-27 7.122867
## 123 2015-05-04 4.720676
## 124 2015-05-11 5.124349
## 125 2015-05-18 34.855711
## 126 2015-05-25 8.736635
## 127 2015-06-01 7.954260
## 128 2015-06-08 8.251721
## 129 2015-06-15 7.516448
## 130 2015-06-22 9.587887
## 131 2015-06-29 6.938458
## 132 2015-07-06 8.594129
## 133 2015-07-13 6.986156
## 134 2015-07-20 5.177542
## 135 2015-07-27 4.231170
## 136 2015-08-03 4.422195
## 137 2015-08-10 4.701165
## 138 2015-08-17 7.091940
## 139 2015-08-24 11.847935
## 140 2015-08-31 13.081896
## 141 2015-09-07 9.931395
## 142 2015-09-14 10.122267
## 143 2015-09-21 18.344954
## 144 2015-09-28 25.816232
## 145 2015-10-05 8.790767
## 146 2015-10-12 8.267644
## 147 2015-10-19 15.568590
## 148 2015-10-26 7.570279
train_main <- train_box_cox %>% group_by(Week) %>% summarise(week_bx=sum(sale_box_cox))
autoplot(ts(train_main$week_bx))+ylab("TOTAL SALE (BOX-COX)")+theme_bw()
par(mfrow=c(1,2))
acf(train_main$week_bx,lag.max = 60)
pacf(train_main$week_bx,lag.max = 60)
### The ACF plot shows the seasonality involved in the series and PACF pattern looks like MA process. ### To clear this, we will take first difference of the data
train_main <- train_main %>%
mutate(week_bx_diff = week_bx - lag(week_bx))
head(train_main)
## # A tibble: 6 x 3
## Week week_bx week_bx_diff
## <date> <dbl> <dbl>
## 1 2012-12-31 20.4 NA
## 2 2013-01-07 13.8 -6.63
## 3 2013-01-14 12.4 -1.40
## 4 2013-01-21 10.7 -1.67
## 5 2013-01-28 10.1 -0.594
## 6 2013-02-04 13.8 3.63
autoplot(ts(train_main$week_bx_diff))
train_main_1 <- train_main[complete.cases(train_main),]
adf.test(train_main_1$week_bx_diff)
## Warning in adf.test(train_main_1$week_bx_diff): p-value smaller than printed p-
## value
##
## Augmented Dickey-Fuller Test
##
## data: train_main_1$week_bx_diff
## Dickey-Fuller = -6.9122, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(train_main_1$week_bx_diff)
## Warning in kpss.test(train_main_1$week_bx_diff): p-value greater than printed p-
## value
##
## KPSS Test for Level Stationarity
##
## data: train_main_1$week_bx_diff
## KPSS Level = 0.025498, Truncation lag parameter = 4, p-value = 0.1
### Both test indicates that the series is stationary ### Checking again the ACF/PACF plots
par(mfrow=c(1,2))
acf(train_main_1$week_bx_diff)
pacf(train_main_1$week_bx_diff)
### PACF AND ACF is much better now. ### The series looks an AR series.The plot indicates that it could be an 3 AR series. i.e ARIMA(3,0,0)
model_1 = arima(train_main_1$week_bx,order = c(3,1,0))
summary(model_1)
##
## Call:
## arima(x = train_main_1$week_bx, order = c(3, 1, 0))
##
## Coefficients:
## ar1 ar2 ar3
## -0.2930 -0.2688 -0.2357
## s.e. 0.0803 0.0805 0.0798
##
## sigma^2 estimated as 83.63: log likelihood = -530.44, aic = 1068.88
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.05339097 9.113623 6.101429 -15.02176 37.68191 1.012667
## ACF1
## Training set -0.03359444
forecast::checkresiduals(model_1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,0)
## Q* = 9.7582, df = 7, p-value = 0.2027
##
## Model df: 3. Total lags used: 10
AIC(
arima(train_main_1$week_bx,order=c(3,1,0)),
arima(train_main_1$week_bx,order=c(2,1,0)),
arima(train_main_1$week_bx,order=c(2,1,1)),
arima(train_main_1$week_bx,order=c(2,2,1)),
arima(train_main_1$week_bx,order=c(2,1,2)),
arima(train_main_1$week_bx,order=c(3,1,2)),
arima(train_main_1$week_bx,order=c(3,2,1)),
arima(train_main_1$week_bx,order=c(3,2,2)),
arima(train_main_1$week_bx,order=c(3,2,3)),
arima(train_main_1$week_bx,order=c(4,2,2)),
arima(train_main_1$week_bx,order=c(4,2,3))
)
## Warning in AIC.default(arima(train_main_1$week_bx, order = c(3, 1, 0)), : models
## are not all fitted to the same number of observations
## df AIC
## arima(train_main_1$week_bx, order = c(3, 1, 0)) 4 1068.884
## arima(train_main_1$week_bx, order = c(2, 1, 0)) 3 1075.343
## arima(train_main_1$week_bx, order = c(2, 1, 1)) 4 1061.705
## arima(train_main_1$week_bx, order = c(2, 2, 1)) 4 1076.732
## arima(train_main_1$week_bx, order = c(2, 1, 2)) 5 1063.722
## arima(train_main_1$week_bx, order = c(3, 1, 2)) 6 1064.910
## arima(train_main_1$week_bx, order = c(3, 2, 1)) 5 1070.748
## arima(train_main_1$week_bx, order = c(3, 2, 2)) 6 1067.581
## arima(train_main_1$week_bx, order = c(3, 2, 3)) 7 1068.289
## arima(train_main_1$week_bx, order = c(4, 2, 2)) 7 1068.113
## arima(train_main_1$week_bx, order = c(4, 2, 3)) 8 1070.873
BIC(
arima(train_main_1$week_bx,order=c(3,1,0)),
arima(train_main_1$week_bx,order=c(2,1,0)),
arima(train_main_1$week_bx,order=c(2,1,1)),
arima(train_main_1$week_bx,order=c(2,2,1)),
arima(train_main_1$week_bx,order=c(2,1,2)),
arima(train_main_1$week_bx,order=c(3,1,2)),
arima(train_main_1$week_bx,order=c(3,2,1)),
arima(train_main_1$week_bx,order=c(3,2,2)),
arima(train_main_1$week_bx,order=c(3,2,3)),
arima(train_main_1$week_bx,order=c(4,2,2)),
arima(train_main_1$week_bx,order=c(4,2,3))
)
## Warning in BIC.default(arima(train_main_1$week_bx, order = c(3, 1, 0)), : models
## are not all fitted to the same number of observations
## df BIC
## arima(train_main_1$week_bx, order = c(3, 1, 0)) 4 1080.818
## arima(train_main_1$week_bx, order = c(2, 1, 0)) 3 1084.294
## arima(train_main_1$week_bx, order = c(2, 1, 1)) 4 1073.639
## arima(train_main_1$week_bx, order = c(2, 2, 1)) 4 1088.639
## arima(train_main_1$week_bx, order = c(2, 1, 2)) 5 1078.640
## arima(train_main_1$week_bx, order = c(3, 1, 2)) 6 1082.812
## arima(train_main_1$week_bx, order = c(3, 2, 1)) 5 1085.632
## arima(train_main_1$week_bx, order = c(3, 2, 2)) 6 1085.441
## arima(train_main_1$week_bx, order = c(3, 2, 3)) 7 1089.126
## arima(train_main_1$week_bx, order = c(4, 2, 2)) 7 1088.950
## arima(train_main_1$week_bx, order = c(4, 2, 3)) 8 1094.687
best_fit <- arima(train_main_1$week_bx,order=c(2,1,1))
forecast::checkresiduals(best_fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,1)
## Q* = 6.4025, df = 7, p-value = 0.4936
##
## Model df: 3. Total lags used: 10
### Lets see what auto.arima gives us.
auto.arima(train_main$week_bx,stationary=FALSE,
seasonal= TRUE,stepwise=FALSE,approximation=FALSE,max.D = 55,max.order = 60)
## Series: train_main$week_bx
## ARIMA(1,0,0) with non-zero mean
##
## Coefficients:
## ar1 mean
## 0.6003 15.9406
## s.e. 0.0652 1.7867
##
## sigma^2 = 78.08: log likelihood = -531.69
## AIC=1069.39 AICc=1069.55 BIC=1078.38
auto_fit <- arima(train_main_1$week_bx,order=c(1,0,0))
forecast::checkresiduals(auto_fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 6.4001, df = 8, p-value = 0.6025
##
## Model df: 2. Total lags used: 10
Box.test(best_fit$residuals,type = "Ljung-Box",lag=1)
##
## Box-Ljung test
##
## data: best_fit$residuals
## X-squared = 0.0028112, df = 1, p-value = 0.9577
Box.test(auto_fit$residuals,type = "Ljung-Box",lag=1)
##
## Box-Ljung test
##
## data: auto_fit$residuals
## X-squared = 0.00024118, df = 1, p-value = 0.9876
Box.test(best_fit$residuals,type = "Ljung-Box",lag=3)
##
## Box-Ljung test
##
## data: best_fit$residuals
## X-squared = 0.65804, df = 3, p-value = 0.883
Box.test(auto_fit$residuals,type = "Ljung-Box",lag=3)
##
## Box-Ljung test
##
## data: auto_fit$residuals
## X-squared = 0.33179, df = 3, p-value = 0.9539
Box.test(best_fit$residuals,type = "Ljung-Box",lag=10)
##
## Box-Ljung test
##
## data: best_fit$residuals
## X-squared = 6.4025, df = 10, p-value = 0.7804
Box.test(auto_fit$residuals,type = "Ljung-Box",lag=10)
##
## Box-Ljung test
##
## data: auto_fit$residuals
## X-squared = 6.4001, df = 10, p-value = 0.7806
### IN SAMPLE RMSE
resid_1 = best_fit$residuals
resid_2 = auto_fit$residuals
pred_1 = train_main_1$week_bx-resid_1
pred_2 = train_main_1$week_bx-resid_2
ggplot()+
geom_line(aes(train_main_1$Week,pred_1),color='red',alpha=0.7)+
geom_line(aes(train_main_1$Week,pred_2),color='blue',alpha=0.7)+
geom_line(aes(train_main_1$Week,train_main_1$week_bx))
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
### dbt- after putting exmp1 -> going to infinite ### We now will calculate MAE for fitted values
RMSE_best = mean(abs((pred_1) - (train_main_1$week_bx)))
RMSE_best
## [1] 5.852256
RMSE_auto = mean(abs((pred_2) - (train_main_1$week_bx)))
RMSE_auto
## [1] 5.994088
best_fit %>%
forecast(h=5) %>%
autoplot()
auto_fit %>%
forecast(h=5) %>%
autoplot()
{#r, include=FALSE} head(df_train_1)