## Warning: package 'dplyr' was built under R version 4.1.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v stringr 1.4.0
## v tidyr   1.1.4     v forcats 0.5.1
## v readr   2.0.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Warning: package 'forecast' was built under R version 4.1.2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Warning: package 'tseries' was built under R version 4.1.2
  df_train <- readr::read_csv('D:\\VAIBHAV\\HOMEWORK\\Time Series\\ASSIGNMENT\\sales_train.csv')
## Rows: 2935849 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): date
## dbl (5): date_block_num, shop_id, item_id, item_price, item_cnt_day
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.


str(df_train)
## spec_tbl_df [2,935,849 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date          : chr [1:2935849] "02.01.2013" "03.01.2013" "05.01.2013" "06.01.2013" ...
##  $ date_block_num: num [1:2935849] 0 0 0 0 0 0 0 0 0 0 ...
##  $ shop_id       : num [1:2935849] 59 25 25 25 25 25 25 25 25 25 ...
##  $ item_id       : num [1:2935849] 22154 2552 2552 2554 2555 ...
##  $ item_price    : num [1:2935849] 999 899 899 1709 1099 ...
##  $ item_cnt_day  : num [1:2935849] 1 1 -1 1 1 1 1 1 1 3 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   date = col_character(),
##   ..   date_block_num = col_double(),
##   ..   shop_id = col_double(),
##   ..   item_id = col_double(),
##   ..   item_price = col_double(),
##   ..   item_cnt_day = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
df_train$date <- dmy(df_train$date)

str(df_train)
## spec_tbl_df [2,935,849 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date          : Date[1:2935849], format: "2013-01-02" "2013-01-03" ...
##  $ date_block_num: num [1:2935849] 0 0 0 0 0 0 0 0 0 0 ...
##  $ shop_id       : num [1:2935849] 59 25 25 25 25 25 25 25 25 25 ...
##  $ item_id       : num [1:2935849] 22154 2552 2552 2554 2555 ...
##  $ item_price    : num [1:2935849] 999 899 899 1709 1099 ...
##  $ item_cnt_day  : num [1:2935849] 1 1 -1 1 1 1 1 1 1 3 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   date = col_character(),
##   ..   date_block_num = col_double(),
##   ..   shop_id = col_double(),
##   ..   item_id = col_double(),
##   ..   item_price = col_double(),
##   ..   item_cnt_day = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>


attach(df_train)

unique(date_block_num)
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## [26] 25 26 27 28 29 30 31 32 33
lapply(df_train,function(x) { length(which(is.na(df_train)))})
## $date
## [1] 0
## 
## $date_block_num
## [1] 0
## 
## $shop_id
## [1] 0
## 
## $item_id
## [1] 0
## 
## $item_price
## [1] 0
## 
## $item_cnt_day
## [1] 0


df_train <-df_train %>% 
           group_by(date) %>% 
               mutate(total_sale = sum(item_price*item_cnt_day)/1000000) %>% 
                 ungroup()
par(mfrow=c(2,1))

df_train %>% ggplot()+
             geom_line(aes(date,total_sale))+
              theme_bw()+
              geom_smooth(aes(date,total_sale),method = "lm")
## `geom_smooth()` using formula 'y ~ x'

              ggtitle("TOATAL DAILY SALES")
## $title
## [1] "TOATAL DAILY SALES"
## 
## attr(,"class")
## [1] "labels"
df_train %>% ggplot(aes(colour="Blue"))+
             geom_line(aes(date,cumsum(total_sale)/1000000),show.legend = FALSE)+
             theme_bw()+
             ggtitle("TOTAL CUMMULATIVE SALES BY DATE")+
             ylab("Cummulative Sales")+
             xlab("DATE")


df_train <- subset(df_train,select = c(date,total_sale))

df_train <- df_train %>% group_by(date) %>% filter(!duplicated(date))

dim(df_train)
## [1] 1034    2
df_train %>% ggplot()+
             geom_line(aes(date,total_sale))



Part 2

The series appears to be non stationary in terms of both variance and mean.

df_train %>% ggplot()+
             geom_line(aes(date,total_sale))+
              theme_bw()+
              geom_smooth(aes(date,total_sale),method = "lm")
## `geom_smooth()` using formula 'y ~ x'

              ggtitle("TOATAL DAILY SALES")
## $title
## [1] "TOATAL DAILY SALES"
## 
## attr(,"class")
## [1] "labels"


Lets see if we have any auto-correlation in the series to move ahead

par(mfrow=c(1,2))
acf(df_train$total_sale,lag.max = 40)
Pacf(df_train$total_sale, lag.max = 40)

### From above plots we could see that there is strong autocorrelation in the data
### Now we will move ahead make the series Variable stationary by log transformation and box cox transformation.


train_box_cox <- df_train %>% 
                 mutate(sale_box_cox = forecast::BoxCox(total_sale,lambda="auto"))

train_box_cox %>% ggplot()+
                  geom_line(aes(date,sale_box_cox))+
                  theme_bw()+
                  ggtitle("SALES PER DAY-BOX-COX TRANSFORMED")

The box-cox transformed series looks better, the variance is now uniform.


Lets find the value of lambda at which series was transformed.-0.6801812

#adf.test(train_box_cox$sale_box_cox)
lambda <- BoxCox.lambda(df_train$total_sale)

lambda
## [1] -0.8635756


## [1] 16256
## [1] 20000

Due to heavy data, the system crashed while performing adf test for stationary.

Lets figure out if we can predict from auto.arima, as it looks like there is small trend form line drawn, and a strong seasonality is involved.

The system could not perform auto.arima,thus we decide to convert this into weekly data and perform further steps.

Just to check on daily data, we run auto.arima with appoximations=True.

We got the output as 5,1,0.


### Lets check the residuals of this by manually putting values.

model_auto = arima(train_box_cox$sale_box_cox,order=c(5,1,0))

forecast::checkresiduals(model_auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,1,0)
## Q* = 29.472, df = 5, p-value = 1.873e-05
## 
## Model df: 5.   Total lags used: 10


### Lets Compare this with original ACF and PACF

par(mfrow=c(1,2))
acf(train_box_cox$sale_box_cox,lag.max = 20)
pacf(train_box_cox$sale_box_cox,lag.max = 20)

df_train$Week <- as.Date(cut(df_train$date, "week"))
aggregate(total_sale ~ Week, df_train, sum)
##           Week total_sale
## 1   2012-12-31   26.43543
## 2   2013-01-07   20.80383
## 3   2013-01-14   19.40747
## 4   2013-01-21   17.73457
## 5   2013-01-28   17.14022
## 6   2013-02-04   20.76675
## 7   2013-02-11   22.41082
## 8   2013-02-18   29.40029
## 9   2013-02-25   18.63159
## 10  2013-03-04   26.49597
## 11  2013-03-11   25.65420
## 12  2013-03-18   21.02726
## 13  2013-03-25   21.63710
## 14  2013-04-01   16.97071
## 15  2013-04-08   16.84502
## 16  2013-04-15   16.01968
## 17  2013-04-22   15.16685
## 18  2013-04-29   15.56394
## 19  2013-05-06   13.02825
## 20  2013-05-13   14.83875
## 21  2013-05-20   15.78017
## 22  2013-05-27   15.66490
## 23  2013-06-03   16.86827
## 24  2013-06-10   21.99448
## 25  2013-06-17   17.95413
## 26  2013-06-24   17.07484
## 27  2013-07-01   16.81119
## 28  2013-07-08   15.73135
## 29  2013-07-15   15.54233
## 30  2013-07-22   15.33993
## 31  2013-07-29   15.41860
## 32  2013-08-05   14.57381
## 33  2013-08-12   15.47869
## 34  2013-08-19   17.24375
## 35  2013-08-26   20.70857
## 36  2013-09-02   22.79903
## 37  2013-09-09   18.19900
## 38  2013-09-16   42.97683
## 39  2013-09-23   29.31174
## 40  2013-09-30   29.01582
## 41  2013-10-07   25.07020
## 42  2013-10-14   20.17567
## 43  2013-10-21   19.22806
## 44  2013-10-28   30.53934
## 45  2013-11-04   25.75330
## 46  2013-11-11   19.34698
## 47  2013-11-18   25.88803
## 48  2013-11-25   58.94111
## 49  2013-12-02   30.35415
## 50  2013-12-09   30.47249
## 51  2013-12-16   48.95757
## 52  2013-12-23   65.54560
## 53  2013-12-30   53.07747
## 54  2014-01-06   24.78459
## 55  2014-01-13   18.57356
## 56  2014-01-20   17.90493
## 57  2014-01-27   18.26185
## 58  2014-02-03   29.89935
## 59  2014-02-10   23.81434
## 60  2014-02-17   31.84550
## 61  2014-02-24   22.87792
## 62  2014-03-03   22.40873
## 63  2014-03-10   18.87321
## 64  2014-03-17   33.17772
## 65  2014-03-24   23.76429
## 66  2014-03-31   18.45659
## 67  2014-04-07   17.88451
## 68  2014-04-14   21.42710
## 69  2014-04-21   18.60999
## 70  2014-04-28   16.76306
## 71  2014-05-05   14.55092
## 72  2014-05-12   14.43177
## 73  2014-05-19   16.19569
## 74  2014-05-26   39.71713
## 75  2014-06-02   18.63094
## 76  2014-06-09   20.27065
## 77  2014-06-16   18.53425
## 78  2014-06-23   19.03261
## 79  2014-06-30   17.20969
## 80  2014-07-07   17.34606
## 81  2014-07-14   15.89059
## 82  2014-07-21   16.03213
## 83  2014-07-28   21.08208
## 84  2014-08-04   18.34613
## 85  2014-08-11   19.33446
## 86  2014-08-18   22.21304
## 87  2014-08-25   21.02987
## 88  2014-09-01   19.41437
## 89  2014-09-08   26.43745
## 90  2014-09-15   20.82242
## 91  2014-09-22   40.43093
## 92  2014-09-29   40.73875
## 93  2014-10-06   26.24087
## 94  2014-10-13   23.00585
## 95  2014-10-20   18.73650
## 96  2014-10-27   20.07889
## 97  2014-11-03   26.29730
## 98  2014-11-10   34.13653
## 99  2014-11-17   52.47483
## 100 2014-11-24   30.47203
## 101 2014-12-01   32.07337
## 102 2014-12-08   33.15333
## 103 2014-12-15   58.01242
## 104 2014-12-22   65.66092
## 105 2014-12-29   70.27005
## 106 2015-01-05   33.80135
## 107 2015-01-12   21.62727
## 108 2015-01-19   19.88812
## 109 2015-01-26   19.96514
## 110 2015-02-02   16.49110
## 111 2015-02-09   19.34865
## 112 2015-02-16   28.82654
## 113 2015-02-23   21.60438
## 114 2015-03-02   18.76137
## 115 2015-03-09   17.49452
## 116 2015-03-16   17.06582
## 117 2015-03-23   25.72285
## 118 2015-03-30   16.54408
## 119 2015-04-06   14.09163
## 120 2015-04-13   39.18419
## 121 2015-04-20   17.70593
## 122 2015-04-27   14.12287
## 123 2015-05-04   11.72068
## 124 2015-05-11   12.12435
## 125 2015-05-18   41.85571
## 126 2015-05-25   15.73664
## 127 2015-06-01   14.95426
## 128 2015-06-08   15.25172
## 129 2015-06-15   14.51645
## 130 2015-06-22   16.58789
## 131 2015-06-29   13.93846
## 132 2015-07-06   15.59413
## 133 2015-07-13   13.98616
## 134 2015-07-20   12.17754
## 135 2015-07-27   11.23117
## 136 2015-08-03   11.42219
## 137 2015-08-10   11.70116
## 138 2015-08-17   14.09194
## 139 2015-08-24   18.84793
## 140 2015-08-31   20.08190
## 141 2015-09-07   16.93140
## 142 2015-09-14   17.12227
## 143 2015-09-21   25.34495
## 144 2015-09-28   32.81623
## 145 2015-10-05   15.79077
## 146 2015-10-12   15.26764
## 147 2015-10-19   22.56859
## 148 2015-10-26   13.57028
df_train_1 <- df_train %>% group_by(Week) %>% summarise(sale_week=sum(total_sale))

autoplot(ts(df_train_1$sale_week))+ylab("TOTAL SALE (week)")+theme_bw()


head(df_train_1)
## # A tibble: 6 x 2
##   Week       sale_week
##   <date>         <dbl>
## 1 2012-12-31      26.4
## 2 2013-01-07      20.8
## 3 2013-01-14      19.4
## 4 2013-01-21      17.7
## 5 2013-01-28      17.1
## 6 2013-02-04      20.8


## Part B-

Here we will change our series from daily to weekly and proceed

#rm(train_diff)

train_box_cox$Week <- as.Date(cut(train_box_cox$date, "week"))
aggregate(sale_box_cox ~ Week, train_box_cox, sum)
##           Week sale_box_cox
## 1   2012-12-31    20.435426
## 2   2013-01-07    13.803835
## 3   2013-01-14    12.407470
## 4   2013-01-21    10.734573
## 5   2013-01-28    10.140218
## 6   2013-02-04    13.766748
## 7   2013-02-11    15.410815
## 8   2013-02-18    22.400294
## 9   2013-02-25    11.631587
## 10  2013-03-04    19.495966
## 11  2013-03-11    18.654195
## 12  2013-03-18    14.027257
## 13  2013-03-25    14.637100
## 14  2013-04-01     9.970707
## 15  2013-04-08     9.845017
## 16  2013-04-15     9.019684
## 17  2013-04-22     8.166846
## 18  2013-04-29     8.563944
## 19  2013-05-06     6.028252
## 20  2013-05-13     7.838752
## 21  2013-05-20     8.780174
## 22  2013-05-27     8.664896
## 23  2013-06-03     9.868274
## 24  2013-06-10    14.994482
## 25  2013-06-17    10.954131
## 26  2013-06-24    10.074844
## 27  2013-07-01     9.811191
## 28  2013-07-08     8.731352
## 29  2013-07-15     8.542329
## 30  2013-07-22     8.339934
## 31  2013-07-29     8.418597
## 32  2013-08-05     7.573813
## 33  2013-08-12     8.478688
## 34  2013-08-19    10.243753
## 35  2013-08-26    13.708573
## 36  2013-09-02    15.799025
## 37  2013-09-09    11.199003
## 38  2013-09-16    35.976832
## 39  2013-09-23    22.311738
## 40  2013-09-30    22.015823
## 41  2013-10-07    18.070195
## 42  2013-10-14    13.175669
## 43  2013-10-21    12.228058
## 44  2013-10-28    23.539339
## 45  2013-11-04    18.753302
## 46  2013-11-11    12.346978
## 47  2013-11-18    18.888026
## 48  2013-11-25    51.941111
## 49  2013-12-02    23.354148
## 50  2013-12-09    23.472486
## 51  2013-12-16    41.957566
## 52  2013-12-23    58.545603
## 53  2013-12-30    46.077474
## 54  2014-01-06    17.784588
## 55  2014-01-13    11.573564
## 56  2014-01-20    10.904925
## 57  2014-01-27    11.261852
## 58  2014-02-03    22.899346
## 59  2014-02-10    16.814337
## 60  2014-02-17    24.845498
## 61  2014-02-24    15.877917
## 62  2014-03-03    15.408726
## 63  2014-03-10    11.873211
## 64  2014-03-17    26.177724
## 65  2014-03-24    16.764294
## 66  2014-03-31    11.456592
## 67  2014-04-07    10.884507
## 68  2014-04-14    14.427103
## 69  2014-04-21    11.609987
## 70  2014-04-28     9.763055
## 71  2014-05-05     7.550921
## 72  2014-05-12     7.431767
## 73  2014-05-19     9.195685
## 74  2014-05-26    32.717125
## 75  2014-06-02    11.630937
## 76  2014-06-09    13.270650
## 77  2014-06-16    11.534253
## 78  2014-06-23    12.032608
## 79  2014-06-30    10.209688
## 80  2014-07-07    10.346058
## 81  2014-07-14     8.890585
## 82  2014-07-21     9.032135
## 83  2014-07-28    14.082085
## 84  2014-08-04    11.346134
## 85  2014-08-11    12.334458
## 86  2014-08-18    15.213042
## 87  2014-08-25    14.029874
## 88  2014-09-01    12.414367
## 89  2014-09-08    19.437454
## 90  2014-09-15    13.822417
## 91  2014-09-22    33.430928
## 92  2014-09-29    33.738751
## 93  2014-10-06    19.240872
## 94  2014-10-13    16.005848
## 95  2014-10-20    11.736497
## 96  2014-10-27    13.078887
## 97  2014-11-03    19.297304
## 98  2014-11-10    27.136530
## 99  2014-11-17    45.474830
## 100 2014-11-24    23.472030
## 101 2014-12-01    25.073367
## 102 2014-12-08    26.153334
## 103 2014-12-15    51.012423
## 104 2014-12-22    58.660918
## 105 2014-12-29    63.270050
## 106 2015-01-05    26.801351
## 107 2015-01-12    14.627269
## 108 2015-01-19    12.888115
## 109 2015-01-26    12.965139
## 110 2015-02-02     9.491104
## 111 2015-02-09    12.348652
## 112 2015-02-16    21.826537
## 113 2015-02-23    14.604377
## 114 2015-03-02    11.761367
## 115 2015-03-09    10.494525
## 116 2015-03-16    10.065824
## 117 2015-03-23    18.722854
## 118 2015-03-30     9.544076
## 119 2015-04-06     7.091630
## 120 2015-04-13    32.184190
## 121 2015-04-20    10.705932
## 122 2015-04-27     7.122867
## 123 2015-05-04     4.720676
## 124 2015-05-11     5.124349
## 125 2015-05-18    34.855711
## 126 2015-05-25     8.736635
## 127 2015-06-01     7.954260
## 128 2015-06-08     8.251721
## 129 2015-06-15     7.516448
## 130 2015-06-22     9.587887
## 131 2015-06-29     6.938458
## 132 2015-07-06     8.594129
## 133 2015-07-13     6.986156
## 134 2015-07-20     5.177542
## 135 2015-07-27     4.231170
## 136 2015-08-03     4.422195
## 137 2015-08-10     4.701165
## 138 2015-08-17     7.091940
## 139 2015-08-24    11.847935
## 140 2015-08-31    13.081896
## 141 2015-09-07     9.931395
## 142 2015-09-14    10.122267
## 143 2015-09-21    18.344954
## 144 2015-09-28    25.816232
## 145 2015-10-05     8.790767
## 146 2015-10-12     8.267644
## 147 2015-10-19    15.568590
## 148 2015-10-26     7.570279
train_main <- train_box_cox %>% group_by(Week) %>% summarise(week_bx=sum(sale_box_cox))

autoplot(ts(train_main$week_bx))+ylab("TOTAL SALE (BOX-COX)")+theme_bw()


The ACF/PACF for weekly data

par(mfrow=c(1,2))
acf(train_main$week_bx,lag.max = 60)
pacf(train_main$week_bx,lag.max = 60)


### The ACF plot shows the seasonality involved in the series and PACF pattern looks like MA process. ### To clear this, we will take first difference of the data

train_main <- train_main %>% 
               mutate(week_bx_diff = week_bx - lag(week_bx))


head(train_main)
## # A tibble: 6 x 3
##   Week       week_bx week_bx_diff
##   <date>       <dbl>        <dbl>
## 1 2012-12-31    20.4       NA    
## 2 2013-01-07    13.8       -6.63 
## 3 2013-01-14    12.4       -1.40 
## 4 2013-01-21    10.7       -1.67 
## 5 2013-01-28    10.1       -0.594
## 6 2013-02-04    13.8        3.63
autoplot(ts(train_main$week_bx_diff))


Perform adf and kpss test to check stationarity.

train_main_1 <- train_main[complete.cases(train_main),]

adf.test(train_main_1$week_bx_diff)
## Warning in adf.test(train_main_1$week_bx_diff): p-value smaller than printed p-
## value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_main_1$week_bx_diff
## Dickey-Fuller = -6.9122, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(train_main_1$week_bx_diff)
## Warning in kpss.test(train_main_1$week_bx_diff): p-value greater than printed p-
## value
## 
##  KPSS Test for Level Stationarity
## 
## data:  train_main_1$week_bx_diff
## KPSS Level = 0.025498, Truncation lag parameter = 4, p-value = 0.1


### Both test indicates that the series is stationary ### Checking again the ACF/PACF plots

par(mfrow=c(1,2))
acf(train_main_1$week_bx_diff)
pacf(train_main_1$week_bx_diff)


### PACF AND ACF is much better now. ### The series looks an AR series.The plot indicates that it could be an 3 AR series. i.e ARIMA(3,0,0)

model_1 = arima(train_main_1$week_bx,order = c(3,1,0))


summary(model_1)
## 
## Call:
## arima(x = train_main_1$week_bx, order = c(3, 1, 0))
## 
## Coefficients:
##           ar1      ar2      ar3
##       -0.2930  -0.2688  -0.2357
## s.e.   0.0803   0.0805   0.0798
## 
## sigma^2 estimated as 83.63:  log likelihood = -530.44,  aic = 1068.88
## 
## Training set error measures:
##                       ME     RMSE      MAE       MPE     MAPE     MASE
## Training set -0.05339097 9.113623 6.101429 -15.02176 37.68191 1.012667
##                     ACF1
## Training set -0.03359444


forecast::checkresiduals(model_1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,0)
## Q* = 9.7582, df = 7, p-value = 0.2027
## 
## Model df: 3.   Total lags used: 10

The residual are looking Good. But there seems to be litle seaonal factor left according to residual graphs. But we seems to be close to the best model.

Here we will try different models near to (3,0,0) and comapre there AIC’s.


AIC(
  arima(train_main_1$week_bx,order=c(3,1,0)),
  
  arima(train_main_1$week_bx,order=c(2,1,0)),
  
  arima(train_main_1$week_bx,order=c(2,1,1)),
  
  arima(train_main_1$week_bx,order=c(2,2,1)),
  
  arima(train_main_1$week_bx,order=c(2,1,2)),
  
  arima(train_main_1$week_bx,order=c(3,1,2)),
  
  arima(train_main_1$week_bx,order=c(3,2,1)),
  
  arima(train_main_1$week_bx,order=c(3,2,2)),
  
  arima(train_main_1$week_bx,order=c(3,2,3)),
  
  arima(train_main_1$week_bx,order=c(4,2,2)),
  
  arima(train_main_1$week_bx,order=c(4,2,3))
  
  
)
## Warning in AIC.default(arima(train_main_1$week_bx, order = c(3, 1, 0)), : models
## are not all fitted to the same number of observations
##                                                 df      AIC
## arima(train_main_1$week_bx, order = c(3, 1, 0))  4 1068.884
## arima(train_main_1$week_bx, order = c(2, 1, 0))  3 1075.343
## arima(train_main_1$week_bx, order = c(2, 1, 1))  4 1061.705
## arima(train_main_1$week_bx, order = c(2, 2, 1))  4 1076.732
## arima(train_main_1$week_bx, order = c(2, 1, 2))  5 1063.722
## arima(train_main_1$week_bx, order = c(3, 1, 2))  6 1064.910
## arima(train_main_1$week_bx, order = c(3, 2, 1))  5 1070.748
## arima(train_main_1$week_bx, order = c(3, 2, 2))  6 1067.581
## arima(train_main_1$week_bx, order = c(3, 2, 3))  7 1068.289
## arima(train_main_1$week_bx, order = c(4, 2, 2))  7 1068.113
## arima(train_main_1$week_bx, order = c(4, 2, 3))  8 1070.873


BIC(
  arima(train_main_1$week_bx,order=c(3,1,0)),
  
  arima(train_main_1$week_bx,order=c(2,1,0)),
  
  arima(train_main_1$week_bx,order=c(2,1,1)),
  
  arima(train_main_1$week_bx,order=c(2,2,1)),
  
  arima(train_main_1$week_bx,order=c(2,1,2)),
  
  arima(train_main_1$week_bx,order=c(3,1,2)),
  
  arima(train_main_1$week_bx,order=c(3,2,1)),
  
  arima(train_main_1$week_bx,order=c(3,2,2)),
  
  arima(train_main_1$week_bx,order=c(3,2,3)),
  
  arima(train_main_1$week_bx,order=c(4,2,2)),
  
  arima(train_main_1$week_bx,order=c(4,2,3))
  
  
)
## Warning in BIC.default(arima(train_main_1$week_bx, order = c(3, 1, 0)), : models
## are not all fitted to the same number of observations
##                                                 df      BIC
## arima(train_main_1$week_bx, order = c(3, 1, 0))  4 1080.818
## arima(train_main_1$week_bx, order = c(2, 1, 0))  3 1084.294
## arima(train_main_1$week_bx, order = c(2, 1, 1))  4 1073.639
## arima(train_main_1$week_bx, order = c(2, 2, 1))  4 1088.639
## arima(train_main_1$week_bx, order = c(2, 1, 2))  5 1078.640
## arima(train_main_1$week_bx, order = c(3, 1, 2))  6 1082.812
## arima(train_main_1$week_bx, order = c(3, 2, 1))  5 1085.632
## arima(train_main_1$week_bx, order = c(3, 2, 2))  6 1085.441
## arima(train_main_1$week_bx, order = c(3, 2, 3))  7 1089.126
## arima(train_main_1$week_bx, order = c(4, 2, 2))  7 1088.950
## arima(train_main_1$week_bx, order = c(4, 2, 3))  8 1094.687


From both AIC and BIC, the best model is ARIMA(2,1,1)

Lets find details about it

best_fit <- arima(train_main_1$week_bx,order=c(2,1,1))

forecast::checkresiduals(best_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,1)
## Q* = 6.4025, df = 7, p-value = 0.4936
## 
## Model df: 3.   Total lags used: 10


### Lets see what auto.arima gives us.

auto.arima(train_main$week_bx,stationary=FALSE,
seasonal= TRUE,stepwise=FALSE,approximation=FALSE,max.D = 55,max.order = 60)
## Series: train_main$week_bx 
## ARIMA(1,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1     mean
##       0.6003  15.9406
## s.e.  0.0652   1.7867
## 
## sigma^2 = 78.08:  log likelihood = -531.69
## AIC=1069.39   AICc=1069.55   BIC=1078.38


auto_fit <- arima(train_main_1$week_bx,order=c(1,0,0))

forecast::checkresiduals(auto_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 6.4001, df = 8, p-value = 0.6025
## 
## Model df: 2.   Total lags used: 10


WE are getting differt models from our best estimation and auto.arima.

We will calculate the in in sample fit for both models

Box.test(best_fit$residuals,type = "Ljung-Box",lag=1)
## 
##  Box-Ljung test
## 
## data:  best_fit$residuals
## X-squared = 0.0028112, df = 1, p-value = 0.9577
Box.test(auto_fit$residuals,type = "Ljung-Box",lag=1)
## 
##  Box-Ljung test
## 
## data:  auto_fit$residuals
## X-squared = 0.00024118, df = 1, p-value = 0.9876


Box.test(best_fit$residuals,type = "Ljung-Box",lag=3)
## 
##  Box-Ljung test
## 
## data:  best_fit$residuals
## X-squared = 0.65804, df = 3, p-value = 0.883
Box.test(auto_fit$residuals,type = "Ljung-Box",lag=3)
## 
##  Box-Ljung test
## 
## data:  auto_fit$residuals
## X-squared = 0.33179, df = 3, p-value = 0.9539


Box.test(best_fit$residuals,type = "Ljung-Box",lag=10)
## 
##  Box-Ljung test
## 
## data:  best_fit$residuals
## X-squared = 6.4025, df = 10, p-value = 0.7804
Box.test(auto_fit$residuals,type = "Ljung-Box",lag=10)
## 
##  Box-Ljung test
## 
## data:  auto_fit$residuals
## X-squared = 6.4001, df = 10, p-value = 0.7806


### IN SAMPLE RMSE

resid_1 = best_fit$residuals

resid_2 = auto_fit$residuals

pred_1 = train_main_1$week_bx-resid_1


pred_2 = train_main_1$week_bx-resid_2


ggplot()+
  geom_line(aes(train_main_1$Week,pred_1),color='red',alpha=0.7)+
  
  geom_line(aes(train_main_1$Week,pred_2),color='blue',alpha=0.7)+
  
  geom_line(aes(train_main_1$Week,train_main_1$week_bx))
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.



### dbt- after putting exmp1 -> going to infinite ### We now will calculate MAE for fitted values

RMSE_best = mean(abs((pred_1) - (train_main_1$week_bx)))

RMSE_best
## [1] 5.852256
RMSE_auto = mean(abs((pred_2) - (train_main_1$week_bx)))
RMSE_auto
## [1] 5.994088

The MAE is less for the model we choose


best_fit %>% 
  forecast(h=5) %>% 
  autoplot()


auto_fit %>% 
  forecast(h=5) %>% 
  autoplot()


{#r, include=FALSE} head(df_train_1)