library(readr)
Month_Value_1 <- read_csv("~/Downloads/Month_Value_1.csv")
## Rows: 96 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Period
## dbl (4): Revenue, Sales_quantity, Average_cost, The_average_annual_payroll_o...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Month_Value_1)
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ purrr 0.3.5 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(timeSeries)
## Loading required package: timeDate
library(fpp2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.4 ──
## ✔ forecast 8.19 ✔ expsmooth 2.3
## ✔ fma 2.4
library(forecast)
head(Month_Value_1)
## # A tibble: 6 × 5
## Period Revenue Sales_quantity Average_cost The_average_annual_payroll_…¹
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 01.01.2015 16010072. 12729 1258. 30024676
## 2 01.02.2015 15807587. 11636 1359. 30024676
## 3 01.03.2015 22047146. 15922 1385. 30024676
## 4 01.04.2015 18814583. 15227 1236. 30024676
## 5 01.05.2015 14021480. 8620 1627. 30024676
## 6 01.06.2015 16783929. 13160 1275. 30024676
## # … with abbreviated variable name ¹​The_average_annual_payroll_of_the_region
str(Month_Value_1)
## spc_tbl_ [96 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Period : chr [1:96] "01.01.2015" "01.02.2015" "01.03.2015" "01.04.2015" ...
## $ Revenue : num [1:96] 16010072 15807587 22047146 18814583 14021480 ...
## $ Sales_quantity : num [1:96] 12729 11636 15922 15227 8620 ...
## $ Average_cost : num [1:96] 1258 1359 1385 1236 1627 ...
## $ The_average_annual_payroll_of_the_region: num [1:96] 3e+07 3e+07 3e+07 3e+07 3e+07 ...
## - attr(*, "spec")=
## .. cols(
## .. Period = col_character(),
## .. Revenue = col_double(),
## .. Sales_quantity = col_double(),
## .. Average_cost = col_double(),
## .. The_average_annual_payroll_of_the_region = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Month_Value_1%>%
is.na()%>%
sum()
## [1] 128
mydata<-na.omit(Month_Value_1)
head(mydata)
## # A tibble: 6 × 5
## Period Revenue Sales_quantity Average_cost The_average_annual_payroll_…¹
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 01.01.2015 16010072. 12729 1258. 30024676
## 2 01.02.2015 15807587. 11636 1359. 30024676
## 3 01.03.2015 22047146. 15922 1385. 30024676
## 4 01.04.2015 18814583. 15227 1236. 30024676
## 5 01.05.2015 14021480. 8620 1627. 30024676
## 6 01.06.2015 16783929. 13160 1275. 30024676
## # … with abbreviated variable name ¹​The_average_annual_payroll_of_the_region
nrow(mydata)
## [1] 64
mydata%>%
is.na()%>%
sum()
## [1] 0
#Declare revenue column as time series data
revenue_tseries<-ts(mydata[,2], start = c(2015,1), frequency = 12)
revenue_tseries
## Jan Feb Mar Apr May Jun Jul Aug
## 2015 16010072 15807587 22047146 18814583 14021480 16783929 19161892 15204984
## 2016 28601586 22367074 29738609 28351008 15264604 24385658 29486517 15270117
## 2017 36007381 30396775 47678131 27013965 24948845 31101346 33848822 16454667
## 2018 44067521 36020287 46995990 35536488 29699599 33261065 35826535 23268655
## 2019 36459960 36546499 54198707 32743990 32531658 47709702 45992142 36933665
## 2020 56288301 40225243 50022165 52320693
## Sep Oct Nov Dec
## 2015 20603940 20992875 14993370 27791808
## 2016 36141028 27915144 21272049 42014160
## 2017 31650093 31572206 22446371 44966126
## 2018 35423490 39831566 32999145 47221828
## 2019 48526260 44160416 36374956 58756474
## 2020
#Time plot for checking trends
autoplot(revenue_tseries) +ggtitle("Time plot:Revenue by month") + ylab("dollars in millions")
-As shown in the above graph, there is a positive trend over time. There
is regular patterns that are happening every year. We need to
investigate more.
#first difference of the data to remove the trend
DY<-diff(revenue_tseries)
autoplot(DY) +ggtitle("Time plot:Change in revenue by month") + ylab("dollars in millions")
-We now removed the trend but we need to check for seasonality.
# checking for seasonality
ggseasonplot(DY) + ggtitle("Seasonal plot: Change in monthly revenue") + ylab("Dolars in millions")
-Seasonality is irregular. No montly partterns repeated.
-Series revenue_tseries has trend. I removed the trend by taking the first difference. Using the first difference data(DY), no seasonality pattern was detected.
-Test 3 different models to choose the best one for forecasting revenue for the next 3 years.
#benchmark method to forecast
model1<-snaive(DY)
summary(model1)
##
## Forecast method: Seasonal naive method
##
## Model Information:
## Call: snaive(y = DY)
##
## Residual sd: 8068682.9697
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 136964.5 8068683 6476361 -58.12047 544.2507 1 -0.3608288
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## May 2020 -212332.1 -10552765.4 10128101 -16026660.1 15601996.0
## Jun 2020 15178044.1 4837610.8 25518477 -636283.9 30992372.1
## Jul 2020 -1717560.1 -12057993.4 8622873 -17531888.1 14096768.0
## Aug 2020 -9058476.6 -19398909.8 1281957 -24872804.6 6755851.5
## Sep 2020 11592595.1 1252161.8 21933028 -4221732.9 27406923.1
## Oct 2020 -4365844.0 -14706277.2 5974589 -20180172.0 11448484.1
## Nov 2020 -7785459.7 -18125893.0 2554974 -23599787.7 8028868.3
## Dec 2020 22381517.2 12041083.9 32721950 6567189.1 38195845.2
## Jan 2021 -2468172.8 -12808606.1 7872261 -18282500.8 13346155.2
## Feb 2021 -16063057.6 -26403490.9 -5722624 -31877385.6 -248729.6
## Mar 2021 9796922.0 -543511.3 20137355 -6017406.1 25611250.0
## Apr 2021 2298527.7 -8041905.6 12638961 -13515800.3 18112855.7
## May 2021 -212332.1 -14835913.1 14411249 -22577169.2 22152505.1
## Jun 2021 15178044.1 554463.1 29801625 -7186793.1 37542881.3
## Jul 2021 -1717560.1 -16341141.1 12906021 -24082397.2 20647277.1
## Aug 2021 -9058476.6 -23682057.6 5565104 -31423313.7 13306360.6
## Sep 2021 11592595.1 -3030985.9 26216176 -10772242.1 33957432.3
## Oct 2021 -4365844.0 -18989425.0 10257737 -26730681.1 17998993.2
## Nov 2021 -7785459.7 -22409040.7 6838121 -30150296.9 14579377.5
## Dec 2021 22381517.2 7757936.2 37005098 16680.0 44746354.3
## Jan 2022 -2468172.8 -17091753.8 12155408 -24833010.0 19896664.4
## Feb 2022 -16063057.6 -30686638.6 -1439477 -38427894.8 6301779.6
## Mar 2022 9796922.0 -4826659.0 24420503 -12567915.2 32161759.1
## Apr 2022 2298527.7 -12325053.3 16922109 -20066309.5 24663364.9
checkresiduals(model1)
##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 42.356, df = 13, p-value = 5.726e-05
##
## Model df: 0. Total lags used: 13
-The residual sd of 8068682.9697 suggests how well our data is fitting. The average difference bewteen the values
of the current month to the values of the same month last year is $8068682.9697.
-The error term model, ACF, has a few data lagging outside the 95% confidence interval.
-We will try the next model and see if it can do a better a job.
#Fit ETS method
model2<-ets(revenue_tseries)
summary(model2)
## ETS(M,A,M)
##
## Call:
## ets(y = revenue_tseries)
##
## Smoothing parameters:
## alpha = 0.0969
## beta = 0.0059
## gamma = 1e-04
##
## Initial states:
## l = 14900629.1102
## b = 653923.3572
## s = 1.2886 0.7598 0.994 1.0785 0.6882 1.0373
## 0.98 0.7657 1.0034 1.3125 0.947 1.145
##
## sigma: 0.1298
##
## AIC AICc BIC
## 2227.768 2241.072 2264.469
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -453610.2 3825696 2888750 -2.836156 9.383142 0.4062265 0.05576733
checkresiduals(model2)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,A,M)
## Q* = 19.384, df = 3, p-value = 0.0002277
##
## Model df: 16. Total lags used: 19
-We still have some issues with the ACF, but Residual sd: 0.1298 is a sign that this model could be a good
fit. There might be a better model out there though. Below is the final model.
model3<-auto.arima(revenue_tseries, d=1, D=1, stepwise = FALSE, approximation = FALSE,trace = TRUE)
##
## ARIMA(0,1,0)(0,1,0)[12] : 1768.97
## ARIMA(0,1,0)(0,1,1)[12] : Inf
## ARIMA(0,1,0)(1,1,0)[12] : 1753.67
## ARIMA(0,1,0)(1,1,1)[12] : Inf
## ARIMA(0,1,1)(0,1,0)[12] : 1744.455
## ARIMA(0,1,1)(0,1,1)[12] : Inf
## ARIMA(0,1,1)(1,1,0)[12] : 1734.879
## ARIMA(0,1,1)(1,1,1)[12] : Inf
## ARIMA(0,1,2)(0,1,0)[12] : 1746.705
## ARIMA(0,1,2)(0,1,1)[12] : Inf
## ARIMA(0,1,2)(1,1,0)[12] : 1737.082
## ARIMA(0,1,2)(1,1,1)[12] : Inf
## ARIMA(0,1,3)(0,1,0)[12] : 1746.241
## ARIMA(0,1,3)(0,1,1)[12] : Inf
## ARIMA(0,1,3)(1,1,0)[12] : 1739.091
## ARIMA(0,1,3)(1,1,1)[12] : Inf
## ARIMA(0,1,4)(0,1,0)[12] : Inf
## ARIMA(0,1,4)(0,1,1)[12] : Inf
## ARIMA(0,1,4)(1,1,0)[12] : 1737.389
## ARIMA(0,1,5)(0,1,0)[12] : Inf
## ARIMA(1,1,0)(0,1,0)[12] : 1762.503
## ARIMA(1,1,0)(0,1,1)[12] : Inf
## ARIMA(1,1,0)(1,1,0)[12] : 1745.729
## ARIMA(1,1,0)(1,1,1)[12] : Inf
## ARIMA(1,1,1)(0,1,0)[12] : 1746.709
## ARIMA(1,1,1)(0,1,1)[12] : Inf
## ARIMA(1,1,1)(1,1,0)[12] : 1737.11
## ARIMA(1,1,1)(1,1,1)[12] : Inf
## ARIMA(1,1,2)(0,1,0)[12] : 1746.763
## ARIMA(1,1,2)(0,1,1)[12] : Inf
## ARIMA(1,1,2)(1,1,0)[12] : Inf
## ARIMA(1,1,2)(1,1,1)[12] : Inf
## ARIMA(1,1,3)(0,1,0)[12] : 1747.537
## ARIMA(1,1,3)(0,1,1)[12] : Inf
## ARIMA(1,1,3)(1,1,0)[12] : 1740.82
## ARIMA(1,1,4)(0,1,0)[12] : Inf
## ARIMA(2,1,0)(0,1,0)[12] : 1744.139
## ARIMA(2,1,0)(0,1,1)[12] : Inf
## ARIMA(2,1,0)(1,1,0)[12] : 1735.026
## ARIMA(2,1,0)(1,1,1)[12] : Inf
## ARIMA(2,1,1)(0,1,0)[12] : 1745.629
## ARIMA(2,1,1)(0,1,1)[12] : Inf
## ARIMA(2,1,1)(1,1,0)[12] : 1737.377
## ARIMA(2,1,1)(1,1,1)[12] : Inf
## ARIMA(2,1,2)(0,1,0)[12] : 1745.916
## ARIMA(2,1,2)(0,1,1)[12] : Inf
## ARIMA(2,1,2)(1,1,0)[12] : 1739.764
## ARIMA(2,1,3)(0,1,0)[12] : Inf
## ARIMA(3,1,0)(0,1,0)[12] : 1746.112
## ARIMA(3,1,0)(0,1,1)[12] : Inf
## ARIMA(3,1,0)(1,1,0)[12] : 1737.401
## ARIMA(3,1,0)(1,1,1)[12] : Inf
## ARIMA(3,1,1)(0,1,0)[12] : Inf
## ARIMA(3,1,1)(0,1,1)[12] : Inf
## ARIMA(3,1,1)(1,1,0)[12] : 1739.644
## ARIMA(3,1,2)(0,1,0)[12] : Inf
## ARIMA(4,1,0)(0,1,0)[12] : 1747.157
## ARIMA(4,1,0)(0,1,1)[12] : Inf
## ARIMA(4,1,0)(1,1,0)[12] : 1739.827
## ARIMA(4,1,1)(0,1,0)[12] : Inf
## ARIMA(5,1,0)(0,1,0)[12] : 1749.724
##
##
##
## Best model: ARIMA(0,1,1)(1,1,0)[12]
summary(model3)
## Series: revenue_tseries
## ARIMA(0,1,1)(1,1,0)[12]
##
## Coefficients:
## ma1 sar1
## -0.8051 -0.5628
## s.e. 0.1239 0.1349
##
## sigma^2 = 2.847e+13: log likelihood = -864.18
## AIC=1734.37 AICc=1734.88 BIC=1740.16
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -327961.6 4669060 3275548 -3.196471 10.09469 0.4606193 0.03397385
checkresiduals(model3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)(1,1,0)[12]
## Q* = 13.303, df = 11, p-value = 0.274
##
## Model df: 2. Total lags used: 13
-If you look at the residuals, there seems a new constant line developed between 2015 and 2016. Violates the residuals randomness observed in the previous model. However, the ACF
model looks perfect.
-After comparing all the 3 models, Model 2 looks the best fit for the forecast.
#Forecast with model 2, for the next 3 years.
fcst<-forecast(model2, h=36)
autoplot(fcst)
summary(fcst)
##
## Forecast method: ETS(M,A,M)
##
## Model Information:
## ETS(M,A,M)
##
## Call:
## ets(y = revenue_tseries)
##
## Smoothing parameters:
## alpha = 0.0969
## beta = 0.0059
## gamma = 1e-04
##
## Initial states:
## l = 14900629.1102
## b = 653923.3572
## s = 1.2886 0.7598 0.994 1.0785 0.6882 1.0373
## 0.98 0.7657 1.0034 1.3125 0.947 1.145
##
## sigma: 0.1298
##
## AIC AICc BIC
## 2227.768 2241.072 2264.469
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -453610.2 3825696 2888750 -2.836156 9.383142 0.4062265 0.05576733
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## May 2020 36542882 30462065 42623699 27243075 45842689
## Jun 2020 47241020 39338687 55143353 35155445 59326595
## Jul 2020 50502969 42006816 58999122 37509225 63496713
## Aug 2020 33837053 28109635 39564470 25077724 42596381
## Sep 2020 53540410 44418231 62662589 39589241 67491579
## Oct 2020 49826347 41277381 58375312 36751832 62900861
## Nov 2020 38450530 31804262 45096799 28285940 48615121
## Dec 2020 65828874 54360756 77296992 48289901 83367847
## Jan 2021 59040587 48670062 69411111 43180238 74900936
## Feb 2021 49285281 40553351 58017210 35930947 62639614
## Mar 2021 68936162 56612337 81259987 50088498 87783826
## Apr 2021 53180353 43583892 62776814 38503832 67856874
## May 2021 40949710 33488195 48411224 29538308 52361112
## Jun 2021 52881291 43148649 62613933 37996499 67766082
## Jul 2021 56473294 45971608 66974980 40412352 72534237
## Aug 2021 37798155 30694190 44902121 26933577 48662734
## Sep 2021 59747516 48395211 71099821 42385663 77109368
## Oct 2021 55547596 44874870 66220323 39225069 71870124
## Nov 2021 42823720 34501392 51146049 30095818 55551623
## Dec 2021 73245658 58844762 87646554 51221388 95269928
## Jan 2022 65630677 52573443 78687911 45661360 85599993
## Feb 2022 54735788 43714444 65757133 37880096 71591481
## Mar 2022 76490262 60899641 92080884 52646465 100334060
## Apr 2022 58955181 46789356 71121006 40349157 77561204
## May 2022 45356543 35879147 54833939 30862117 59850970
## Jun 2022 58521569 46138050 70905088 39582611 77460527
## Jul 2022 62443627 49060909 75826345 41976526 82910728
## Aug 2022 41759264 32694003 50824525 27895144 55623384
## Sep 2022 65954630 51450838 80458422 43772994 88136266
## Oct 2022 61268854 47619345 74918363 40393731 82143977
## Nov 2022 47196917 36544233 57849600 30905043 63488790
## Dec 2022 80662452 62216239 99108665 52451404 108873501
## Jan 2023 72220776 55486530 88955021 46627956 97813596
## Feb 2023 60186304 46055615 74316992 38575280 81797327
## Mar 2023 84044373 64050108 104038639 53465783 114622964
## Apr 2023 64730016 49125892 80334141 40865567 88594465
-The summary above is the company`s forecasted revenue for the next 3 years. In the summary output, There is a point forecast for
every month with 80% chance, and 95% confidence interval.
-Please note the original data set includes time length from April 2020 to 2022 but there are no values given. All
cells are NULLs. The forecasted revenue includes these years since I removed all the NULLs.
Thanks for taking the time to read!