FORECASTING FOR THE NEXT 3 MONTHS

#Package

library(dplyr)     
library(stringr)   
library(tidyr)     
library(forcats)
library(ggplot2)    #for basic visualization
library(modeltime)  #modelling time series
library(tidymodels) #modelling time series
library(timetk)     #data wrangling and visualization of time series
library(lubridate)  #for work with dates and datetimes
library(tidyverse)  #data wrangling
library(anomalize)  #anomaly in time series
library(readr)

#Open the file

sales <- read_csv("sales.csv")
products <- read_csv("products.csv")
stores <- read_csv("stores.csv")

#Join all of the file, and drop the unnecessary column

Sales<-sales %>%
  left_join(products, by=c("Product_ID"="Product_ID"))%>%
  left_join(stores, by=c("Store_ID"="Store_ID"))
Sales<-Sales[-c(1,3,4,13)] #Drop Sale_ID, Store ID, Product ID, Store open date
Sales

## # A tibble: 829,262 x 9
##    Date       Units Product_Name     Product_Category Product_Cost Product_Price
##    <date>     <dbl> <chr>            <chr>            <chr>        <chr>        
##  1 2017-01-01     1 Chutes & Ladders Games            $9.99        $12.99       
##  2 2017-01-01     1 Action Figure    Toys             $9.99        $15.99       
##  3 2017-01-01     1 Deck Of Cards    Games            $3.99        $6.99        
##  4 2017-01-01     1 Dart Gun         Sports & Outdoo~ $11.99       $15.99       
##  5 2017-01-01     1 Lego Bricks      Toys             $34.99       $39.99       
##  6 2017-01-01     1 Splash Balls     Sports & Outdoo~ $7.99        $8.99        
##  7 2017-01-01     1 Dart Gun         Sports & Outdoo~ $11.99       $15.99       
##  8 2017-01-01     1 Animal Figures   Toys             $9.99        $12.99       
##  9 2017-01-01     1 Mini Ping Pong ~ Sports & Outdoo~ $6.99        $9.99        
## 10 2017-01-01     1 Deck Of Cards    Games            $3.99        $6.99        
## # ... with 829,252 more rows, and 3 more variables: Store_Name <chr>,
## #   Store_City <chr>, Store_Location <chr>

#Delete the “$” symbol in column product cost and product price and change to numeric type, and make new column to count the total cost and total price

Sales<-Sales%>%
  mutate(Product_Cost=str_replace(Product_Cost,"\\$",""))%>%
  mutate(Product_Price=str_replace(Product_Price,"\\$",""))%>%
  mutate(Product_Cost=as.numeric(Product_Cost))%>%
  mutate(Product_Price=as.numeric(Product_Price))%>%
  mutate(Total_Cost=Units*Product_Cost)%>%
  mutate(Total_Price=Units*Product_Price)
Sales

## # A tibble: 829,262 x 11
##    Date       Units Product_Name     Product_Category Product_Cost Product_Price
##    <date>     <dbl> <chr>            <chr>                   <dbl>         <dbl>
##  1 2017-01-01     1 Chutes & Ladders Games                    9.99         13.0 
##  2 2017-01-01     1 Action Figure    Toys                     9.99         16.0 
##  3 2017-01-01     1 Deck Of Cards    Games                    3.99          6.99
##  4 2017-01-01     1 Dart Gun         Sports & Outdoo~        12.0          16.0 
##  5 2017-01-01     1 Lego Bricks      Toys                    35.0          40.0 
##  6 2017-01-01     1 Splash Balls     Sports & Outdoo~         7.99          8.99
##  7 2017-01-01     1 Dart Gun         Sports & Outdoo~        12.0          16.0 
##  8 2017-01-01     1 Animal Figures   Toys                     9.99         13.0 
##  9 2017-01-01     1 Mini Ping Pong ~ Sports & Outdoo~         6.99          9.99
## 10 2017-01-01     1 Deck Of Cards    Games                    3.99          6.99
## # ... with 829,252 more rows, and 5 more variables: Store_Name <chr>,
## #   Store_City <chr>, Store_Location <chr>, Total_Cost <dbl>, Total_Price <dbl>

#Extract day and month from column Date and make new column based on it and reorder to make it nice to look

Sales<-Sales %>%
  mutate(Day = lubridate::day(Date),
         Month = month.name[lubridate::month(Date, label = TRUE)])

Sales<-Sales[, c(1,12,13,2,3,4,5,6,10,11,7,8,9)]
Sales

## # A tibble: 829,262 x 13
##    Date         Day Month   Units Product_Name     Product_Category Product_Cost
##    <date>     <int> <chr>   <dbl> <chr>            <chr>                   <dbl>
##  1 2017-01-01     1 January     1 Chutes & Ladders Games                    9.99
##  2 2017-01-01     1 January     1 Action Figure    Toys                     9.99
##  3 2017-01-01     1 January     1 Deck Of Cards    Games                    3.99
##  4 2017-01-01     1 January     1 Dart Gun         Sports & Outdoo~        12.0 
##  5 2017-01-01     1 January     1 Lego Bricks      Toys                    35.0 
##  6 2017-01-01     1 January     1 Splash Balls     Sports & Outdoo~         7.99
##  7 2017-01-01     1 January     1 Dart Gun         Sports & Outdoo~        12.0 
##  8 2017-01-01     1 January     1 Animal Figures   Toys                     9.99
##  9 2017-01-01     1 January     1 Mini Ping Pong ~ Sports & Outdoo~         6.99
## 10 2017-01-01     1 January     1 Deck Of Cards    Games                    3.99
## # ... with 829,252 more rows, and 6 more variables: Product_Price <dbl>,
## #   Total_Cost <dbl>, Total_Price <dbl>, Store_Name <chr>, Store_City <chr>,
## #   Store_Location <chr>

#Check the NA rows

colSums(is.na(Sales))

##             Date              Day            Month            Units 
##                0                0                0                0 
##     Product_Name Product_Category     Product_Cost    Product_Price 
##                0                0                0                0 
##       Total_Cost      Total_Price       Store_Name       Store_City 
##                0                0                0                0 
##   Store_Location 
##                0

#Sales Composition Each Day Based on Product Category

Sales%>%
  group_by(Day, Product_Category)%>%
  summarise(Total.Price = sum(Total_Price))%>%
  
  ggplot(aes(
    x=Day,
    y=Total.Price,
    fill=Product_Category))+
  geom_col(position = "stack")+
  theme_bw()+
  theme(legend.position = "bottom",
        legend.title = element_blank())+
  scale_x_continuous(breaks = 1:31)+
  labs(title = "Sales Composition Each Day Based on Product Category",
       subtitle = "January 2017 - September 2018",
       x = "Day",
       y = "Total Sales")

## `summarise()` has grouped output by 'Day'. You can override using the `.groups` argument.

#Total Unit Sold Composition Each Day Based on Product Category

Sales%>%
  group_by(Day, Product_Category)%>%
  summarise(count=n())%>%
  
  ggplot(aes(
    x=Day,
    y=count,
    fill=Product_Category))+
  geom_col(position = "stack")+
  theme_bw()+
  theme(legend.position = "bottom",
        legend.title = element_blank())+
  scale_x_continuous(breaks = 1:31)+
  labs(title = "Total Unit Sold Composition Each Day Based on Product Category",
       subtitle = "January 2017 - September 2018",
       x = "Day",
       y = "Total Unit")

## `summarise()` has grouped output by 'Day'. You can override using the `.groups` argument.

#Sales Composition Each Month Based on Product Category

Sales%>%
  group_by(Month, Product_Category)%>%
  summarise(Total.Price = sum(Total_Price))%>%
  
  ggplot(aes(
    x=fct_rev(factor(Month,levels=month.name)),
    y=Total.Price,
    fill=Product_Category))+
  geom_col(position = "stack")+
  theme_bw()+
  coord_flip()+
  theme(legend.position = "bottom",
        legend.title = element_blank())+
  labs(title = "Sales Composition Each Month Based on Product Category",
       subtitle = "Sales From January 2017 - September 2018",
       x = "Month",
       y = "Total Sales")

## `summarise()` has grouped output by 'Month'. You can override using the `.groups` argument.

#Total Unit Sold Composition Each Month Based on Product Category

Sales%>%
  group_by(Month, Product_Category)%>%
  summarise(count=n())%>%
  
  ggplot(aes(
    x=fct_rev(factor(Month,levels=month.name)),
    y=count,
    fill=Product_Category))+
  geom_col(position = "stack")+
  theme_bw()+
  coord_flip()+
  theme(legend.position = "bottom",
        legend.title = element_blank())+
  labs(title = "Total Unit Sold Composition Each Month Based on Product Category",
       subtitle = "January 2017 - September 2018",
       x = "Month",
       y = "Total Unit")

## `summarise()` has grouped output by 'Month'. You can override using the `.groups` argument.

#Lets check the top 10 product

Sales%>%
  group_by(Product_Name)%>%
  summarise(Total.Price=sum(Total_Price))%>%
  arrange(desc(Total.Price))%>%
  head(10)%>%

ggplot(aes(
  x=reorder(Product_Name,Total.Price), 
  y = Total.Price, 
  label = Total.Price))+
geom_col(aes(fill = Total.Price), show.legend = FALSE)+
coord_flip()+
theme_bw()+
theme(axis.text = element_text(size = 10), 
      axis.title = element_text(size = 12, colour = "black"))+
geom_label(aes(fill = Total.Price),
           colour = "white",
           size = 4,
           show.legend = FALSE,
           position = position_stack(0.9))+
labs(title = "Total Sales Based on Product Names",
     subtitle = "Top 10 Product From January 2017 - September 2018",
     x = "Product Names",
     y = "Total Sales")

#Now lets see the Store name that made the highest sales

Sales%>%
  group_by(Store_Name)%>%
  summarise(Total.Price=sum(Total_Price))%>%
  arrange(desc(Total.Price)) %>%
  head(10)%>%


ggplot(aes(
  x=reorder(Store_Name,Total.Price), 
  y = Total.Price, 
  label = Total.Price))+
geom_col(aes(fill = Total.Price), show.legend = FALSE)+
coord_flip()+
theme_bw()+
theme(axis.text = element_text(size = 8), 
      axis.title = element_text(size = 10, colour = "black"))+
geom_label(aes(fill = Total.Price),
           colour = "white",
           size = 4,
           show.legend = FALSE,
           position = position_stack(0.9))+
labs(title = "Total Sales Based on Stores Name",
     subtitle = "Top 10 Stores From January 2017 - September 2018",
     x = "Store Names",
     y = "Total Sales")

Maven Toys Cludad de Mexico 1 has the highest sales

#Now lets see the store location

Sales%>%
  group_by(Store_Location)%>%
  summarise(Total.Price=sum(Total_Price))%>%
  arrange(desc(Total.Price))%>%
  
ggplot(aes(
  x=Store_Location, 
  y =Total.Price, 
  label =round(Total.Price, digits = 2)))+
geom_col(aes(fill = Total.Price), show.legend = FALSE)+
coord_flip()+
theme_bw()+
theme(axis.text = element_text(size = 10), 
      axis.title = element_text(size = 12, colour = "black"))+
geom_label(aes(fill = Total.Price),
           colour = "white",
           size = 5,
           show.legend = FALSE,
           position = position_stack(.9))+
labs(title = "Store Location Total Sales",
     subtitle = "January 2017 - September 2018",
     x = "Location",
     y = "Total Sales")

#Percentage Store Location Sales

Sales%>%
  select(Store_Location, Product_Category, Total_Price)%>%
  group_by(Store_Location, Product_Category)%>%
  summarise(Total.Price = sum(Total_Price))%>%
  mutate(Percent = round(Total.Price / sum(Total.Price)*100 , digits = 2))%>%
  
ggplot(aes(
  x=Store_Location,
  y=Percent,
  fill=Product_Category))+
geom_bar(stat = "identity")+
geom_text(aes(label = paste(Percent,"%"), y = Percent), 
            position = position_stack(vjust = 0.5))+
coord_flip()+
theme_bw()+
theme(legend.position = "bottom",
      legend.title = element_blank())+

labs(title = "Sales Composition in Store Location Based on Product Category",
     subtitle = "January 2017 - September 2018",
     x = "Location",
     y = "Percentage")

## `summarise()` has grouped output by 'Store_Location'. You can override using the `.groups` argument.

Sales%>%
  select(Store_Location, Product_Category, Total_Price)%>%
  group_by(Store_Location, Product_Category)%>%
  summarise(count=n())%>%

ggplot(aes(
  x=reorder(Store_Location,count),
  y=count,
  fill=Product_Category))+
geom_col(position = "stack")+
coord_flip()+
theme_bw()+
theme(legend.position = "bottom",
      legend.title = element_blank())+
labs(title = "Total Unit Sold Composition in Store Location Based on Product Category",
     subtitle = "January 2017 - September 2018",
     x = "Location",
     y = "Total Unit")

## `summarise()` has grouped output by 'Store_Location'. You can override using the `.groups` argument.

#MENCOBA

Sales %>%
  mutate(DayName=lubridate::wday(Date, label = TRUE))%>%
  group_by(DayName,Product_Category)%>%
  summarise(count=n()) %>%

ggplot(aes(
  x=reorder(DayName,count),
  y=count,
  fill=Product_Category))+
geom_col(position = "stack")+
coord_flip()+
theme_bw()+
theme(legend.position = "bottom",
      legend.title = element_blank())+
labs(title = "Total Unit Sold Composition Based on Product Category",
     subtitle = "January 2017 - September 2018",
     x = "Product Category",
     y = "Total Unit")

## `summarise()` has grouped output by 'DayName'. You can override using the `.groups` argument.

#Sales Location Each Day

Sales %>%
  mutate(DayName=lubridate::wday(Date, label = TRUE))%>%
  group_by(DayName,Product_Category, Store_Location)%>%
  summarise(count=n())%>%
  
ggplot(aes(
  x=DayName,
  y=count,
  fill=Product_Category))+
geom_col()+
facet_wrap(~Store_Location)+
theme_bw()+
theme(legend.position = "bottom",
      legend.title=element_blank())+
labs(title = "",
     x = "Day Name",
     y = "Total Unit")

## `summarise()` has grouped output by 'DayName', 'Product_Category'. You can override using the `.groups` argument.

#sales daily time series

sales_daily<-Sales%>%
  mutate(Date=floor_date(Date, unit="day"))%>%
  group_by(Date)%>%
  summarise(Total.Price=sum(Total_Price))
sales_daily%>%plot_time_series(Date,Total.Price,
                               .title = "Daily Sales January 2017 - September 2018")

#check the anomaly and clear

sales_daily%>%
  plot_anomaly_diagnostics(Date, Total.Price)

sales_daily2<-sales_daily%>%
  time_decompose(Total.Price)%>%
  anomalize(remainder)%>%
  clean_anomalies()%>%
  select(Date, observed_cleaned)%>%
  rename(Total.Price=observed_cleaned)
sales_daily2%>%plot_time_series(Date, Total.Price,
                               .title = "Daily Sales January 2017 - September 2018")

#Check the stationary

library(tseries)
sales_daily_test=ts(sales_daily2$Total.Price, start = c(2017,1,1),frequency = 365.25)

test for stationary with adf and pp

adf.test(sales_daily_test);

## 
##  Augmented Dickey-Fuller Test
## 
## data:  sales_daily_test
## Dickey-Fuller = -4.1858, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary

pp.test(sales_daily_test)

## 
##  Phillips-Perron Unit Root Test
## 
## data:  sales_daily_test
## Dickey-Fuller Z(alpha) = -214.28, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary

since in adf and pp test, p-value less than 0.05, the data is stationary

sales_daily2%>%
  plot_seasonal_diagnostics(Date, Total.Price,
                            .feature_set = c("wday.lbl","month.lbl"))

#Now lets split date train and date test, (train80% ; test20%)

splits<-initial_time_split(sales_daily2, prop = 0.8)

#you can also use this  
#splits<-time_series_split(sales_daily,
#                          assess = "4 months",
#                          cumulative = TRUE) #tells the sampling to use all of the prior data as the training set.
  
splits%>%
  tk_time_series_cv_plan()%>%
  plot_time_series_cv_plan(Date,Total.Price,
                           .title = "Data Split into Training and Testing")

#Snaive

model_snaive<-naive_reg()%>%
  set_engine("snaive")%>%
  fit(Total.Price~Date, training(splits))
model_snaive

## parsnip model object
## 
## Fit time:  101ms 
## SNAIVE [7]
## --------
## Model: 
## # A tibble: 7 x 2
##   Date        value
##   <date>      <dbl>
## 1 2018-05-19 36488.
## 2 2018-05-20 41634.
## 3 2018-05-21 20881.
## 4 2018-05-22 19508.
## 5 2018-05-23 20939.
## 6 2018-05-24 19333.
## 7 2018-05-25 30387.

#ETS

model_ets<-exp_smoothing()%>%
  set_engine("ets")%>%
  fit(Total.Price~Date, training(splits))
model_ets

## parsnip model object
## 
## Fit time:  1.7s 
## ETS(M,N,M) 
## 
## Call:
##  forecast::ets(y = outcome, model = model_ets, damped = damping_ets,  
## 
##  Call:
##      alpha = alpha, beta = beta, gamma = gamma) 
## 
##   Smoothing parameters:
##     alpha = 0.1706 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 18177.9105 
##     s = 1.3066 1.0586 0.8522 0.8276 0.7561 0.8397
##            1.3593
## 
##   sigma:  0.1629
## 
##      AIC     AICc      BIC 
## 11486.94 11487.38 11529.28

#ARIMA

model_arima<-arima_reg()%>%
  set_engine("auto_arima")%>%
  fit(Total.Price~Date, training(splits))
model_arima

## parsnip model object
## 
## Fit time:  1.4s 
## Series: outcome 
## ARIMA(1,0,1)(1,1,1)[7] 
## 
## Coefficients:
##          ar1      ma1     sar1     sma1
##       0.9492  -0.7739  -0.1426  -0.8474
## s.e.  0.0294   0.0600   0.0513   0.0344
## 
## sigma^2 estimated as 12476397:  log likelihood=-4825.85
## AIC=9661.69   AICc=9661.82   BIC=9682.8

#Prophet

model_prophet<-prophet_reg(
  seasonality_yearly  = TRUE)%>%
  set_engine("prophet")%>%
  fit(Total.Price~Date, training(splits))
model_prophet

## parsnip model object
## 
## Fit time:  1.9s 
## PROPHET Model
## - growth: 'linear'
## - n.changepoints: 25
## - changepoint.range: 0.8
## - yearly.seasonality: 'TRUE'
## - weekly.seasonality: 'auto'
## - daily.seasonality: 'auto'
## - seasonality.mode: 'additive'
## - changepoint.prior.scale: 0.05
## - seasonality.prior.scale: 10
## - holidays.prior.scale: 10
## - logistic_cap: NULL
## - logistic_floor: NULL
## - extra_regressors: 0

#modeltimetable

model_tbl<-modeltime_table(
  model_snaive,
  model_ets,
  model_arima,
  model_prophet)
model_tbl

## # Modeltime Table
## # A tibble: 4 x 3
##   .model_id .model   .model_desc           
##       <int> <list>   <chr>                 
## 1         1 <fit[+]> SNAIVE [7]            
## 2         2 <fit[+]> ETS(M,N,M)            
## 3         3 <fit[+]> ARIMA(1,0,1)(1,1,1)[7]
## 4         4 <fit[+]> PROPHET

#calibrate

calibrate_tbl<-model_tbl%>%
  modeltime_calibrate(testing(splits))
calibrate_tbl

## # Modeltime Table
## # A tibble: 4 x 5
##   .model_id .model   .model_desc            .type .calibration_data 
##       <int> <list>   <chr>                  <chr> <list>            
## 1         1 <fit[+]> SNAIVE [7]             Test  <tibble [128 x 4]>
## 2         2 <fit[+]> ETS(M,N,M)             Test  <tibble [128 x 4]>
## 3         3 <fit[+]> ARIMA(1,0,1)(1,1,1)[7] Test  <tibble [128 x 4]>
## 4         4 <fit[+]> PROPHET                Test  <tibble [128 x 4]>

~Calibration is how confidence intervals and accuracy metrics are determined.
~Calibration Data is simply forecasting predictions and residuals that are calculated from out-of-sample data.
~After calibrating, the calibration data follows the data through the forecasting workflow.

#accuracy

calibrate_tbl%>%modeltime_accuracy()

## # A tibble: 4 x 9
##   .model_id .model_desc            .type   mae  mape  mase smape  rmse   rsq
##       <int> <chr>                  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1         1 SNAIVE [7]             Test  4348.  19.8 0.778  17.6 5522. 0.700
## 2         2 ETS(M,N,M)             Test  3870.  19.3 0.692  16.8 4689. 0.721
## 3         3 ARIMA(1,0,1)(1,1,1)[7] Test  3965.  19.2 0.709  16.8 4917. 0.712
## 4         4 PROPHET                Test  3172.  15.1 0.567  14.0 3953. 0.733

#Test Set Visualization

calibrate_tbl%>%
  modeltime_forecast(
    new_data = testing(splits),
    actual_data = sales_daily2)%>%
  plot_modeltime_forecast()%>%
  plotly::layout(
    legend=list(
      orientation="h",
      xanchor="center",
      x=0.5,
      y=-0.2))

#FORECAST FUTURE

future_forecast_tbl<-calibrate_tbl%>%
  modeltime_refit(sales_daily2)%>% #retrain on full data
  modeltime_forecast(
    h = "3 months",
    actual_data = sales_daily2)

future_forecast_tbl%>%
  plot_modeltime_forecast()%>%
  plotly::layout(
    legend=list(
      orientation="h",
      xanchor="center",
      x=0.5,
      y=-0.2))

#Forecast with prophet

model_prophet2<-prophet_reg(
  growth = "linear", #see the trend
  prior_scale_changepoints = 0.4, #Parameter modulating the flexibility of the automatic changepoint selection
  prior_scale_seasonality = 5, #modulating the strength of the seasonality model
  seasonality_yearly  = TRUE,
  seasonality_weekly = TRUE,
  seasonality_daily = FALSE)%>%
  set_engine("prophet")%>%
  fit(Total.Price~Date, training(splits))
model_prophet2

## parsnip model object
## 
## Fit time:  150ms 
## PROPHET Model
## - growth: 'linear'
## - n.changepoints: 25
## - changepoint.range: 0.8
## - yearly.seasonality: 'TRUE'
## - weekly.seasonality: 'TRUE'
## - daily.seasonality: 'FALSE'
## - seasonality.mode: 'additive'
## - changepoint.prior.scale: 0.4
## - seasonality.prior.scale: 5
## - holidays.prior.scale: 10
## - logistic_cap: NULL
## - logistic_floor: NULL
## - extra_regressors: 0

model_tbl2<-modeltime_table(
  model_prophet,
  model_prophet2)

calibrate_tbl2<-model_tbl2%>%
  modeltime_calibrate(testing(splits))

calibrate_tbl2%>%modeltime_accuracy()

## # A tibble: 2 x 9
##   .model_id .model_desc .type   mae  mape  mase smape  rmse   rsq
##       <int> <chr>       <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1         1 PROPHET     Test  3172.  15.1 0.567  14.0 3953. 0.733
## 2         2 PROPHET     Test  3185.  14.0 0.570  13.9 3957. 0.737

calibrate_tbl2%>%
  modeltime_forecast(
    new_data = testing(splits),
    actual_data = sales_daily2)%>%
  plot_modeltime_forecast(.title = "Test Plot")%>%
  plotly::layout(
    legend=list(
      orientation="h",
      xanchor="center",
      x=0.5,
      y=-0.2))

future_forecast_tbl2<-calibrate_tbl2%>%
  modeltime_refit(sales_daily2)%>% #retrain on full data
  modeltime_forecast(
    h = "3 months",
    actual_data = sales_daily2)

future_forecast_tbl2%>%
  plot_modeltime_forecast(.title = "Forecast 3 Months")%>%
  plotly::layout(
    legend=list(
      orientation="h",
      xanchor="center",
      x=0.5,
      y=-0.2))