#Package
library(dplyr)
library(stringr)
library(tidyr)
library(forcats)
library(ggplot2) #for basic visualization
library(modeltime) #modelling time series
library(tidymodels) #modelling time series
library(timetk) #data wrangling and visualization of time series
library(lubridate) #for work with dates and datetimes
library(tidyverse) #data wrangling
library(anomalize) #anomaly in time series
library(readr)
#Open the file
sales <- read_csv("sales.csv")
products <- read_csv("products.csv")
stores <- read_csv("stores.csv")
#Join all of the file, and drop the unnecessary column
Sales<-sales %>%
left_join(products, by=c("Product_ID"="Product_ID"))%>%
left_join(stores, by=c("Store_ID"="Store_ID"))
Sales<-Sales[-c(1,3,4,13)] #Drop Sale_ID, Store ID, Product ID, Store open date
Sales
## # A tibble: 829,262 x 9
## Date Units Product_Name Product_Category Product_Cost Product_Price
## <date> <dbl> <chr> <chr> <chr> <chr>
## 1 2017-01-01 1 Chutes & Ladders Games $9.99 $12.99
## 2 2017-01-01 1 Action Figure Toys $9.99 $15.99
## 3 2017-01-01 1 Deck Of Cards Games $3.99 $6.99
## 4 2017-01-01 1 Dart Gun Sports & Outdoo~ $11.99 $15.99
## 5 2017-01-01 1 Lego Bricks Toys $34.99 $39.99
## 6 2017-01-01 1 Splash Balls Sports & Outdoo~ $7.99 $8.99
## 7 2017-01-01 1 Dart Gun Sports & Outdoo~ $11.99 $15.99
## 8 2017-01-01 1 Animal Figures Toys $9.99 $12.99
## 9 2017-01-01 1 Mini Ping Pong ~ Sports & Outdoo~ $6.99 $9.99
## 10 2017-01-01 1 Deck Of Cards Games $3.99 $6.99
## # ... with 829,252 more rows, and 3 more variables: Store_Name <chr>,
## # Store_City <chr>, Store_Location <chr>
#Delete the “$” symbol in column product cost and product price and change to numeric type, and make new column to count the total cost and total price
Sales<-Sales%>%
mutate(Product_Cost=str_replace(Product_Cost,"\\$",""))%>%
mutate(Product_Price=str_replace(Product_Price,"\\$",""))%>%
mutate(Product_Cost=as.numeric(Product_Cost))%>%
mutate(Product_Price=as.numeric(Product_Price))%>%
mutate(Total_Cost=Units*Product_Cost)%>%
mutate(Total_Price=Units*Product_Price)
Sales
## # A tibble: 829,262 x 11
## Date Units Product_Name Product_Category Product_Cost Product_Price
## <date> <dbl> <chr> <chr> <dbl> <dbl>
## 1 2017-01-01 1 Chutes & Ladders Games 9.99 13.0
## 2 2017-01-01 1 Action Figure Toys 9.99 16.0
## 3 2017-01-01 1 Deck Of Cards Games 3.99 6.99
## 4 2017-01-01 1 Dart Gun Sports & Outdoo~ 12.0 16.0
## 5 2017-01-01 1 Lego Bricks Toys 35.0 40.0
## 6 2017-01-01 1 Splash Balls Sports & Outdoo~ 7.99 8.99
## 7 2017-01-01 1 Dart Gun Sports & Outdoo~ 12.0 16.0
## 8 2017-01-01 1 Animal Figures Toys 9.99 13.0
## 9 2017-01-01 1 Mini Ping Pong ~ Sports & Outdoo~ 6.99 9.99
## 10 2017-01-01 1 Deck Of Cards Games 3.99 6.99
## # ... with 829,252 more rows, and 5 more variables: Store_Name <chr>,
## # Store_City <chr>, Store_Location <chr>, Total_Cost <dbl>, Total_Price <dbl>
#Extract day and month from column Date and make new column based on it and reorder to make it nice to look
Sales<-Sales %>%
mutate(Day = lubridate::day(Date),
Month = month.name[lubridate::month(Date, label = TRUE)])
Sales<-Sales[, c(1,12,13,2,3,4,5,6,10,11,7,8,9)]
Sales
## # A tibble: 829,262 x 13
## Date Day Month Units Product_Name Product_Category Product_Cost
## <date> <int> <chr> <dbl> <chr> <chr> <dbl>
## 1 2017-01-01 1 January 1 Chutes & Ladders Games 9.99
## 2 2017-01-01 1 January 1 Action Figure Toys 9.99
## 3 2017-01-01 1 January 1 Deck Of Cards Games 3.99
## 4 2017-01-01 1 January 1 Dart Gun Sports & Outdoo~ 12.0
## 5 2017-01-01 1 January 1 Lego Bricks Toys 35.0
## 6 2017-01-01 1 January 1 Splash Balls Sports & Outdoo~ 7.99
## 7 2017-01-01 1 January 1 Dart Gun Sports & Outdoo~ 12.0
## 8 2017-01-01 1 January 1 Animal Figures Toys 9.99
## 9 2017-01-01 1 January 1 Mini Ping Pong ~ Sports & Outdoo~ 6.99
## 10 2017-01-01 1 January 1 Deck Of Cards Games 3.99
## # ... with 829,252 more rows, and 6 more variables: Product_Price <dbl>,
## # Total_Cost <dbl>, Total_Price <dbl>, Store_Name <chr>, Store_City <chr>,
## # Store_Location <chr>
#Check the NA rows
colSums(is.na(Sales))
## Date Day Month Units
## 0 0 0 0
## Product_Name Product_Category Product_Cost Product_Price
## 0 0 0 0
## Total_Cost Total_Price Store_Name Store_City
## 0 0 0 0
## Store_Location
## 0
#Sales Composition Each Day Based on Product Category
Sales%>%
group_by(Day, Product_Category)%>%
summarise(Total.Price = sum(Total_Price))%>%
ggplot(aes(
x=Day,
y=Total.Price,
fill=Product_Category))+
geom_col(position = "stack")+
theme_bw()+
theme(legend.position = "bottom",
legend.title = element_blank())+
scale_x_continuous(breaks = 1:31)+
labs(title = "Sales Composition Each Day Based on Product Category",
subtitle = "January 2017 - September 2018",
x = "Day",
y = "Total Sales")
## `summarise()` has grouped output by 'Day'. You can override using the `.groups` argument.
#Total Unit Sold Composition Each Day Based on Product Category
Sales%>%
group_by(Day, Product_Category)%>%
summarise(count=n())%>%
ggplot(aes(
x=Day,
y=count,
fill=Product_Category))+
geom_col(position = "stack")+
theme_bw()+
theme(legend.position = "bottom",
legend.title = element_blank())+
scale_x_continuous(breaks = 1:31)+
labs(title = "Total Unit Sold Composition Each Day Based on Product Category",
subtitle = "January 2017 - September 2018",
x = "Day",
y = "Total Unit")
## `summarise()` has grouped output by 'Day'. You can override using the `.groups` argument.
#Sales Composition Each Month Based on Product Category
Sales%>%
group_by(Month, Product_Category)%>%
summarise(Total.Price = sum(Total_Price))%>%
ggplot(aes(
x=fct_rev(factor(Month,levels=month.name)),
y=Total.Price,
fill=Product_Category))+
geom_col(position = "stack")+
theme_bw()+
coord_flip()+
theme(legend.position = "bottom",
legend.title = element_blank())+
labs(title = "Sales Composition Each Month Based on Product Category",
subtitle = "Sales From January 2017 - September 2018",
x = "Month",
y = "Total Sales")
## `summarise()` has grouped output by 'Month'. You can override using the `.groups` argument.
#Total Unit Sold Composition Each Month Based on Product Category
Sales%>%
group_by(Month, Product_Category)%>%
summarise(count=n())%>%
ggplot(aes(
x=fct_rev(factor(Month,levels=month.name)),
y=count,
fill=Product_Category))+
geom_col(position = "stack")+
theme_bw()+
coord_flip()+
theme(legend.position = "bottom",
legend.title = element_blank())+
labs(title = "Total Unit Sold Composition Each Month Based on Product Category",
subtitle = "January 2017 - September 2018",
x = "Month",
y = "Total Unit")
## `summarise()` has grouped output by 'Month'. You can override using the `.groups` argument.
#Lets check the top 10 product
Sales%>%
group_by(Product_Name)%>%
summarise(Total.Price=sum(Total_Price))%>%
arrange(desc(Total.Price))%>%
head(10)%>%
ggplot(aes(
x=reorder(Product_Name,Total.Price),
y = Total.Price,
label = Total.Price))+
geom_col(aes(fill = Total.Price), show.legend = FALSE)+
coord_flip()+
theme_bw()+
theme(axis.text = element_text(size = 10),
axis.title = element_text(size = 12, colour = "black"))+
geom_label(aes(fill = Total.Price),
colour = "white",
size = 4,
show.legend = FALSE,
position = position_stack(0.9))+
labs(title = "Total Sales Based on Product Names",
subtitle = "Top 10 Product From January 2017 - September 2018",
x = "Product Names",
y = "Total Sales")
#Now lets see the Store name that made the highest sales
Sales%>%
group_by(Store_Name)%>%
summarise(Total.Price=sum(Total_Price))%>%
arrange(desc(Total.Price)) %>%
head(10)%>%
ggplot(aes(
x=reorder(Store_Name,Total.Price),
y = Total.Price,
label = Total.Price))+
geom_col(aes(fill = Total.Price), show.legend = FALSE)+
coord_flip()+
theme_bw()+
theme(axis.text = element_text(size = 8),
axis.title = element_text(size = 10, colour = "black"))+
geom_label(aes(fill = Total.Price),
colour = "white",
size = 4,
show.legend = FALSE,
position = position_stack(0.9))+
labs(title = "Total Sales Based on Stores Name",
subtitle = "Top 10 Stores From January 2017 - September 2018",
x = "Store Names",
y = "Total Sales")
Maven Toys Cludad de Mexico 1 has the highest sales
#Now lets see the store location
Sales%>%
group_by(Store_Location)%>%
summarise(Total.Price=sum(Total_Price))%>%
arrange(desc(Total.Price))%>%
ggplot(aes(
x=Store_Location,
y =Total.Price,
label =round(Total.Price, digits = 2)))+
geom_col(aes(fill = Total.Price), show.legend = FALSE)+
coord_flip()+
theme_bw()+
theme(axis.text = element_text(size = 10),
axis.title = element_text(size = 12, colour = "black"))+
geom_label(aes(fill = Total.Price),
colour = "white",
size = 5,
show.legend = FALSE,
position = position_stack(.9))+
labs(title = "Store Location Total Sales",
subtitle = "January 2017 - September 2018",
x = "Location",
y = "Total Sales")
#Percentage Store Location Sales
Sales%>%
select(Store_Location, Product_Category, Total_Price)%>%
group_by(Store_Location, Product_Category)%>%
summarise(Total.Price = sum(Total_Price))%>%
mutate(Percent = round(Total.Price / sum(Total.Price)*100 , digits = 2))%>%
ggplot(aes(
x=Store_Location,
y=Percent,
fill=Product_Category))+
geom_bar(stat = "identity")+
geom_text(aes(label = paste(Percent,"%"), y = Percent),
position = position_stack(vjust = 0.5))+
coord_flip()+
theme_bw()+
theme(legend.position = "bottom",
legend.title = element_blank())+
labs(title = "Sales Composition in Store Location Based on Product Category",
subtitle = "January 2017 - September 2018",
x = "Location",
y = "Percentage")
## `summarise()` has grouped output by 'Store_Location'. You can override using the `.groups` argument.
Sales%>%
select(Store_Location, Product_Category, Total_Price)%>%
group_by(Store_Location, Product_Category)%>%
summarise(count=n())%>%
ggplot(aes(
x=reorder(Store_Location,count),
y=count,
fill=Product_Category))+
geom_col(position = "stack")+
coord_flip()+
theme_bw()+
theme(legend.position = "bottom",
legend.title = element_blank())+
labs(title = "Total Unit Sold Composition in Store Location Based on Product Category",
subtitle = "January 2017 - September 2018",
x = "Location",
y = "Total Unit")
## `summarise()` has grouped output by 'Store_Location'. You can override using the `.groups` argument.
#MENCOBA
Sales %>%
mutate(DayName=lubridate::wday(Date, label = TRUE))%>%
group_by(DayName,Product_Category)%>%
summarise(count=n()) %>%
ggplot(aes(
x=reorder(DayName,count),
y=count,
fill=Product_Category))+
geom_col(position = "stack")+
coord_flip()+
theme_bw()+
theme(legend.position = "bottom",
legend.title = element_blank())+
labs(title = "Total Unit Sold Composition Based on Product Category",
subtitle = "January 2017 - September 2018",
x = "Product Category",
y = "Total Unit")
## `summarise()` has grouped output by 'DayName'. You can override using the `.groups` argument.
#Sales Location Each Day
Sales %>%
mutate(DayName=lubridate::wday(Date, label = TRUE))%>%
group_by(DayName,Product_Category, Store_Location)%>%
summarise(count=n())%>%
ggplot(aes(
x=DayName,
y=count,
fill=Product_Category))+
geom_col()+
facet_wrap(~Store_Location)+
theme_bw()+
theme(legend.position = "bottom",
legend.title=element_blank())+
labs(title = "",
x = "Day Name",
y = "Total Unit")
## `summarise()` has grouped output by 'DayName', 'Product_Category'. You can override using the `.groups` argument.
#sales daily time series
sales_daily<-Sales%>%
mutate(Date=floor_date(Date, unit="day"))%>%
group_by(Date)%>%
summarise(Total.Price=sum(Total_Price))
sales_daily%>%plot_time_series(Date,Total.Price,
.title = "Daily Sales January 2017 - September 2018")
#check the anomaly and clear
sales_daily%>%
plot_anomaly_diagnostics(Date, Total.Price)
sales_daily2<-sales_daily%>%
time_decompose(Total.Price)%>%
anomalize(remainder)%>%
clean_anomalies()%>%
select(Date, observed_cleaned)%>%
rename(Total.Price=observed_cleaned)
sales_daily2%>%plot_time_series(Date, Total.Price,
.title = "Daily Sales January 2017 - September 2018")
#Check the stationary
library(tseries)
sales_daily_test=ts(sales_daily2$Total.Price, start = c(2017,1,1),frequency = 365.25)
test for stationary with adf and pp
adf.test(sales_daily_test);
##
## Augmented Dickey-Fuller Test
##
## data: sales_daily_test
## Dickey-Fuller = -4.1858, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
pp.test(sales_daily_test)
##
## Phillips-Perron Unit Root Test
##
## data: sales_daily_test
## Dickey-Fuller Z(alpha) = -214.28, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
since in adf and pp test, p-value less than 0.05, the data is stationary
sales_daily2%>%
plot_seasonal_diagnostics(Date, Total.Price,
.feature_set = c("wday.lbl","month.lbl"))
#Now lets split date train and date test, (train80% ; test20%)
splits<-initial_time_split(sales_daily2, prop = 0.8)
#you can also use this
#splits<-time_series_split(sales_daily,
# assess = "4 months",
# cumulative = TRUE) #tells the sampling to use all of the prior data as the training set.
splits%>%
tk_time_series_cv_plan()%>%
plot_time_series_cv_plan(Date,Total.Price,
.title = "Data Split into Training and Testing")
#Snaive
model_snaive<-naive_reg()%>%
set_engine("snaive")%>%
fit(Total.Price~Date, training(splits))
model_snaive
## parsnip model object
##
## Fit time: 101ms
## SNAIVE [7]
## --------
## Model:
## # A tibble: 7 x 2
## Date value
## <date> <dbl>
## 1 2018-05-19 36488.
## 2 2018-05-20 41634.
## 3 2018-05-21 20881.
## 4 2018-05-22 19508.
## 5 2018-05-23 20939.
## 6 2018-05-24 19333.
## 7 2018-05-25 30387.
#ETS
model_ets<-exp_smoothing()%>%
set_engine("ets")%>%
fit(Total.Price~Date, training(splits))
model_ets
## parsnip model object
##
## Fit time: 1.7s
## ETS(M,N,M)
##
## Call:
## forecast::ets(y = outcome, model = model_ets, damped = damping_ets,
##
## Call:
## alpha = alpha, beta = beta, gamma = gamma)
##
## Smoothing parameters:
## alpha = 0.1706
## gamma = 1e-04
##
## Initial states:
## l = 18177.9105
## s = 1.3066 1.0586 0.8522 0.8276 0.7561 0.8397
## 1.3593
##
## sigma: 0.1629
##
## AIC AICc BIC
## 11486.94 11487.38 11529.28
#ARIMA
model_arima<-arima_reg()%>%
set_engine("auto_arima")%>%
fit(Total.Price~Date, training(splits))
model_arima
## parsnip model object
##
## Fit time: 1.4s
## Series: outcome
## ARIMA(1,0,1)(1,1,1)[7]
##
## Coefficients:
## ar1 ma1 sar1 sma1
## 0.9492 -0.7739 -0.1426 -0.8474
## s.e. 0.0294 0.0600 0.0513 0.0344
##
## sigma^2 estimated as 12476397: log likelihood=-4825.85
## AIC=9661.69 AICc=9661.82 BIC=9682.8
#Prophet
model_prophet<-prophet_reg(
seasonality_yearly = TRUE)%>%
set_engine("prophet")%>%
fit(Total.Price~Date, training(splits))
model_prophet
## parsnip model object
##
## Fit time: 1.9s
## PROPHET Model
## - growth: 'linear'
## - n.changepoints: 25
## - changepoint.range: 0.8
## - yearly.seasonality: 'TRUE'
## - weekly.seasonality: 'auto'
## - daily.seasonality: 'auto'
## - seasonality.mode: 'additive'
## - changepoint.prior.scale: 0.05
## - seasonality.prior.scale: 10
## - holidays.prior.scale: 10
## - logistic_cap: NULL
## - logistic_floor: NULL
## - extra_regressors: 0
#modeltimetable
model_tbl<-modeltime_table(
model_snaive,
model_ets,
model_arima,
model_prophet)
model_tbl
## # Modeltime Table
## # A tibble: 4 x 3
## .model_id .model .model_desc
## <int> <list> <chr>
## 1 1 <fit[+]> SNAIVE [7]
## 2 2 <fit[+]> ETS(M,N,M)
## 3 3 <fit[+]> ARIMA(1,0,1)(1,1,1)[7]
## 4 4 <fit[+]> PROPHET
#calibrate
calibrate_tbl<-model_tbl%>%
modeltime_calibrate(testing(splits))
calibrate_tbl
## # Modeltime Table
## # A tibble: 4 x 5
## .model_id .model .model_desc .type .calibration_data
## <int> <list> <chr> <chr> <list>
## 1 1 <fit[+]> SNAIVE [7] Test <tibble [128 x 4]>
## 2 2 <fit[+]> ETS(M,N,M) Test <tibble [128 x 4]>
## 3 3 <fit[+]> ARIMA(1,0,1)(1,1,1)[7] Test <tibble [128 x 4]>
## 4 4 <fit[+]> PROPHET Test <tibble [128 x 4]>
~Calibration is how confidence intervals and accuracy metrics are determined.
~Calibration Data is simply forecasting predictions and residuals that are calculated from out-of-sample data.
~After calibrating, the calibration data follows the data through the forecasting workflow.
#accuracy
calibrate_tbl%>%modeltime_accuracy()
## # A tibble: 4 x 9
## .model_id .model_desc .type mae mape mase smape rmse rsq
## <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 SNAIVE [7] Test 4348. 19.8 0.778 17.6 5522. 0.700
## 2 2 ETS(M,N,M) Test 3870. 19.3 0.692 16.8 4689. 0.721
## 3 3 ARIMA(1,0,1)(1,1,1)[7] Test 3965. 19.2 0.709 16.8 4917. 0.712
## 4 4 PROPHET Test 3172. 15.1 0.567 14.0 3953. 0.733
#Test Set Visualization
calibrate_tbl%>%
modeltime_forecast(
new_data = testing(splits),
actual_data = sales_daily2)%>%
plot_modeltime_forecast()%>%
plotly::layout(
legend=list(
orientation="h",
xanchor="center",
x=0.5,
y=-0.2))
#FORECAST FUTURE
future_forecast_tbl<-calibrate_tbl%>%
modeltime_refit(sales_daily2)%>% #retrain on full data
modeltime_forecast(
h = "3 months",
actual_data = sales_daily2)
future_forecast_tbl%>%
plot_modeltime_forecast()%>%
plotly::layout(
legend=list(
orientation="h",
xanchor="center",
x=0.5,
y=-0.2))
#Forecast with prophet
model_prophet2<-prophet_reg(
growth = "linear", #see the trend
prior_scale_changepoints = 0.4, #Parameter modulating the flexibility of the automatic changepoint selection
prior_scale_seasonality = 5, #modulating the strength of the seasonality model
seasonality_yearly = TRUE,
seasonality_weekly = TRUE,
seasonality_daily = FALSE)%>%
set_engine("prophet")%>%
fit(Total.Price~Date, training(splits))
model_prophet2
## parsnip model object
##
## Fit time: 150ms
## PROPHET Model
## - growth: 'linear'
## - n.changepoints: 25
## - changepoint.range: 0.8
## - yearly.seasonality: 'TRUE'
## - weekly.seasonality: 'TRUE'
## - daily.seasonality: 'FALSE'
## - seasonality.mode: 'additive'
## - changepoint.prior.scale: 0.4
## - seasonality.prior.scale: 5
## - holidays.prior.scale: 10
## - logistic_cap: NULL
## - logistic_floor: NULL
## - extra_regressors: 0
model_tbl2<-modeltime_table(
model_prophet,
model_prophet2)
calibrate_tbl2<-model_tbl2%>%
modeltime_calibrate(testing(splits))
calibrate_tbl2%>%modeltime_accuracy()
## # A tibble: 2 x 9
## .model_id .model_desc .type mae mape mase smape rmse rsq
## <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 PROPHET Test 3172. 15.1 0.567 14.0 3953. 0.733
## 2 2 PROPHET Test 3185. 14.0 0.570 13.9 3957. 0.737
calibrate_tbl2%>%
modeltime_forecast(
new_data = testing(splits),
actual_data = sales_daily2)%>%
plot_modeltime_forecast(.title = "Test Plot")%>%
plotly::layout(
legend=list(
orientation="h",
xanchor="center",
x=0.5,
y=-0.2))
future_forecast_tbl2<-calibrate_tbl2%>%
modeltime_refit(sales_daily2)%>% #retrain on full data
modeltime_forecast(
h = "3 months",
actual_data = sales_daily2)
future_forecast_tbl2%>%
plot_modeltime_forecast(.title = "Forecast 3 Months")%>%
plotly::layout(
legend=list(
orientation="h",
xanchor="center",
x=0.5,
y=-0.2))