options(scipen = 999)
library(tidyverse)
library(DataExplorer)
library(forecast)
library(tseries)
my focus here is to perform a timeseries analysis based on Walmarts weekly sales and then forecast their top and bottom 250 stores
walmart2 = readr::read_csv("C:/Users/thepy/Downloads/Walmart.csv")
Rows: 6435 Columns: 8── Column specification ────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Date
dbl (7): Store, Weekly_Sales, Holiday_Flag, Temperature, Fuel_Price, CPI, Unemployment
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1] 0
[1] 0
summary(walmart2)
Store Date Weekly_Sales Holiday_Flag Temperature
Min. : 1 Length:6435 Min. : 209986 Min. :0.00000 Min. : -2.06
1st Qu.:12 Class :character 1st Qu.: 553350 1st Qu.:0.00000 1st Qu.: 47.46
Median :23 Mode :character Median : 960746 Median :0.00000 Median : 62.67
Mean :23 Mean :1046965 Mean :0.06993 Mean : 60.66
3rd Qu.:34 3rd Qu.:1420159 3rd Qu.:0.00000 3rd Qu.: 74.94
Max. :45 Max. :3818686 Max. :1.00000 Max. :100.14
Fuel_Price CPI Unemployment
Min. :2.472 Min. :126.1 Min. : 3.879
1st Qu.:2.933 1st Qu.:131.7 1st Qu.: 6.891
Median :3.445 Median :182.6 Median : 7.874
Mean :3.359 Mean :171.6 Mean : 7.999
3rd Qu.:3.735 3rd Qu.:212.7 3rd Qu.: 8.622
Max. :4.468 Max. :227.2 Max. :14.313
walmart_sales <- walmart2
mean(walmart_sales$Weekly_Sales)
[1] 1046965
median(walmart_sales$Weekly_Sales)
[1] 960746
walmart_sales <- walmart_sales %>%
mutate(Date=dmy(Date)) %>%
select(Store,Date,Weekly_Sales)
hist(walmart_sales$Weekly_Sales)
Identifying Outliers
walmart_sales$zscore <- (abs((walmart_sales$Weekly_Sales-mean(walmart_sales$Weekly_Sales))/sd(walmart_sales$Weekly_Sales)))
walmart_sales$type <- ifelse(walmart_sales$zscore >3, '1','0')
walmart_sales %>%
dplyr::filter(type==1)
Removing outliers
walmart_sales_clean <- walmart_sales %>%
dplyr::filter(type!=1) %>%
select(Date,Weekly_Sales,Store)
time_walmart_sales <-ts(walmart_sales_clean$Weekly_Sales,frequency = 365.25/7)
#decompose(time_walmart_sales)
adf.test(time_walmart_sales)
Warning: p-value smaller than printed p-value
Augmented Dickey-Fuller Test
data: time_walmart_sales
Dickey-Fuller = -5.3215, Lag order = 18, p-value = 0.01
alternative hypothesis: stationary
time series has a pvalue less than .05 and is thereby stationary
plot(decompose(time_walmart_sales))
The breakdown of the time series components indicates that walmart’s monthly sales shows signs of having both trend and seasonality
The mean value of time-series is constant over time, which implies, the trend component is nullified.
The variance does not increase over time. Seasonality effect is minimal.
Though there are minimal signs of trend and seasonality, it is still devoid of trend or seasonal patterns, which makes it looks like a random white noise irrespective of the observed time interval.
Next I will go ahead and forecast the total monthly sales for Walmart’s top 50 stores
walmart_sales_clean1b<- walmart_sales_clean %>%
# mutate(Date=dmy(Date)) %>%
# mutate(Weekly_Sales2=ifelse(Weekly_Sales>mean(Weekly_Sales),'aboveavg','belowavg')) %>%
group_by(Date,Store) %>%
#filter(Weekly_Sales2 == 'aboveavg') %>%
summarise(sale=sum(Weekly_Sales)) %>%
#dplyr::filter(sale>=1000000) %>%
# filter(Weekly_Sales2 == 'aboveavg') %>%
arrange(desc(sale)) %>%
head(250) %>%
select(-Store,)
`summarise()` has grouped output by 'Date'. You can override using the `.groups` argument.
ts_walmart <- ts(walmart_sales_clean1b$sale,decimal_date(ymd("2010-02-05 ")),
frequency = 365.25/7) #weekly
autoplot(ts_walmart,series ='Weekly_Sales' )+
ggtitle('Walmart top 250 weekly sales') +
xlab('Time') +
ylab('Sales') +
#xlim(2010,2018)+
scale_color_manual(values='blue')+
theme_bw()
#check for stationarity
adf.test(ts_walmart)
Warning: p-value smaller than printed p-value
Augmented Dickey-Fuller Test
data: ts_walmart
Dickey-Fuller = -6.3348, Lag order = 6, p-value = 0.01
alternative hypothesis: stationary
stationary
meaning that it mean and variance do not change over time and therefore there is no trend
auto_model <- auto.arima(ts_walmart)
summary(auto_model)
Series: ts_walmart
ARIMA(4,2,1)(1,0,1)[52]
Coefficients:
ar1 ar2 ar3 ar4 ma1 sar1 sma1
-0.2208 -0.1513 -0.1299 0.1741 -0.8672 -0.0894 0.0182
s.e. 0.0764 0.0799 0.0794 0.0803 0.0379 0.9558 0.9503
sigma^2 = 16606649: log likelihood = -2410.65
AIC=4837.31 AICc=4837.91 BIC=4865.41
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 405.5517 4001.101 2019.187 0.01791827 0.08517036 0.01911669 0.04826562
# Fit the SARIMA model
sarima_model <- Arima(ts_walmart, order=c(1,1,1), seasonal=c(1,1,1))
summary(sarima_model)
Series: ts_walmart
ARIMA(1,1,1)(1,1,1)[52]
Coefficients:
ar1 ma1 sar1 sma1
0.9962 -0.9232 0.0390 -0.3003
s.e. 0.0046 0.0228 0.3907 0.3653
sigma^2 = 26501871: log likelihood = -1954.63
AIC=3919.25 AICc=3919.57 BIC=3935.67
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set -169.7445 4523.212 2050.338 -0.008180731 0.09269737 0.01941161 -0.07274209
# Forecast the next 12 months
forecasted_values <- forecast(sarima_model, h=60)
autoplot(forecasted_values) +
ggtitle("Sales Forecast for top 250 stores within the next 60 weeks") +
xlab("Time") +
ylab("Sales") +
# xlim(2010,2019)+
theme_minimal()
NA
NA
we can from the forecast that from 2010 rolled around Walmart’s top 250 stores begin showing signs of possibly bouncing back upward in sales.
Next I will go ahead and forecast the total monthly sales for Walmart’s bottom 250 stores
walmart_sales_clean2b<- walmart_sales_clean %>%
# mutate(Date=dmy(Date)) %>%
# mutate(Weekly_Sales2=ifelse(Weekly_Sales>mean(Weekly_Sales),'aboveavg','belowavg')) %>%
group_by(Date,Store) %>%
#filter(Weekly_Sales2 == 'aboveavg') %>%
summarise(sale=sum(Weekly_Sales)) %>%
#dplyr::filter(sale>=1000000) %>%
# filter(Weekly_Sales2 == 'aboveavg') %>%
arrange(sale) %>%
head(250) %>%
select(-Store)
`summarise()` has grouped output by 'Date'. You can override using the `.groups` argument.
walmart_sales_clean2b %>%
arrange(Date)
ts_walmart_bottom <- ts(walmart_sales_clean2b$sale,start=decimal_date(ymd("2010-02-05")),
frequency = 365.25 / 7) #Weekly
autoplot(ts_walmart_bottom,series ='Weekly_Sales' )+
ggtitle('Walmart Total bottom 250 Weekly sales') +
xlab('Time') +
ylab('Sales') +
# xlim(2012,2021)+
scale_color_manual(values='red')+
theme_bw()
NA
NA
#check for stationarity
adf.test(ts_walmart_bottom)
Augmented Dickey-Fuller Test
data: ts_walmart_bottom
Dickey-Fuller = -2.8112, Lag order = 6, p-value = 0.2343
alternative hypothesis: stationary
not stationary meaning that the mean and variance change overtime and with that can be expected to exhibit a trend.
auto_model2 <- auto.arima(ts_walmart_bottom)
summary(auto_model2)
Series: ts_walmart_bottom
ARIMA(1,2,2)(1,0,0)[52]
Coefficients:
ar1 ma1 ma2 sar1
-0.7974 -0.1632 -0.4524 0.0436
s.e. 0.1053 0.1184 0.1050 0.0944
sigma^2 = 203234: log likelihood = -1865.87
AIC=3741.74 AICc=3741.99 BIC=3759.31
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set -49.59141 445.372 265.8309 -0.02160973 0.1054949 0.01808943 0.05645411
# Fit the SARIMA model
sarimamodel2 <- Arima(ts_walmart_bottom, order=c(1,1,0), seasonal=list(order=c(1,0,0),period=NA),
method="ML")
summary(sarimamodel2)
Series: ts_walmart_bottom
ARIMA(1,1,0)(1,0,0)[52]
Coefficients:
ar1 sar1
0.450 0.2575
s.e. 0.068 0.1028
sigma^2 = 307758: log likelihood = -1927.45
AIC=3860.9 AICc=3861 BIC=3871.46
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 140.6053 551.4208 297.2886 0.05699207 0.1183901 0.02023008 -0.3512537
# Forecast the next 12 months
forecasted_values_bottomsales <- forecast(sarimamodel2, h=60)
autoplot(forecasted_values_bottomsales) +
ggtitle("Sales Forecast for worst 250 stores for next 60 weeks") +
xlab("Time") +
ylab("Sales") +
#xlim(2012,2018)+
theme_minimal()
NA
NA
we can from the forecast its seen that Walmart’s bottom 250 stores from 2010 to early 2016 sales continue to rise