In this project, I will utilize various R packages to explore the sales data and build models that can predict future sales.

The analysis presents the ideas, the path leading to the ideas and the presentation of the results and the accuracy of the model.

Load the required packages:

library(ggplot2)
library(forecast)
library(tseries)
library(tidyverse)
library("rio")
library(readxl)
library(lubridate)
library(dplyr)
library(forcats)
library(kableExtra)
sales.dat <- read.csv("training.csv")

Look at the first 10 rows of the data in the sales dataframe

df <- head(sales.dat, 10)
kable(df) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)
order_id contact_id purchase_date product_id quantity sales_amount
4852169 17846335 2012-01-01 3308032 1 18.71
3570597 18759585 2012-01-01 37220513 2 83.50
32096674 5025194 2012-01-01 10259603 1 19.95
2904402 29624779 2012-01-01 3308032 1 18.71
38369470 42287117 2012-01-01 3308032 1 18.71
10514626 34476502 2012-01-01 21913642 2 39.90
36513528 5025194 2012-01-01 30518406 1 45.56
6595294 14603912 2012-01-01 16308207 4 159.80
18187529 10412066 2012-01-01 11543327 1 239.95
37889477 35117040 2012-01-01 21103908 1 37.95

Convert the purchase dates in data format and add colums for year, month, days and weeks.

## Convert the purchase dates in data format
sales.dat$purchase_date <- ymd(sales.dat$purchase_date)
sales.dat$Date <- sales.dat$purchase_date
sales.dat$year <- as.factor(year(sales.dat$purchase_date))
sales.dat$month <- as.factor(month(sales.dat$purchase_date))
sales.dat$day <- as.factor(day(sales.dat$purchase_date))
sales.dat$weekday <- as.factor(weekdays(sales.dat$purchase_date))

Remove Zero sales from the data

# Remove zero sales in the data 
sales.dat <- sales.dat[sales.dat$sales_amount > 0, ]

HIGHS, LOWS, BEST AND WORST SALES Overall highest sales

top.sales <- sales.dat %>% 
  slice_max(sales_amount, n=5)

kable(top.sales) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)
order_id contact_id purchase_date product_id quantity sales_amount Date year month day weekday
10199301 8166406 2012-02-17 23237796 50 10997.50 2012-02-17 2012 2 17 Friday
16120223 36576499 2012-12-05 18800419 50 4997.50 2012-12-05 2012 12 5 Wednesday
28260999 36576499 2013-03-12 18800419 50 4997.50 2013-03-12 2013 3 12 Tuesday
25015419 10558171 2012-02-27 32836058 1 4482.23 2012-02-27 2012 2 27 Monday
31348848 27348661 2013-01-18 7796190 2 4223.92 2013-01-18 2013 1 18 Friday

Highest Sales for 2012

top.2012 <- sales.dat %>% filter(year == 2012) %>%
  slice_max(sales_amount, n=5)

kable(top.2012) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)
order_id contact_id purchase_date product_id quantity sales_amount Date year month day weekday
10199301 8166406 2012-02-17 23237796 50 10997.50 2012-02-17 2012 2 17 Friday
16120223 36576499 2012-12-05 18800419 50 4997.50 2012-12-05 2012 12 5 Wednesday
25015419 10558171 2012-02-27 32836058 1 4482.23 2012-02-27 2012 2 27 Monday
20041945 29412637 2012-05-15 27441828 260 4147.00 2012-05-15 2012 5 15 Tuesday
46251672 122106 2012-05-17 39469603 1 3450.62 2012-05-17 2012 5 17 Thursday

Highest Sales for 2013

top.2013 <- sales.dat %>% filter(year == 2013) %>%
  slice_max(sales_amount, n=5)

kable(top.2013) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)
order_id contact_id purchase_date product_id quantity sales_amount Date year month day weekday
28260999 36576499 2013-03-12 18800419 50 4997.50 2013-03-12 2013 3 12 Tuesday
31348848 27348661 2013-01-18 7796190 2 4223.92 2013-01-18 2013 1 18 Friday
7857999 36576499 2013-01-25 18800419 40 3998.00 2013-01-25 2013 1 25 Friday
9099385 16408693 2013-03-26 2034783 1 3503.03 2013-03-26 2013 3 26 Tuesday
42213345 19201239 2013-03-30 2034783 1 3483.21 2013-03-30 2013 3 30 Saturday
5363792 22945018 2013-04-11 2034783 1 3483.21 2013-04-11 2013 4 11 Thursday

Best Performing month of 2012 and 2013

best.month.2012 <- sales.dat %>%
  filter(year == 2012) %>%
  group_by(month) %>% 
  summarize(Total_sales = sum(sales_amount)) %>%
  arrange(desc(Total_sales))

kable(best.month.2012) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)
month Total_sales
11 2242950
12 2172004
1 1767764
6 1660848
7 1655123
5 1639885
3 1620713
8 1590792
10 1587317
4 1515504
9 1500107
2 1491336
best.month.2013 <- sales.dat%>%
  filter(year == 2013) %>%
  group_by(month) %>%
  summarize(Total_sales = sum(sales_amount)) %>%
  arrange(desc(Total_sales))

kable(best.month.2013) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)
month Total_sales
12 1872236.2
11 1647369.4
1 1383460.4
10 1372081.1
6 1249496.1
4 1240470.6
5 1236225.7
9 1202684.2
8 1195198.4
7 1184992.2
3 1093032.6
2 971850.8

Total Year Revenue sales in 2012 and 2013

rev.2012 <- sales.dat%>%
  filter(year == 2012) %>%
  summarize(total = sum(sales_amount))

kable(rev.2012) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)
total
20444342
rev.2013 <- sales.dat%>%
  filter(year == 2013) %>%
  summarize(total = sum(sales_amount))

kable(rev.2013) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)
total
15649098

Percent change in Revenue == sales dropped by

rev.change <- ((rev.2013 / rev.2012) * 100)

kable(rev.change) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)
total
76.54488

Total number of products

prods <- sales.dat %>%
  select(product_id) %>%
  distinct() %>%
  summarise("No of Products" = n()) #there are 16471 product

kable(prods) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)
No of Products
16471

Most popular products in the last 2 years- Products with maximum sales in terms of quantity

popular_products <- sales.dat %>%
  group_by(product_id) %>%
  summarize(Prd.Cnt = sum(quantity)) %>%
  ungroup() %>%
  arrange(desc(Prd.Cnt))
options(scipen = 999)

# Plot top 20 popular products in 2012 and 2013 by sales
head(popular_products, 20) %>%
  ggplot(aes(
    x = reorder(as.factor(product_id), Prd.Cnt),
    y = Prd.Cnt, fill = as.factor(product_id)
  )) +
  geom_bar(stat = "identity") +
  theme(legend.position = "none") +
  labs(y = "Total Sales", x = "Product ID", title = "Top 10 Highest Selling Products (2012-2013)") +
  coord_flip()

Visualize the weekly sales data by month

  par(mfrow=c(2,1))
  
sales.dat %>% 
  filter(year == 2012) %>%
  ggplot(., aes(x = purchase_date, y = sales_amount)) +
  geom_col()+
  ggtitle("Sales Per week of 2012")

sales.dat %>% 
  filter(year == 2013) %>%
  ggplot(., aes(x = purchase_date, y = sales_amount)) +
  geom_col()+
  ggtitle("Sales Per week of 2013")

Monthly Sales 2012

sales.dat %>% 
  group_by(month) %>%
  filter(year == 2012) %>%
  ggplot(., aes(x = as.factor(month),
                y = sales_amount)) +
  geom_col() +
  ggtitle("Monthly Sales 2012") +
  coord_flip()

Monthly Sales 2013

sales.dat %>% 
  group_by(month) %>%
  filter(year == 2013) %>%
  ggplot(., aes(x = as.factor(month),
                y = sales_amount)) +
  geom_col() +
  ggtitle("Monthly Sales 2013") +
  coord_flip()

Product Popularity Most popular products in the last 3 years- Products with maximum sales in terms of quantity

popular_products <- sales.dat %>%
  group_by(product_id) %>%
  summarize(Prd.Cnt = sum(quantity)) %>%
  ungroup() %>%
  arrange(desc(Prd.Cnt))
options(scipen = 999)

Plot of top 20 most popular products in the last 3 years by sales

head(popular_products, 20) %>%
  ggplot(aes(
    x = reorder(as.factor(product_id), Prd.Cnt),
    y = Prd.Cnt, fill = as.factor(product_id)
  )) +
  geom_bar(stat = "identity") +
  theme(legend.position = "none") +
  labs(y = "Total Sales", x = "Products", title = "Highest Selling Products") +
  coord_flip()

Total number of contacts (customers)

sales.dat %>%
  select(contact_id) %>%
  distinct() %>%
  summarise("No of Contacts" = n())
##   No of Contacts
## 1         262294
## There were 262,295 total unique customers in the past 3 years

Most popular products(by product count) per customer between 2012-2013

pop.prod.per.cont <- sales.dat %>%
  group_by(contact_id, product_id) %>%
  summarize(Qty = sum(quantity)) %>%
  filter(Qty == max(Qty)) %>%
  arrange(desc(Qty)) %>%
  ungroup()
## `summarise()` has grouped output by 'contact_id'. You can override using the `.groups` argument.
ggplot(
  data = head(pop.prod.per.cont, 20),
  aes(x = reorder(as.factor(contact_id), Qty), y = Qty, fill = as.factor(product_id))
) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Most Selling Product per Contact", x = "Contact ID", y = "Qty Sold", fill = "Product ID")

Most popular products in 2012 and 2013 respectively- Products with maximum sales in terms of quantity

pop.prods.2012 <- sales.dat %>%
  filter(year == 2012)%>%
  group_by(product_id) %>%
  summarize(Prd.Cnt = sum(quantity)) %>%
  ungroup() %>%
  arrange(desc(Prd.Cnt))
options(scipen = 999)

Plot of top 20 most popular products in 2012 years by sales

head(pop.prods.2012, 20) %>%
  ggplot(aes(
    x = reorder(as.factor(product_id), Prd.Cnt),
    y = Prd.Cnt, fill = as.factor(product_id)
  )) +
  geom_bar(stat = "identity") +
  theme(legend.position = "none") +
  labs(y = "Total Sales", x = "Product ID", title = "Highest Selling Products in 2012") +
  coord_flip()

Plot of top 20 most popular products in 2013 years by sales

pop.prods.2013 <- sales.dat %>%
  filter(year == 2013)%>%
  group_by(product_id) %>%
  summarize(Prd.Cnt = sum(quantity)) %>%
  ungroup() %>%
  arrange(desc(Prd.Cnt))
options(scipen = 999)
head(pop.prods.2013, 20) %>%
  ggplot(aes(
    x = reorder(as.factor(product_id), Prd.Cnt),
    y = Prd.Cnt, fill = as.factor(product_id)
  )) +
  geom_bar(stat = "identity") +
  theme(legend.position = "none") +
  labs(y = "Total Sales", x = "Product ID", title = "Highest Selling Products in 2013") +
  coord_flip()

Stacked yearly sales

sales.dat %>% 
  group_by(year) %>%
  ggplot(., aes(x = month,
                y = sales_amount,
                fill = as.factor(year))) +
  geom_col() +
  ggtitle("2012 to 2013 Monthly Sales")

CLEANING THE DATA FOR MODELING AND PREDICTION

Data Cleaning and Exploratory Data Analysis

Using sample data (fraction of the origianl data) for modeling and easy visualization

#sample 1000 data points 
small_data <- sample_n(sales.dat, 1000)

Plot of the sales field over time to show variance in this field

#Plot the sales field over time - show variance in this field
ggplot(small_data, aes(Date, sales_amount)) +
  geom_line() +
  scale_x_date("month") +
  ylab("Monthly Sales") +
  xlab("")

Plot of month over month to see the range and outliers

ggplot(small_data, aes(Date, sales_amount)) +
  geom_point(color = "navyblue") +
  facet_wrap( ~ month) +
  scale_x_date("month") +
  ylab("Monthly Sales") +
  xlab("")

# Create a time series object based on sales and pass on tsclean
small.sales.ts <- ts(small_data[, c("sales_amount")])

small_data$clean_count <- tsclean(small.sales.ts)


#tsclean() identifies outliers and replace missing values if there are any

Plot of the cleaned data

#pLot the cleaned data 
ggplot() +
  geom_line(data = small_data, aes(x = Date, y = clean_count)) + ylab("Cleaned Count")
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

Monthly and Weekly Moving averages

# Get the monthly and weekly moving averages (ma) and compare to cleaned daily data which still has lots of variance 
small_data$cnt_ma <- ma(small_data$clean_count, order = 7) # weekly ma - using the clean and no outliers dataset
small_data$cnt_ma30 <- ma(small_data$clean_count, order = 30) # monthly ma


ggplot() +
  geom_line(data = small_data, aes(x = Date,y = clean_count, color = "Counts")) +
  geom_line(data = small_data, aes(x = Date, y = cnt_ma, color = "Weekly Moving Average")) +
  geom_line(data = small_data, aes(x = Date, y = cnt_ma30, color = "Monthly Moving Average")) +
  ylab("Sales")
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

Decomposing the time series data

# Decomposing the time series data - taking seasonality, trand and cycle into account 
#calculate seasonal component with stl()
small_count_ma <- ts(na.omit(small_data$cnt_ma), frequency = 30)
small_decomp <- stl(small_count_ma, s.window = "periodic")
small_deseasonal_cnt <- seasadj(small_decomp) #used later in ARIMA modellater
plot(small_decomp)

Test for stationarity

#Test for stationary - first visual check for stationary variance 
#2nd- Augmented Dickey-Fuller Test (adf)
adf.test(small_count_ma, alternative = "stationary") # More negative
## Warning in adf.test(small_count_ma, alternative = "stationary"): p-value smaller
## than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  small_count_ma
## Dickey-Fuller = -7.7034, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
# the data is stationary

Autocorelations and choosing model order

#ACF plots display correlation between series and its lags
Acf(small_count_ma, main='')

#PACF plots display correlation between series and its lag that is explained by previous lags 
Pacf(small_count_ma, main='')

Differencing

small_count_d1 <- diff(small_deseasonal_cnt, differences = 1)

small_count_d2 <- diff(small_deseasonal_cnt, differences = 2)

#Test the adf for the differencing
small_adf0 <- adf.test(small_count_ma, alternative = "stationary") # More negative the adf, the more correct the model will be: we have an adf of -55.042, pvalue of 0.01
## Warning in adf.test(small_count_ma, alternative = "stationary"): p-value smaller
## than printed p-value
small_adf1 <- adf.test(small_count_d1, alternative = "stationary")
## Warning in adf.test(small_count_d1, alternative = "stationary"): p-value smaller
## than printed p-value
small_adf2 <- adf.test(small_count_d2, alternative = "stationary")
## Warning in adf.test(small_count_d2, alternative = "stationary"): p-value smaller
## than printed p-value
small_adf0;small_adf1;small_adf2
## 
##  Augmented Dickey-Fuller Test
## 
## data:  small_count_ma
## Dickey-Fuller = -7.7034, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  small_count_d1
## Dickey-Fuller = -12.717, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  small_count_d2
## Dickey-Fuller = -19.619, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
#Test the acf and pacf to identify the best fit differencing

Acf(small_count_ma, main='')

Acf(small_count_d1, main='') # this fits better

Pacf(small_count_ma, main='')

Pacf(small_count_d1, main='')

FITTING THE ARIMA MODEL

# Get auto fit p,d,q values

auto.arima(small_deseasonal_cnt, seasonal = FALSE)
## Series: small_deseasonal_cnt 
## ARIMA(3,0,2) with non-zero mean 
## 
## Coefficients:
##          ar1     ar2      ar3     ma1      ma2     mean
##       0.7085  0.7737  -0.6344  0.2622  -0.5231  53.9924
## s.e.  0.0686  0.0614   0.0525  0.0775   0.0682   1.3309
## 
## sigma^2 estimated as 75.23:  log likelihood=-3555.54
## AIC=7125.08   AICc=7125.19   BIC=7159.39

Model Evaluation

#Evaluate and iterate the model 
small_fit <- auto.arima(small_deseasonal_cnt, seasonal = FALSE)
tsdisplay(residuals(small_fit), lag.max = 45, main = '(1,1,1) Model Residuals')

small_fit2 <- arima(small_deseasonal_cnt, order = c(3,0,9))
## Warning in arima(small_deseasonal_cnt, order = c(3, 0, 9)): possible convergence
## problem: optim gave code = 1
tsdisplay(residuals(small_fit2), lag.max = 20, main = 'Seasonal Model Residuals')

Forecast with autoarima

#Forecast with autoarima

small_fcast <- forecast(small_fit, h=30)
small_fcast2 <- forecast(small_fit2, h=30)
plot(small_fcast)

plot(small_fcast2)

Model Testing

#TEST MODEL PERFORMANCE WITH A HOLDOUT SET
hold.set <- window(ts(small_deseasonal_cnt), start=450)
fit_no_holdset <- arima(ts(small_deseasonal_cnt[-c(350:994)]), order = c(3,0,9))
## Warning in log(s2): NaNs produced
## Warning in arima(ts(small_deseasonal_cnt[-c(350:994)]), order = c(3, 0, :
## possible convergence problem: optim gave code = 1
fcast_no_holdset <- forecast(fit_no_holdset, h=35)
plot(fcast_no_holdset, main = "")
lines(ts(small_deseasonal_cnt))

#the model needs seasonality added back as it is too linear and not realistic
fit_w_seasonality <- auto.arima(small_deseasonal_cnt, seasonal = TRUE)
seasonlity.fcast <- forecast(fit_w_seasonality, h=90)
plot(seasonlity.fcast)

lines(ts(small_count_ma)) #plots the actual values
lines(ts(small_deseasonal_cnt)) #plots the deseasonalized values

# the moving avarage are staying with the 95% range in the forecast, indication that the model is good

FURTHER TESTING AND ANALYSIS OF THE MODEL

#Test against the auto arima model 

small_fit3 <- auto.arima(small_deseasonal_cnt, seasonal = FALSE)

tsdisplay(residuals(small_fit3), lag.max = 15, main = "Seasonal Model Residuals")

# There is evidence of a lag at 7, a higher order q value (q=7) might be necessary
small_fit4 <- arima(small_deseasonal_cnt, order = c(1,1,7))

tsdisplay(residuals(small_fit4), lag.max = 15, main = "Seasonal Model Residuals")

# Trying the default arima pdq values
small_fit5 <- arima(small_deseasonal_cnt, order = c(1,1,1))

tsdisplay(residuals(small_fit5), lag.max = 15, main = "Seasonal Model Residuals")

small_fit6 <- arima(small_deseasonal_cnt, order = c(3,0,7))
tsdisplay(residuals(small_fit6), lag.max = 20, main = 'Seasonal Model Residuals')

#plot the forecasts and select the best model

small_fcast <- forecast(small_fit, h=30)
small_fcast
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 34.13333       47.48162 36.36596 58.59728 30.48169 64.48155
## 34.16667       48.31492 32.82290 63.80693 24.62192 72.00791
## 34.20000       48.60771 29.93075 67.28466 20.04377 77.17165
## 34.23333       49.91492 29.31586 70.51399 18.41137 81.41848
## 34.26667       50.53902 28.65727 72.42077 17.07377 84.00426
## 34.30000       51.80689 29.28427 74.32950 17.36152 86.25226
## 34.33333       52.35877 29.48740 75.23014 17.38003 87.33751
## 34.36667       53.33484 30.35174 76.31794 18.18522 88.48446
## 34.40000       53.64906 30.62935 76.66878 18.44344 88.85469
## 34.43333       54.27679 31.25660 77.29697 19.07045 89.48312
## 34.46667       54.34543 31.32291 77.36795 19.13553 89.55534
## 34.50000       54.68040 31.63861 77.72220 19.44102 89.91979
## 34.53333       54.57261 31.50850 77.63672 19.29910 89.84613
## 34.56667       54.71186 31.61445 77.80927 19.38743 90.03630
## 34.60000       54.51461 31.39264 77.63658 19.15261 89.87661
## 34.63333       54.55098 31.40424 77.69772 19.15109 89.95087
## 34.66667       54.33578 31.17534 77.49622 18.91495 89.75662
## 34.70000       54.33659 31.16506 77.50811 18.89880 89.77438
## 34.73333       54.14758 30.97179 77.32338 18.70326 89.59190
## 34.76667       54.15081 30.97215 77.32947 18.70211 89.59952
## 34.80000       54.00636 30.82723 77.18549 18.55694 89.45578
## 34.83333       54.02641 30.84704 77.20577 18.57663 89.47619
## 34.86667       53.92680 30.74739 77.10620 18.47696 89.37664
## 34.90000       53.96338 30.78394 77.14283 18.51348 89.41329
## 34.93333       53.89951 30.71970 77.07932 18.44905 89.34997
## 34.96667       53.94576 30.76574 77.12578 18.49497 89.39654
## 35.00000       53.90590 30.72543 77.08636 18.45444 89.35735
## 35.03333       53.95396 30.77330 77.13461 18.50220 89.40571
## 35.06667       53.92783 30.74688 77.10877 18.47563 89.38002
## 35.10000       53.97179 30.79075 77.15282 18.51945 89.42412
plot(small_fcast)

small_fcast2 <- forecast(small_fit2, h=30)
plot(small_fcast2)

small_fcast3 <- forecast(small_fit3, h=30)
plot(small_fcast3)

small_fcast4 <- forecast(small_fit4, h=30)
plot(small_fcast4)

small_fcast5 <- forecast(small_fit5, h=30)
plot(small_fcast5)

small_fcast6 <- forecast(small_fit6, h=30)
plot(small_fcast6)

# Predict 90 days - whole year (360 days)
small_fcast <- forecast(small_fit, h=90)
plot(small_fcast)

small_fcast2 <- forecast(small_fit2, h=90)
plot(small_fcast2)

small_fcast3 <- forecast(small_fit3, h=90)
plot(small_fcast3)

small_fcast4 <- forecast(small_fit4, h=90)
plot(small_fcast4)

small_fcast5 <- forecast(small_fit5, h=90)
plot(small_fcast5)

small_fcast6 <- forecast(small_fit6, h=90)
plot(small_fcast6)

Of all the models tested, small_fit, small_fit1, small_fit2,small_fit3, and small_fit appears to be giving the best prediction for the sales.