Emarsys Data Science Skills Assesment (Sales Prediction in R)

In this project, I will utilize various R packages to explore the sales data and build models that can predict future sales.

The analysis presents the ideas, the path leading to the ideas and the presentation of the results and the accuracy of the model.

Load the required packages:

library(ggplot2)
library(forecast)
library(tseries)
library(tidyverse)
library("rio")
library(readxl)
library(lubridate)
library(dplyr)
library(forcats)
library(kableExtra)
sales.dat <- read.csv("training.csv")

Look at the first 10 rows of the data in the sales dataframe

df <- head(sales.dat, 10)
kable(df) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)

order_id	contact_id	purchase_date	product_id	quantity	sales_amount
4852169	17846335	2012-01-01	3308032	1	18.71
3570597	18759585	2012-01-01	37220513	2	83.50
32096674	5025194	2012-01-01	10259603	1	19.95
2904402	29624779	2012-01-01	3308032	1	18.71
38369470	42287117	2012-01-01	3308032	1	18.71
10514626	34476502	2012-01-01	21913642	2	39.90
36513528	5025194	2012-01-01	30518406	1	45.56
6595294	14603912	2012-01-01	16308207	4	159.80
18187529	10412066	2012-01-01	11543327	1	239.95
37889477	35117040	2012-01-01	21103908	1	37.95

Convert the purchase dates in data format and add colums for year, month, days and weeks.

## Convert the purchase dates in data format
sales.dat$purchase_date <- ymd(sales.dat$purchase_date)
sales.dat$Date <- sales.dat$purchase_date
sales.dat$year <- as.factor(year(sales.dat$purchase_date))
sales.dat$month <- as.factor(month(sales.dat$purchase_date))
sales.dat$day <- as.factor(day(sales.dat$purchase_date))
sales.dat$weekday <- as.factor(weekdays(sales.dat$purchase_date))

Remove Zero sales from the data

# Remove zero sales in the data 
sales.dat <- sales.dat[sales.dat$sales_amount > 0, ]

HIGHS, LOWS, BEST AND WORST SALES Overall highest sales

top.sales <- sales.dat %>% 
  slice_max(sales_amount, n=5)

kable(top.sales) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)

order_id	contact_id	purchase_date	product_id	quantity	sales_amount	Date	year	month	day	weekday
10199301	8166406	2012-02-17	23237796	50	10997.50	2012-02-17	2012	2	17	Friday
16120223	36576499	2012-12-05	18800419	50	4997.50	2012-12-05	2012	12	5	Wednesday
28260999	36576499	2013-03-12	18800419	50	4997.50	2013-03-12	2013	3	12	Tuesday
25015419	10558171	2012-02-27	32836058	1	4482.23	2012-02-27	2012	2	27	Monday
31348848	27348661	2013-01-18	7796190	2	4223.92	2013-01-18	2013	1	18	Friday

Highest Sales for 2012

top.2012 <- sales.dat %>% filter(year == 2012) %>%
  slice_max(sales_amount, n=5)

kable(top.2012) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)

order_id	contact_id	purchase_date	product_id	quantity	sales_amount	Date	year	month	day	weekday
10199301	8166406	2012-02-17	23237796	50	10997.50	2012-02-17	2012	2	17	Friday
16120223	36576499	2012-12-05	18800419	50	4997.50	2012-12-05	2012	12	5	Wednesday
25015419	10558171	2012-02-27	32836058	1	4482.23	2012-02-27	2012	2	27	Monday
20041945	29412637	2012-05-15	27441828	260	4147.00	2012-05-15	2012	5	15	Tuesday
46251672	122106	2012-05-17	39469603	1	3450.62	2012-05-17	2012	5	17	Thursday

Highest Sales for 2013

top.2013 <- sales.dat %>% filter(year == 2013) %>%
  slice_max(sales_amount, n=5)

kable(top.2013) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)

order_id	contact_id	purchase_date	product_id	quantity	sales_amount	Date	year	month	day	weekday
28260999	36576499	2013-03-12	18800419	50	4997.50	2013-03-12	2013	3	12	Tuesday
31348848	27348661	2013-01-18	7796190	2	4223.92	2013-01-18	2013	1	18	Friday
7857999	36576499	2013-01-25	18800419	40	3998.00	2013-01-25	2013	1	25	Friday
9099385	16408693	2013-03-26	2034783	1	3503.03	2013-03-26	2013	3	26	Tuesday
42213345	19201239	2013-03-30	2034783	1	3483.21	2013-03-30	2013	3	30	Saturday
5363792	22945018	2013-04-11	2034783	1	3483.21	2013-04-11	2013	4	11	Thursday

Best Performing month of 2012 and 2013

best.month.2012 <- sales.dat %>%
  filter(year == 2012) %>%
  group_by(month) %>% 
  summarize(Total_sales = sum(sales_amount)) %>%
  arrange(desc(Total_sales))

kable(best.month.2012) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)

month	Total_sales
11	2242950
12	2172004
1	1767764
6	1660848
7	1655123
5	1639885
3	1620713
8	1590792
10	1587317
4	1515504
9	1500107
2	1491336

best.month.2013 <- sales.dat%>%
  filter(year == 2013) %>%
  group_by(month) %>%
  summarize(Total_sales = sum(sales_amount)) %>%
  arrange(desc(Total_sales))

kable(best.month.2013) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)

month	Total_sales
12	1872236.2
11	1647369.4
1	1383460.4
10	1372081.1
6	1249496.1
4	1240470.6
5	1236225.7
9	1202684.2
8	1195198.4
7	1184992.2
3	1093032.6
2	971850.8

Total Year Revenue sales in 2012 and 2013

rev.2012 <- sales.dat%>%
  filter(year == 2012) %>%
  summarize(total = sum(sales_amount))

kable(rev.2012) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)

total
20444342

rev.2013 <- sales.dat%>%
  filter(year == 2013) %>%
  summarize(total = sum(sales_amount))

kable(rev.2013) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)

total
15649098

Percent change in Revenue == sales dropped by

rev.change <- ((rev.2013 / rev.2012) * 100)

kable(rev.change) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)

total
76.54488

Total number of products

prods <- sales.dat %>%
  select(product_id) %>%
  distinct() %>%
  summarise("No of Products" = n()) #there are 16471 product

kable(prods) %>%
  kable_styling(bootstrap_options = "striped", font_size = 12, full_width = F)

No of Products
16471

Most popular products in the last 2 years- Products with maximum sales in terms of quantity

popular_products <- sales.dat %>%
  group_by(product_id) %>%
  summarize(Prd.Cnt = sum(quantity)) %>%
  ungroup() %>%
  arrange(desc(Prd.Cnt))

options(scipen = 999)

# Plot top 20 popular products in 2012 and 2013 by sales
head(popular_products, 20) %>%
  ggplot(aes(
    x = reorder(as.factor(product_id), Prd.Cnt),
    y = Prd.Cnt, fill = as.factor(product_id)
  )) +
  geom_bar(stat = "identity") +
  theme(legend.position = "none") +
  labs(y = "Total Sales", x = "Product ID", title = "Top 10 Highest Selling Products (2012-2013)") +
  coord_flip()

Visualize the weekly sales data by month

  par(mfrow=c(2,1))
  
sales.dat %>% 
  filter(year == 2012) %>%
  ggplot(., aes(x = purchase_date, y = sales_amount)) +
  geom_col()+
  ggtitle("Sales Per week of 2012")

sales.dat %>% 
  filter(year == 2013) %>%
  ggplot(., aes(x = purchase_date, y = sales_amount)) +
  geom_col()+
  ggtitle("Sales Per week of 2013")

Monthly Sales 2012

sales.dat %>% 
  group_by(month) %>%
  filter(year == 2012) %>%
  ggplot(., aes(x = as.factor(month),
                y = sales_amount)) +
  geom_col() +
  ggtitle("Monthly Sales 2012") +
  coord_flip()

Monthly Sales 2013

sales.dat %>% 
  group_by(month) %>%
  filter(year == 2013) %>%
  ggplot(., aes(x = as.factor(month),
                y = sales_amount)) +
  geom_col() +
  ggtitle("Monthly Sales 2013") +
  coord_flip()

Product Popularity Most popular products in the last 3 years- Products with maximum sales in terms of quantity

popular_products <- sales.dat %>%
  group_by(product_id) %>%
  summarize(Prd.Cnt = sum(quantity)) %>%
  ungroup() %>%
  arrange(desc(Prd.Cnt))

options(scipen = 999)

Plot of top 20 most popular products in the last 3 years by sales

head(popular_products, 20) %>%
  ggplot(aes(
    x = reorder(as.factor(product_id), Prd.Cnt),
    y = Prd.Cnt, fill = as.factor(product_id)
  )) +
  geom_bar(stat = "identity") +
  theme(legend.position = "none") +
  labs(y = "Total Sales", x = "Products", title = "Highest Selling Products") +
  coord_flip()

Total number of contacts (customers)

sales.dat %>%
  select(contact_id) %>%
  distinct() %>%
  summarise("No of Contacts" = n())

##   No of Contacts
## 1         262294

## There were 262,295 total unique customers in the past 3 years

Most popular products(by product count) per customer between 2012-2013

pop.prod.per.cont <- sales.dat %>%
  group_by(contact_id, product_id) %>%
  summarize(Qty = sum(quantity)) %>%
  filter(Qty == max(Qty)) %>%
  arrange(desc(Qty)) %>%
  ungroup()

## `summarise()` has grouped output by 'contact_id'. You can override using the `.groups` argument.

ggplot(
  data = head(pop.prod.per.cont, 20),
  aes(x = reorder(as.factor(contact_id), Qty), y = Qty, fill = as.factor(product_id))
) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Most Selling Product per Contact", x = "Contact ID", y = "Qty Sold", fill = "Product ID")

Most popular products in 2012 and 2013 respectively- Products with maximum sales in terms of quantity

pop.prods.2012 <- sales.dat %>%
  filter(year == 2012)%>%
  group_by(product_id) %>%
  summarize(Prd.Cnt = sum(quantity)) %>%
  ungroup() %>%
  arrange(desc(Prd.Cnt))

options(scipen = 999)

Plot of top 20 most popular products in 2012 years by sales

head(pop.prods.2012, 20) %>%
  ggplot(aes(
    x = reorder(as.factor(product_id), Prd.Cnt),
    y = Prd.Cnt, fill = as.factor(product_id)
  )) +
  geom_bar(stat = "identity") +
  theme(legend.position = "none") +
  labs(y = "Total Sales", x = "Product ID", title = "Highest Selling Products in 2012") +
  coord_flip()

Plot of top 20 most popular products in 2013 years by sales

pop.prods.2013 <- sales.dat %>%
  filter(year == 2013)%>%
  group_by(product_id) %>%
  summarize(Prd.Cnt = sum(quantity)) %>%
  ungroup() %>%
  arrange(desc(Prd.Cnt))

options(scipen = 999)

head(pop.prods.2013, 20) %>%
  ggplot(aes(
    x = reorder(as.factor(product_id), Prd.Cnt),
    y = Prd.Cnt, fill = as.factor(product_id)
  )) +
  geom_bar(stat = "identity") +
  theme(legend.position = "none") +
  labs(y = "Total Sales", x = "Product ID", title = "Highest Selling Products in 2013") +
  coord_flip()

Stacked yearly sales

sales.dat %>% 
  group_by(year) %>%
  ggplot(., aes(x = month,
                y = sales_amount,
                fill = as.factor(year))) +
  geom_col() +
  ggtitle("2012 to 2013 Monthly Sales")

CLEANING THE DATA FOR MODELING AND PREDICTION

Data Cleaning and Exploratory Data Analysis

Using sample data (fraction of the origianl data) for modeling and easy visualization

#sample 1000 data points 
small_data <- sample_n(sales.dat, 1000)

Plot of the sales field over time to show variance in this field

#Plot the sales field over time - show variance in this field
ggplot(small_data, aes(Date, sales_amount)) +
  geom_line() +
  scale_x_date("month") +
  ylab("Monthly Sales") +
  xlab("")

Plot of month over month to see the range and outliers

ggplot(small_data, aes(Date, sales_amount)) +
  geom_point(color = "navyblue") +
  facet_wrap( ~ month) +
  scale_x_date("month") +
  ylab("Monthly Sales") +
  xlab("")

# Create a time series object based on sales and pass on tsclean
small.sales.ts <- ts(small_data[, c("sales_amount")])

small_data$clean_count <- tsclean(small.sales.ts)


#tsclean() identifies outliers and replace missing values if there are any

Plot of the cleaned data

#pLot the cleaned data 
ggplot() +
  geom_line(data = small_data, aes(x = Date, y = clean_count)) + ylab("Cleaned Count")

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

Monthly and Weekly Moving averages

# Get the monthly and weekly moving averages (ma) and compare to cleaned daily data which still has lots of variance 
small_data$cnt_ma <- ma(small_data$clean_count, order = 7) # weekly ma - using the clean and no outliers dataset
small_data$cnt_ma30 <- ma(small_data$clean_count, order = 30) # monthly ma


ggplot() +
  geom_line(data = small_data, aes(x = Date,y = clean_count, color = "Counts")) +
  geom_line(data = small_data, aes(x = Date, y = cnt_ma, color = "Weekly Moving Average")) +
  geom_line(data = small_data, aes(x = Date, y = cnt_ma30, color = "Monthly Moving Average")) +
  ylab("Sales")

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

Decomposing the time series data

# Decomposing the time series data - taking seasonality, trand and cycle into account 
#calculate seasonal component with stl()
small_count_ma <- ts(na.omit(small_data$cnt_ma), frequency = 30)
small_decomp <- stl(small_count_ma, s.window = "periodic")
small_deseasonal_cnt <- seasadj(small_decomp) #used later in ARIMA modellater
plot(small_decomp)

Test for stationarity

#Test for stationary - first visual check for stationary variance 
#2nd- Augmented Dickey-Fuller Test (adf)
adf.test(small_count_ma, alternative = "stationary") # More negative

## Warning in adf.test(small_count_ma, alternative = "stationary"): p-value smaller
## than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  small_count_ma
## Dickey-Fuller = -7.7034, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary

# the data is stationary

Autocorelations and choosing model order

#ACF plots display correlation between series and its lags
Acf(small_count_ma, main='')

#PACF plots display correlation between series and its lag that is explained by previous lags 
Pacf(small_count_ma, main='')

Differencing

small_count_d1 <- diff(small_deseasonal_cnt, differences = 1)

small_count_d2 <- diff(small_deseasonal_cnt, differences = 2)

#Test the adf for the differencing
small_adf0 <- adf.test(small_count_ma, alternative = "stationary") # More negative the adf, the more correct the model will be: we have an adf of -55.042, pvalue of 0.01

## Warning in adf.test(small_count_ma, alternative = "stationary"): p-value smaller
## than printed p-value

small_adf1 <- adf.test(small_count_d1, alternative = "stationary")

## Warning in adf.test(small_count_d1, alternative = "stationary"): p-value smaller
## than printed p-value

small_adf2 <- adf.test(small_count_d2, alternative = "stationary")

## Warning in adf.test(small_count_d2, alternative = "stationary"): p-value smaller
## than printed p-value

small_adf0;small_adf1;small_adf2

## 
##  Augmented Dickey-Fuller Test
## 
## data:  small_count_ma
## Dickey-Fuller = -7.7034, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary

## 
##  Augmented Dickey-Fuller Test
## 
## data:  small_count_d1
## Dickey-Fuller = -12.717, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary

## 
##  Augmented Dickey-Fuller Test
## 
## data:  small_count_d2
## Dickey-Fuller = -19.619, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary

#Test the acf and pacf to identify the best fit differencing

Acf(small_count_ma, main='')

Acf(small_count_d1, main='') # this fits better

Pacf(small_count_ma, main='')

Pacf(small_count_d1, main='')

FITTING THE ARIMA MODEL

# Get auto fit p,d,q values

auto.arima(small_deseasonal_cnt, seasonal = FALSE)

## Series: small_deseasonal_cnt 
## ARIMA(3,0,2) with non-zero mean 
## 
## Coefficients:
##          ar1     ar2      ar3     ma1      ma2     mean
##       0.7085  0.7737  -0.6344  0.2622  -0.5231  53.9924
## s.e.  0.0686  0.0614   0.0525  0.0775   0.0682   1.3309
## 
## sigma^2 estimated as 75.23:  log likelihood=-3555.54
## AIC=7125.08   AICc=7125.19   BIC=7159.39

Model Evaluation

#Evaluate and iterate the model 
small_fit <- auto.arima(small_deseasonal_cnt, seasonal = FALSE)
tsdisplay(residuals(small_fit), lag.max = 45, main = '(1,1,1) Model Residuals')

small_fit2 <- arima(small_deseasonal_cnt, order = c(3,0,9))

## Warning in arima(small_deseasonal_cnt, order = c(3, 0, 9)): possible convergence
## problem: optim gave code = 1

tsdisplay(residuals(small_fit2), lag.max = 20, main = 'Seasonal Model Residuals')

Forecast with autoarima

#Forecast with autoarima

small_fcast <- forecast(small_fit, h=30)
small_fcast2 <- forecast(small_fit2, h=30)
plot(small_fcast)

plot(small_fcast2)

Model Testing

#TEST MODEL PERFORMANCE WITH A HOLDOUT SET
hold.set <- window(ts(small_deseasonal_cnt), start=450)
fit_no_holdset <- arima(ts(small_deseasonal_cnt[-c(350:994)]), order = c(3,0,9))

## Warning in log(s2): NaNs produced

## Warning in arima(ts(small_deseasonal_cnt[-c(350:994)]), order = c(3, 0, :
## possible convergence problem: optim gave code = 1

fcast_no_holdset <- forecast(fit_no_holdset, h=35)
plot(fcast_no_holdset, main = "")
lines(ts(small_deseasonal_cnt))

#the model needs seasonality added back as it is too linear and not realistic
fit_w_seasonality <- auto.arima(small_deseasonal_cnt, seasonal = TRUE)
seasonlity.fcast <- forecast(fit_w_seasonality, h=90)
plot(seasonlity.fcast)

lines(ts(small_count_ma)) #plots the actual values
lines(ts(small_deseasonal_cnt)) #plots the deseasonalized values

# the moving avarage are staying with the 95% range in the forecast, indication that the model is good

FURTHER TESTING AND ANALYSIS OF THE MODEL

#Test against the auto arima model 

small_fit3 <- auto.arima(small_deseasonal_cnt, seasonal = FALSE)

tsdisplay(residuals(small_fit3), lag.max = 15, main = "Seasonal Model Residuals")

# There is evidence of a lag at 7, a higher order q value (q=7) might be necessary
small_fit4 <- arima(small_deseasonal_cnt, order = c(1,1,7))

tsdisplay(residuals(small_fit4), lag.max = 15, main = "Seasonal Model Residuals")

# Trying the default arima pdq values
small_fit5 <- arima(small_deseasonal_cnt, order = c(1,1,1))

tsdisplay(residuals(small_fit5), lag.max = 15, main = "Seasonal Model Residuals")

small_fit6 <- arima(small_deseasonal_cnt, order = c(3,0,7))
tsdisplay(residuals(small_fit6), lag.max = 20, main = 'Seasonal Model Residuals')

#plot the forecasts and select the best model

small_fcast <- forecast(small_fit, h=30)
small_fcast

##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 34.13333       47.48162 36.36596 58.59728 30.48169 64.48155
## 34.16667       48.31492 32.82290 63.80693 24.62192 72.00791
## 34.20000       48.60771 29.93075 67.28466 20.04377 77.17165
## 34.23333       49.91492 29.31586 70.51399 18.41137 81.41848
## 34.26667       50.53902 28.65727 72.42077 17.07377 84.00426
## 34.30000       51.80689 29.28427 74.32950 17.36152 86.25226
## 34.33333       52.35877 29.48740 75.23014 17.38003 87.33751
## 34.36667       53.33484 30.35174 76.31794 18.18522 88.48446
## 34.40000       53.64906 30.62935 76.66878 18.44344 88.85469
## 34.43333       54.27679 31.25660 77.29697 19.07045 89.48312
## 34.46667       54.34543 31.32291 77.36795 19.13553 89.55534
## 34.50000       54.68040 31.63861 77.72220 19.44102 89.91979
## 34.53333       54.57261 31.50850 77.63672 19.29910 89.84613
## 34.56667       54.71186 31.61445 77.80927 19.38743 90.03630
## 34.60000       54.51461 31.39264 77.63658 19.15261 89.87661
## 34.63333       54.55098 31.40424 77.69772 19.15109 89.95087
## 34.66667       54.33578 31.17534 77.49622 18.91495 89.75662
## 34.70000       54.33659 31.16506 77.50811 18.89880 89.77438
## 34.73333       54.14758 30.97179 77.32338 18.70326 89.59190
## 34.76667       54.15081 30.97215 77.32947 18.70211 89.59952
## 34.80000       54.00636 30.82723 77.18549 18.55694 89.45578
## 34.83333       54.02641 30.84704 77.20577 18.57663 89.47619
## 34.86667       53.92680 30.74739 77.10620 18.47696 89.37664
## 34.90000       53.96338 30.78394 77.14283 18.51348 89.41329
## 34.93333       53.89951 30.71970 77.07932 18.44905 89.34997
## 34.96667       53.94576 30.76574 77.12578 18.49497 89.39654
## 35.00000       53.90590 30.72543 77.08636 18.45444 89.35735
## 35.03333       53.95396 30.77330 77.13461 18.50220 89.40571
## 35.06667       53.92783 30.74688 77.10877 18.47563 89.38002
## 35.10000       53.97179 30.79075 77.15282 18.51945 89.42412

plot(small_fcast)

small_fcast2 <- forecast(small_fit2, h=30)
plot(small_fcast2)

small_fcast3 <- forecast(small_fit3, h=30)
plot(small_fcast3)

small_fcast4 <- forecast(small_fit4, h=30)
plot(small_fcast4)

small_fcast5 <- forecast(small_fit5, h=30)
plot(small_fcast5)

small_fcast6 <- forecast(small_fit6, h=30)
plot(small_fcast6)

# Predict 90 days - whole year (360 days)
small_fcast <- forecast(small_fit, h=90)
plot(small_fcast)

small_fcast2 <- forecast(small_fit2, h=90)
plot(small_fcast2)

small_fcast3 <- forecast(small_fit3, h=90)
plot(small_fcast3)

small_fcast4 <- forecast(small_fit4, h=90)
plot(small_fcast4)

small_fcast5 <- forecast(small_fit5, h=90)
plot(small_fcast5)

small_fcast6 <- forecast(small_fit6, h=90)
plot(small_fcast6)

Of all the models tested, small_fit, small_fit1, small_fit2,small_fit3, and small_fit appears to be giving the best prediction for the sales.

Emarsys Data Science Skills Assesment (Sales Prediction in R)

Charles Otieno

March 30, 2021