1 Objective

  1. Compare model performance between Linear model (OLS) vs Non-linear model (e.g. Support Vector Machine, Random Forest)
  2. Performance metrics considered: Root mean squared error (RMSE) and R2
  3. Select best model for Prediction

2 Clear Workspace

rm(list = ls())

3 Load library

library(ggplot2)
library(dplyr)
library(pastecs)
library(car)
library(caret)
library(gvlma)
library(readxl)

4 Load data

data <- raw.data

5 Data wrangling

5.1 Calculate Total spending

data$Total <- data$Quantity * data$UnitPrice

data <- data %>% filter(!Total <= 0)

5.2 Select column

cust_data <- select(data, c(InvoiceDate, CustomerID, Total))

5.3 RFM indicators

# rename column 
cust_data <- rename(cust_data,
                    customer_id = CustomerID, 
                    purchase_amount = Total, 
                    date_of_purchase= InvoiceDate)
# Convert purchase date to R date object
cust_data$date_of_purchase <- as.Date(cust_data$date_of_purchase, "%Y-%m-%d")
## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%Y-%m-%d'
# recency
cust_data$days_since <- as.numeric(difftime(time1 = "2012-01-01",
                                            time2 = cust_data$date_of_purchase,
                                            units = "days"))

cust_data$year_of_purchase <- as.numeric(format(cust_data$date_of_purchase, "%Y"))

6 Exploratory Data Analysis

6.1 Dataset of customer active in 2010

customers_2010 <- cust_data %>% 
  filter(year_of_purchase == 2010) %>% 
    group_by(customer_id) %>% 
      summarise(recency = min(days_since) - 365,
                frequency = n(),
                first_purchase =  max(days_since) - 365,
                avg_amount = mean(purchase_amount),
                revenue =  sum(purchase_amount))

summary(customers_2010)
##   customer_id       recency         frequency        first_purchase  
##  Min.   :12347   Min.   : 8.708   Min.   :    1.00   Min.   : 8.708  
##  1st Qu.:14051   1st Qu.:15.708   1st Qu.:    9.00   1st Qu.:18.708  
##  Median :15545   Median :21.708   Median :   18.00   Median :23.708  
##  Mean   :15472   Mean   :20.884   Mean   :   46.82   Mean   :23.282  
##  3rd Qu.:17001   3rd Qu.:25.708   3rd Qu.:   38.00   3rd Qu.:28.708  
##  Max.   :18269   Max.   :30.708   Max.   :15323.00   Max.   :30.708  
##  NA's   :1                                                           
##    avg_amount          revenue         
##  Min.   :   1.621   Min.   :    12.45  
##  1st Qu.:  12.284   1st Qu.:   198.30  
##  Median :  19.148   Median :   324.31  
##  Mean   :  45.990   Mean   :   929.74  
##  3rd Qu.:  31.977   3rd Qu.:   589.42  
##  Max.   :3794.400   Max.   :251032.25  
## 

6.2 Dataset of customer active in 2011

customers_2011 <- cust_data %>%
                    filter(year_of_purchase == 2011) %>%
                       group_by(customer_id) %>%
                          summarise(recency = min(days_since), 
                             frequency = n(), 
                             first_purchase = max(days_since),
                             avg_amount = mean(purchase_amount),
                             revenue = sum(purchase_amount)) 

summary(customers_2011)
##   customer_id       recency         frequency        first_purchase  
##  Min.   :12346   Min.   : 22.71   Min.   :     1.0   Min.   : 22.71  
##  1st Qu.:13807   1st Qu.: 39.71   1st Qu.:    17.0   1st Qu.:115.46  
##  Median :15281   Median : 69.71   Median :    40.0   Median :249.71  
##  Mean   :15291   Mean   :107.04   Mean   :   115.8   Mean   :222.50  
##  3rd Qu.:16772   3rd Qu.:150.71   3rd Qu.:    97.0   3rd Qu.:316.71  
##  Max.   :18287   Max.   :361.71   Max.   :116897.0   Max.   :361.71  
##  NA's   :1                                                           
##    avg_amount          revenue         
##  Min.   :    2.24   Min.   :      3.8  
##  1st Qu.:   12.33   1st Qu.:    304.4  
##  Median :   17.69   Median :    665.3  
##  Mean   :   69.10   Mean   :   2332.4  
##  3rd Qu.:   24.85   3rd Qu.:   1615.9  
##  Max.   :77183.60   Max.   :1504244.4  
## 

6.3 Training set

# 2014 customers who stayed loyal in 2015
active_2010 <- semi_join(customers_2010, customers_2011, by = "customer_id")
summary(active_2010)
##   customer_id       recency         frequency        first_purchase  
##  Min.   :12347   Min.   : 8.708   Min.   :    1.00   Min.   : 8.708  
##  1st Qu.:14039   1st Qu.:15.708   1st Qu.:   10.00   1st Qu.:18.708  
##  Median :15528   Median :19.708   Median :   19.00   Median :23.708  
##  Mean   :15446   Mean   :20.523   Mean   :   50.59   Mean   :23.247  
##  3rd Qu.:17000   3rd Qu.:24.708   3rd Qu.:   38.00   3rd Qu.:28.708  
##  Max.   :18260   Max.   :30.708   Max.   :15323.00   Max.   :30.708  
##  NA's   :1                                                           
##    avg_amount          revenue         
##  Min.   :   1.621   Min.   :    12.45  
##  1st Qu.:  12.177   1st Qu.:   214.89  
##  Median :  19.327   Median :   349.15  
##  Mean   :  47.000   Mean   :  1022.52  
##  3rd Qu.:  32.023   3rd Qu.:   656.92  
##  Max.   :3794.400   Max.   :251032.25  
## 

Note: There is 1 NA’s row. We need to remove it to make the row.names. If not remove, it says: “Error in rowNamesDF<-(x, value = value) : missing values in ‘row.names’ are not allowed”

active_2010 <- active_2010[complete.cases(active_2010), ]
row.names(active_2010) <- active_2010$customer_id
## Warning: Setting row names on a tibble is deprecated.
active_2010 <- select(active_2010, -customer_id)

6.4 Test set

active_2011 <- semi_join(customers_2011, customers_2010, by = "customer_id")
summary(active_2011)
##   customer_id       recency         frequency        first_purchase  
##  Min.   :12347   Min.   : 22.71   Min.   :     1.0   Min.   : 25.71  
##  1st Qu.:14039   1st Qu.: 29.71   1st Qu.:    35.0   1st Qu.:257.21  
##  Median :15528   Median : 43.71   Median :    95.0   Median :321.71  
##  Mean   :15446   Mean   : 78.36   Mean   :   339.1   Mean   :283.46  
##  3rd Qu.:17000   3rd Qu.: 86.21   3rd Qu.:   203.0   3rd Qu.:349.21  
##  Max.   :18260   Max.   :359.71   Max.   :116897.0   Max.   :361.71  
##  NA's   :1                                                           
##    avg_amount         revenue         
##  Min.   :  2.833   Min.   :     10.0  
##  1st Qu.: 10.725   1st Qu.:    587.7  
##  Median : 17.933   Median :   1670.4  
##  Mean   : 35.613   Mean   :   7097.4  
##  3rd Qu.: 27.302   3rd Qu.:   3808.8  
##  Max.   :828.934   Max.   :1504244.4  
## 

7 Data exploration

7.1 Descriptive stats

options(digits = 2, scipen = 99)
stat.desc(active_2010)
##               recency frequency first_purchase avg_amount   revenue
## nbr.val        766.00     766.0         766.00      766.0     766.0
## nbr.null         0.00       0.0           0.00        0.0       0.0
## nbr.na           0.00       0.0           0.00        0.0       0.0
## min              8.71       1.0           8.71        1.6      12.4
## max             30.71     668.0          30.71     3794.4   27834.6
## range           22.00     667.0          22.00     3792.8   27822.2
## sum          15732.58   23476.0       17799.58    36032.5  533237.4
## median          19.71      19.0          23.71       19.4     348.1
## mean            20.54      30.6          23.24       47.0     696.1
## SE.mean          0.21       1.4           0.20        6.0      56.5
## CI.mean.0.95     0.42       2.8           0.40       11.9     110.9
## var             34.71    1574.0          31.09    28032.6 2443375.8
## std.dev          5.89      39.7           5.58      167.4    1563.1
## coef.var         0.29       1.3           0.24        3.6       2.2

7.2 Distributions

7.2.1 Revenue per customer

ggplot(active_2010, aes(x = revenue)) +
  geom_histogram(bins = 40, fill = "steelblue") +
  theme_classic() +
  labs(x = "revenue", title = "Annual revenue p/customer")

Note: The distribution is severely right skewed.

summary(active_2010$revenue)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12     215     348     696     656   27835

7.2.2 Frequency and Recency

ggplot(active_2010, aes(x = frequency)) +
  geom_histogram(bins = 20, fill = "steelblue") +
    theme_classic() +
      labs(x = "frequency", title = "Purchase frequency")

ggplot(active_2010, aes(x = recency)) +
   geom_histogram(bins = 20, fill = "steelblue") +
   theme_classic() +
   labs(x = "recency", title = "Recency in Days")

summary(active_2010$frequency)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1      10      19      31      38     668

7.2.3 First purchase and Average amount spent

ggplot(active_2010, aes(x = first_purchase)) +
   geom_histogram(fill = "steelblue") +
   theme_classic() +
   labs(x = "first purchase (no.of days since)", title = "Days since first purchase") 

ggplot(active_2010, aes(x = avg_amount)) +
   geom_histogram(bins = 40, fill = "steelblue") +
   theme_classic() +
   labs(x = "average amount", title = "Average amount spent per transaction")

7.3 Correlations

cor1 <- cor(active_2010[,-1])
cor1
##                frequency first_purchase avg_amount revenue
## frequency           1.00         0.1577    -0.1124    0.28
## first_purchase      0.16         1.0000     0.0088    0.10
## avg_amount         -0.11         0.0088     1.0000    0.35
## revenue             0.28         0.1033     0.3531    1.00

Univariate density plots are shown on the diagonal whilst pairwise plots are shown on either side. The green lines are non-parametric regression smoothers.

cor_plot <- scatterplotMatrix(active_2010[,-1], spread = FALSE, main = "Scatterplot Matrix")
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter

8 Modelling

8.1 Model fitting

8.1.1 Cross validation settings for all models

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     repeats = 5)

8.1.2 Non-linear models

Performance metrics; RMSE; R2

# Random Forests 

set.seed(32)

rF_fit <- train(revenue ~.,
                data = active_2010,
                method = "rf",
                metric = "RMSE",
                trControl = ctrl,
                importance =  TRUE)

rF_fit 
## Random Forest 
## 
## 766 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 690, 689, 689, 690, 689, 689, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE  Rsquared  MAE
##   2     956   0.69      223
##   3     943   0.73      183
##   4     944   0.74      170
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
# Multivariate adaptive regression splines (MARS) 

set.seed(32)

mars_fit <- train(revenue ~.,
                data = active_2010,
                method = "earth",
                metric = "RMSE",
                tuneLength = 10,
                trControl = ctrl)

mars_fit
## Multivariate Adaptive Regression Spline 
## 
## 766 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 690, 689, 689, 690, 689, 689, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE  Rsquared  MAE
##    2      1702  0.14      616
##    3      1585  0.35      627
##    4      1461  0.41      578
##    5      1473  0.45      571
##    6      1448  0.50      529
##    8      1413  0.53      524
##    9      1412  0.53      525
##   10      1429  0.52      529
##   11      1437  0.52      532
##   13      1438  0.52      531
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 9 and degree = 1.
# Support Vector Machines

set.seed(32)

svm_fit <- train(revenue~.,
                  data = active_2010,
                  method = "svmRadial",
                  preProc = c("center", "scale"),
                  tuneLength = 10,
                  trControl = ctrl)
svm_fit
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 766 samples
##   4 predictor
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 690, 689, 689, 690, 689, 689, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE  Rsquared  MAE
##     0.25  1130  0.46      360
##     0.50  1074  0.51      326
##     1.00  1012  0.56      290
##     2.00   968  0.60      271
##     4.00   925  0.63      260
##     8.00   903  0.65      253
##    16.00   893  0.65      247
##    32.00   888  0.65      246
##    64.00   886  0.65      245
##   128.00   886  0.65      245
## 
## Tuning parameter 'sigma' was held constant at a value of 0.94
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.94 and C = 128.

8.1.3 Linear Model

# Ordinary least squares 

lm_fit <- train(revenue ~.,
                data = active_2010,
                method = "lm",
                trControl = ctrl)

lm_fit 
## Linear Regression 
## 
## 766 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 690, 688, 689, 690, 690, 688, ... 
## Resampling results:
## 
##   RMSE  Rsquared  MAE
##   1418  0.37      490
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

9 Model evaluation and selection

9.1 Compare models

results <- resamples(list(RandomForest = rF_fit,
                          MARS = mars_fit,
                          SVM = svm_fit,
                          OLS = lm_fit))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: RandomForest, MARS, SVM, OLS 
## Number of resamples: 50 
## 
## MAE 
##              Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RandomForest   34      75    157  183     238  685    0
## MARS          253     416    463  525     592 1075    0
## SVM            82     154    214  245     295  766    0
## OLS           244     366    430  490     577 1044    0
## 
## RMSE 
##              Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RandomForest   57     225    590  943    1472 3361    0
## MARS          340     647    761 1412    1806 4621    0
## SVM           105     391    584  886    1080 3472    0
## OLS           422     634    857 1418    1967 4698    0
## 
## Rsquared 
##              Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RandomForest 0.22    0.51   0.85 0.73    0.91 0.99    0
## MARS         0.21    0.45   0.54 0.53    0.65 0.71    0
## SVM          0.18    0.55   0.70 0.65    0.79 0.95    0
## OLS          0.11    0.30   0.37 0.37    0.45 0.64    0

OLS performs much worse than all the other models on both RMSE and R2.

The random forest model performs best on both evaluation metrics: low RMSE of 943 and highest R2 of 0.73. This model seems to fit the data well. According to R2, 73% of the variation in the response is explained by the model.

The random forest model is selected for prediction model.

SVM is close to that of the MARS.

library(gridExtra)

b1 <- bwplot(results, metric = "RMSE", main = "Comparative Performance\nRoot Mean Squared Error")

b2 <- bwplot(results, metric = "Rsquared", main = "Comparative Performance\nR Squared")

grid.arrange(b1, b2, ncol=2)

Hypothesis tests to confirm that the performance differences observed above are statistically significant.

modelDiff <- diff(results)
summary(modelDiff)
## 
## Call:
## summary.diff.resamples(object = modelDiff)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## MAE 
##              RandomForest         MARS                SVM                
## RandomForest                      -341.3               -61.4             
## MARS         < 0.0000000000000002                      279.8             
## SVM          0.00000014022319065  0.00000000000000115                    
## OLS          0.00000000000011268  1                   0.00000000077357960
##              OLS   
## RandomForest -306.5
## MARS           34.8
## SVM          -245.1
## OLS                
## 
## RMSE 
##              RandomForest MARS    SVM     OLS   
## RandomForest              -468.4    57.6  -475.1
## MARS         0.00276               526.0    -6.7
## SVM          1.00000      0.01452         -532.7
## OLS          0.07056      1.00000 0.04342       
## 
## Rsquared 
##              RandomForest     MARS             SVM              OLS    
## RandomForest                   0.1941           0.0741           0.3576
## MARS         0.00000066415524                  -0.1199           0.1635
## SVM          0.19879          0.00088                            0.2834
## OLS          0.00000000000495 0.00000000045682 0.00000000011771

P-values are close to 0 for almost all pairwise comparisons confirming statistical differences between the models.

9.2 Prediction

Random forest model is used to predict 2015 spending.

9.2.1 Predict model

rF_pred <- predict(rF_fit, newdata = active_2011)
head(rF_pred)
##    1    2    3    4    5    6 
## 3243 1006 1865  642 1214  188

9.2.2 Prediction error

MSE - mean squared error

pred_MSE <- mean((rF_pred - active_2011$revenue)^2)
pred_MSE
## [1] 3216964654

RMSE

sqrt(pred_MSE)
## [1] 56718

9.3 Best model vs null model

9.3.1 Model

useless <- nullModel(x = active_2010[, -5], y = active_2010$revenue)
useless
## Null Classification Model
## 
## Call:
## nullModel.default(x = active_2010[, -5], y = active_2010$revenue)
## 
## Predicted Value: 696
uselessPred <- predict(useless, newdata = active_2011, type = "raw")

9.3.2 Null model predict error

MSE

uselessMSE <- mean((uselessPred - active_2011$revenue)^2)
uselessMSE
## [1] 3277424133

RMSE

sqrt(uselessMSE)
## [1] 57249

The random forest model prediction error is smaller than the null model prediction error.