rm(list = ls())library(ggplot2)
library(dplyr)
library(pastecs)
library(car)
library(caret)
library(gvlma)
library(readxl)data <- raw.datadata$Total <- data$Quantity * data$UnitPrice
data <- data %>% filter(!Total <= 0)cust_data <- select(data, c(InvoiceDate, CustomerID, Total))# rename column
cust_data <- rename(cust_data,
customer_id = CustomerID,
purchase_amount = Total,
date_of_purchase= InvoiceDate)# Convert purchase date to R date object
cust_data$date_of_purchase <- as.Date(cust_data$date_of_purchase, "%Y-%m-%d")## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%Y-%m-%d'
# recency
cust_data$days_since <- as.numeric(difftime(time1 = "2012-01-01",
time2 = cust_data$date_of_purchase,
units = "days"))
cust_data$year_of_purchase <- as.numeric(format(cust_data$date_of_purchase, "%Y"))customers_2010 <- cust_data %>%
filter(year_of_purchase == 2010) %>%
group_by(customer_id) %>%
summarise(recency = min(days_since) - 365,
frequency = n(),
first_purchase = max(days_since) - 365,
avg_amount = mean(purchase_amount),
revenue = sum(purchase_amount))
summary(customers_2010)## customer_id recency frequency first_purchase
## Min. :12347 Min. : 8.708 Min. : 1.00 Min. : 8.708
## 1st Qu.:14051 1st Qu.:15.708 1st Qu.: 9.00 1st Qu.:18.708
## Median :15545 Median :21.708 Median : 18.00 Median :23.708
## Mean :15472 Mean :20.884 Mean : 46.82 Mean :23.282
## 3rd Qu.:17001 3rd Qu.:25.708 3rd Qu.: 38.00 3rd Qu.:28.708
## Max. :18269 Max. :30.708 Max. :15323.00 Max. :30.708
## NA's :1
## avg_amount revenue
## Min. : 1.621 Min. : 12.45
## 1st Qu.: 12.284 1st Qu.: 198.30
## Median : 19.148 Median : 324.31
## Mean : 45.990 Mean : 929.74
## 3rd Qu.: 31.977 3rd Qu.: 589.42
## Max. :3794.400 Max. :251032.25
##
customers_2011 <- cust_data %>%
filter(year_of_purchase == 2011) %>%
group_by(customer_id) %>%
summarise(recency = min(days_since),
frequency = n(),
first_purchase = max(days_since),
avg_amount = mean(purchase_amount),
revenue = sum(purchase_amount))
summary(customers_2011)## customer_id recency frequency first_purchase
## Min. :12346 Min. : 22.71 Min. : 1.0 Min. : 22.71
## 1st Qu.:13807 1st Qu.: 39.71 1st Qu.: 17.0 1st Qu.:115.46
## Median :15281 Median : 69.71 Median : 40.0 Median :249.71
## Mean :15291 Mean :107.04 Mean : 115.8 Mean :222.50
## 3rd Qu.:16772 3rd Qu.:150.71 3rd Qu.: 97.0 3rd Qu.:316.71
## Max. :18287 Max. :361.71 Max. :116897.0 Max. :361.71
## NA's :1
## avg_amount revenue
## Min. : 2.24 Min. : 3.8
## 1st Qu.: 12.33 1st Qu.: 304.4
## Median : 17.69 Median : 665.3
## Mean : 69.10 Mean : 2332.4
## 3rd Qu.: 24.85 3rd Qu.: 1615.9
## Max. :77183.60 Max. :1504244.4
##
# 2014 customers who stayed loyal in 2015
active_2010 <- semi_join(customers_2010, customers_2011, by = "customer_id")
summary(active_2010)## customer_id recency frequency first_purchase
## Min. :12347 Min. : 8.708 Min. : 1.00 Min. : 8.708
## 1st Qu.:14039 1st Qu.:15.708 1st Qu.: 10.00 1st Qu.:18.708
## Median :15528 Median :19.708 Median : 19.00 Median :23.708
## Mean :15446 Mean :20.523 Mean : 50.59 Mean :23.247
## 3rd Qu.:17000 3rd Qu.:24.708 3rd Qu.: 38.00 3rd Qu.:28.708
## Max. :18260 Max. :30.708 Max. :15323.00 Max. :30.708
## NA's :1
## avg_amount revenue
## Min. : 1.621 Min. : 12.45
## 1st Qu.: 12.177 1st Qu.: 214.89
## Median : 19.327 Median : 349.15
## Mean : 47.000 Mean : 1022.52
## 3rd Qu.: 32.023 3rd Qu.: 656.92
## Max. :3794.400 Max. :251032.25
##
Note: There is 1 NA’s row. We need to remove it to make the row.names. If not remove, it says: “Error in rowNamesDF<-(x, value = value) : missing values in ‘row.names’ are not allowed”
active_2010 <- active_2010[complete.cases(active_2010), ]row.names(active_2010) <- active_2010$customer_id## Warning: Setting row names on a tibble is deprecated.
active_2010 <- select(active_2010, -customer_id)active_2011 <- semi_join(customers_2011, customers_2010, by = "customer_id")
summary(active_2011)## customer_id recency frequency first_purchase
## Min. :12347 Min. : 22.71 Min. : 1.0 Min. : 25.71
## 1st Qu.:14039 1st Qu.: 29.71 1st Qu.: 35.0 1st Qu.:257.21
## Median :15528 Median : 43.71 Median : 95.0 Median :321.71
## Mean :15446 Mean : 78.36 Mean : 339.1 Mean :283.46
## 3rd Qu.:17000 3rd Qu.: 86.21 3rd Qu.: 203.0 3rd Qu.:349.21
## Max. :18260 Max. :359.71 Max. :116897.0 Max. :361.71
## NA's :1
## avg_amount revenue
## Min. : 2.833 Min. : 10.0
## 1st Qu.: 10.725 1st Qu.: 587.7
## Median : 17.933 Median : 1670.4
## Mean : 35.613 Mean : 7097.4
## 3rd Qu.: 27.302 3rd Qu.: 3808.8
## Max. :828.934 Max. :1504244.4
##
options(digits = 2, scipen = 99)
stat.desc(active_2010)## recency frequency first_purchase avg_amount revenue
## nbr.val 766.00 766.0 766.00 766.0 766.0
## nbr.null 0.00 0.0 0.00 0.0 0.0
## nbr.na 0.00 0.0 0.00 0.0 0.0
## min 8.71 1.0 8.71 1.6 12.4
## max 30.71 668.0 30.71 3794.4 27834.6
## range 22.00 667.0 22.00 3792.8 27822.2
## sum 15732.58 23476.0 17799.58 36032.5 533237.4
## median 19.71 19.0 23.71 19.4 348.1
## mean 20.54 30.6 23.24 47.0 696.1
## SE.mean 0.21 1.4 0.20 6.0 56.5
## CI.mean.0.95 0.42 2.8 0.40 11.9 110.9
## var 34.71 1574.0 31.09 28032.6 2443375.8
## std.dev 5.89 39.7 5.58 167.4 1563.1
## coef.var 0.29 1.3 0.24 3.6 2.2
ggplot(active_2010, aes(x = revenue)) +
geom_histogram(bins = 40, fill = "steelblue") +
theme_classic() +
labs(x = "revenue", title = "Annual revenue p/customer") Note: The distribution is severely right skewed.
summary(active_2010$revenue)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 215 348 696 656 27835
ggplot(active_2010, aes(x = frequency)) +
geom_histogram(bins = 20, fill = "steelblue") +
theme_classic() +
labs(x = "frequency", title = "Purchase frequency")ggplot(active_2010, aes(x = recency)) +
geom_histogram(bins = 20, fill = "steelblue") +
theme_classic() +
labs(x = "recency", title = "Recency in Days")summary(active_2010$frequency)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 10 19 31 38 668
ggplot(active_2010, aes(x = first_purchase)) +
geom_histogram(fill = "steelblue") +
theme_classic() +
labs(x = "first purchase (no.of days since)", title = "Days since first purchase") ggplot(active_2010, aes(x = avg_amount)) +
geom_histogram(bins = 40, fill = "steelblue") +
theme_classic() +
labs(x = "average amount", title = "Average amount spent per transaction")cor1 <- cor(active_2010[,-1])
cor1## frequency first_purchase avg_amount revenue
## frequency 1.00 0.1577 -0.1124 0.28
## first_purchase 0.16 1.0000 0.0088 0.10
## avg_amount -0.11 0.0088 1.0000 0.35
## revenue 0.28 0.1033 0.3531 1.00
Univariate density plots are shown on the diagonal whilst pairwise plots are shown on either side. The green lines are non-parametric regression smoothers.
cor_plot <- scatterplotMatrix(active_2010[,-1], spread = FALSE, main = "Scatterplot Matrix")## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is not a
## graphical parameter
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5)Performance metrics; RMSE; R2
# Random Forests
set.seed(32)
rF_fit <- train(revenue ~.,
data = active_2010,
method = "rf",
metric = "RMSE",
trControl = ctrl,
importance = TRUE)
rF_fit ## Random Forest
##
## 766 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 690, 689, 689, 690, 689, 689, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 956 0.69 223
## 3 943 0.73 183
## 4 944 0.74 170
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
# Multivariate adaptive regression splines (MARS)
set.seed(32)
mars_fit <- train(revenue ~.,
data = active_2010,
method = "earth",
metric = "RMSE",
tuneLength = 10,
trControl = ctrl)
mars_fit## Multivariate Adaptive Regression Spline
##
## 766 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 690, 689, 689, 690, 689, 689, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 1702 0.14 616
## 3 1585 0.35 627
## 4 1461 0.41 578
## 5 1473 0.45 571
## 6 1448 0.50 529
## 8 1413 0.53 524
## 9 1412 0.53 525
## 10 1429 0.52 529
## 11 1437 0.52 532
## 13 1438 0.52 531
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 9 and degree = 1.
# Support Vector Machines
set.seed(32)
svm_fit <- train(revenue~.,
data = active_2010,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
svm_fit## Support Vector Machines with Radial Basis Function Kernel
##
## 766 samples
## 4 predictor
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 690, 689, 689, 690, 689, 689, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1130 0.46 360
## 0.50 1074 0.51 326
## 1.00 1012 0.56 290
## 2.00 968 0.60 271
## 4.00 925 0.63 260
## 8.00 903 0.65 253
## 16.00 893 0.65 247
## 32.00 888 0.65 246
## 64.00 886 0.65 245
## 128.00 886 0.65 245
##
## Tuning parameter 'sigma' was held constant at a value of 0.94
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.94 and C = 128.
# Ordinary least squares
lm_fit <- train(revenue ~.,
data = active_2010,
method = "lm",
trControl = ctrl)
lm_fit ## Linear Regression
##
## 766 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 690, 688, 689, 690, 690, 688, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1418 0.37 490
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
results <- resamples(list(RandomForest = rF_fit,
MARS = mars_fit,
SVM = svm_fit,
OLS = lm_fit))
summary(results)##
## Call:
## summary.resamples(object = results)
##
## Models: RandomForest, MARS, SVM, OLS
## Number of resamples: 50
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RandomForest 34 75 157 183 238 685 0
## MARS 253 416 463 525 592 1075 0
## SVM 82 154 214 245 295 766 0
## OLS 244 366 430 490 577 1044 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RandomForest 57 225 590 943 1472 3361 0
## MARS 340 647 761 1412 1806 4621 0
## SVM 105 391 584 886 1080 3472 0
## OLS 422 634 857 1418 1967 4698 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RandomForest 0.22 0.51 0.85 0.73 0.91 0.99 0
## MARS 0.21 0.45 0.54 0.53 0.65 0.71 0
## SVM 0.18 0.55 0.70 0.65 0.79 0.95 0
## OLS 0.11 0.30 0.37 0.37 0.45 0.64 0
OLS performs much worse than all the other models on both RMSE and R2.
The random forest model performs best on both evaluation metrics: low RMSE of 943 and highest R2 of 0.73. This model seems to fit the data well. According to R2, 73% of the variation in the response is explained by the model.
The random forest model is selected for prediction model.
SVM is close to that of the MARS.
library(gridExtra)
b1 <- bwplot(results, metric = "RMSE", main = "Comparative Performance\nRoot Mean Squared Error")
b2 <- bwplot(results, metric = "Rsquared", main = "Comparative Performance\nR Squared")
grid.arrange(b1, b2, ncol=2)Hypothesis tests to confirm that the performance differences observed above are statistically significant.
modelDiff <- diff(results)
summary(modelDiff)##
## Call:
## summary.diff.resamples(object = modelDiff)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## MAE
## RandomForest MARS SVM
## RandomForest -341.3 -61.4
## MARS < 0.0000000000000002 279.8
## SVM 0.00000014022319065 0.00000000000000115
## OLS 0.00000000000011268 1 0.00000000077357960
## OLS
## RandomForest -306.5
## MARS 34.8
## SVM -245.1
## OLS
##
## RMSE
## RandomForest MARS SVM OLS
## RandomForest -468.4 57.6 -475.1
## MARS 0.00276 526.0 -6.7
## SVM 1.00000 0.01452 -532.7
## OLS 0.07056 1.00000 0.04342
##
## Rsquared
## RandomForest MARS SVM OLS
## RandomForest 0.1941 0.0741 0.3576
## MARS 0.00000066415524 -0.1199 0.1635
## SVM 0.19879 0.00088 0.2834
## OLS 0.00000000000495 0.00000000045682 0.00000000011771
P-values are close to 0 for almost all pairwise comparisons confirming statistical differences between the models.
Random forest model is used to predict 2015 spending.
rF_pred <- predict(rF_fit, newdata = active_2011)
head(rF_pred)## 1 2 3 4 5 6
## 3243 1006 1865 642 1214 188
MSE - mean squared error
pred_MSE <- mean((rF_pred - active_2011$revenue)^2)
pred_MSE## [1] 3216964654
RMSE
sqrt(pred_MSE)## [1] 56718
useless <- nullModel(x = active_2010[, -5], y = active_2010$revenue)
useless## Null Classification Model
##
## Call:
## nullModel.default(x = active_2010[, -5], y = active_2010$revenue)
##
## Predicted Value: 696
uselessPred <- predict(useless, newdata = active_2011, type = "raw")MSE
uselessMSE <- mean((uselessPred - active_2011$revenue)^2)
uselessMSE## [1] 3277424133
RMSE
sqrt(uselessMSE)## [1] 57249
The random forest model prediction error is smaller than the null model prediction error.