# --- COMPUTING PREDICTORS AND TARGET VARIABLES ------------
# Load text file into local variable called 'data'
data = read.delim(file = "C:\\Users\\Srirama Bonam\\Documents\\Mktg Data Analytics.txt", header = FALSE, sep = '\t', dec = '.')
# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
data$days_since = as.numeric(difftime(time1 = "2016-01-01",
time2 = data$date_of_purchase,
units = "days"))
# Compute key marketing indicators using SQL language
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# Compute RFM variables as of a year ago
customers_2014 = sqldf("SELECT customer_id,
MIN(days_since) - 365 AS 'recency',
MAX(days_since) - 365 AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'avg_amount',
MAX(purchase_amount) AS 'max_amount'
FROM data
WHERE days_since > 365
GROUP BY 1")
# Compute revenues generated by customers in 2015
revenue_2015 = sqldf("SELECT customer_id, SUM(purchase_amount) AS 'revenue_2015'
FROM data
WHERE year_of_purchase = 2015
GROUP BY 1")
# Merge 2015 customers and 2015 revenue
in_sample = merge(customers_2014, revenue_2015, all.x = TRUE)
in_sample$revenue_2015[is.na(in_sample$revenue_2015)] = 0
in_sample$active_2015 = as.numeric(in_sample$revenue_2015 > 0)
# Display calibration (in-sample) data
head(in_sample)
## customer_id recency first_purchase frequency avg_amount max_amount
## 1 10 3463.7708 3463.771 1 30.0 30
## 2 80 301.7708 3385.771 6 70.0 80
## 3 90 392.7708 3417.771 10 115.8 153
## 4 120 1035.7708 1035.771 1 20.0 20
## 5 130 2604.7708 3344.771 2 50.0 60
## 6 160 2597.7708 3211.771 2 30.0 30
## revenue_2015 active_2015
## 1 0 0
## 2 80 1
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
summary(in_sample)
## customer_id recency first_purchase frequency
## Min. : 10 Min. : 0.771 Min. : 0.771 Min. : 1.000
## 1st Qu.: 77710 1st Qu.: 257.771 1st Qu.: 795.771 1st Qu.: 1.000
## Median :127140 Median : 889.771 Median :1890.771 Median : 2.000
## Mean :127315 Mean :1122.357 Mean :1788.182 Mean : 2.665
## 3rd Qu.:181800 3rd Qu.:1867.771 3rd Qu.:2694.771 3rd Qu.: 3.000
## Max. :245840 Max. :3648.771 Max. :3650.771 Max. :40.000
## avg_amount max_amount revenue_2015 active_2015
## Min. : 5.00 Min. : 5.00 Min. : 0.00 Min. :0.0000
## 1st Qu.: 21.25 1st Qu.: 25.00 1st Qu.: 0.00 1st Qu.:0.0000
## Median : 30.00 Median : 30.00 Median : 0.00 Median :0.0000
## Mean : 55.53 Mean : 65.72 Mean : 21.22 Mean :0.2299
## 3rd Qu.: 50.00 3rd Qu.: 60.00 3rd Qu.: 0.00 3rd Qu.:0.0000
## Max. :4500.00 Max. :4500.00 Max. :4500.00 Max. :1.0000
# --- CALIBRATE THE MODELS ---------------------------------
# Calibrate probability model
library(nnet)
prob.model = multinom(formula = active_2015 ~ recency + first_purchase + frequency + avg_amount + max_amount,
data = in_sample)
## # weights: 7 (6 variable)
## initial value 11717.653087
## iter 10 value 6233.713394
## final value 6184.465956
## converged
coef = summary(prob.model)$coefficients
std = summary(prob.model)$standard.errors
print(coef)
## (Intercept) recency first_purchase frequency avg_amount
## -5.335883e-01 -1.965114e-03 -1.163259e-05 2.195100e-01 4.159636e-04
## max_amount
## -1.558103e-04
print(std)
## (Intercept) recency first_purchase frequency avg_amount
## 4.410041e-02 6.000518e-05 3.925582e-05 1.479178e-02 3.634840e-04
## max_amount
## 2.713806e-04
print(coef / std)
## (Intercept) recency first_purchase frequency avg_amount
## -12.0993962 -32.7490778 -0.2963278 14.8399941 1.1443792
## max_amount
## -0.5741393
# For the monetary model, select only those who made a purchase
z = which(in_sample$active_2015 == 1)
head(in_sample[z, ])
## customer_id recency first_purchase frequency avg_amount max_amount
## 2 80 301.77083 3385.771 6 70.00000 80
## 18 480 15.77083 3312.771 11 62.27273 235
## 30 830 266.77083 3373.771 6 48.33333 60
## 31 850 61.77083 3050.771 8 28.12500 30
## 32 860 266.77083 3642.771 9 53.33333 60
## 39 1020 1461.77083 1826.771 3 73.33333 100
## revenue_2015 active_2015
## 2 80 1
## 18 45 1
## 30 50 1
## 31 60 1
## 32 60 1
## 39 120 1
summary(in_sample[z, ])
## customer_id recency first_purchase frequency
## Min. : 80 Min. : 0.771 Min. : 0.771 Min. : 1.000
## 1st Qu.: 78590 1st Qu.: 22.771 1st Qu.: 649.521 1st Qu.: 2.000
## Median :143550 Median : 96.771 Median :1603.771 Median : 4.000
## Mean :134907 Mean : 306.117 Mean :1635.934 Mean : 4.741
## 3rd Qu.:194363 3rd Qu.: 327.771 3rd Qu.:2665.771 3rd Qu.: 7.000
## Max. :236660 Max. :3543.771 Max. :3646.771 Max. :40.000
## avg_amount max_amount revenue_2015 active_2015
## Min. : 5.00 Min. : 5.00 Min. : 5.0 Min. :1
## 1st Qu.: 30.00 1st Qu.: 30.00 1st Qu.: 30.0 1st Qu.:1
## Median : 40.00 Median : 50.00 Median : 50.0 Median :1
## Mean : 67.78 Mean : 88.33 Mean : 92.3 Mean :1
## 3rd Qu.: 60.00 3rd Qu.: 80.00 3rd Qu.: 100.0 3rd Qu.:1
## Max. :4500.00 Max. :4500.00 Max. :4500.0 Max. :1
# Calibrate the monetary model (version 1)
amount.model = lm(formula = revenue_2015 ~ avg_amount + max_amount, data = in_sample[z, ])
summary(amount.model)
##
## Call:
## lm(formula = revenue_2015 ~ avg_amount + max_amount, data = in_sample[z,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2138.5 -20.1 -16.6 0.1 3361.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.74709 2.38112 8.713 <2e-16 ***
## avg_amount 0.67486 0.03280 20.575 <2e-16 ***
## max_amount 0.29225 0.02363 12.367 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 136.6 on 3883 degrees of freedom
## Multiple R-squared: 0.6054, Adjusted R-squared: 0.6052
## F-statistic: 2979 on 2 and 3883 DF, p-value: < 2.2e-16
# Plot the results of the monetary model
plot(x = in_sample[z, ]$revenue_2015, y = amount.model$fitted.values)

# Re-calibrate the monetary model, using a log-transform (version 2)
amount.model = lm(formula = log(revenue_2015) ~ log(avg_amount) + log(max_amount), data = in_sample[z, ])
summary(amount.model)
##
## Call:
## lm(formula = log(revenue_2015) ~ log(avg_amount) + log(max_amount),
## data = in_sample[z, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7866 -0.1811 -0.0770 0.1852 3.5656
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.37000 0.04003 9.242 <2e-16 ***
## log(avg_amount) 0.54881 0.04167 13.171 <2e-16 ***
## log(max_amount) 0.38813 0.03796 10.224 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4781 on 3883 degrees of freedom
## Multiple R-squared: 0.6927, Adjusted R-squared: 0.6926
## F-statistic: 4377 on 2 and 3883 DF, p-value: < 2.2e-16
# Plot the results of this new monetary model
plot(x = log(in_sample[z, ]$revenue_2015), y = amount.model$fitted.values)

# --- APPLY THE MODELS TO TODAY'S DATA ---------------------
# Compute RFM variables as of today
customers_2015 = sqldf("SELECT customer_id,
MIN(days_since) AS 'recency',
MAX(days_since) AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'avg_amount',
MAX(purchase_amount) AS 'max_amount'
FROM data GROUP BY 1")
# Predict the target variables based on today's data
customers_2015$prob_predicted = predict(object = prob.model, newdata = customers_2015, type = "probs")
customers_2015$revenue_predicted = exp(predict(object = amount.model, newdata = customers_2015))
customers_2015$score_predicted = customers_2015$prob_predicted * customers_2015$revenue_predicted
summary(customers_2015$prob_predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000263 0.012634 0.106176 0.224984 0.397811 0.999904
summary(customers_2015$revenue_predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.54 29.00 35.05 65.63 57.30 3832.95
summary(customers_2015$score_predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0033 0.4557 4.5559 18.8336 17.9572 2854.2611
hist(customers_2015$score_predicted)

# How many customers have an expected revenue of more than $50
z = which(customers_2015$score_predicted > 50)
print(length(z))
## [1] 1323