# --- COMPUTING RECENCY, FREQUENCY, MONETARY VALUE ---------
# Load text file into local variable called 'data'
data = read.delim(file = "C:\\Users\\Srirama Bonam\\Documents\\Mktg Data Analytics.txt", header = FALSE, sep = '\t', dec = '.')
# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
data$days_since = as.numeric(difftime(time1 = "2016-01-01",
time2 = data$date_of_purchase,
units = "days"))
# Display the data after transformation
head(data)
## customer_id purchase_amount date_of_purchase year_of_purchase days_since
## 1 760 25 2009-11-06 2009 2246.7708
## 2 860 50 2012-09-28 2012 1189.7708
## 3 1200 100 2005-10-25 2005 3719.7708
## 4 1420 50 2009-07-09 2009 2366.7708
## 5 1940 70 2013-01-25 2013 1070.7708
## 6 1960 40 2013-10-29 2013 793.7708
summary(data)
## customer_id purchase_amount date_of_purchase year_of_purchase
## Min. : 10 Min. : 5.00 Min. :2005-01-02 Min. :2005
## 1st Qu.: 57720 1st Qu.: 25.00 1st Qu.:2009-01-17 1st Qu.:2009
## Median :102440 Median : 30.00 Median :2011-11-23 Median :2011
## Mean :108935 Mean : 62.34 Mean :2011-07-14 Mean :2011
## 3rd Qu.:160525 3rd Qu.: 60.00 3rd Qu.:2013-12-29 3rd Qu.:2013
## Max. :264200 Max. :4500.00 Max. :2015-12-31 Max. :2015
## days_since
## Min. : 0.771
## 1st Qu.: 732.771
## Median :1499.771
## Mean :1631.710
## 3rd Qu.:2539.771
## Max. :4015.771
# Compute key marketing indicators using SQL language
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# Compute recency, frequency, and average purchase amount
customers_2015 = sqldf("SELECT customer_id,
MIN(days_since) AS 'recency',
MAX(days_since) AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data GROUP BY 1")
# Explore the data
head(customers_2015)
## customer_id recency first_purchase frequency amount
## 1 10 3828.7708 3828.771 1 30.00000
## 2 80 342.7708 3750.771 7 71.42857
## 3 90 757.7708 3782.771 10 115.80000
## 4 120 1400.7708 1400.771 1 20.00000
## 5 130 2969.7708 3709.771 2 50.00000
## 6 160 2962.7708 3576.771 2 30.00000
summary(customers_2015)
## customer_id recency first_purchase frequency
## Min. : 10 Min. : 0.771 Min. : 0.771 Min. : 1.000
## 1st Qu.: 81990 1st Qu.: 243.771 1st Qu.: 987.771 1st Qu.: 1.000
## Median :136430 Median :1069.771 Median :2086.771 Median : 2.000
## Mean :137574 Mean :1252.809 Mean :1983.781 Mean : 2.782
## 3rd Qu.:195100 3rd Qu.:2129.771 3rd Qu.:2991.771 3rd Qu.: 3.000
## Max. :264200 Max. :4013.771 Max. :4015.771 Max. :45.000
## amount
## Min. : 5.00
## 1st Qu.: 21.67
## Median : 30.00
## Mean : 57.79
## 3rd Qu.: 50.00
## Max. :4500.00
hist(customers_2015$recency)

hist(customers_2015$frequency)

hist(customers_2015$amount)

hist(customers_2015$amount, breaks = 100)

# --- CODING A MANAGERIAL SEGMENTATION ---------------------
# Simple 2-segment solution based on recency alone
customers_2015$segment = ifelse(test = customers_2015$recency > 365*3, yes = "inactive", no = "NA")
table(customers_2015$segment)
##
## inactive NA
## 9158 9259
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 inactive 2177.8817 2545.939 1.814479 48.11277
## 2 NA 337.8268 1427.754 3.739713 67.36760
# A more complex 3-segment solution based on recency alone
customers_2015$segment = ifelse(test = customers_2015$recency > 365*3,
yes = "inactive",
no = ifelse(test = customers_2015$recency > 365*2,
yes = "cold",
no = "NA"))
table(customers_2015$segment)
##
## cold inactive NA
## 1903 9158 7356
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 cold 857.5522 1431.888 2.303205 51.73989
## 2 inactive 2177.8817 2545.939 1.814479 48.11277
## 3 NA 203.3736 1426.685 4.111338 71.41050
# Simple 2-segment solution using the which statement
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
table(customers_2015$segment)
##
## inactive NA
## 9158 9259
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 inactive 2177.8817 2545.939 1.814479 48.11277
## 2 NA 337.8268 1427.754 3.739713 67.36760
# More complex 4-segment solution using which
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
table(customers_2015$segment)
##
## active cold inactive warm
## 5398 1903 9158 1958
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 active 99.51148 1465.614 4.560763 72.08094
## 2 cold 857.55223 1431.888 2.303205 51.73989
## 3 inactive 2177.88167 2545.939 1.814479 48.11277
## 4 warm 489.71057 1319.361 2.872319 69.56215
# Complete segment solution using which, and exploiting previous test as input
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount < 100)] = "warm low value"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount >= 100)] = "warm high value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount < 100)] = "active low value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount >= 100)] = "active high value"
table(customers_2015$segment)
##
## active high value active low value cold inactive
## 573 3313 1903 9158
## new active new warm warm high value warm low value
## 1512 938 119 901
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 active high value 88.59108 1985.68008 5.888307 240.04574
## 2 active low value 108.13184 2003.57283 5.935406 40.72452
## 3 cold 857.55223 1431.88802 2.303205 51.73989
## 4 inactive 2177.88167 2545.93921 1.814479 48.11277
## 5 new active 84.76157 89.78472 1.045635 77.13385
## 6 new warm 509.07574 516.39343 1.044776 66.59903
## 7 warm high value 454.89688 2015.12377 4.714286 327.40746
## 8 warm low value 474.14819 2063.41012 4.531632 38.59193
# Re-order factor in a way that makes sense
customers_2015$segment = factor(x = customers_2015$segment, levels = c("inactive", "cold",
"warm high value", "warm low value", "new warm",
"active high value", "active low value", "new active"))
table(customers_2015$segment)
##
## inactive cold warm high value warm low value
## 9158 1903 119 901
## new warm active high value active low value new active
## 938 573 3313 1512
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 inactive 2177.88167 2545.93921 1.814479 48.11277
## 2 cold 857.55223 1431.88802 2.303205 51.73989
## 3 warm high value 454.89688 2015.12377 4.714286 327.40746
## 4 warm low value 474.14819 2063.41012 4.531632 38.59193
## 5 new warm 509.07574 516.39343 1.044776 66.59903
## 6 active high value 88.59108 1985.68008 5.888307 240.04574
## 7 active low value 108.13184 2003.57283 5.935406 40.72452
## 8 new active 84.76157 89.78472 1.045635 77.13385
# --- SEGMENTING A DATABASE RETROSPECTIVELY ----------------
# Compute key marketing indicators using SQL language
library(sqldf)
# Compute recency, frequency, and average purchase amount
customers_2014 = sqldf("SELECT customer_id,
MIN(days_since) - 365 AS 'recency',
MAX(days_since) - 365 AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data
WHERE days_since > 365
GROUP BY 1")
# Complete segment solution using which, and exploiting previous test as input
customers_2014$segment = "NA"
customers_2014$segment[which(customers_2014$recency > 365*3)] = "inactive"
customers_2014$segment[which(customers_2014$recency <= 365*3 & customers_2014$recency > 365*2)] = "cold"
customers_2014$segment[which(customers_2014$recency <= 365*2 & customers_2014$recency > 365*1)] = "warm"
customers_2014$segment[which(customers_2014$recency <= 365)] = "active"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$first_purchase <= 365*2)] = "new warm"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount < 100)] = "warm low value"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount >= 100)] = "warm high value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$first_purchase <= 365)] = "new active"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount < 100)] = "active low value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount >= 100)] = "active high value"
# Re-order factor in a way that makes sense
customers_2014$segment = factor(x = customers_2014$segment, levels = c("inactive", "cold",
"warm high value", "warm low value", "new warm",
"active high value", "active low value", "new active"))
# Show segmentation results
table(customers_2014$segment)
##
## inactive cold warm high value warm low value
## 7512 2153 111 956
## new warm active high value active low value new active
## 1250 475 3011 1437
pie(table(customers_2014$segment), col = rainbow(24))

aggregate(x = customers_2014[, 2:5], by = list(customers_2014$segment), mean)
## Group.1 recency first_purchase frequency amount
## 1 inactive 2058.21213 2352.8077 1.730964 48.11120
## 2 cold 866.38765 1565.2149 2.254064 51.11460
## 3 warm high value 460.96903 1878.5816 4.414414 187.84911
## 4 warm low value 470.43192 1945.1976 4.361925 37.38206
## 5 new warm 497.08843 504.6508 1.057600 51.36765
## 6 active high value 85.10978 1837.8214 5.696842 261.90216
## 7 active low value 97.86183 1796.1222 5.633677 40.45917
## 8 new active 131.85782 142.6421 1.070981 69.72516
# --- COMPUTING REVENUE GENERATION PER SEGMENT -------------
# Compute how much revenue is generated by segments
# Notice that people with no revenue in 2015 do NOT appear
revenue_2015 = sqldf("SELECT customer_id, SUM(purchase_amount) AS 'revenue_2015'
FROM data
WHERE year_of_purchase = 2015
GROUP BY 1")
summary(revenue_2015)
## customer_id revenue_2015
## Min. : 80 Min. : 5.00
## 1st Qu.:104105 1st Qu.: 30.00
## Median :185495 Median : 50.00
## Mean :167782 Mean : 88.62
## 3rd Qu.:246058 3rd Qu.: 85.00
## Max. :264200 Max. :4500.00
# Merge 2015 customers and 2015 revenue (the wrong way)
actual = merge(customers_2015, revenue_2015)
# Merge 2015 customers and 2015 revenue (correct)
actual = merge(customers_2015, revenue_2015, all.x = TRUE)
actual$revenue_2015[is.na(actual$revenue_2015)] = 0
# Show average revenue per customer and per segment
aggregate(x = actual$revenue_2015, by = list(customers_2015$segment), mean)
## Group.1 x
## 1 inactive 0.00000
## 2 cold 0.00000
## 3 warm high value 0.00000
## 4 warm low value 0.00000
## 5 new warm 0.00000
## 6 active high value 323.56894
## 7 active low value 52.30604
## 8 new active 79.16614
# Merge 2014 customers and 2015 revenue (correct)
forward = merge(customers_2014, revenue_2015, all.x = TRUE)
forward$revenue_2015[is.na(forward$revenue_2015)] = 0
# Show average revenue per customer and per segment
r = aggregate(x = forward$revenue_2015, by = list(customers_2014$segment), mean)
print(r)
## Group.1 x
## 1 inactive 2.949466
## 2 cold 6.108221
## 3 warm high value 114.459459
## 4 warm low value 13.494770
## 5 new warm 5.064000
## 6 active high value 254.077895
## 7 active low value 41.896556
## 8 new active 31.046625
# Re-order and display results
r = r[order(r$x, decreasing = TRUE), ]
print(r)
## Group.1 x
## 6 active high value 254.077895
## 3 warm high value 114.459459
## 7 active low value 41.896556
## 8 new active 31.046625
## 4 warm low value 13.494770
## 2 cold 6.108221
## 5 new warm 5.064000
## 1 inactive 2.949466
barplot(r$x, names.arg = r$Group.1)
