# --- COMPUTING RECENCY, FREQUENCY, MONETARY VALUE ---------
# Load text file into local variable called 'data'
data = read.delim(file = "C:\\Users\\Srirama Bonam\\Documents\\Mktg Data Analytics.txt", header = FALSE, sep = '\t', dec = '.')
# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$days_since = as.numeric(difftime(time1 = "2016-01-01",
time2 = data$date_of_purchase,
units = "days"))
# Display the data after transformation
head(data)
## customer_id purchase_amount date_of_purchase days_since
## 1 760 25 2009-11-06 2246.7708
## 2 860 50 2012-09-28 1189.7708
## 3 1200 100 2005-10-25 3719.7708
## 4 1420 50 2009-07-09 2366.7708
## 5 1940 70 2013-01-25 1070.7708
## 6 1960 40 2013-10-29 793.7708
summary(data)
## customer_id purchase_amount date_of_purchase
## Min. : 10 Min. : 5.00 Min. :2005-01-02
## 1st Qu.: 57720 1st Qu.: 25.00 1st Qu.:2009-01-17
## Median :102440 Median : 30.00 Median :2011-11-23
## Mean :108935 Mean : 62.34 Mean :2011-07-14
## 3rd Qu.:160525 3rd Qu.: 60.00 3rd Qu.:2013-12-29
## Max. :264200 Max. :4500.00 Max. :2015-12-31
## days_since
## Min. : 0.771
## 1st Qu.: 732.771
## Median :1499.771
## Mean :1631.710
## 3rd Qu.:2539.771
## Max. :4015.771
# Compute key marketing indicators using SQL language
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# Compute recency, frequency, and average purchase amount
customers = sqldf("SELECT customer_id,
MIN(days_since) AS 'recency',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount'
FROM data GROUP BY 1")
# Explore the data
head(customers)
## customer_id recency frequency amount
## 1 10 3828.7708 1 30.00000
## 2 80 342.7708 7 71.42857
## 3 90 757.7708 10 115.80000
## 4 120 1400.7708 1 20.00000
## 5 130 2969.7708 2 50.00000
## 6 160 2962.7708 2 30.00000
summary(customers)
## customer_id recency frequency amount
## Min. : 10 Min. : 0.771 Min. : 1.000 Min. : 5.00
## 1st Qu.: 81990 1st Qu.: 243.771 1st Qu.: 1.000 1st Qu.: 21.67
## Median :136430 Median :1069.771 Median : 2.000 Median : 30.00
## Mean :137574 Mean :1252.809 Mean : 2.782 Mean : 57.79
## 3rd Qu.:195100 3rd Qu.:2129.771 3rd Qu.: 3.000 3rd Qu.: 50.00
## Max. :264200 Max. :4013.771 Max. :45.000 Max. :4500.00
hist(customers$recency)

hist(customers$frequency)

hist(customers$amount)

hist(customers$amount, breaks = 100)

# --- PREPARING AND TRANSFORMING DATA ----------------------
# Copy customer data into new data frame
new_data = customers
# Remove customer id as a variable, store it as row names
head(new_data)
## customer_id recency frequency amount
## 1 10 3828.7708 1 30.00000
## 2 80 342.7708 7 71.42857
## 3 90 757.7708 10 115.80000
## 4 120 1400.7708 1 20.00000
## 5 130 2969.7708 2 50.00000
## 6 160 2962.7708 2 30.00000
row.names(new_data) = new_data$customer_id
new_data$customer_id = NULL
head(new_data)
## recency frequency amount
## 10 3828.7708 1 30.00000
## 80 342.7708 7 71.42857
## 90 757.7708 10 115.80000
## 120 1400.7708 1 20.00000
## 130 2969.7708 2 50.00000
## 160 2962.7708 2 30.00000
# Take the log-transform of the amount, and plot
new_data$amount = log(new_data$amount)
hist(new_data$amount)

# Standardize variables
new_data = scale(new_data)
head(new_data)
## recency frequency amount
## 10 2.3819788 -0.6068923 -0.2357955
## 80 -0.8415073 1.4360863 0.8943622
## 90 -0.4577590 2.4575756 1.5238194
## 120 0.1368198 -0.6068923 -0.7640251
## 130 1.5876660 -0.2663959 0.4296952
## 160 1.5811931 -0.2663959 -0.2357955
# --- RUNNING A HIERARCHICAL SEGMENTATION ------------------
# Compute distance metrics on standardized data
# This will likely generate an error on most machines
# d = dist(new_data)
# Take a 10% sample
sample = seq(1, 18417, by = 10)
head(sample)
## [1] 1 11 21 31 41 51
customers_sample = customers[sample, ]
new_data_sample = new_data[sample, ]
# Compute distance metrics on standardized data
d = dist(new_data_sample)
# Perform hierarchical clustering on distance metrics
c = hclust(d, method="ward.D2")
# Plot de dendogram
plot(c)

# Cut at 9 segments
members = cutree(c, k = 9)
# Show 30 first customers, frequency table
members[1:30]
## 10 260 510 850 1040 1430 1860 2160 2380 2700 3000 3140 3650 3920 4240
## 1 2 1 3 4 1 4 5 6 7 7 5 3 1 2
## 4470 4710 4910 5230 5520 5710 5920 6080 6240 6410 6600 6750 6940 7100 7330
## 3 6 7 4 5 1 5 8 5 6 1 3 3 7 2
table(members)
## members
## 1 2 3 4 5 6 7 8 9
## 309 212 130 306 49 83 236 317 200
# Show profile of each segment
aggregate(customers_sample[, 2:4], by = list(members), mean)
## Group.1 recency frequency amount
## 1 1 2563.0071 1.284790 37.78474
## 2 2 2684.0633 1.311321 16.26336
## 3 3 193.4170 10.615385 42.01521
## 4 4 161.8754 2.398693 41.10641
## 5 5 2566.8116 1.857143 214.86197
## 6 6 448.3491 6.578313 261.21914
## 7 7 1143.4657 4.466102 29.85421
## 8 8 922.6005 1.280757 22.43357
## 9 9 799.1108 1.400000 87.42917