# --- COMPUTING RECENCY, FREQUENCY, MONETARY VALUE ---------


# Load text file into local variable called 'data'
data = read.delim(file = "C:\\Users\\Srirama Bonam\\Documents\\Mktg Data Analytics.txt", header = FALSE, sep = '\t', dec = '.')

# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
data$days_since       = as.numeric(difftime(time1 = "2016-01-01",
                                            time2 = data$date_of_purchase,
                                            units = "days"))

# Display the data after transformation
head(data)
##   customer_id purchase_amount date_of_purchase year_of_purchase days_since
## 1         760              25       2009-11-06             2009  2246.7708
## 2         860              50       2012-09-28             2012  1189.7708
## 3        1200             100       2005-10-25             2005  3719.7708
## 4        1420              50       2009-07-09             2009  2366.7708
## 5        1940              70       2013-01-25             2013  1070.7708
## 6        1960              40       2013-10-29             2013   793.7708
summary(data)
##   customer_id     purchase_amount   date_of_purchase     year_of_purchase
##  Min.   :    10   Min.   :   5.00   Min.   :2005-01-02   Min.   :2005    
##  1st Qu.: 57720   1st Qu.:  25.00   1st Qu.:2009-01-17   1st Qu.:2009    
##  Median :102440   Median :  30.00   Median :2011-11-23   Median :2011    
##  Mean   :108935   Mean   :  62.34   Mean   :2011-07-14   Mean   :2011    
##  3rd Qu.:160525   3rd Qu.:  60.00   3rd Qu.:2013-12-29   3rd Qu.:2013    
##  Max.   :264200   Max.   :4500.00   Max.   :2015-12-31   Max.   :2015    
##    days_since      
##  Min.   :   0.771  
##  1st Qu.: 732.771  
##  Median :1499.771  
##  Mean   :1631.710  
##  3rd Qu.:2539.771  
##  Max.   :4015.771
# Compute key marketing indicators using SQL language
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# Compute recency, frequency, and average purchase amount
customers_2015 = sqldf("SELECT customer_id,
                               MIN(days_since) AS 'recency',
                               MAX(days_since) AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'amount'
                        FROM data GROUP BY 1")

# Explore the data
head(customers_2015)
##   customer_id   recency first_purchase frequency    amount
## 1          10 3828.7708       3828.771         1  30.00000
## 2          80  342.7708       3750.771         7  71.42857
## 3          90  757.7708       3782.771        10 115.80000
## 4         120 1400.7708       1400.771         1  20.00000
## 5         130 2969.7708       3709.771         2  50.00000
## 6         160 2962.7708       3576.771         2  30.00000
summary(customers_2015)
##   customer_id        recency         first_purchase       frequency     
##  Min.   :    10   Min.   :   0.771   Min.   :   0.771   Min.   : 1.000  
##  1st Qu.: 81990   1st Qu.: 243.771   1st Qu.: 987.771   1st Qu.: 1.000  
##  Median :136430   Median :1069.771   Median :2086.771   Median : 2.000  
##  Mean   :137574   Mean   :1252.809   Mean   :1983.781   Mean   : 2.782  
##  3rd Qu.:195100   3rd Qu.:2129.771   3rd Qu.:2991.771   3rd Qu.: 3.000  
##  Max.   :264200   Max.   :4013.771   Max.   :4015.771   Max.   :45.000  
##      amount       
##  Min.   :   5.00  
##  1st Qu.:  21.67  
##  Median :  30.00  
##  Mean   :  57.79  
##  3rd Qu.:  50.00  
##  Max.   :4500.00
hist(customers_2015$recency)

hist(customers_2015$frequency)

hist(customers_2015$amount)

hist(customers_2015$amount, breaks = 100)

# --- CODING A MANAGERIAL SEGMENTATION ---------------------


# Simple 2-segment solution based on recency alone
customers_2015$segment = ifelse(test = customers_2015$recency > 365*3, yes = "inactive", no = "NA")
table(customers_2015$segment)
## 
## inactive       NA 
##     9158     9259
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##    Group.1   recency first_purchase frequency   amount
## 1 inactive 2177.8817       2545.939  1.814479 48.11277
## 2       NA  337.8268       1427.754  3.739713 67.36760
# A more complex 3-segment solution based on recency alone
customers_2015$segment = ifelse(test = customers_2015$recency > 365*3,
                                yes = "inactive",
                                no = ifelse(test = customers_2015$recency > 365*2,
                                            yes = "cold",
                                            no = "NA"))
table(customers_2015$segment)
## 
##     cold inactive       NA 
##     1903     9158     7356
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##    Group.1   recency first_purchase frequency   amount
## 1     cold  857.5522       1431.888  2.303205 51.73989
## 2 inactive 2177.8817       2545.939  1.814479 48.11277
## 3       NA  203.3736       1426.685  4.111338 71.41050
# Simple 2-segment solution using the which statement
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
table(customers_2015$segment)
## 
## inactive       NA 
##     9158     9259
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##    Group.1   recency first_purchase frequency   amount
## 1 inactive 2177.8817       2545.939  1.814479 48.11277
## 2       NA  337.8268       1427.754  3.739713 67.36760
# More complex 4-segment solution using which
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
table(customers_2015$segment)
## 
##   active     cold inactive     warm 
##     5398     1903     9158     1958
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##    Group.1    recency first_purchase frequency   amount
## 1   active   99.51148       1465.614  4.560763 72.08094
## 2     cold  857.55223       1431.888  2.303205 51.73989
## 3 inactive 2177.88167       2545.939  1.814479 48.11277
## 4     warm  489.71057       1319.361  2.872319 69.56215
# Complete segment solution using which, and exploiting previous test as input
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount < 100)] = "warm low value"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount >= 100)] = "warm high value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount < 100)] = "active low value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount >= 100)] = "active high value"
table(customers_2015$segment)
## 
## active high value  active low value              cold          inactive 
##               573              3313              1903              9158 
##        new active          new warm   warm high value    warm low value 
##              1512               938               119               901
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##             Group.1    recency first_purchase frequency    amount
## 1 active high value   88.59108     1985.68008  5.888307 240.04574
## 2  active low value  108.13184     2003.57283  5.935406  40.72452
## 3              cold  857.55223     1431.88802  2.303205  51.73989
## 4          inactive 2177.88167     2545.93921  1.814479  48.11277
## 5        new active   84.76157       89.78472  1.045635  77.13385
## 6          new warm  509.07574      516.39343  1.044776  66.59903
## 7   warm high value  454.89688     2015.12377  4.714286 327.40746
## 8    warm low value  474.14819     2063.41012  4.531632  38.59193
# Re-order factor in a way that makes sense
customers_2015$segment = factor(x = customers_2015$segment, levels = c("inactive", "cold",
                                                                       "warm high value", "warm low value", "new warm",
                                                                       "active high value", "active low value", "new active"))
table(customers_2015$segment)
## 
##          inactive              cold   warm high value    warm low value 
##              9158              1903               119               901 
##          new warm active high value  active low value        new active 
##               938               573              3313              1512
aggregate(x = customers_2015[, 2:5], by = list(customers_2015$segment), mean)
##             Group.1    recency first_purchase frequency    amount
## 1          inactive 2177.88167     2545.93921  1.814479  48.11277
## 2              cold  857.55223     1431.88802  2.303205  51.73989
## 3   warm high value  454.89688     2015.12377  4.714286 327.40746
## 4    warm low value  474.14819     2063.41012  4.531632  38.59193
## 5          new warm  509.07574      516.39343  1.044776  66.59903
## 6 active high value   88.59108     1985.68008  5.888307 240.04574
## 7  active low value  108.13184     2003.57283  5.935406  40.72452
## 8        new active   84.76157       89.78472  1.045635  77.13385
# --- SEGMENTING A DATABASE RETROSPECTIVELY ----------------


# Compute key marketing indicators using SQL language
library(sqldf)

# Compute recency, frequency, and average purchase amount
customers_2014 = sqldf("SELECT customer_id,
                               MIN(days_since) - 365 AS 'recency',
                               MAX(days_since) - 365 AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'amount'
                        FROM data
                        WHERE days_since > 365
                        GROUP BY 1")

# Complete segment solution using which, and exploiting previous test as input
customers_2014$segment = "NA"
customers_2014$segment[which(customers_2014$recency > 365*3)] = "inactive"
customers_2014$segment[which(customers_2014$recency <= 365*3 & customers_2014$recency > 365*2)] = "cold"
customers_2014$segment[which(customers_2014$recency <= 365*2 & customers_2014$recency > 365*1)] = "warm"
customers_2014$segment[which(customers_2014$recency <= 365)] = "active"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$first_purchase <= 365*2)] = "new warm"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount < 100)] = "warm low value"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount >= 100)] = "warm high value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$first_purchase <= 365)] = "new active"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount < 100)] = "active low value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount >= 100)] = "active high value"

# Re-order factor in a way that makes sense
customers_2014$segment = factor(x = customers_2014$segment, levels = c("inactive", "cold",
                                                                       "warm high value", "warm low value", "new warm",
                                                                       "active high value", "active low value", "new active"))

# Show segmentation results
table(customers_2014$segment)
## 
##          inactive              cold   warm high value    warm low value 
##              7512              2153               111               956 
##          new warm active high value  active low value        new active 
##              1250               475              3011              1437
pie(table(customers_2014$segment), col = rainbow(24))

aggregate(x = customers_2014[, 2:5], by = list(customers_2014$segment), mean)
##             Group.1    recency first_purchase frequency    amount
## 1          inactive 2058.21213      2352.8077  1.730964  48.11120
## 2              cold  866.38765      1565.2149  2.254064  51.11460
## 3   warm high value  460.96903      1878.5816  4.414414 187.84911
## 4    warm low value  470.43192      1945.1976  4.361925  37.38206
## 5          new warm  497.08843       504.6508  1.057600  51.36765
## 6 active high value   85.10978      1837.8214  5.696842 261.90216
## 7  active low value   97.86183      1796.1222  5.633677  40.45917
## 8        new active  131.85782       142.6421  1.070981  69.72516
# --- COMPUTING REVENUE GENERATION PER SEGMENT -------------


# Compute how much revenue is generated by segments
# Notice that people with no revenue in 2015 do NOT appear
revenue_2015 = sqldf("SELECT customer_id, SUM(purchase_amount) AS 'revenue_2015'
                      FROM data
                      WHERE year_of_purchase = 2015
                      GROUP BY 1")
summary(revenue_2015)
##   customer_id      revenue_2015    
##  Min.   :    80   Min.   :   5.00  
##  1st Qu.:104105   1st Qu.:  30.00  
##  Median :185495   Median :  50.00  
##  Mean   :167782   Mean   :  88.62  
##  3rd Qu.:246058   3rd Qu.:  85.00  
##  Max.   :264200   Max.   :4500.00
# Merge 2015 customers and 2015 revenue (the wrong way)
actual = merge(customers_2015, revenue_2015)

# Merge 2015 customers and 2015 revenue (correct)
actual = merge(customers_2015, revenue_2015, all.x = TRUE)
actual$revenue_2015[is.na(actual$revenue_2015)] = 0

# Show average revenue per customer and per segment
aggregate(x = actual$revenue_2015, by = list(customers_2015$segment), mean)
##             Group.1         x
## 1          inactive   0.00000
## 2              cold   0.00000
## 3   warm high value   0.00000
## 4    warm low value   0.00000
## 5          new warm   0.00000
## 6 active high value 323.56894
## 7  active low value  52.30604
## 8        new active  79.16614
# Merge 2014 customers and 2015 revenue (correct)
forward = merge(customers_2014, revenue_2015, all.x = TRUE)
forward$revenue_2015[is.na(forward$revenue_2015)] = 0

# Show average revenue per customer and per segment
r = aggregate(x = forward$revenue_2015, by = list(customers_2014$segment), mean)
print(r)
##             Group.1          x
## 1          inactive   2.949466
## 2              cold   6.108221
## 3   warm high value 114.459459
## 4    warm low value  13.494770
## 5          new warm   5.064000
## 6 active high value 254.077895
## 7  active low value  41.896556
## 8        new active  31.046625
# Re-order and display results
r = r[order(r$x, decreasing = TRUE), ]
print(r)
##             Group.1          x
## 6 active high value 254.077895
## 3   warm high value 114.459459
## 7  active low value  41.896556
## 8        new active  31.046625
## 4    warm low value  13.494770
## 2              cold   6.108221
## 5          new warm   5.064000
## 1          inactive   2.949466
barplot(r$x, names.arg = r$Group.1)