# --- COMPUTING RECENCY, FREQUENCY, MONETARY VALUE ---------


# Load text file into local variable called 'data'
data = read.delim(file = "C:\\Users\\Srirama Bonam\\Documents\\Mktg Data Analytics.txt", header = FALSE, sep = '\t', dec = '.')

# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$days_since       = as.numeric(difftime(time1 = "2016-01-01",
                                            time2 = data$date_of_purchase,
                                            units = "days"))

# Display the data after transformation
head(data)
##   customer_id purchase_amount date_of_purchase days_since
## 1         760              25       2009-11-06  2246.7708
## 2         860              50       2012-09-28  1189.7708
## 3        1200             100       2005-10-25  3719.7708
## 4        1420              50       2009-07-09  2366.7708
## 5        1940              70       2013-01-25  1070.7708
## 6        1960              40       2013-10-29   793.7708
summary(data)
##   customer_id     purchase_amount   date_of_purchase    
##  Min.   :    10   Min.   :   5.00   Min.   :2005-01-02  
##  1st Qu.: 57720   1st Qu.:  25.00   1st Qu.:2009-01-17  
##  Median :102440   Median :  30.00   Median :2011-11-23  
##  Mean   :108935   Mean   :  62.34   Mean   :2011-07-14  
##  3rd Qu.:160525   3rd Qu.:  60.00   3rd Qu.:2013-12-29  
##  Max.   :264200   Max.   :4500.00   Max.   :2015-12-31  
##    days_since      
##  Min.   :   0.771  
##  1st Qu.: 732.771  
##  Median :1499.771  
##  Mean   :1631.710  
##  3rd Qu.:2539.771  
##  Max.   :4015.771
# Compute key marketing indicators using SQL language
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# Compute recency, frequency, and average purchase amount
customers = sqldf("SELECT customer_id,
                          MIN(days_since) AS 'recency',
                          COUNT(*) AS 'frequency',
                          AVG(purchase_amount) AS 'amount'
                   FROM data GROUP BY 1")

# Explore the data
head(customers)
##   customer_id   recency frequency    amount
## 1          10 3828.7708         1  30.00000
## 2          80  342.7708         7  71.42857
## 3          90  757.7708        10 115.80000
## 4         120 1400.7708         1  20.00000
## 5         130 2969.7708         2  50.00000
## 6         160 2962.7708         2  30.00000
summary(customers)
##   customer_id        recency           frequency          amount       
##  Min.   :    10   Min.   :   0.771   Min.   : 1.000   Min.   :   5.00  
##  1st Qu.: 81990   1st Qu.: 243.771   1st Qu.: 1.000   1st Qu.:  21.67  
##  Median :136430   Median :1069.771   Median : 2.000   Median :  30.00  
##  Mean   :137574   Mean   :1252.809   Mean   : 2.782   Mean   :  57.79  
##  3rd Qu.:195100   3rd Qu.:2129.771   3rd Qu.: 3.000   3rd Qu.:  50.00  
##  Max.   :264200   Max.   :4013.771   Max.   :45.000   Max.   :4500.00
hist(customers$recency)

hist(customers$frequency)

hist(customers$amount)

hist(customers$amount, breaks = 100)

# --- PREPARING AND TRANSFORMING DATA ----------------------


# Copy customer data into new data frame
new_data = customers

# Remove customer id as a variable, store it as row names
head(new_data)
##   customer_id   recency frequency    amount
## 1          10 3828.7708         1  30.00000
## 2          80  342.7708         7  71.42857
## 3          90  757.7708        10 115.80000
## 4         120 1400.7708         1  20.00000
## 5         130 2969.7708         2  50.00000
## 6         160 2962.7708         2  30.00000
row.names(new_data) = new_data$customer_id
new_data$customer_id = NULL
head(new_data)
##       recency frequency    amount
## 10  3828.7708         1  30.00000
## 80   342.7708         7  71.42857
## 90   757.7708        10 115.80000
## 120 1400.7708         1  20.00000
## 130 2969.7708         2  50.00000
## 160 2962.7708         2  30.00000
# Take the log-transform of the amount, and plot
new_data$amount = log(new_data$amount)
hist(new_data$amount)

# Standardize variables
new_data = scale(new_data)
head(new_data)
##        recency  frequency     amount
## 10   2.3819788 -0.6068923 -0.2357955
## 80  -0.8415073  1.4360863  0.8943622
## 90  -0.4577590  2.4575756  1.5238194
## 120  0.1368198 -0.6068923 -0.7640251
## 130  1.5876660 -0.2663959  0.4296952
## 160  1.5811931 -0.2663959 -0.2357955
# --- RUNNING A HIERARCHICAL SEGMENTATION ------------------


# Compute distance metrics on standardized data
# This will likely generate an error on most machines
# d = dist(new_data)

# Take a 10% sample
sample = seq(1, 18417, by = 10)
head(sample)
## [1]  1 11 21 31 41 51
customers_sample = customers[sample, ]
new_data_sample  = new_data[sample, ]

# Compute distance metrics on standardized data
d = dist(new_data_sample)

# Perform hierarchical clustering on distance metrics
c = hclust(d, method="ward.D2")

# Plot de dendogram
plot(c)

# Cut at 9 segments
members = cutree(c, k = 9)

# Show 30 first customers, frequency table
members[1:30]
##   10  260  510  850 1040 1430 1860 2160 2380 2700 3000 3140 3650 3920 4240 
##    1    2    1    3    4    1    4    5    6    7    7    5    3    1    2 
## 4470 4710 4910 5230 5520 5710 5920 6080 6240 6410 6600 6750 6940 7100 7330 
##    3    6    7    4    5    1    5    8    5    6    1    3    3    7    2
table(members)
## members
##   1   2   3   4   5   6   7   8   9 
## 309 212 130 306  49  83 236 317 200
# Show profile of each segment
aggregate(customers_sample[, 2:4], by = list(members), mean)
##   Group.1   recency frequency    amount
## 1       1 2563.0071  1.284790  37.78474
## 2       2 2684.0633  1.311321  16.26336
## 3       3  193.4170 10.615385  42.01521
## 4       4  161.8754  2.398693  41.10641
## 5       5 2566.8116  1.857143 214.86197
## 6       6  448.3491  6.578313 261.21914
## 7       7 1143.4657  4.466102  29.85421
## 8       8  922.6005  1.280757  22.43357
## 9       9  799.1108  1.400000  87.42917