# --- SEGMENT CUSTOMERS IN 2014 AND 2015 -------------------


# Load text file into local variable called 'data'
data = read.delim(file = "C:\\Users\\Srirama Bonam\\Documents\\Mktg Data Analytics.txt", header = FALSE, sep = '\t', dec = '.')

# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
data$days_since       = as.numeric(difftime(time1 = "2016-01-01",
                                            time2 = data$date_of_purchase,
                                            units = "days"))

# Invoke library to compute key marketing indicators using SQL language
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# Segment customers in 2015
customers_2015 = sqldf("SELECT customer_id,
                               MIN(days_since) AS 'recency',
                               MAX(days_since) AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'amount'
                        FROM data GROUP BY 1")
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 & customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 & customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount < 100)] = "warm low value"
customers_2015$segment[which(customers_2015$segment == "warm" & customers_2015$amount >= 100)] = "warm high value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount < 100)] = "active low value"
customers_2015$segment[which(customers_2015$segment == "active" & customers_2015$amount >= 100)] = "active high value"
customers_2015$segment = factor(x = customers_2015$segment, levels = c("inactive", "cold",
                                                                       "warm high value", "warm low value", "new warm",
                                                                       "active high value", "active low value", "new active"))

# Segment customers in 2014
customers_2014 = sqldf("SELECT customer_id,
                               MIN(days_since) - 365 AS 'recency',
                               MAX(days_since) - 365 AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'amount'
                        FROM data
                        WHERE days_since > 365
                        GROUP BY 1")
customers_2014$segment = "NA"
customers_2014$segment[which(customers_2014$recency > 365*3)] = "inactive"
customers_2014$segment[which(customers_2014$recency <= 365*3 & customers_2014$recency > 365*2)] = "cold"
customers_2014$segment[which(customers_2014$recency <= 365*2 & customers_2014$recency > 365*1)] = "warm"
customers_2014$segment[which(customers_2014$recency <= 365)] = "active"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$first_purchase <= 365*2)] = "new warm"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount < 100)] = "warm low value"
customers_2014$segment[which(customers_2014$segment == "warm" & customers_2014$amount >= 100)] = "warm high value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$first_purchase <= 365)] = "new active"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount < 100)] = "active low value"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount >= 100)] = "active high value"
customers_2014$segment = factor(x = customers_2014$segment, levels = c("inactive", "cold",
                                                                       "warm high value", "warm low value", "new warm",
                                                                       "active high value", "active low value", "new active"))


# --- COMPUTE TRANSITION MATRIX ----------------------------


# Compute transition matrix
new_data = merge(x = customers_2014, y = customers_2015, by = "customer_id", all.x = TRUE)
head(new_data)
##   customer_id recency.x first_purchase.x frequency.x amount.x
## 1          10 3463.7708         3463.771           1     30.0
## 2          80  301.7708         3385.771           6     70.0
## 3          90  392.7708         3417.771          10    115.8
## 4         120 1035.7708         1035.771           1     20.0
## 5         130 2604.7708         3344.771           2     50.0
## 6         160 2597.7708         3211.771           2     30.0
##          segment.x recency.y first_purchase.y frequency.y  amount.y
## 1         inactive 3828.7708         3828.771           1  30.00000
## 2 active low value  342.7708         3750.771           7  71.42857
## 3  warm high value  757.7708         3782.771          10 115.80000
## 4             cold 1400.7708         1400.771           1  20.00000
## 5         inactive 2969.7708         3709.771           2  50.00000
## 6         inactive 2962.7708         3576.771           2  30.00000
##          segment.y
## 1         inactive
## 2 active low value
## 3             cold
## 4         inactive
## 5         inactive
## 6         inactive
transition = table(new_data$segment.x, new_data$segment.y)
print(transition)
##                    
##                     inactive cold warm high value warm low value new warm
##   inactive              7227    0               0              0        0
##   cold                  1931    0               0              0        0
##   warm high value          0   75               0              0        0
##   warm low value           0  689               0              0        0
##   new warm                 0 1139               0              0        0
##   active high value        0    0             119              0        0
##   active low value         0    0               0            901        0
##   new active               0    0               0              0      938
##                    
##                     active high value active low value new active
##   inactive                         35              250          0
##   cold                             22              200          0
##   warm high value                  35                1          0
##   warm low value                    1              266          0
##   new warm                         15               96          0
##   active high value               354                2          0
##   active low value                 22             2088          0
##   new active                       89              410          0
# Divide each row by its sum
transition = transition / rowSums(transition)
print(transition)
##                    
##                        inactive        cold warm high value warm low value
##   inactive          0.962060703 0.000000000     0.000000000    0.000000000
##   cold              0.896888063 0.000000000     0.000000000    0.000000000
##   warm high value   0.000000000 0.675675676     0.000000000    0.000000000
##   warm low value    0.000000000 0.720711297     0.000000000    0.000000000
##   new warm          0.000000000 0.911200000     0.000000000    0.000000000
##   active high value 0.000000000 0.000000000     0.250526316    0.000000000
##   active low value  0.000000000 0.000000000     0.000000000    0.299236134
##   new active        0.000000000 0.000000000     0.000000000    0.000000000
##                    
##                        new warm active high value active low value
##   inactive          0.000000000       0.004659212      0.033280085
##   cold              0.000000000       0.010218300      0.092893637
##   warm high value   0.000000000       0.315315315      0.009009009
##   warm low value    0.000000000       0.001046025      0.278242678
##   new warm          0.000000000       0.012000000      0.076800000
##   active high value 0.000000000       0.745263158      0.004210526
##   active low value  0.000000000       0.007306543      0.693457323
##   new active        0.652748782       0.061934586      0.285316632
##                    
##                      new active
##   inactive          0.000000000
##   cold              0.000000000
##   warm high value   0.000000000
##   warm low value    0.000000000
##   new warm          0.000000000
##   active high value 0.000000000
##   active low value  0.000000000
##   new active        0.000000000
# --- USE TRANSITION MATRIX TO MAKE PREDICTIONS ------------


# Initialize a matrix with the number of customers in each segment today and after 10 periods
segments = matrix(nrow = 8, ncol = 11)
segments[, 1] = table(customers_2015$segment)
colnames(segments) = 2015:2025
row.names(segments) = levels(customers_2015$segment)
print(segments)
##                   2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
## inactive          9158   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## cold              1903   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## warm high value    119   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## warm low value     901   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## new warm           938   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## active high value  573   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## active low value  3313   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## new active        1512   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
# Compute for each an every period
for (i in 2:11) {
  segments[, i] = segments[, i-1] %*% transition
}

# Plot inactive, active high value customers over time
barplot(segments[1, ])

barplot(segments[2, ])

# Display how segments will evolve over time
print(round(segments))
##                   2015  2016  2017  2018  2019  2020  2021  2022  2023
## inactive          9158 10517 11539 12636 12940 13186 13386 13542 13664
## cold              1903  1584  1711   874   821   782   740   709   684
## warm high value    119   144   165   160   156   152   149   146   143
## warm low value     901   991  1058   989   938   884   844   813   789
## new warm           938   987     0     0     0     0     0     0     0
## active high value  573   657   639   624   607   593   581   571   562
## active low value  3313  3537  3305  3134  2954  2820  2717  2637  2575
## new active        1512     0     0     0     0     0     0     0     0
##                    2024  2025
## inactive          13759 13834
## cold                665   650
## warm high value     141   139
## warm low value      771   756
## new warm              0     0
## active high value   554   547
## active low value   2527  2490
## new active            0     0
# --- COMPUTE THE (DISCOUNTED) CLV OF A DATABASE -----------


# Yearly revenue per segment
# This comes directly from module 2, lines 160-161
yearly_revenue = c(0, 0, 0, 0, 0, 323.57, 52.31, 79.17)

# Compute revenue per segment
revenue_per_segment = yearly_revenue * segments
print(revenue_per_segment)
##                       2015     2016     2017     2018     2019     2020
## inactive               0.0      0.0      0.0      0.0      0.0      0.0
## cold                   0.0      0.0      0.0      0.0      0.0      0.0
## warm high value        0.0      0.0      0.0      0.0      0.0      0.0
## warm low value         0.0      0.0      0.0      0.0      0.0      0.0
## new warm               0.0      0.0      0.0      0.0      0.0      0.0
## active high value 185405.6 212495.8 206634.8 202009.1 196555.6 191967.7
## active low value  173303.0 184999.5 172904.6 163925.2 154528.3 147531.6
## new active        119705.0      0.0      0.0      0.0      0.0      0.0
##                       2021     2022     2023     2024     2025
## inactive               0.0      0.0      0.0      0.0      0.0
## cold                   0.0      0.0      0.0      0.0      0.0
## warm high value        0.0      0.0      0.0      0.0      0.0
## warm low value         0.0      0.0      0.0      0.0      0.0
## new warm               0.0      0.0      0.0      0.0      0.0
## active high value 188022.8 184627.5 181717.4 179229.1 177106.1
## active low value  142127.7 137940.6 134702.6 132199.2 130264.3
## new active             0.0      0.0      0.0      0.0      0.0
# Compute yearly revenue
yearly_revenue = colSums(revenue_per_segment)
print(round(yearly_revenue))
##   2015   2016   2017   2018   2019   2020   2021   2022   2023   2024 
## 478414 397495 379539 365934 351084 339499 330150 322568 316420 311428 
##   2025 
## 307370
barplot(yearly_revenue)

# Compute cumulated revenue
cumulated_revenue = cumsum(yearly_revenue)
print(round(cumulated_revenue))
##    2015    2016    2017    2018    2019    2020    2021    2022    2023 
##  478414  875909 1255448 1621383 1972467 2311966 2642116 2964685 3281105 
##    2024    2025 
## 3592533 3899903
barplot(cumulated_revenue)

# Create a discount factor
discount_rate = 0.10
discount = 1 / ((1 + discount_rate) ^ ((1:11) - 1))
print(discount)
##  [1] 1.0000000 0.9090909 0.8264463 0.7513148 0.6830135 0.6209213 0.5644739
##  [8] 0.5131581 0.4665074 0.4240976 0.3855433
# Compute discounted yearly revenue
disc_yearly_revenue = yearly_revenue * discount
print(round(disc_yearly_revenue))
##   2015   2016   2017   2018   2019   2020   2021   2022   2023   2024 
## 478414 361359 313669 274932 239795 210802 186361 165528 147612 132076 
##   2025 
## 118505
barplot(disc_yearly_revenue)
lines(yearly_revenue)

# Compute discounted cumulated revenue
disc_cumulated_revenue = cumsum(disc_yearly_revenue)
print(round(disc_cumulated_revenue))
##    2015    2016    2017    2018    2019    2020    2021    2022    2023 
##  478414  839773 1153442 1428374 1668169 1878971 2065333 2230861 2378473 
##    2024    2025 
## 2510549 2629054
barplot(disc_cumulated_revenue)

# What is the database worth?
print(disc_cumulated_revenue[11] - yearly_revenue[1])
##    2025 
## 2150640