# --- COMPUTING PREDICTORS AND TARGET VARIABLES ------------


# Load text file into local variable called 'data'
data = read.delim(file = "C:\\Users\\Srirama Bonam\\Documents\\Mktg Data Analytics.txt", header = FALSE, sep = '\t', dec = '.')

# Add headers and interpret the last column as a date, extract year of purchase
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
data$days_since       = as.numeric(difftime(time1 = "2016-01-01",
                                            time2 = data$date_of_purchase,
                                            units = "days"))

# Compute key marketing indicators using SQL language
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# Compute RFM variables as of a year ago
customers_2014 = sqldf("SELECT customer_id,
                               MIN(days_since) - 365 AS 'recency',
                               MAX(days_since) - 365 AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'avg_amount',
                               MAX(purchase_amount) AS 'max_amount'
                        FROM data
                        WHERE days_since > 365
                        GROUP BY 1")

# Compute revenues generated by customers in 2015
revenue_2015 = sqldf("SELECT customer_id, SUM(purchase_amount) AS 'revenue_2015'
                      FROM data
                      WHERE year_of_purchase = 2015
                      GROUP BY 1")

# Merge 2015 customers and 2015 revenue
in_sample = merge(customers_2014, revenue_2015, all.x = TRUE)
in_sample$revenue_2015[is.na(in_sample$revenue_2015)] = 0
in_sample$active_2015 = as.numeric(in_sample$revenue_2015 > 0)

# Display calibration (in-sample) data
head(in_sample)
##   customer_id   recency first_purchase frequency avg_amount max_amount
## 1          10 3463.7708       3463.771         1       30.0         30
## 2          80  301.7708       3385.771         6       70.0         80
## 3          90  392.7708       3417.771        10      115.8        153
## 4         120 1035.7708       1035.771         1       20.0         20
## 5         130 2604.7708       3344.771         2       50.0         60
## 6         160 2597.7708       3211.771         2       30.0         30
##   revenue_2015 active_2015
## 1            0           0
## 2           80           1
## 3            0           0
## 4            0           0
## 5            0           0
## 6            0           0
summary(in_sample)
##   customer_id        recency         first_purchase       frequency     
##  Min.   :    10   Min.   :   0.771   Min.   :   0.771   Min.   : 1.000  
##  1st Qu.: 77710   1st Qu.: 257.771   1st Qu.: 795.771   1st Qu.: 1.000  
##  Median :127140   Median : 889.771   Median :1890.771   Median : 2.000  
##  Mean   :127315   Mean   :1122.357   Mean   :1788.182   Mean   : 2.665  
##  3rd Qu.:181800   3rd Qu.:1867.771   3rd Qu.:2694.771   3rd Qu.: 3.000  
##  Max.   :245840   Max.   :3648.771   Max.   :3650.771   Max.   :40.000  
##    avg_amount        max_amount       revenue_2015      active_2015    
##  Min.   :   5.00   Min.   :   5.00   Min.   :   0.00   Min.   :0.0000  
##  1st Qu.:  21.25   1st Qu.:  25.00   1st Qu.:   0.00   1st Qu.:0.0000  
##  Median :  30.00   Median :  30.00   Median :   0.00   Median :0.0000  
##  Mean   :  55.53   Mean   :  65.72   Mean   :  21.22   Mean   :0.2299  
##  3rd Qu.:  50.00   3rd Qu.:  60.00   3rd Qu.:   0.00   3rd Qu.:0.0000  
##  Max.   :4500.00   Max.   :4500.00   Max.   :4500.00   Max.   :1.0000
# --- CALIBRATE THE MODELS ---------------------------------


# Calibrate probability model
library(nnet)
prob.model = multinom(formula = active_2015 ~ recency + first_purchase + frequency + avg_amount + max_amount,
                      data = in_sample)
## # weights:  7 (6 variable)
## initial  value 11717.653087 
## iter  10 value 6233.713394
## final  value 6184.465956 
## converged
coef = summary(prob.model)$coefficients
std  = summary(prob.model)$standard.errors
print(coef)
##    (Intercept)        recency first_purchase      frequency     avg_amount 
##  -5.335883e-01  -1.965114e-03  -1.163259e-05   2.195100e-01   4.159636e-04 
##     max_amount 
##  -1.558103e-04
print(std)
##    (Intercept)        recency first_purchase      frequency     avg_amount 
##   4.410041e-02   6.000518e-05   3.925582e-05   1.479178e-02   3.634840e-04 
##     max_amount 
##   2.713806e-04
print(coef / std)
##    (Intercept)        recency first_purchase      frequency     avg_amount 
##    -12.0993962    -32.7490778     -0.2963278     14.8399941      1.1443792 
##     max_amount 
##     -0.5741393
# For the monetary model, select only those who made a purchase
z = which(in_sample$active_2015 == 1)
head(in_sample[z, ])
##    customer_id    recency first_purchase frequency avg_amount max_amount
## 2           80  301.77083       3385.771         6   70.00000         80
## 18         480   15.77083       3312.771        11   62.27273        235
## 30         830  266.77083       3373.771         6   48.33333         60
## 31         850   61.77083       3050.771         8   28.12500         30
## 32         860  266.77083       3642.771         9   53.33333         60
## 39        1020 1461.77083       1826.771         3   73.33333        100
##    revenue_2015 active_2015
## 2            80           1
## 18           45           1
## 30           50           1
## 31           60           1
## 32           60           1
## 39          120           1
summary(in_sample[z, ])
##   customer_id        recency         first_purchase       frequency     
##  Min.   :    80   Min.   :   0.771   Min.   :   0.771   Min.   : 1.000  
##  1st Qu.: 78590   1st Qu.:  22.771   1st Qu.: 649.521   1st Qu.: 2.000  
##  Median :143550   Median :  96.771   Median :1603.771   Median : 4.000  
##  Mean   :134907   Mean   : 306.117   Mean   :1635.934   Mean   : 4.741  
##  3rd Qu.:194363   3rd Qu.: 327.771   3rd Qu.:2665.771   3rd Qu.: 7.000  
##  Max.   :236660   Max.   :3543.771   Max.   :3646.771   Max.   :40.000  
##    avg_amount        max_amount       revenue_2015     active_2015
##  Min.   :   5.00   Min.   :   5.00   Min.   :   5.0   Min.   :1   
##  1st Qu.:  30.00   1st Qu.:  30.00   1st Qu.:  30.0   1st Qu.:1   
##  Median :  40.00   Median :  50.00   Median :  50.0   Median :1   
##  Mean   :  67.78   Mean   :  88.33   Mean   :  92.3   Mean   :1   
##  3rd Qu.:  60.00   3rd Qu.:  80.00   3rd Qu.: 100.0   3rd Qu.:1   
##  Max.   :4500.00   Max.   :4500.00   Max.   :4500.0   Max.   :1
# Calibrate the monetary model (version 1)
amount.model = lm(formula = revenue_2015 ~ avg_amount + max_amount, data = in_sample[z, ])
summary(amount.model)
## 
## Call:
## lm(formula = revenue_2015 ~ avg_amount + max_amount, data = in_sample[z, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2138.5   -20.1   -16.6     0.1  3361.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 20.74709    2.38112   8.713   <2e-16 ***
## avg_amount   0.67486    0.03280  20.575   <2e-16 ***
## max_amount   0.29225    0.02363  12.367   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 136.6 on 3883 degrees of freedom
## Multiple R-squared:  0.6054, Adjusted R-squared:  0.6052 
## F-statistic:  2979 on 2 and 3883 DF,  p-value: < 2.2e-16
# Plot the results of the monetary model
plot(x = in_sample[z, ]$revenue_2015, y = amount.model$fitted.values)

# Re-calibrate the monetary model, using a log-transform (version 2)
amount.model = lm(formula = log(revenue_2015) ~ log(avg_amount) + log(max_amount), data = in_sample[z, ])
summary(amount.model)
## 
## Call:
## lm(formula = log(revenue_2015) ~ log(avg_amount) + log(max_amount), 
##     data = in_sample[z, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7866 -0.1811 -0.0770  0.1852  3.5656 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.37000    0.04003   9.242   <2e-16 ***
## log(avg_amount)  0.54881    0.04167  13.171   <2e-16 ***
## log(max_amount)  0.38813    0.03796  10.224   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4781 on 3883 degrees of freedom
## Multiple R-squared:  0.6927, Adjusted R-squared:  0.6926 
## F-statistic:  4377 on 2 and 3883 DF,  p-value: < 2.2e-16
# Plot the results of this new monetary model
plot(x = log(in_sample[z, ]$revenue_2015), y = amount.model$fitted.values)

# --- APPLY THE MODELS TO TODAY'S DATA ---------------------


# Compute RFM variables as of today
customers_2015 = sqldf("SELECT customer_id,
                               MIN(days_since) AS 'recency',
                               MAX(days_since) AS 'first_purchase',
                               COUNT(*) AS 'frequency',
                               AVG(purchase_amount) AS 'avg_amount',
                               MAX(purchase_amount) AS 'max_amount'
                        FROM data GROUP BY 1")

# Predict the target variables based on today's data
customers_2015$prob_predicted    = predict(object = prob.model, newdata = customers_2015, type = "probs")
customers_2015$revenue_predicted = exp(predict(object = amount.model, newdata = customers_2015))
customers_2015$score_predicted   = customers_2015$prob_predicted * customers_2015$revenue_predicted
summary(customers_2015$prob_predicted)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000263 0.012634 0.106176 0.224984 0.397811 0.999904
summary(customers_2015$revenue_predicted)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.54   29.00   35.05   65.63   57.30 3832.95
summary(customers_2015$score_predicted)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    0.0033    0.4557    4.5559   18.8336   17.9572 2854.2611
hist(customers_2015$score_predicted)

# How many customers have an expected revenue of more than $50
z = which(customers_2015$score_predicted > 50)
print(length(z))
## [1] 1323