Load open source dataset for customer purchase history for a retail outlet dataset into local variable called ‘data’

#setwd("~/coursera/Case studies_Accenture/Data Relevance project")

data = read.csv("OnlineRetail.csv")

Explore the dataset

str(data)
## 'data.frame':    541909 obs. of  8 variables:
##  $ InvoiceNo  : Factor w/ 25900 levels "536365","536366",..: 1 1 1 1 1 1 1 2 2 3 ...
##  $ StockCode  : Factor w/ 4070 levels "10002","10080",..: 3538 2795 3045 2986 2985 1663 801 1548 1547 3306 ...
##  $ Description: Factor w/ 4224 levels ""," 4 PURPLE FLOCK DINNER CANDLES",..: 4027 4035 932 1959 2980 3235 1573 1698 1695 259 ...
##  $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: Factor w/ 23260 levels "1/10/2011 10:04",..: 6839 6839 6839 6839 6839 6839 6839 6840 6840 6841 ...
##  $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
##  $ Country    : Factor w/ 38 levels "Australia","Austria",..: 36 36 36 36 36 36 36 36 36 36 ...
summary(data)
##    InvoiceNo        StockCode     
##  573585 :  1114   85123A :  2313  
##  581219 :   749   22423  :  2203  
##  581492 :   731   85099B :  2159  
##  580729 :   721   47566  :  1727  
##  558475 :   705   20725  :  1639  
##  579777 :   687   84879  :  1502  
##  (Other):537202   (Other):530366  
##                              Description        Quantity        
##  WHITE HANGING HEART T-LIGHT HOLDER:  2369   Min.   :-80995.00  
##  REGENCY CAKESTAND 3 TIER          :  2200   1st Qu.:     1.00  
##  JUMBO BAG RED RETROSPOT           :  2159   Median :     3.00  
##  PARTY BUNTING                     :  1727   Mean   :     9.55  
##  LUNCH BAG RED RETROSPOT           :  1638   3rd Qu.:    10.00  
##  ASSORTED COLOUR BIRD ORNAMENT     :  1501   Max.   : 80995.00  
##  (Other)                           :530315                      
##            InvoiceDate       UnitPrice           CustomerID    
##  10/31/2011 14:41:  1114   Min.   :-11062.06   Min.   :12346   
##  12/8/2011 9:28  :   749   1st Qu.:     1.25   1st Qu.:13953   
##  12/9/2011 10:03 :   731   Median :     2.08   Median :15152   
##  12/5/2011 17:24 :   721   Mean   :     4.61   Mean   :15288   
##  6/29/2011 15:58 :   705   3rd Qu.:     4.13   3rd Qu.:16791   
##  11/30/2011 15:13:   687   Max.   : 38970.00   Max.   :18287   
##  (Other)         :537202                       NA's   :135080  
##            Country      
##  United Kingdom:495478  
##  Germany       :  9495  
##  France        :  8557  
##  EIRE          :  8196  
##  Spain         :  2533  
##  Netherlands   :  2371  
##  (Other)       : 15279

Create ‘testdata’ to keep original dataset intact

testdata = data

Compute days_since and convert date into datetime format

testdata$InvoiceDate = as.POSIXct(testdata$InvoiceDate, format = "%m/%d/%Y %H:%M")
testdata$days_since = as.numeric(difftime(time1 = "2011-07-19",
                                          time2 = testdata$InvoiceDate,
                                          units = "days"))

Create a new column Amount (QTY * Unit Price)

testdata$Amount = (testdata$Quantity * testdata$UnitPrice)

Create another dataset ‘compdata’ to keep testdata intact

compdata = testdata

Cleaning data by Removing “NA’s” from dataset

compdata = na.omit(compdata)
compdata = compdata[compdata$Amount >0,]
summary(compdata)
##    InvoiceNo        StockCode     
##  576339 :   542   85123A :  2035  
##  579196 :   533   22423  :  1723  
##  580727 :   529   85099B :  1618  
##  578270 :   442   84879  :  1408  
##  573576 :   435   47566  :  1396  
##  567656 :   421   20725  :  1317  
##  (Other):394982   (Other):388387  
##                              Description        Quantity       
##  WHITE HANGING HEART T-LIGHT HOLDER:  2028   Min.   :    1.00  
##  REGENCY CAKESTAND 3 TIER          :  1723   1st Qu.:    2.00  
##  JUMBO BAG RED RETROSPOT           :  1618   Median :    6.00  
##  ASSORTED COLOUR BIRD ORNAMENT     :  1408   Mean   :   12.99  
##  PARTY BUNTING                     :  1396   3rd Qu.:   12.00  
##  LUNCH BAG RED RETROSPOT           :  1316   Max.   :80995.00  
##  (Other)                           :388395                     
##   InvoiceDate                    UnitPrice          CustomerID   
##  Min.   :2010-12-01 08:26:00   Min.   :   0.001   Min.   :12346  
##  1st Qu.:2011-04-07 11:12:00   1st Qu.:   1.250   1st Qu.:13969  
##  Median :2011-07-31 14:39:00   Median :   1.950   Median :15159  
##  Mean   :2011-07-11 00:04:07   Mean   :   3.116   Mean   :15294  
##  3rd Qu.:2011-10-20 14:33:00   3rd Qu.:   3.750   3rd Qu.:16795  
##  Max.   :2011-12-09 12:50:00   Max.   :8142.750   Max.   :18287  
##                                                                  
##            Country         days_since           Amount         
##  United Kingdom:354321   Min.   :-143.576   Min.   :     0.00  
##  Germany       :  9040   1st Qu.: -93.606   1st Qu.:     4.68  
##  France        :  8341   Median : -12.610   Median :    11.80  
##  EIRE          :  7236   Mean   :   7.997   Mean   :    22.40  
##  Spain         :  2484   3rd Qu.: 102.533   3rd Qu.:    19.80  
##  Netherlands   :  2359   Max.   : 229.607   Max.   :168469.60  
##  (Other)       : 14103
str(compdata)
## 'data.frame':    397884 obs. of  10 variables:
##  $ InvoiceNo  : Factor w/ 25900 levels "536365","536366",..: 1 1 1 1 1 1 1 2 2 3 ...
##  $ StockCode  : Factor w/ 4070 levels "10002","10080",..: 3538 2795 3045 2986 2985 1663 801 1548 1547 3306 ...
##  $ Description: Factor w/ 4224 levels ""," 4 PURPLE FLOCK DINNER CANDLES",..: 4027 4035 932 1959 2980 3235 1573 1698 1695 259 ...
##  $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: POSIXct, format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
##  $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
##  $ Country    : Factor w/ 38 levels "Australia","Austria",..: 36 36 36 36 36 36 36 36 36 36 ...
##  $ days_since : num  230 230 230 230 230 ...
##  $ Amount     : num  15.3 20.3 22 20.3 20.3 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:135080] 623 1444 1445 1446 1447 1448 1449 1450 1451 1452 ...
##   .. ..- attr(*, "names")= chr [1:135080] "623" "1444" "1445" "1446" ...

Computing Key Marketing Indicators

To analyze customer value through RFM method using SQL RFM stands for Recency - How recently did the customer purchase? Frequency - How often do they purchase? Monetary Value - How much do they spend?

#install.packages('sqldf')
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.3.2
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.3.2
## Loading required package: DBI
newcustomers = sqldf("SELECT CustomerID,
                  MIN(days_since) AS 'recency',
                  COUNT(*) AS 'frequency',
                  AVG(Amount) AS 'avg_amount'
                  FROM compdata GROUP BY 1")
## Loading required package: tcltk

Explore newcustomers dataset

head(newcustomers)
##   CustomerID    recency frequency  avg_amount
## 1      12346  181.54097         1 77183.60000
## 2      12347 -141.70278       182    23.68132
## 3      12348  -68.55069        31    57.97548
## 4      12349 -125.45208        73    24.07603
## 5      12350  166.29097        17    19.67059
## 6      12352 -107.60903        85    29.48282
str(newcustomers)
## 'data.frame':    4338 obs. of  4 variables:
##  $ CustomerID: int  12346 12347 12348 12349 12350 12352 12353 12354 12355 12356 ...
##  $ recency   : num  181.5 -141.7 -68.6 -125.5 166.3 ...
##  $ frequency : int  1 182 31 73 17 85 4 58 13 59 ...
##  $ avg_amount: num  77183.6 23.7 58 24.1 19.7 ...
summary(newcustomers)
##    CustomerID       recency           frequency         avg_amount      
##  Min.   :12346   Min.   :-143.576   Min.   :   1.00   Min.   :    2.10  
##  1st Qu.:13813   1st Qu.:-126.504   1st Qu.:  17.00   1st Qu.:   12.37  
##  Median :15300   Median : -93.445   Median :  41.00   Median :   17.72  
##  Mean   :15300   Mean   : -51.506   Mean   :  91.72   Mean   :   68.35  
##  3rd Qu.:16779   3rd Qu.:  -1.804   3rd Qu.: 100.00   3rd Qu.:   24.86  
##  Max.   :18287   Max.   : 229.547   Max.   :7847.00   Max.   :77183.60
summary(compdata$Amount)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      0.00      4.68     11.80     22.40     19.80 168500.00
summary(compdata)
##    InvoiceNo        StockCode     
##  576339 :   542   85123A :  2035  
##  579196 :   533   22423  :  1723  
##  580727 :   529   85099B :  1618  
##  578270 :   442   84879  :  1408  
##  573576 :   435   47566  :  1396  
##  567656 :   421   20725  :  1317  
##  (Other):394982   (Other):388387  
##                              Description        Quantity       
##  WHITE HANGING HEART T-LIGHT HOLDER:  2028   Min.   :    1.00  
##  REGENCY CAKESTAND 3 TIER          :  1723   1st Qu.:    2.00  
##  JUMBO BAG RED RETROSPOT           :  1618   Median :    6.00  
##  ASSORTED COLOUR BIRD ORNAMENT     :  1408   Mean   :   12.99  
##  PARTY BUNTING                     :  1396   3rd Qu.:   12.00  
##  LUNCH BAG RED RETROSPOT           :  1316   Max.   :80995.00  
##  (Other)                           :388395                     
##   InvoiceDate                    UnitPrice          CustomerID   
##  Min.   :2010-12-01 08:26:00   Min.   :   0.001   Min.   :12346  
##  1st Qu.:2011-04-07 11:12:00   1st Qu.:   1.250   1st Qu.:13969  
##  Median :2011-07-31 14:39:00   Median :   1.950   Median :15159  
##  Mean   :2011-07-11 00:04:07   Mean   :   3.116   Mean   :15294  
##  3rd Qu.:2011-10-20 14:33:00   3rd Qu.:   3.750   3rd Qu.:16795  
##  Max.   :2011-12-09 12:50:00   Max.   :8142.750   Max.   :18287  
##                                                                  
##            Country         days_since           Amount         
##  United Kingdom:354321   Min.   :-143.576   Min.   :     0.00  
##  Germany       :  9040   1st Qu.: -93.606   1st Qu.:     4.68  
##  France        :  8341   Median : -12.610   Median :    11.80  
##  EIRE          :  7236   Mean   :   7.997   Mean   :    22.40  
##  Spain         :  2484   3rd Qu.: 102.533   3rd Qu.:    19.80  
##  Netherlands   :  2359   Max.   : 229.607   Max.   :168469.60  
##  (Other)       : 14103
hist(newcustomers$recency)

hist(newcustomers$frequency)

hist(newcustomers$avg_amount)

hist(newcustomers$avg_amount, breaks = 10)

Preparing and Transforming Data

Copy customer data into new data frame ‘customers’

customers = newcustomers

Remove customer id as a variable, store it as row names

head(customers)
##   CustomerID    recency frequency  avg_amount
## 1      12346  181.54097         1 77183.60000
## 2      12347 -141.70278       182    23.68132
## 3      12348  -68.55069        31    57.97548
## 4      12349 -125.45208        73    24.07603
## 5      12350  166.29097        17    19.67059
## 6      12352 -107.60903        85    29.48282
row.names(customers) = customers$CustomerID
customers$CustomerID = 0
head(customers)
##       CustomerID    recency frequency  avg_amount
## 12346          0  181.54097         1 77183.60000
## 12347          0 -141.70278       182    23.68132
## 12348          0  -68.55069        31    57.97548
## 12349          0 -125.45208        73    24.07603
## 12350          0  166.29097        17    19.67059
## 12352          0 -107.60903        85    29.48282

Take the log-transform of the amount, and plot

customers$avg_amount = log(customers$avg_amount)
hist(customers$avg_amount)

Standardize variables through Scaling

customers = scale(customers)
head(customers)
##       CustomerID    recency   frequency avg_amount
## 12346        NaN  2.3300500 -0.39653199  9.2259949
## 12347        NaN -0.9018018  0.39460347  0.2805031
## 12348        NaN -0.1704135 -0.26540457  1.2706066
## 12349        NaN -0.7393243 -0.08182617  0.2987830
## 12350        NaN  2.1775777 -0.32659736  0.0752997
## 12352        NaN -0.5609260 -0.02937520  0.5228181

RUNNING A HIERARCHICAL SEGMENTATION

Compute distance metrics on standardized data

d = dist(customers)

Take a 10% sample

sample = seq(1, 4338, by = 10)
head(sample)
## [1]  1 11 21 31 41 51
customers_sample = customers[sample, ]
new_data_sample  = customers[sample, ]

Compute distance metrics on standardized data

d = dist(new_data_sample)

Perform hierarchical clustering on distance metrics

c = hclust(d, method="ward.D2")

Plot dendogram

plot(c)

Cut at 5 segments

members = cutree(c, k = 5)

Show 30 first customers, frequency table

members[1:30]
## 12346 12357 12370 12381 12397 12409 12422 12432 12445 12455 12471 12481 
##     1     2     2     2     2     2     3     2     2     2     2     2 
## 12497 12510 12521 12534 12547 12560 12573 12584 12594 12607 12618 12628 
##     2     3     3     3     3     2     3     2     2     2     2     2 
## 12642 12652 12665 12679 12690 12704 
##     2     3     3     2     3     2
table(members)
## members
##   1   2   3   4   5 
##   7 166 162  98   1

Show profile of each segment

aggregate(customers_sample[, 2:4], by = list(members), mean)
##   Group.1    recency   frequency avg_amount
## 1       1  1.4119515 -0.38903899  4.2445490
## 2       2 -0.6049432 -0.06823948  0.4543241
## 3       3  1.0214686 -0.24252472 -0.1120100
## 4       4 -0.6765733  0.48585745 -0.9734574
## 5       5 -0.9102377 33.89766029 -1.3909332

COMPUTING PREDICTORS AND TARGET VARIABLES

Compute RFM variables as of 6months ago -

Extract all the predictors: then these will be used what happend over next 6 months

testdata$InvoiceDate = as.POSIXct(testdata$InvoiceDate, format = "%m/%d/%Y %H:%M")
testdata$days_since = as.numeric(difftime(time1 = "2011-07-19",
                                          time2 = testdata$InvoiceDate,
                                          units = "days"))
library(sqldf)
customers_6monthsago = sqldf("SELECT customerID,InvoiceNo,
                             MIN(days_since) - 180 AS 'recency',
                             MAX(days_since) - 180 AS 'first_purchase',
                             COUNT(*) AS 'frequency',
                             AVG(Amount) AS 'avg_amount',
                             MAX(Amount) AS 'max_amount'
                             FROM compdata
                             WHERE days_since < -12   
                             GROUP BY 1,2")

summary(customers_6monthsago)
##    CustomerID      InvoiceNo       recency       first_purchase  
##  Min.   :12347   561861 :   1   Min.   :-323.6   Min.   :-323.6  
##  1st Qu.:13791   561862 :   1   1st Qu.:-300.6   1st Qu.:-300.6  
##  Median :15152   561863 :   1   Median :-272.5   Median :-272.5  
##  Mean   :15251   561864 :   1   Mean   :-268.1   Mean   :-268.1  
##  3rd Qu.:16749   561865 :   1   3rd Qu.:-238.5   3rd Qu.:-238.5  
##  Max.   :18287   561866 :   1   Max.   :-192.4   Max.   :-192.4  
##                  (Other):8434                                    
##    frequency        avg_amount          max_amount       
##  Min.   :  1.00   Min.   :     0.38   Min.   :     0.38  
##  1st Qu.:  7.00   1st Qu.:    12.66   1st Qu.:    25.50  
##  Median : 16.00   Median :    18.79   Median :    41.60  
##  Mean   : 23.66   Mean   :    70.46   Mean   :   119.35  
##  3rd Qu.: 29.00   3rd Qu.:    31.57   3rd Qu.:    85.92  
##  Max.   :542.00   Max.   :168469.60   Max.   :168469.60  
## 
summary(compdata)
##    InvoiceNo        StockCode     
##  576339 :   542   85123A :  2035  
##  579196 :   533   22423  :  1723  
##  580727 :   529   85099B :  1618  
##  578270 :   442   84879  :  1408  
##  573576 :   435   47566  :  1396  
##  567656 :   421   20725  :  1317  
##  (Other):394982   (Other):388387  
##                              Description        Quantity       
##  WHITE HANGING HEART T-LIGHT HOLDER:  2028   Min.   :    1.00  
##  REGENCY CAKESTAND 3 TIER          :  1723   1st Qu.:    2.00  
##  JUMBO BAG RED RETROSPOT           :  1618   Median :    6.00  
##  ASSORTED COLOUR BIRD ORNAMENT     :  1408   Mean   :   12.99  
##  PARTY BUNTING                     :  1396   3rd Qu.:   12.00  
##  LUNCH BAG RED RETROSPOT           :  1316   Max.   :80995.00  
##  (Other)                           :388395                     
##   InvoiceDate                    UnitPrice          CustomerID   
##  Min.   :2010-12-01 08:26:00   Min.   :   0.001   Min.   :12346  
##  1st Qu.:2011-04-07 11:12:00   1st Qu.:   1.250   1st Qu.:13969  
##  Median :2011-07-31 14:39:00   Median :   1.950   Median :15159  
##  Mean   :2011-07-11 00:04:07   Mean   :   3.116   Mean   :15294  
##  3rd Qu.:2011-10-20 14:33:00   3rd Qu.:   3.750   3rd Qu.:16795  
##  Max.   :2011-12-09 12:50:00   Max.   :8142.750   Max.   :18287  
##                                                                  
##            Country         days_since           Amount         
##  United Kingdom:354321   Min.   :-143.576   Min.   :     0.00  
##  Germany       :  9040   1st Qu.: -93.606   1st Qu.:     4.68  
##  France        :  8341   Median : -12.610   Median :    11.80  
##  EIRE          :  7236   Mean   :   7.997   Mean   :    22.40  
##  Spain         :  2484   3rd Qu.: 102.533   3rd Qu.:    19.80  
##  Netherlands   :  2359   Max.   : 229.607   Max.   :168469.60  
##  (Other)       : 14103
hist(compdata$days_since)

Compute revenues generated by customers 6 months later

revenues_6months = sqldf("SELECT customerID, SUM(Amount) AS 'revenue_6months'
                              FROM compdata
                              WHERE days_since >  -12
                         GROUP BY 1")

head(revenues_6months)
##   CustomerID revenue_6months
## 1      12346        77183.60
## 2      12347         2205.95
## 3      12348         1487.24
## 4      12350          334.40
## 5      12352         1561.81
## 6      12353           89.00
head(customers_6monthsago)
##   CustomerID InvoiceNo   recency first_purchase frequency avg_amount
## 1      12347    562032 -194.3667      -194.3667        22   26.58682
## 2      12347    573511 -284.5174      -284.5174        47   27.53872
## 3      12347    581180 -321.7028      -321.7028        11   20.43818
## 4      12348    568172 -248.5507      -248.5507         3  103.33333
## 5      12349    577609 -305.4521      -305.4521        73   24.07603
## 6      12352    567505 -243.6069      -243.6069        18   20.34722
##   max_amount
## 1     106.20
## 2     106.20
## 3      42.96
## 4     150.00
## 5     300.00
## 6      80.00
revenues_6months[revenues_6months$InvoiceNo == 541431,]
## [1] CustomerID      revenue_6months
## <0 rows> (or 0-length row.names)
customers_6monthsago[customers_6monthsago$InvoiceNo == 541431,]
## [1] CustomerID     InvoiceNo      recency        first_purchase
## [5] frequency      avg_amount     max_amount    
## <0 rows> (or 0-length row.names)

Merge 6months ago customers and 6months revenue later

we will run in sample predictions then out of sample predictions

in.sample = merge(customers_6monthsago, revenues_6months,all.x = TRUE)
head(in.sample)
##   CustomerID InvoiceNo   recency first_purchase frequency avg_amount
## 1      12347    581180 -321.7028      -321.7028        11   20.43818
## 2      12347    562032 -194.3667      -194.3667        22   26.58682
## 3      12347    573511 -284.5174      -284.5174        47   27.53872
## 4      12348    568172 -248.5507      -248.5507         3  103.33333
## 5      12349    577609 -305.4521      -305.4521        73   24.07603
## 6      12352    574275 -287.6090      -287.6090        15   20.78200
##   max_amount revenue_6months
## 1      42.96         2205.95
## 2     106.20         2205.95
## 3     106.20         2205.95
## 4     150.00         1487.24
## 5     300.00              NA
## 6      80.00         1561.81
in.sample$revenue_6months[is.na(in.sample$revenue_6months)] = 0

New variable active_6months to see whether customers have spent anything in next 6 months 0 = no, 1 = yes

in.sample$active_6months = as.numeric(in.sample$revenue_6months > 0)
head(in.sample$revenue_6months)
## [1] 2205.95 2205.95 2205.95 1487.24    0.00 1561.81

Multinomial model:To predict outcomes that either be 0 or 1

Calibrate probability model to check whether customer will be active in 6 months or not

library(nnet)
prob.model = multinom(formula = active_6months ~ recency + first_purchase + frequency + avg_amount + max_amount,data = in.sample)
## # weights:  7 (6 variable)
## initial  value 5850.162204 
## iter  10 value 4594.716649
## final  value 4593.278815 
## converged
prob.model
## Call:
## multinom(formula = active_6months ~ recency + first_purchase + 
##     frequency + avg_amount + max_amount, data = in.sample)
## 
## Coefficients:
##    (Intercept)        recency first_purchase      frequency     avg_amount 
##   2.8464826700   0.0004390637   0.0056723996  -0.0070378114  -0.0023956770 
##     max_amount 
##   0.0025267350 
## 
## Residual Deviance: 9186.558 
## AIC: 9196.558
# We see -ve recency means larger the recency lower the probability: more days have passed between the last purchase and today, so less likely u will make nxt purchase and vice versa frequency is +ve meaning more frequency more probablity of purchase
coef = summary(prob.model)$coefficients 

std = summary(prob.model)$standard.errors
print(coef)
##    (Intercept)        recency first_purchase      frequency     avg_amount 
##   2.8464826700   0.0004390637   0.0056723996  -0.0070378114  -0.0023956770 
##     max_amount 
##   0.0025267350
print(std)
##    (Intercept)        recency first_purchase      frequency     avg_amount 
##   0.1964625674   0.0003552645   0.0003554752   0.0008602611   0.0004591174 
##     max_amount 
##   0.0003968547
# To indicate which parameters are significant to what extent: good sign = above 2 and below -2
print(coef/std)
##    (Intercept)        recency first_purchase      frequency     avg_amount 
##      14.488677       1.235878      15.957229      -8.181018      -5.218006 
##     max_amount 
##       6.366903

For the monetary model, select only those who made a purchase = 1,they only fit the model

z = which(in.sample$active_6months ==1)
head(in.sample[z,])
##   CustomerID InvoiceNo   recency first_purchase frequency avg_amount
## 1      12347    581180 -321.7028      -321.7028        11   20.43818
## 2      12347    562032 -194.3667      -194.3667        22   26.58682
## 3      12347    573511 -284.5174      -284.5174        47   27.53872
## 4      12348    568172 -248.5507      -248.5507         3  103.33333
## 6      12352    574275 -287.6090      -287.6090        15   20.78200
## 7      12352    567505 -243.6069      -243.6069        18   20.34722
##   max_amount revenue_6months active_6months
## 1      42.96         2205.95              1
## 2     106.20         2205.95              1
## 3     106.20         2205.95              1
## 4     150.00         1487.24              1
## 6      80.00         1561.81              1
## 7      80.00         1561.81              1
summary(in.sample[z,])
##    CustomerID      InvoiceNo       recency       first_purchase  
##  Min.   :12347   561861 :   1   Min.   :-323.6   Min.   :-323.6  
##  1st Qu.:13784   561862 :   1   1st Qu.:-300.5   1st Qu.:-300.5  
##  Median :15114   561863 :   1   Median :-270.5   Median :-270.5  
##  Mean   :15244   561864 :   1   Mean   :-266.1   Mean   :-266.1  
##  3rd Qu.:16746   561865 :   1   3rd Qu.:-234.6   3rd Qu.:-234.6  
##  Max.   :18287   561866 :   1   Max.   :-192.4   Max.   :-192.4  
##                  (Other):6379                                    
##    frequency        avg_amount          max_amount       
##  Min.   :  1.00   Min.   :     0.38   Min.   :     0.38  
##  1st Qu.:  7.00   1st Qu.:    13.41   1st Qu.:    29.50  
##  Median : 15.00   Median :    19.73   Median :    45.00  
##  Mean   : 22.36   Mean   :    80.37   Mean   :   132.94  
##  3rd Qu.: 29.00   3rd Qu.:    34.78   3rd Qu.:    95.20  
##  Max.   :399.00   Max.   :168469.60   Max.   :168469.60  
##                                                          
##  revenue_6months    active_6months
##  Min.   :     2.9   Min.   :1     
##  1st Qu.:   597.1   1st Qu.:1     
##  Median :  1560.8   Median :1     
##  Mean   :  7107.1   Mean   :1     
##  3rd Qu.:  4265.8   3rd Qu.:1     
##  Max.   :136028.1   Max.   :1     
## 

Calibrate the monetary model (version 1)

amount.model = lm(formula = revenue_6months~ avg_amount + max_amount + recency + first_purchase + frequency, data = in.sample[z,])
amount.model$coefficients
##    (Intercept)     avg_amount     max_amount        recency first_purchase 
##    7435.801479     -38.046607      38.171295       7.785148             NA 
##      frequency 
##     -12.229979
summary(in.sample[z,])
##    CustomerID      InvoiceNo       recency       first_purchase  
##  Min.   :12347   561861 :   1   Min.   :-323.6   Min.   :-323.6  
##  1st Qu.:13784   561862 :   1   1st Qu.:-300.5   1st Qu.:-300.5  
##  Median :15114   561863 :   1   Median :-270.5   Median :-270.5  
##  Mean   :15244   561864 :   1   Mean   :-266.1   Mean   :-266.1  
##  3rd Qu.:16746   561865 :   1   3rd Qu.:-234.6   3rd Qu.:-234.6  
##  Max.   :18287   561866 :   1   Max.   :-192.4   Max.   :-192.4  
##                  (Other):6379                                    
##    frequency        avg_amount          max_amount       
##  Min.   :  1.00   Min.   :     0.38   Min.   :     0.38  
##  1st Qu.:  7.00   1st Qu.:    13.41   1st Qu.:    29.50  
##  Median : 15.00   Median :    19.73   Median :    45.00  
##  Mean   : 22.36   Mean   :    80.37   Mean   :   132.94  
##  3rd Qu.: 29.00   3rd Qu.:    34.78   3rd Qu.:    95.20  
##  Max.   :399.00   Max.   :168469.60   Max.   :168469.60  
##                                                          
##  revenue_6months    active_6months
##  Min.   :     2.9   Min.   :1     
##  1st Qu.:   597.1   1st Qu.:1     
##  Median :  1560.8   Median :1     
##  Mean   :  7107.1   Mean   :1     
##  3rd Qu.:  4265.8   3rd Qu.:1     
##  Max.   :136028.1   Max.   :1     
## 
a = table(in.sample[z, ]$revenue_6months, amount.model$fitted.values)

summary(amount.model$fitted.values)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2956    5510    6007    7107    7089  177600
summary(in.sample[z, ]$revenue_6months)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      2.9    597.1   1561.0   7107.0   4266.0 136000.0

Plot the results of the monetary model

plot(x = in.sample[z, ]$revenue_6months, y = amount.model$fitted.values)

qqplot(x = in.sample[z, ]$revenue_6months, y = amount.model$fitted.values)

plot(x = in.sample[z,]$CustomerID, y=in.sample[z, ]$revenue_6months )

plot(x = in.sample[z,]$CustomerID, y=amount.model$fitted.values )

Re-calibrate the monetary model, using a log-transform (version 2)

amount.model = lm(formula = log(revenue_6months) ~ log(avg_amount) + log(max_amount), data = in.sample[z, ])
summary(amount.model)
## 
## Call:
## lm(formula = log(revenue_6months) ~ log(avg_amount) + log(max_amount), 
##     data = in.sample[z, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.4912  -0.9647  -0.0475   0.8456   6.2929 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.07231    0.07555  67.137  < 2e-16 ***
## log(avg_amount)  0.10579    0.03253   3.252  0.00115 ** 
## log(max_amount)  0.51606    0.03513  14.688  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.439 on 6382 degrees of freedom
## Multiple R-squared:  0.1601, Adjusted R-squared:  0.1598 
## F-statistic: 608.1 on 2 and 6382 DF,  p-value: < 2.2e-16

Plot the results of this new monetary model

plot(x = log(in.sample[z, ]$revenue_6months), y = amount.model$fitted.values)

# APPLY THE MODELS TO TODAY’S DATA

Compute RFM variables as of today

library(sqldf)
customers_2012 = sqldf("SELECT customerID,
                       MIN(days_since) AS 'recency',
                       MAX(days_since) AS 'first_purchase',
                       COUNT(*) AS 'frequency',
                       AVG(Amount) AS 'avg_amount',
                       MAX(Amount) AS 'max_amount'
                       FROM compdata GROUP BY 1")

head(customers_2012)
##   CustomerID    recency first_purchase frequency  avg_amount max_amount
## 1      12346  181.54097       181.5410         1 77183.60000    77183.6
## 2      12347 -141.70278       223.3354       182    23.68132      249.6
## 3      12348  -68.55069       214.1604        31    57.97548      240.0
## 4      12349 -125.45208      -125.4521        73    24.07603      300.0
## 5      12350  166.29097       166.2910        17    19.67059       40.0
## 6      12352 -107.60903       152.4354        85    29.48282      376.5

Predict the target variables based on today’s data

customers_2012$prob_predicted    = predict(object = prob.model, newdata = customers_2012, type = "probs")
customers_2012$revenue_predicted = exp(predict(object = amount.model, newdata = customers_2012))
customers_2012$score_predicted   = customers_2012$prob_predicted * customers_2012$revenue_predicted
summary(customers_2012$prob_predicted)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.9074  0.9507  0.9231  0.9714  1.0000
summary(customers_2012$revenue_predicted)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    362.9   1263.0   1669.0   2374.0   2499.0 252600.0
summary(customers_2012$score_predicted)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1151    1543    2217    2317  252600
hist(customers_2012$score_predicted)

hist(log(customers_2012$score_predicted))

customers_2012_new = customers_2012[customers_2012$score_predicted < 10000,]
d = dist(customers_2012_new)

plot(customers_2012_new$score_predicted)

hist(customers_2012_new$score_predicted)

Perform hierarchical clustering on distance metrics

c = hclust(d, method="ward.D2")

Plot the dendogram

plot(c)

Cut at 5 segments

members = cutree(c, k = 5)
customers_2012_new$cust_segement = members

Show 30 first customers, frequency table

members[1:30]
##  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  1  1  1  2  3  2  2  1  3  1  1  1  1  2  2  2  2  1  2  1  3  2  2  1  2 
## 27 28 29 30 31 
##  2  2  2  1  1
table(members)
## members
##    1    2    3    4    5 
##  592 1103  166 1397 1029
write.csv(file = "customers_2012_new.csv", x = customers_2012_new)

How many customers have an expected revenue of more than $2200

so we can target and customize our marketing promotions and offers to specific customers based on their predicted score

z = which(customers_2012$score_predicted > 2200)
print(length(z))
## [1] 1198