Load open source dataset for customer purchase history for a retail outlet dataset into local variable called ‘data’
#setwd("~/coursera/Case studies_Accenture/Data Relevance project")
data = read.csv("OnlineRetail.csv")
Explore the dataset
str(data)
## 'data.frame': 541909 obs. of 8 variables:
## $ InvoiceNo : Factor w/ 25900 levels "536365","536366",..: 1 1 1 1 1 1 1 2 2 3 ...
## $ StockCode : Factor w/ 4070 levels "10002","10080",..: 3538 2795 3045 2986 2985 1663 801 1548 1547 3306 ...
## $ Description: Factor w/ 4224 levels ""," 4 PURPLE FLOCK DINNER CANDLES",..: 4027 4035 932 1959 2980 3235 1573 1698 1695 259 ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: Factor w/ 23260 levels "1/10/2011 10:04",..: 6839 6839 6839 6839 6839 6839 6839 6840 6840 6841 ...
## $ UnitPrice : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
## $ Country : Factor w/ 38 levels "Australia","Austria",..: 36 36 36 36 36 36 36 36 36 36 ...
summary(data)
## InvoiceNo StockCode
## 573585 : 1114 85123A : 2313
## 581219 : 749 22423 : 2203
## 581492 : 731 85099B : 2159
## 580729 : 721 47566 : 1727
## 558475 : 705 20725 : 1639
## 579777 : 687 84879 : 1502
## (Other):537202 (Other):530366
## Description Quantity
## WHITE HANGING HEART T-LIGHT HOLDER: 2369 Min. :-80995.00
## REGENCY CAKESTAND 3 TIER : 2200 1st Qu.: 1.00
## JUMBO BAG RED RETROSPOT : 2159 Median : 3.00
## PARTY BUNTING : 1727 Mean : 9.55
## LUNCH BAG RED RETROSPOT : 1638 3rd Qu.: 10.00
## ASSORTED COLOUR BIRD ORNAMENT : 1501 Max. : 80995.00
## (Other) :530315
## InvoiceDate UnitPrice CustomerID
## 10/31/2011 14:41: 1114 Min. :-11062.06 Min. :12346
## 12/8/2011 9:28 : 749 1st Qu.: 1.25 1st Qu.:13953
## 12/9/2011 10:03 : 731 Median : 2.08 Median :15152
## 12/5/2011 17:24 : 721 Mean : 4.61 Mean :15288
## 6/29/2011 15:58 : 705 3rd Qu.: 4.13 3rd Qu.:16791
## 11/30/2011 15:13: 687 Max. : 38970.00 Max. :18287
## (Other) :537202 NA's :135080
## Country
## United Kingdom:495478
## Germany : 9495
## France : 8557
## EIRE : 8196
## Spain : 2533
## Netherlands : 2371
## (Other) : 15279
Create ‘testdata’ to keep original dataset intact
testdata = data
testdata$InvoiceDate = as.POSIXct(testdata$InvoiceDate, format = "%m/%d/%Y %H:%M")
testdata$days_since = as.numeric(difftime(time1 = "2011-07-19",
time2 = testdata$InvoiceDate,
units = "days"))
Create a new column Amount (QTY * Unit Price)
testdata$Amount = (testdata$Quantity * testdata$UnitPrice)
Create another dataset ‘compdata’ to keep testdata intact
compdata = testdata
Cleaning data by Removing “NA’s” from dataset
compdata = na.omit(compdata)
compdata = compdata[compdata$Amount >0,]
summary(compdata)
## InvoiceNo StockCode
## 576339 : 542 85123A : 2035
## 579196 : 533 22423 : 1723
## 580727 : 529 85099B : 1618
## 578270 : 442 84879 : 1408
## 573576 : 435 47566 : 1396
## 567656 : 421 20725 : 1317
## (Other):394982 (Other):388387
## Description Quantity
## WHITE HANGING HEART T-LIGHT HOLDER: 2028 Min. : 1.00
## REGENCY CAKESTAND 3 TIER : 1723 1st Qu.: 2.00
## JUMBO BAG RED RETROSPOT : 1618 Median : 6.00
## ASSORTED COLOUR BIRD ORNAMENT : 1408 Mean : 12.99
## PARTY BUNTING : 1396 3rd Qu.: 12.00
## LUNCH BAG RED RETROSPOT : 1316 Max. :80995.00
## (Other) :388395
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00 Min. : 0.001 Min. :12346
## 1st Qu.:2011-04-07 11:12:00 1st Qu.: 1.250 1st Qu.:13969
## Median :2011-07-31 14:39:00 Median : 1.950 Median :15159
## Mean :2011-07-11 00:04:07 Mean : 3.116 Mean :15294
## 3rd Qu.:2011-10-20 14:33:00 3rd Qu.: 3.750 3rd Qu.:16795
## Max. :2011-12-09 12:50:00 Max. :8142.750 Max. :18287
##
## Country days_since Amount
## United Kingdom:354321 Min. :-143.576 Min. : 0.00
## Germany : 9040 1st Qu.: -93.606 1st Qu.: 4.68
## France : 8341 Median : -12.610 Median : 11.80
## EIRE : 7236 Mean : 7.997 Mean : 22.40
## Spain : 2484 3rd Qu.: 102.533 3rd Qu.: 19.80
## Netherlands : 2359 Max. : 229.607 Max. :168469.60
## (Other) : 14103
str(compdata)
## 'data.frame': 397884 obs. of 10 variables:
## $ InvoiceNo : Factor w/ 25900 levels "536365","536366",..: 1 1 1 1 1 1 1 2 2 3 ...
## $ StockCode : Factor w/ 4070 levels "10002","10080",..: 3538 2795 3045 2986 2985 1663 801 1548 1547 3306 ...
## $ Description: Factor w/ 4224 levels ""," 4 PURPLE FLOCK DINNER CANDLES",..: 4027 4035 932 1959 2980 3235 1573 1698 1695 259 ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: POSIXct, format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
## $ UnitPrice : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
## $ Country : Factor w/ 38 levels "Australia","Austria",..: 36 36 36 36 36 36 36 36 36 36 ...
## $ days_since : num 230 230 230 230 230 ...
## $ Amount : num 15.3 20.3 22 20.3 20.3 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:135080] 623 1444 1445 1446 1447 1448 1449 1450 1451 1452 ...
## .. ..- attr(*, "names")= chr [1:135080] "623" "1444" "1445" "1446" ...
To analyze customer value through RFM method using SQL RFM stands for Recency - How recently did the customer purchase? Frequency - How often do they purchase? Monetary Value - How much do they spend?
#install.packages('sqldf')
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.3.2
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.3.2
## Loading required package: DBI
newcustomers = sqldf("SELECT CustomerID,
MIN(days_since) AS 'recency',
COUNT(*) AS 'frequency',
AVG(Amount) AS 'avg_amount'
FROM compdata GROUP BY 1")
## Loading required package: tcltk
head(newcustomers)
## CustomerID recency frequency avg_amount
## 1 12346 181.54097 1 77183.60000
## 2 12347 -141.70278 182 23.68132
## 3 12348 -68.55069 31 57.97548
## 4 12349 -125.45208 73 24.07603
## 5 12350 166.29097 17 19.67059
## 6 12352 -107.60903 85 29.48282
str(newcustomers)
## 'data.frame': 4338 obs. of 4 variables:
## $ CustomerID: int 12346 12347 12348 12349 12350 12352 12353 12354 12355 12356 ...
## $ recency : num 181.5 -141.7 -68.6 -125.5 166.3 ...
## $ frequency : int 1 182 31 73 17 85 4 58 13 59 ...
## $ avg_amount: num 77183.6 23.7 58 24.1 19.7 ...
summary(newcustomers)
## CustomerID recency frequency avg_amount
## Min. :12346 Min. :-143.576 Min. : 1.00 Min. : 2.10
## 1st Qu.:13813 1st Qu.:-126.504 1st Qu.: 17.00 1st Qu.: 12.37
## Median :15300 Median : -93.445 Median : 41.00 Median : 17.72
## Mean :15300 Mean : -51.506 Mean : 91.72 Mean : 68.35
## 3rd Qu.:16779 3rd Qu.: -1.804 3rd Qu.: 100.00 3rd Qu.: 24.86
## Max. :18287 Max. : 229.547 Max. :7847.00 Max. :77183.60
summary(compdata$Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 4.68 11.80 22.40 19.80 168500.00
summary(compdata)
## InvoiceNo StockCode
## 576339 : 542 85123A : 2035
## 579196 : 533 22423 : 1723
## 580727 : 529 85099B : 1618
## 578270 : 442 84879 : 1408
## 573576 : 435 47566 : 1396
## 567656 : 421 20725 : 1317
## (Other):394982 (Other):388387
## Description Quantity
## WHITE HANGING HEART T-LIGHT HOLDER: 2028 Min. : 1.00
## REGENCY CAKESTAND 3 TIER : 1723 1st Qu.: 2.00
## JUMBO BAG RED RETROSPOT : 1618 Median : 6.00
## ASSORTED COLOUR BIRD ORNAMENT : 1408 Mean : 12.99
## PARTY BUNTING : 1396 3rd Qu.: 12.00
## LUNCH BAG RED RETROSPOT : 1316 Max. :80995.00
## (Other) :388395
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00 Min. : 0.001 Min. :12346
## 1st Qu.:2011-04-07 11:12:00 1st Qu.: 1.250 1st Qu.:13969
## Median :2011-07-31 14:39:00 Median : 1.950 Median :15159
## Mean :2011-07-11 00:04:07 Mean : 3.116 Mean :15294
## 3rd Qu.:2011-10-20 14:33:00 3rd Qu.: 3.750 3rd Qu.:16795
## Max. :2011-12-09 12:50:00 Max. :8142.750 Max. :18287
##
## Country days_since Amount
## United Kingdom:354321 Min. :-143.576 Min. : 0.00
## Germany : 9040 1st Qu.: -93.606 1st Qu.: 4.68
## France : 8341 Median : -12.610 Median : 11.80
## EIRE : 7236 Mean : 7.997 Mean : 22.40
## Spain : 2484 3rd Qu.: 102.533 3rd Qu.: 19.80
## Netherlands : 2359 Max. : 229.607 Max. :168469.60
## (Other) : 14103
hist(newcustomers$recency)
hist(newcustomers$frequency)
hist(newcustomers$avg_amount)
hist(newcustomers$avg_amount, breaks = 10)
Copy customer data into new data frame ‘customers’
customers = newcustomers
Remove customer id as a variable, store it as row names
head(customers)
## CustomerID recency frequency avg_amount
## 1 12346 181.54097 1 77183.60000
## 2 12347 -141.70278 182 23.68132
## 3 12348 -68.55069 31 57.97548
## 4 12349 -125.45208 73 24.07603
## 5 12350 166.29097 17 19.67059
## 6 12352 -107.60903 85 29.48282
row.names(customers) = customers$CustomerID
customers$CustomerID = 0
head(customers)
## CustomerID recency frequency avg_amount
## 12346 0 181.54097 1 77183.60000
## 12347 0 -141.70278 182 23.68132
## 12348 0 -68.55069 31 57.97548
## 12349 0 -125.45208 73 24.07603
## 12350 0 166.29097 17 19.67059
## 12352 0 -107.60903 85 29.48282
Take the log-transform of the amount, and plot
customers$avg_amount = log(customers$avg_amount)
hist(customers$avg_amount)
Standardize variables through Scaling
customers = scale(customers)
head(customers)
## CustomerID recency frequency avg_amount
## 12346 NaN 2.3300500 -0.39653199 9.2259949
## 12347 NaN -0.9018018 0.39460347 0.2805031
## 12348 NaN -0.1704135 -0.26540457 1.2706066
## 12349 NaN -0.7393243 -0.08182617 0.2987830
## 12350 NaN 2.1775777 -0.32659736 0.0752997
## 12352 NaN -0.5609260 -0.02937520 0.5228181
Compute distance metrics on standardized data
d = dist(customers)
Take a 10% sample
sample = seq(1, 4338, by = 10)
head(sample)
## [1] 1 11 21 31 41 51
customers_sample = customers[sample, ]
new_data_sample = customers[sample, ]
Compute distance metrics on standardized data
d = dist(new_data_sample)
Perform hierarchical clustering on distance metrics
c = hclust(d, method="ward.D2")
plot(c)
Cut at 5 segments
members = cutree(c, k = 5)
Show 30 first customers, frequency table
members[1:30]
## 12346 12357 12370 12381 12397 12409 12422 12432 12445 12455 12471 12481
## 1 2 2 2 2 2 3 2 2 2 2 2
## 12497 12510 12521 12534 12547 12560 12573 12584 12594 12607 12618 12628
## 2 3 3 3 3 2 3 2 2 2 2 2
## 12642 12652 12665 12679 12690 12704
## 2 3 3 2 3 2
table(members)
## members
## 1 2 3 4 5
## 7 166 162 98 1
aggregate(customers_sample[, 2:4], by = list(members), mean)
## Group.1 recency frequency avg_amount
## 1 1 1.4119515 -0.38903899 4.2445490
## 2 2 -0.6049432 -0.06823948 0.4543241
## 3 3 1.0214686 -0.24252472 -0.1120100
## 4 4 -0.6765733 0.48585745 -0.9734574
## 5 5 -0.9102377 33.89766029 -1.3909332
Extract all the predictors: then these will be used what happend over next 6 months
testdata$InvoiceDate = as.POSIXct(testdata$InvoiceDate, format = "%m/%d/%Y %H:%M")
testdata$days_since = as.numeric(difftime(time1 = "2011-07-19",
time2 = testdata$InvoiceDate,
units = "days"))
library(sqldf)
customers_6monthsago = sqldf("SELECT customerID,InvoiceNo,
MIN(days_since) - 180 AS 'recency',
MAX(days_since) - 180 AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(Amount) AS 'avg_amount',
MAX(Amount) AS 'max_amount'
FROM compdata
WHERE days_since < -12
GROUP BY 1,2")
summary(customers_6monthsago)
## CustomerID InvoiceNo recency first_purchase
## Min. :12347 561861 : 1 Min. :-323.6 Min. :-323.6
## 1st Qu.:13791 561862 : 1 1st Qu.:-300.6 1st Qu.:-300.6
## Median :15152 561863 : 1 Median :-272.5 Median :-272.5
## Mean :15251 561864 : 1 Mean :-268.1 Mean :-268.1
## 3rd Qu.:16749 561865 : 1 3rd Qu.:-238.5 3rd Qu.:-238.5
## Max. :18287 561866 : 1 Max. :-192.4 Max. :-192.4
## (Other):8434
## frequency avg_amount max_amount
## Min. : 1.00 Min. : 0.38 Min. : 0.38
## 1st Qu.: 7.00 1st Qu.: 12.66 1st Qu.: 25.50
## Median : 16.00 Median : 18.79 Median : 41.60
## Mean : 23.66 Mean : 70.46 Mean : 119.35
## 3rd Qu.: 29.00 3rd Qu.: 31.57 3rd Qu.: 85.92
## Max. :542.00 Max. :168469.60 Max. :168469.60
##
summary(compdata)
## InvoiceNo StockCode
## 576339 : 542 85123A : 2035
## 579196 : 533 22423 : 1723
## 580727 : 529 85099B : 1618
## 578270 : 442 84879 : 1408
## 573576 : 435 47566 : 1396
## 567656 : 421 20725 : 1317
## (Other):394982 (Other):388387
## Description Quantity
## WHITE HANGING HEART T-LIGHT HOLDER: 2028 Min. : 1.00
## REGENCY CAKESTAND 3 TIER : 1723 1st Qu.: 2.00
## JUMBO BAG RED RETROSPOT : 1618 Median : 6.00
## ASSORTED COLOUR BIRD ORNAMENT : 1408 Mean : 12.99
## PARTY BUNTING : 1396 3rd Qu.: 12.00
## LUNCH BAG RED RETROSPOT : 1316 Max. :80995.00
## (Other) :388395
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00 Min. : 0.001 Min. :12346
## 1st Qu.:2011-04-07 11:12:00 1st Qu.: 1.250 1st Qu.:13969
## Median :2011-07-31 14:39:00 Median : 1.950 Median :15159
## Mean :2011-07-11 00:04:07 Mean : 3.116 Mean :15294
## 3rd Qu.:2011-10-20 14:33:00 3rd Qu.: 3.750 3rd Qu.:16795
## Max. :2011-12-09 12:50:00 Max. :8142.750 Max. :18287
##
## Country days_since Amount
## United Kingdom:354321 Min. :-143.576 Min. : 0.00
## Germany : 9040 1st Qu.: -93.606 1st Qu.: 4.68
## France : 8341 Median : -12.610 Median : 11.80
## EIRE : 7236 Mean : 7.997 Mean : 22.40
## Spain : 2484 3rd Qu.: 102.533 3rd Qu.: 19.80
## Netherlands : 2359 Max. : 229.607 Max. :168469.60
## (Other) : 14103
hist(compdata$days_since)
revenues_6months = sqldf("SELECT customerID, SUM(Amount) AS 'revenue_6months'
FROM compdata
WHERE days_since > -12
GROUP BY 1")
head(revenues_6months)
## CustomerID revenue_6months
## 1 12346 77183.60
## 2 12347 2205.95
## 3 12348 1487.24
## 4 12350 334.40
## 5 12352 1561.81
## 6 12353 89.00
head(customers_6monthsago)
## CustomerID InvoiceNo recency first_purchase frequency avg_amount
## 1 12347 562032 -194.3667 -194.3667 22 26.58682
## 2 12347 573511 -284.5174 -284.5174 47 27.53872
## 3 12347 581180 -321.7028 -321.7028 11 20.43818
## 4 12348 568172 -248.5507 -248.5507 3 103.33333
## 5 12349 577609 -305.4521 -305.4521 73 24.07603
## 6 12352 567505 -243.6069 -243.6069 18 20.34722
## max_amount
## 1 106.20
## 2 106.20
## 3 42.96
## 4 150.00
## 5 300.00
## 6 80.00
revenues_6months[revenues_6months$InvoiceNo == 541431,]
## [1] CustomerID revenue_6months
## <0 rows> (or 0-length row.names)
customers_6monthsago[customers_6monthsago$InvoiceNo == 541431,]
## [1] CustomerID InvoiceNo recency first_purchase
## [5] frequency avg_amount max_amount
## <0 rows> (or 0-length row.names)
we will run in sample predictions then out of sample predictions
in.sample = merge(customers_6monthsago, revenues_6months,all.x = TRUE)
head(in.sample)
## CustomerID InvoiceNo recency first_purchase frequency avg_amount
## 1 12347 581180 -321.7028 -321.7028 11 20.43818
## 2 12347 562032 -194.3667 -194.3667 22 26.58682
## 3 12347 573511 -284.5174 -284.5174 47 27.53872
## 4 12348 568172 -248.5507 -248.5507 3 103.33333
## 5 12349 577609 -305.4521 -305.4521 73 24.07603
## 6 12352 574275 -287.6090 -287.6090 15 20.78200
## max_amount revenue_6months
## 1 42.96 2205.95
## 2 106.20 2205.95
## 3 106.20 2205.95
## 4 150.00 1487.24
## 5 300.00 NA
## 6 80.00 1561.81
in.sample$revenue_6months[is.na(in.sample$revenue_6months)] = 0
New variable active_6months to see whether customers have spent anything in next 6 months 0 = no, 1 = yes
in.sample$active_6months = as.numeric(in.sample$revenue_6months > 0)
head(in.sample$revenue_6months)
## [1] 2205.95 2205.95 2205.95 1487.24 0.00 1561.81
Calibrate probability model to check whether customer will be active in 6 months or not
library(nnet)
prob.model = multinom(formula = active_6months ~ recency + first_purchase + frequency + avg_amount + max_amount,data = in.sample)
## # weights: 7 (6 variable)
## initial value 5850.162204
## iter 10 value 4594.716649
## final value 4593.278815
## converged
prob.model
## Call:
## multinom(formula = active_6months ~ recency + first_purchase +
## frequency + avg_amount + max_amount, data = in.sample)
##
## Coefficients:
## (Intercept) recency first_purchase frequency avg_amount
## 2.8464826700 0.0004390637 0.0056723996 -0.0070378114 -0.0023956770
## max_amount
## 0.0025267350
##
## Residual Deviance: 9186.558
## AIC: 9196.558
# We see -ve recency means larger the recency lower the probability: more days have passed between the last purchase and today, so less likely u will make nxt purchase and vice versa frequency is +ve meaning more frequency more probablity of purchase
coef = summary(prob.model)$coefficients
std = summary(prob.model)$standard.errors
print(coef)
## (Intercept) recency first_purchase frequency avg_amount
## 2.8464826700 0.0004390637 0.0056723996 -0.0070378114 -0.0023956770
## max_amount
## 0.0025267350
print(std)
## (Intercept) recency first_purchase frequency avg_amount
## 0.1964625674 0.0003552645 0.0003554752 0.0008602611 0.0004591174
## max_amount
## 0.0003968547
# To indicate which parameters are significant to what extent: good sign = above 2 and below -2
print(coef/std)
## (Intercept) recency first_purchase frequency avg_amount
## 14.488677 1.235878 15.957229 -8.181018 -5.218006
## max_amount
## 6.366903
For the monetary model, select only those who made a purchase = 1,they only fit the model
z = which(in.sample$active_6months ==1)
head(in.sample[z,])
## CustomerID InvoiceNo recency first_purchase frequency avg_amount
## 1 12347 581180 -321.7028 -321.7028 11 20.43818
## 2 12347 562032 -194.3667 -194.3667 22 26.58682
## 3 12347 573511 -284.5174 -284.5174 47 27.53872
## 4 12348 568172 -248.5507 -248.5507 3 103.33333
## 6 12352 574275 -287.6090 -287.6090 15 20.78200
## 7 12352 567505 -243.6069 -243.6069 18 20.34722
## max_amount revenue_6months active_6months
## 1 42.96 2205.95 1
## 2 106.20 2205.95 1
## 3 106.20 2205.95 1
## 4 150.00 1487.24 1
## 6 80.00 1561.81 1
## 7 80.00 1561.81 1
summary(in.sample[z,])
## CustomerID InvoiceNo recency first_purchase
## Min. :12347 561861 : 1 Min. :-323.6 Min. :-323.6
## 1st Qu.:13784 561862 : 1 1st Qu.:-300.5 1st Qu.:-300.5
## Median :15114 561863 : 1 Median :-270.5 Median :-270.5
## Mean :15244 561864 : 1 Mean :-266.1 Mean :-266.1
## 3rd Qu.:16746 561865 : 1 3rd Qu.:-234.6 3rd Qu.:-234.6
## Max. :18287 561866 : 1 Max. :-192.4 Max. :-192.4
## (Other):6379
## frequency avg_amount max_amount
## Min. : 1.00 Min. : 0.38 Min. : 0.38
## 1st Qu.: 7.00 1st Qu.: 13.41 1st Qu.: 29.50
## Median : 15.00 Median : 19.73 Median : 45.00
## Mean : 22.36 Mean : 80.37 Mean : 132.94
## 3rd Qu.: 29.00 3rd Qu.: 34.78 3rd Qu.: 95.20
## Max. :399.00 Max. :168469.60 Max. :168469.60
##
## revenue_6months active_6months
## Min. : 2.9 Min. :1
## 1st Qu.: 597.1 1st Qu.:1
## Median : 1560.8 Median :1
## Mean : 7107.1 Mean :1
## 3rd Qu.: 4265.8 3rd Qu.:1
## Max. :136028.1 Max. :1
##
amount.model = lm(formula = revenue_6months~ avg_amount + max_amount + recency + first_purchase + frequency, data = in.sample[z,])
amount.model$coefficients
## (Intercept) avg_amount max_amount recency first_purchase
## 7435.801479 -38.046607 38.171295 7.785148 NA
## frequency
## -12.229979
summary(in.sample[z,])
## CustomerID InvoiceNo recency first_purchase
## Min. :12347 561861 : 1 Min. :-323.6 Min. :-323.6
## 1st Qu.:13784 561862 : 1 1st Qu.:-300.5 1st Qu.:-300.5
## Median :15114 561863 : 1 Median :-270.5 Median :-270.5
## Mean :15244 561864 : 1 Mean :-266.1 Mean :-266.1
## 3rd Qu.:16746 561865 : 1 3rd Qu.:-234.6 3rd Qu.:-234.6
## Max. :18287 561866 : 1 Max. :-192.4 Max. :-192.4
## (Other):6379
## frequency avg_amount max_amount
## Min. : 1.00 Min. : 0.38 Min. : 0.38
## 1st Qu.: 7.00 1st Qu.: 13.41 1st Qu.: 29.50
## Median : 15.00 Median : 19.73 Median : 45.00
## Mean : 22.36 Mean : 80.37 Mean : 132.94
## 3rd Qu.: 29.00 3rd Qu.: 34.78 3rd Qu.: 95.20
## Max. :399.00 Max. :168469.60 Max. :168469.60
##
## revenue_6months active_6months
## Min. : 2.9 Min. :1
## 1st Qu.: 597.1 1st Qu.:1
## Median : 1560.8 Median :1
## Mean : 7107.1 Mean :1
## 3rd Qu.: 4265.8 3rd Qu.:1
## Max. :136028.1 Max. :1
##
a = table(in.sample[z, ]$revenue_6months, amount.model$fitted.values)
summary(amount.model$fitted.values)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2956 5510 6007 7107 7089 177600
summary(in.sample[z, ]$revenue_6months)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.9 597.1 1561.0 7107.0 4266.0 136000.0
plot(x = in.sample[z, ]$revenue_6months, y = amount.model$fitted.values)
qqplot(x = in.sample[z, ]$revenue_6months, y = amount.model$fitted.values)
plot(x = in.sample[z,]$CustomerID, y=in.sample[z, ]$revenue_6months )
plot(x = in.sample[z,]$CustomerID, y=amount.model$fitted.values )
amount.model = lm(formula = log(revenue_6months) ~ log(avg_amount) + log(max_amount), data = in.sample[z, ])
summary(amount.model)
##
## Call:
## lm(formula = log(revenue_6months) ~ log(avg_amount) + log(max_amount),
## data = in.sample[z, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.4912 -0.9647 -0.0475 0.8456 6.2929
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.07231 0.07555 67.137 < 2e-16 ***
## log(avg_amount) 0.10579 0.03253 3.252 0.00115 **
## log(max_amount) 0.51606 0.03513 14.688 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.439 on 6382 degrees of freedom
## Multiple R-squared: 0.1601, Adjusted R-squared: 0.1598
## F-statistic: 608.1 on 2 and 6382 DF, p-value: < 2.2e-16
plot(x = log(in.sample[z, ]$revenue_6months), y = amount.model$fitted.values)
# APPLY THE MODELS TO TODAY’S DATA
library(sqldf)
customers_2012 = sqldf("SELECT customerID,
MIN(days_since) AS 'recency',
MAX(days_since) AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(Amount) AS 'avg_amount',
MAX(Amount) AS 'max_amount'
FROM compdata GROUP BY 1")
head(customers_2012)
## CustomerID recency first_purchase frequency avg_amount max_amount
## 1 12346 181.54097 181.5410 1 77183.60000 77183.6
## 2 12347 -141.70278 223.3354 182 23.68132 249.6
## 3 12348 -68.55069 214.1604 31 57.97548 240.0
## 4 12349 -125.45208 -125.4521 73 24.07603 300.0
## 5 12350 166.29097 166.2910 17 19.67059 40.0
## 6 12352 -107.60903 152.4354 85 29.48282 376.5
customers_2012$prob_predicted = predict(object = prob.model, newdata = customers_2012, type = "probs")
customers_2012$revenue_predicted = exp(predict(object = amount.model, newdata = customers_2012))
customers_2012$score_predicted = customers_2012$prob_predicted * customers_2012$revenue_predicted
summary(customers_2012$prob_predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.9074 0.9507 0.9231 0.9714 1.0000
summary(customers_2012$revenue_predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 362.9 1263.0 1669.0 2374.0 2499.0 252600.0
summary(customers_2012$score_predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1151 1543 2217 2317 252600
hist(customers_2012$score_predicted)
hist(log(customers_2012$score_predicted))
customers_2012_new = customers_2012[customers_2012$score_predicted < 10000,]
d = dist(customers_2012_new)
plot(customers_2012_new$score_predicted)
hist(customers_2012_new$score_predicted)
Perform hierarchical clustering on distance metrics
c = hclust(d, method="ward.D2")
Plot the dendogram
plot(c)
Cut at 5 segments
members = cutree(c, k = 5)
customers_2012_new$cust_segement = members
members[1:30]
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 1 1 1 2 3 2 2 1 3 1 1 1 1 2 2 2 2 1 2 1 3 2 2 1 2
## 27 28 29 30 31
## 2 2 2 1 1
table(members)
## members
## 1 2 3 4 5
## 592 1103 166 1397 1029
write.csv(file = "customers_2012_new.csv", x = customers_2012_new)
so we can target and customize our marketing promotions and offers to specific customers based on their predicted score
z = which(customers_2012$score_predicted > 2200)
print(length(z))
## [1] 1198