Here are the libraries we will use
library(data.table)
## Warning: package 'data.table' was built under R version 3.4.3
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.4.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
Here is a preview of the dataset with the dates formatted correctly
transactions <- fread('transactions.csv')
transactions[, TransDate:=dmy(TransDate, tz="UTC")]
head(transactions, 5)
We first need to calculate the max transaction date to calculate the recency score
(max.Date <- max(transactions[,TransDate]))
## [1] "2012-12-09 UTC"
Now we can calculate the recency, frequency, and monetary scores like so
rfm <- transactions[,list(
recency = as.numeric(max.Date - max(TransDate)),
frequency = .N,
value = mean(PurchAmount)
), by=Customer]
The cut2() function will allow us to create the rfm scores based on the data
rfm.scores <- rfm[,list(
Customer,
recency = as.numeric(cut2(-recency, g=3)),
frequency = as.numeric(cut2(frequency, g=3)),
value = as.numeric(cut2(value, g=3))
)]
Finally we can calculate the unweighted and weighted RFM scores. Weights can be specified in a vector, in this case we used \((.2, .2, .6)\)
rfm.scores[, UnweightedRFM := mean(c(recency, frequency, value)), by=Customer]
rfm.scores[, WeightedRFM := mean(c(recency, frequency, value) * c(.2,.2,.6)), by=Customer]
head(rfm.scores, 5)