This is a transnational data set which contains all the transactions occurring between 12/01/2010 and 12/09/2011 for a UK-based and registered non-store online retail.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.
The raw data includes 541909 observations of 8 variables. I omitted 136534 missing observations and the new data set has 406829 observations.
Create an aggregated variable named Amount, by multiplying Quantity with Price, which gives the total amount of money spent per product / item in each transaction. The amount variable is assume to represent sales revenue. Separate the variable InvoiceDate into two variables Date and Time . This allows different transactions created by the same consumer on the same day but at different times to be treated separately. Create SKU variable that encodes the first 3 digits of Stockcode to indicate different stock keeping unit, which is related to later calculation of "Breadth" customer behavior indicator.
#omit NA
eRetail<-na.omit(eRetail)
#create Amount variable#
eRetail$Amount <- eRetail$Quantity * eRetail$UnitPrice
#create SKU variable
eRetail$SKU <- substr(eRetail$StockCode,1,3)
#separate date & time#
eRetail$InvoiceDate<-strptime(eRetail$InvoiceDate,"%m/%d/%Y %H:%M")
eRetail$InvoiceTime = format(eRetail$InvoiceDate,"%H")
eRetail$InvoiceDate<-as.Date(eRetail$InvoiceDate,"%m/%d/%Y")
#look at internal structure#
str(eRetail)## 'data.frame': 406829 obs. of 11 variables:
## $ InvoiceNo : Factor w/ 25900 levels "536365","536366",..: 1 1 1 1 1 1 1 2 2 3 ...
## $ StockCode : Factor w/ 4070 levels "10002","10080",..: 3538 2795 3045 2986 2985 1663 801 1548 1547 3306 ...
## $ Description: Factor w/ 4223 levels " 4 PURPLE FLOCK DINNER CANDLES",..: 4026 4034 931 1958 2979 3234 1572 1697 1694 258 ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: Date, format: "2010-12-01" "2010-12-01" ...
## $ UnitPrice : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
## $ Country : Factor w/ 38 levels "Australia","Austria",..: 36 36 36 36 36 36 36 36 36 36 ...
## $ Amount : num 15.3 20.3 22 20.3 20.3 ...
## $ SKU : chr "851" "710" "844" "840" ...
## $ InvoiceTime: chr "08" "08" "08" "08" ...
## - attr(*, "na.action")=Class 'omit' Named int [1:135080] 623 1444 1445 1446 1447 1448 1449 1450 1451 1452 ...
## .. ..- attr(*, "names")= chr [1:135080] "623" "1444" "1445" "1446" ...
#View a summary#
summary(eRetail)## InvoiceNo StockCode
## 576339 : 542 85123A : 2077
## 579196 : 533 22423 : 1905
## 580727 : 529 85099B : 1662
## 578270 : 442 84879 : 1418
## 573576 : 435 47566 : 1416
## 567656 : 421 20725 : 1359
## (Other):403927 (Other):396992
## Description Quantity
## WHITE HANGING HEART T-LIGHT HOLDER: 2070 Min. :-80995.00
## REGENCY CAKESTAND 3 TIER : 1905 1st Qu.: 2.00
## JUMBO BAG RED RETROSPOT : 1662 Median : 5.00
## ASSORTED COLOUR BIRD ORNAMENT : 1418 Mean : 12.06
## PARTY BUNTING : 1416 3rd Qu.: 12.00
## LUNCH BAG RED RETROSPOT : 1358 Max. : 80995.00
## (Other) :397000
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 Min. : 0.00 Min. :12346
## 1st Qu.:2011-04-06 1st Qu.: 1.25 1st Qu.:13953
## Median :2011-07-31 Median : 1.95 Median :15152
## Mean :2011-07-10 Mean : 3.46 Mean :15288
## 3rd Qu.:2011-10-20 3rd Qu.: 3.75 3rd Qu.:16791
## Max. :2011-12-09 Max. :38970.00 Max. :18287
##
## Country Amount SKU
## United Kingdom:361878 Min. :-168469.6 Length:406829
## Germany : 9495 1st Qu.: 4.2 Class :character
## France : 8491 Median : 11.1 Mode :character
## EIRE : 7485 Mean : 20.4
## Spain : 2533 3rd Qu.: 19.5
## Netherlands : 2371 Max. : 168469.6
## (Other) : 14576
## InvoiceTime
## Length:406829
## Class :character
## Mode :character
##
##
##
##
#View the top#
head(eRetail)## InvoiceNo StockCode Description Quantity
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
## 2 536365 71053 WHITE METAL LANTERN 6
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2
## InvoiceDate UnitPrice CustomerID Country Amount SKU InvoiceTime
## 1 2010-12-01 2.55 17850 United Kingdom 15.30 851 08
## 2 2010-12-01 3.39 17850 United Kingdom 20.34 710 08
## 3 2010-12-01 2.75 17850 United Kingdom 22.00 844 08
## 4 2010-12-01 3.39 17850 United Kingdom 20.34 840 08
## 5 2010-12-01 3.39 17850 United Kingdom 20.34 840 08
## 6 2010-12-01 7.65 17850 United Kingdom 15.30 227 08
## boxplot of Amount
boxplot(eRetail$Amount)$stats[c(1, 5), ]## [1] -18.75 42.45
#cutoff outliner
Retail<-subset(eRetail,eRetail$Amount>= -18.75 & eRetail$Amount<= 42.45 )Retail1 <- ddply(Retail, .(StockCode), summarize, sumAmount= sum(Amount), sumQuantity= sum(Quantity), nCustomer= length(unique(CustomerID)), nPurchase= length(unique(InvoiceNo)) )
head(Retail1[order(-Retail1$sumQuantity),] )## StockCode sumAmount sumQuantity nCustomer nPurchase
## 2793 84077 6728.97 23023 291 430
## 426 21212 9062.59 16441 610 967
## 3049 84879 22962.03 13587 628 1134
## 3014 84826 236.38 13522 26 28
## 1088 22197 10393.55 12165 390 920
## 3213 85099B 24659.24 12035 582 1339
head(Retail1[order(-Retail1$nCustomer),] )## StockCode sumAmount sumQuantity nCustomer nPurchase
## 3227 85123A 26939.51 9137 756 1583
## 1287 22423 21520.21 1697 719 1132
## 3049 84879 22962.03 13587 628 1134
## 2583 47566 22632.99 4585 615 1089
## 426 21212 9062.59 16441 610 967
## 1567 22720 14978.48 3018 610 1114
head(Retail1[order(-Retail1$sumAmount),] )## StockCode sumAmount sumQuantity nCustomer nPurchase
## 3227 85123A 26939.51 9137 756 1583
## 3213 85099B 24659.24 12035 582 1339
## 3049 84879 22962.03 13587 628 1134
## 2583 47566 22632.99 4585 615 1089
## 1287 22423 21520.21 1697 719 1132
## 2023 23203 18458.88 8906 483 989
head(Retail1[order(-Retail1$nPurchase),] )## StockCode sumAmount sumQuantity nCustomer nPurchase
## 3227 85123A 26939.51 9137 756 1583
## 3213 85099B 24659.24 12035 582 1339
## 171 20725 16063.55 9695 518 1216
## 3049 84879 22962.03 13587 628 1134
## 1287 22423 21520.21 1697 719 1132
## 1567 22720 14978.48 3018 610 1114
5 top selling products by sales volume: Product StockCode 3825 WORLD WAR 2 GLIDERS ASSTD DESIGNS 84077 1779 JUMBO BAG RED RETROSPOT 85099B 218 ASSORTED COLOUR BIRD ORNAMENT 84879 3736 WHITE HANGING HEART T-LIGHT HOLDER 85123A 2289 PACK OF 72 RETROSPOT CAKE CASES 21212 2625 POPCORN HOLDER 22197 sumAmount sumQuantity nCustomer nPurchase 3825 13332.33 53215 307 477 1779 83236.76 45066 636 1643 218 56499.22 35314 679 1385 3736 93823.85 34147 858 2013 2289 16247.95 33409 636 1041 2625 23098.30 30504 296 668
5 top selling products by number of customers: Product StockCode 2793 REGENCY CAKESTAND 3 TIER 22423 3736 WHITE HANGING HEART T-LIGHT HOLDER 85123A 2365 PARTY BUNTING 47566 218 ASSORTED COLOUR BIRD ORNAMENT 84879 3031 SET OF 3 CAKE TINS PANTRY DESIGN 22720 1779 JUMBO BAG RED RETROSPOT 85099B sumAmount sumQuantity nCustomer nPurchase 2793 132870.40 11555 887 1884 3736 93823.85 34147 858 2013 2365 67687.53 15027 708 1399 218 56499.22 35314 679 1385 3031 32607.80 6864 640 1218 1779 83236.76 45066 636 1643
5 top selling products by value of sales revenue: Product StockCode 2793 REGENCY CAKESTAND 3 TIER 22423 3736 WHITE HANGING HEART T-LIGHT HOLDER 85123A 1779 JUMBO BAG RED RETROSPOT 85099B 2365 PARTY BUNTING 47566 2637 POSTAGE POST 218 ASSORTED COLOUR BIRD ORNAMENT 84879 sumAmount sumQuantity nCustomer nPurchase 2793 132870.40 11555 887 1884 3736 93823.85 34147 858 2013 1779 83236.76 45066 636 1643 2365 67687.53 15027 708 1399 2637 66710.24 3002 378 1192 218 56499.22 35314 679 1385
5 top selling products by number of purchases: Product StockCode 3736 WHITE HANGING HEART T-LIGHT HOLDER 85123A 2793 REGENCY CAKESTAND 3 TIER 22423 1779 JUMBO BAG RED RETROSPOT 85099B 2365 PARTY BUNTING 47566 218 ASSORTED COLOUR BIRD ORNAMENT 84879 1961 LUNCH BAG RED RETROSPOT 20725 sumAmount sumQuantity nCustomer nPurchase 3736 93823.85 34147 858 2013 2793 132870.40 11555 887 1884 1779 83236.76 45066 636 1643 2365 67687.53 15027 708 1399 218 56499.22 35314 679 1385 1961 27230.05 17145 532 1329
Retail2 <- subset(Retail, Description%in%c("MEDIUM CERAMIC TOP STORAGE JAR","JUMBO BAG RED RETROSPOT","REGENCY CAKESTAND 3 TIER","WHITE HANGING HEART T-LIGHT HOLDER","PARTY BUNTING","WORLD WAR 2 GLIDERS ASSTD DESIGNS"), select = c(Description,InvoiceDate,InvoiceTime,Quantity,CustomerID,Amount,InvoiceNo))
Retail2$Invoice_month<-month(Retail2$InvoiceDate)
Retail2$Decription<-as.character(Retail2$Description)ggplot(Retail2, aes(x=Invoice_month, y= Quantity))+ facet_wrap(~Description, ncol=2) +
geom_bar(stat="identity") +
labs(title = "Sales by month", x = "Month", y = "Sales Volume")ggplot(Retail2, aes(x=Invoice_month, y= length(unique(CustomerID)) )) + facet_wrap(~Description, ncol=2) +
geom_bar(stat="identity") +
labs(title = "Sales by month", x = "Month", y = "Number of Customer") ggplot(Retail2, aes(x=Invoice_month, y= Amount )) + facet_wrap(~Description, ncol=2) +
geom_bar(stat="identity") +
labs(title = "Sales by month", x = "Month", y = "Sales Revenue") ggplot(Retail2, aes(x=Invoice_month, y= length(unique(InvoiceNo)) )) + facet_wrap(~Description, ncol=2) +
geom_bar(stat="identity") +
labs(title = "Sales by month", x = "Month", y = "Number of Purchases") *The sales of products changes with time.
Retail3<-ddply(Retail, .(InvoiceTime), summarize, sumAmount=sum(Amount), sumQuantity=sum(Quantity), nCustomer=length(unique(CustomerID)))
names(Retail3) [1] <-"InvoiceHour"ggplot(Retail3, aes(x=InvoiceHour, y= sumQuantity)) +
geom_bar(stat="identity") +
labs(title = "Sales by hours", x = "Hours", y = "Sales Volume")ggplot(Retail3, aes(x=InvoiceHour, y= nCustomer)) +
geom_bar(stat="identity") +
labs(title = "Sales by hours", x = "Hours", y = "Number of customer") *The busiest hour of the day is around 12 pm for sales volume, and 12 pm for number of customers.
getRFMdf<-function (RFM_raw){
RFM_raw <- RFM_raw[!duplicated(RFM_raw$CustomerID),]
RFM_raw <- cbind(RFM_raw, First_date = with(df,
as.Date(as.integer(by(InvoiceDate, CustomerID, min)), "1970/01/01")))
RFM_raw <- cbind(RFM_raw, Last_date = with(df,
as.Date(as.integer(by(InvoiceDate, CustomerID, max)), "1970/01/01")))
#Recency
AsOfDate <- max(RFM_raw$Last_date)
RFM_raw <- cbind(RFM_raw, Recency = with(df,
as.numeric(difftime(AsOfDate,RFM_raw$Last_date,units="days")))/30)
#First_purchase
RFM_raw <- cbind(RFM_raw, First_purchase = with(df,
as.numeric(difftime(AsOfDate,RFM_raw$First_date,units="days")))/30)
#Frequency
RFM_raw <- cbind(RFM_raw, Frequency = with(df,
as.numeric(by(InvoiceNo, CustomerID, function(x) length(unique(x))))))
#Monetary & related
RFM_raw <- cbind(RFM_raw, Monetary = with(df,
as.numeric(by(Amount, CustomerID, sum))))
RFM_raw <- cbind(RFM_raw, AvgM = with(df,
as.numeric(by(Amount, CustomerID, mean))))
RFM_raw <- cbind(RFM_raw, maxM = with(df,
as.numeric(by(Amount, CustomerID, max))))
#Breadth
RFM_raw <- cbind(RFM_raw, Breadth = with(df,
as.numeric(by(SKU, CustomerID, function(x) length(unique(x))))))
#Tenure
RFM_raw <- cbind(RFM_raw, Tenure = with(df, as.numeric(difftime(RFM_raw$Last_date,RFM_raw$First_date,units="days")))/30)
#sum Quantity
RFM_raw <- cbind(RFM_raw, sumQuant = with(df,
as.numeric(by(Quantity, CustomerID, mean))))
}getRFMnor<-function (RFMn){
RFMn<- as.data.frame(scale(df2[14:22], center= TRUE))
RFMn<- cbind(df2[,c(1:13)],RFMn)
RFMn<- rename(RFMn, c("Recency" = "R", "Frequency" = "Fq", "Monetary" = "M", "Breadth" = "B" , "Tenure" = "Ten", "sumQuant" = "Q" ) )
}#score 1 to 5
score15<-function(x){
ceiling((rank(x))/(length(x))*5)
}
getRFMscore<-function (RFMs){
RFMs <- as.data.frame(lapply(df3[,c(15:22)], score15))
RFMs <- cbind(df3[,c(1:13)], R= ceiling((rank(-df3$R))/(length(df3$R))*5), RFMs)
RFMs <- cbind(RFMs,RFMScore = 100*RFMs$R + 10*RFMs$Fq+RFMs$M)
}df<- eRetail
rawRFM<-as.data.frame(getRFMdf(df))#take a look at disturbution
par(mfrow = c(1,3))
boxplot(rawRFM$Recency)$stats[c(1, 5), ]## [1] 0.0 11.1
boxplot(rawRFM$Frequency)$stats[c(1, 5), ]## [1] 1 11
boxplot(rawRFM$Monetary)$stats[c(1, 5), ]## [1] -1592.49 3580.13
Strong left-skewness for Recency, Frequency, TotalAmount,Monetary,Breadth, and Tenure.
RFM<-subset(rawRFM,rawRFM$Recency<= 12 & rawRFM$Frequency<= 25 & rawRFM$Monetary>= 0 & rawRFM$Monetary<= 10000)
summary(rawRFM$Monetary)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4288.0 293.4 648.1 1898.0 1612.0 279500.0
par(mfrow = c(1,3))
hist(RFM$Recency)
hist(RFM$Frequency)
hist(RFM$Monetary)Now, the Left-skewness is better.
#data normalization
df2<- RFM
nRFM<-as.data.frame(getRFMnor(df2))
#score
df3 <- nRFM
RFMs<-as.data.frame(getRFMscore(df3))
par(mfrow = c(1,3))
hist(RFMs$R)
hist(RFMs$Fq)
hist(RFMs$M)In hope to find a trend / association among the scores identify customer segment with high value high value customer is defined as high Monetary score M is Monetary score, which is score upon the sum purchase value of the different purchases of an individual customer
#draw histogram
drawHistograms <- function(df,r=5,f=5,m=5){
#set the layout plot window
par(mfrow = c(m,f))
names <-rep("",times=r)
for(i in 1:r) names[i]<-paste("R",i)
for (i in 1:m){
for (j in 1:f){
c <- rep(0,times=r)
for(k in 1:f){
tmpdf <-df[df$Fq==j & df$M==i & df$R==k,]
c[k]<- dim(tmpdf)[1]
}
if (i==1 & j==1)
barplot(c,col="lightblue",names.arg=names)
else
barplot(c,col="lightblue")
if (j==1) title(ylab=paste("M",i))
if (i==1) title(main=paste("Fq",j))
}
}
par(mfrow = c(5,5))
} # end of drawHistograms function#drawHistograms(RFMscore)RFM_cluster <- data.frame(RFM$Recency,RFM$Frequency,RFM$Monetary,RFMs$RFMScore)
ggplot(RFM_cluster,aes(x=RFM.Recency,y=RFM.Monetary))+geom_point()d <- dist(RFM_cluster)
km <- kmeans(d,centers=5)
RFM_cluster$cluster <- km$cluster
RFM_cluster$CustomerID <- RFM$CustomerIDRFM_cluster1<-RFM_cluster[which(RFM_cluster$cluster==1),]
RFM_cluster2<-RFM_cluster[which(RFM_cluster$cluster==2),]
RFM_cluster3<-RFM_cluster[which(RFM_cluster$cluster==3),]
RFM_cluster4<-RFM_cluster[which(RFM_cluster$cluster==4),]
RFM_cluster5<-RFM_cluster[which(RFM_cluster$cluster==5),]