Online Retail

DATA SET DESCRIPTION

This is a transnational data set which contains all the transactions occurring between 12/01/2010 and 12/09/2011 for a UK-based and registered non-store online retail.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.

Load the data

The raw data includes 541909 observations of 8 variables. I omitted 136534 missing observations and the new data set has 406829 observations.

Data pre-processing

Create an aggregated variable named Amount, by multiplying Quantity with Price, which gives the total amount of money spent per product / item in each transaction. The amount variable is assume to represent sales revenue. Separate the variable InvoiceDate into two variables Date and Time . This allows different transactions created by the same consumer on the same day but at different times to be treated separately. Create SKU variable that encodes the first 3 digits of Stockcode to indicate different stock keeping unit, which is related to later calculation of "Breadth" customer behavior indicator.

#omit NA
eRetail<-na.omit(eRetail)
#create Amount variable#
eRetail$Amount <- eRetail$Quantity * eRetail$UnitPrice
#create SKU variable
eRetail$SKU <- substr(eRetail$StockCode,1,3)
#separate date & time#
eRetail$InvoiceDate<-strptime(eRetail$InvoiceDate,"%m/%d/%Y %H:%M")
eRetail$InvoiceTime = format(eRetail$InvoiceDate,"%H")
eRetail$InvoiceDate<-as.Date(eRetail$InvoiceDate,"%m/%d/%Y")
#look at internal structure#
str(eRetail)
## 'data.frame':    406829 obs. of  11 variables:
##  $ InvoiceNo  : Factor w/ 25900 levels "536365","536366",..: 1 1 1 1 1 1 1 2 2 3 ...
##  $ StockCode  : Factor w/ 4070 levels "10002","10080",..: 3538 2795 3045 2986 2985 1663 801 1548 1547 3306 ...
##  $ Description: Factor w/ 4223 levels " 4 PURPLE FLOCK DINNER CANDLES",..: 4026 4034 931 1958 2979 3234 1572 1697 1694 258 ...
##  $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: Date, format: "2010-12-01" "2010-12-01" ...
##  $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
##  $ Country    : Factor w/ 38 levels "Australia","Austria",..: 36 36 36 36 36 36 36 36 36 36 ...
##  $ Amount     : num  15.3 20.3 22 20.3 20.3 ...
##  $ SKU        : chr  "851" "710" "844" "840" ...
##  $ InvoiceTime: chr  "08" "08" "08" "08" ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:135080] 623 1444 1445 1446 1447 1448 1449 1450 1451 1452 ...
##   .. ..- attr(*, "names")= chr [1:135080] "623" "1444" "1445" "1446" ...
#View a summary#
summary(eRetail)
##    InvoiceNo        StockCode     
##  576339 :   542   85123A :  2077  
##  579196 :   533   22423  :  1905  
##  580727 :   529   85099B :  1662  
##  578270 :   442   84879  :  1418  
##  573576 :   435   47566  :  1416  
##  567656 :   421   20725  :  1359  
##  (Other):403927   (Other):396992  
##                              Description        Quantity        
##  WHITE HANGING HEART T-LIGHT HOLDER:  2070   Min.   :-80995.00  
##  REGENCY CAKESTAND 3 TIER          :  1905   1st Qu.:     2.00  
##  JUMBO BAG RED RETROSPOT           :  1662   Median :     5.00  
##  ASSORTED COLOUR BIRD ORNAMENT     :  1418   Mean   :    12.06  
##  PARTY BUNTING                     :  1416   3rd Qu.:    12.00  
##  LUNCH BAG RED RETROSPOT           :  1358   Max.   : 80995.00  
##  (Other)                           :397000                      
##   InvoiceDate           UnitPrice          CustomerID   
##  Min.   :2010-12-01   Min.   :    0.00   Min.   :12346  
##  1st Qu.:2011-04-06   1st Qu.:    1.25   1st Qu.:13953  
##  Median :2011-07-31   Median :    1.95   Median :15152  
##  Mean   :2011-07-10   Mean   :    3.46   Mean   :15288  
##  3rd Qu.:2011-10-20   3rd Qu.:    3.75   3rd Qu.:16791  
##  Max.   :2011-12-09   Max.   :38970.00   Max.   :18287  
##                                                         
##            Country           Amount              SKU           
##  United Kingdom:361878   Min.   :-168469.6   Length:406829     
##  Germany       :  9495   1st Qu.:      4.2   Class :character  
##  France        :  8491   Median :     11.1   Mode  :character  
##  EIRE          :  7485   Mean   :     20.4                     
##  Spain         :  2533   3rd Qu.:     19.5                     
##  Netherlands   :  2371   Max.   : 168469.6                     
##  (Other)       : 14576                                         
##  InvoiceTime       
##  Length:406829     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
#View the top#
head(eRetail)
##   InvoiceNo StockCode                         Description Quantity
## 1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6
## 2    536365     71053                 WHITE METAL LANTERN        6
## 3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8
## 4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6
## 5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6
## 6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2
##   InvoiceDate UnitPrice CustomerID        Country Amount SKU InvoiceTime
## 1  2010-12-01      2.55      17850 United Kingdom  15.30 851          08
## 2  2010-12-01      3.39      17850 United Kingdom  20.34 710          08
## 3  2010-12-01      2.75      17850 United Kingdom  22.00 844          08
## 4  2010-12-01      3.39      17850 United Kingdom  20.34 840          08
## 5  2010-12-01      3.39      17850 United Kingdom  20.34 840          08
## 6  2010-12-01      7.65      17850 United Kingdom  15.30 227          08

SIMPLE EXPLORATION OF DATA

Outliers

## boxplot of Amount
boxplot(eRetail$Amount)$stats[c(1, 5), ]

## [1] -18.75  42.45
#cutoff outliner
Retail<-subset(eRetail,eRetail$Amount>= -18.75 & eRetail$Amount<= 42.45 )

What are top 5 selling products accross all times?

Retail1 <- ddply(Retail, .(StockCode), summarize, sumAmount= sum(Amount), sumQuantity= sum(Quantity), nCustomer= length(unique(CustomerID)), nPurchase= length(unique(InvoiceNo)) )

head(Retail1[order(-Retail1$sumQuantity),] )
##      StockCode sumAmount sumQuantity nCustomer nPurchase
## 2793     84077   6728.97       23023       291       430
## 426      21212   9062.59       16441       610       967
## 3049     84879  22962.03       13587       628      1134
## 3014     84826    236.38       13522        26        28
## 1088     22197  10393.55       12165       390       920
## 3213    85099B  24659.24       12035       582      1339
head(Retail1[order(-Retail1$nCustomer),] )
##      StockCode sumAmount sumQuantity nCustomer nPurchase
## 3227    85123A  26939.51        9137       756      1583
## 1287     22423  21520.21        1697       719      1132
## 3049     84879  22962.03       13587       628      1134
## 2583     47566  22632.99        4585       615      1089
## 426      21212   9062.59       16441       610       967
## 1567     22720  14978.48        3018       610      1114
head(Retail1[order(-Retail1$sumAmount),] )
##      StockCode sumAmount sumQuantity nCustomer nPurchase
## 3227    85123A  26939.51        9137       756      1583
## 3213    85099B  24659.24       12035       582      1339
## 3049     84879  22962.03       13587       628      1134
## 2583     47566  22632.99        4585       615      1089
## 1287     22423  21520.21        1697       719      1132
## 2023     23203  18458.88        8906       483       989
head(Retail1[order(-Retail1$nPurchase),] )
##      StockCode sumAmount sumQuantity nCustomer nPurchase
## 3227    85123A  26939.51        9137       756      1583
## 3213    85099B  24659.24       12035       582      1339
## 171      20725  16063.55        9695       518      1216
## 3049     84879  22962.03       13587       628      1134
## 1287     22423  21520.21        1697       719      1132
## 1567     22720  14978.48        3018       610      1114

5 top selling products by sales volume: Product StockCode 3825 WORLD WAR 2 GLIDERS ASSTD DESIGNS 84077 1779 JUMBO BAG RED RETROSPOT 85099B 218 ASSORTED COLOUR BIRD ORNAMENT 84879 3736 WHITE HANGING HEART T-LIGHT HOLDER 85123A 2289 PACK OF 72 RETROSPOT CAKE CASES 21212 2625 POPCORN HOLDER 22197 sumAmount sumQuantity nCustomer nPurchase 3825 13332.33 53215 307 477 1779 83236.76 45066 636 1643 218 56499.22 35314 679 1385 3736 93823.85 34147 858 2013 2289 16247.95 33409 636 1041 2625 23098.30 30504 296 668

5 top selling products by number of customers: Product StockCode 2793 REGENCY CAKESTAND 3 TIER 22423 3736 WHITE HANGING HEART T-LIGHT HOLDER 85123A 2365 PARTY BUNTING 47566 218 ASSORTED COLOUR BIRD ORNAMENT 84879 3031 SET OF 3 CAKE TINS PANTRY DESIGN 22720 1779 JUMBO BAG RED RETROSPOT 85099B sumAmount sumQuantity nCustomer nPurchase 2793 132870.40 11555 887 1884 3736 93823.85 34147 858 2013 2365 67687.53 15027 708 1399 218 56499.22 35314 679 1385 3031 32607.80 6864 640 1218 1779 83236.76 45066 636 1643

5 top selling products by value of sales revenue: Product StockCode 2793 REGENCY CAKESTAND 3 TIER 22423 3736 WHITE HANGING HEART T-LIGHT HOLDER 85123A 1779 JUMBO BAG RED RETROSPOT 85099B 2365 PARTY BUNTING 47566 2637 POSTAGE POST 218 ASSORTED COLOUR BIRD ORNAMENT 84879 sumAmount sumQuantity nCustomer nPurchase 2793 132870.40 11555 887 1884 3736 93823.85 34147 858 2013 1779 83236.76 45066 636 1643 2365 67687.53 15027 708 1399 2637 66710.24 3002 378 1192 218 56499.22 35314 679 1385

5 top selling products by number of purchases: Product StockCode 3736 WHITE HANGING HEART T-LIGHT HOLDER 85123A 2793 REGENCY CAKESTAND 3 TIER 22423 1779 JUMBO BAG RED RETROSPOT 85099B 2365 PARTY BUNTING 47566 218 ASSORTED COLOUR BIRD ORNAMENT 84879 1961 LUNCH BAG RED RETROSPOT 20725 sumAmount sumQuantity nCustomer nPurchase 3736 93823.85 34147 858 2013 2793 132870.40 11555 887 1884 1779 83236.76 45066 636 1643 2365 67687.53 15027 708 1399 218 56499.22 35314 679 1385 1961 27230.05 17145 532 1329

Do these sales of top selling products change with time (months)?Any seasonality?

Retail2 <- subset(Retail, Description%in%c("MEDIUM CERAMIC TOP STORAGE JAR","JUMBO BAG RED RETROSPOT","REGENCY CAKESTAND 3 TIER","WHITE HANGING HEART T-LIGHT HOLDER","PARTY BUNTING","WORLD WAR 2 GLIDERS ASSTD DESIGNS"), select = c(Description,InvoiceDate,InvoiceTime,Quantity,CustomerID,Amount,InvoiceNo))
Retail2$Invoice_month<-month(Retail2$InvoiceDate)
Retail2$Decription<-as.character(Retail2$Description)
ggplot(Retail2, aes(x=Invoice_month, y= Quantity))+ facet_wrap(~Description, ncol=2) + 
  geom_bar(stat="identity") + 
  labs(title = "Sales by month", x = "Month", y = "Sales Volume")

ggplot(Retail2, aes(x=Invoice_month, y= length(unique(CustomerID)) )) + facet_wrap(~Description, ncol=2) + 
  geom_bar(stat="identity") + 
  labs(title = "Sales by month", x = "Month", y = "Number of Customer") 

ggplot(Retail2, aes(x=Invoice_month, y= Amount )) + facet_wrap(~Description, ncol=2) + 
  geom_bar(stat="identity") + 
  labs(title = "Sales by month", x = "Month", y = "Sales Revenue") 

ggplot(Retail2, aes(x=Invoice_month, y= length(unique(InvoiceNo)) )) + facet_wrap(~Description, ncol=2) + 
  geom_bar(stat="identity") + 
  labs(title = "Sales by month", x = "Month", y = "Number of Purchases") 

*The sales of products changes with time.

What are the busiest hours of a day?

Retail3<-ddply(Retail, .(InvoiceTime), summarize, sumAmount=sum(Amount), sumQuantity=sum(Quantity), nCustomer=length(unique(CustomerID)))
names(Retail3) [1] <-"InvoiceHour"
ggplot(Retail3, aes(x=InvoiceHour, y= sumQuantity)) + 
  geom_bar(stat="identity") + 
  labs(title = "Sales by hours", x = "Hours", y = "Sales Volume")

ggplot(Retail3, aes(x=InvoiceHour, y= nCustomer)) + 
  geom_bar(stat="identity") + 
  labs(title = "Sales by hours", x = "Hours", y = "Number of customer")

*The busiest hour of the day is around 12 pm for sales volume, and 12 pm for number of customers.

START RFM ANALYSIS

Building dataset for RFM analysis

getRFMdf<-function (RFM_raw){
RFM_raw <- RFM_raw[!duplicated(RFM_raw$CustomerID),]
RFM_raw <- cbind(RFM_raw, First_date = with(df,
as.Date(as.integer(by(InvoiceDate, CustomerID, min)), "1970/01/01")))
RFM_raw <- cbind(RFM_raw, Last_date = with(df,
as.Date(as.integer(by(InvoiceDate, CustomerID, max)), "1970/01/01")))
#Recency
AsOfDate <- max(RFM_raw$Last_date)
RFM_raw <- cbind(RFM_raw, Recency = with(df,
as.numeric(difftime(AsOfDate,RFM_raw$Last_date,units="days")))/30)
#First_purchase
RFM_raw <- cbind(RFM_raw, First_purchase = with(df,
as.numeric(difftime(AsOfDate,RFM_raw$First_date,units="days")))/30)
#Frequency
RFM_raw <- cbind(RFM_raw, Frequency = with(df,
as.numeric(by(InvoiceNo, CustomerID, function(x) length(unique(x))))))
#Monetary & related
RFM_raw <- cbind(RFM_raw, Monetary = with(df,
as.numeric(by(Amount, CustomerID, sum))))
RFM_raw <- cbind(RFM_raw, AvgM = with(df,
as.numeric(by(Amount, CustomerID, mean))))
RFM_raw <- cbind(RFM_raw, maxM = with(df,
as.numeric(by(Amount, CustomerID, max))))
#Breadth
RFM_raw <- cbind(RFM_raw, Breadth = with(df,
as.numeric(by(SKU, CustomerID, function(x) length(unique(x))))))
#Tenure
RFM_raw <- cbind(RFM_raw, Tenure = with(df, as.numeric(difftime(RFM_raw$Last_date,RFM_raw$First_date,units="days")))/30)
#sum Quantity
RFM_raw <- cbind(RFM_raw, sumQuant = with(df,
as.numeric(by(Quantity, CustomerID, mean))))
}

define getRFMnor

getRFMnor<-function (RFMn){
RFMn<- as.data.frame(scale(df2[14:22], center= TRUE))
RFMn<- cbind(df2[,c(1:13)],RFMn)
RFMn<- rename(RFMn, c("Recency" = "R", "Frequency" = "Fq", "Monetary" = "M", "Breadth" = "B" , "Tenure" = "Ten", "sumQuant" = "Q" ) )
}

define getRFMscore function

#score 1 to 5
score15<-function(x){
  ceiling((rank(x))/(length(x))*5)
}
getRFMscore<-function (RFMs){
RFMs <- as.data.frame(lapply(df3[,c(15:22)], score15))
RFMs <- cbind(df3[,c(1:13)], R= ceiling((rank(-df3$R))/(length(df3$R))*5), RFMs)
RFMs <- cbind(RFMs,RFMScore = 100*RFMs$R + 10*RFMs$Fq+RFMs$M)
}

RFM score on the whole data (12/2010 to 12/2011)

df<- eRetail
rawRFM<-as.data.frame(getRFMdf(df))

some rawRFM score EDA

#take a look at disturbution
 par(mfrow = c(1,3))
boxplot(rawRFM$Recency)$stats[c(1, 5), ]
## [1]  0.0 11.1
boxplot(rawRFM$Frequency)$stats[c(1, 5), ]
## [1]  1 11
boxplot(rawRFM$Monetary)$stats[c(1, 5), ]

## [1] -1592.49  3580.13

Strong left-skewness for Recency, Frequency, TotalAmount,Monetary,Breadth, and Tenure.

Exclude outliners

RFM<-subset(rawRFM,rawRFM$Recency<= 12 & rawRFM$Frequency<= 25 & rawRFM$Monetary>= 0 & rawRFM$Monetary<= 10000)
summary(rawRFM$Monetary)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  -4288.0    293.4    648.1   1898.0   1612.0 279500.0
 par(mfrow = c(1,3))
 hist(RFM$Recency)
 hist(RFM$Frequency)
 hist(RFM$Monetary)

Now, the Left-skewness is better.

#data normalization
df2<- RFM
nRFM<-as.data.frame(getRFMnor(df2))
#score
df3 <- nRFM
RFMs<-as.data.frame(getRFMscore(df3))
 par(mfrow = c(1,3))
 hist(RFMs$R)
 hist(RFMs$Fq)
 hist(RFMs$M)

RFM visualization

In hope to find a trend / association among the scores identify customer segment with high value high value customer is defined as high Monetary score M is Monetary score, which is score upon the sum purchase value of the different purchases of an individual customer

#draw histogram
drawHistograms <- function(df,r=5,f=5,m=5){

#set the layout plot window
par(mfrow = c(m,f))

names <-rep("",times=r)
for(i in 1:r) names[i]<-paste("R",i)


for (i in 1:m){
    for (j in 1:f){
        c <- rep(0,times=r)
        for(k in 1:f){
            tmpdf <-df[df$Fq==j & df$M==i & df$R==k,]
            c[k]<- dim(tmpdf)[1]

        }
        if (i==1 & j==1) 
            barplot(c,col="lightblue",names.arg=names)
        else
            barplot(c,col="lightblue")
        if (j==1) title(ylab=paste("M",i))  
        if (i==1) title(main=paste("Fq",j)) 

    }

}

par(mfrow = c(5,5))

} # end of drawHistograms function
#drawHistograms(RFMscore)

K-means cluster analysis

Visulize the clusters

RFM_cluster <- data.frame(RFM$Recency,RFM$Frequency,RFM$Monetary,RFMs$RFMScore)
ggplot(RFM_cluster,aes(x=RFM.Recency,y=RFM.Monetary))+geom_point()

d  <- dist(RFM_cluster) 
km <- kmeans(d,centers=5)
RFM_cluster$cluster <- km$cluster 
RFM_cluster$CustomerID <- RFM$CustomerID

separate clusters

RFM_cluster1<-RFM_cluster[which(RFM_cluster$cluster==1),]

RFM_cluster2<-RFM_cluster[which(RFM_cluster$cluster==2),]

RFM_cluster3<-RFM_cluster[which(RFM_cluster$cluster==3),]

RFM_cluster4<-RFM_cluster[which(RFM_cluster$cluster==4),]

RFM_cluster5<-RFM_cluster[which(RFM_cluster$cluster==5),]