customer segmentation on online-retailer data using RFM analysis.
library(readxl)
data<-read_excel("C:\\Users\\badal\\Desktop\\datset_\\Online_Retail.xlsx")
head(data)
str(data)
Classes âtbl_dfâ, âtblâ and 'data.frame': 541909 obs. of 8 variables:
$ InvoiceNo : chr "536365" "536365" "536365" "536365" ...
$ StockCode : chr "85123A" "71053" "84406B" "84029G" ...
$ Description: chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
$ Quantity : num 6 6 8 6 6 2 6 6 6 32 ...
$ InvoiceDate: POSIXct, format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
$ UnitPrice : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
$ CustomerID : num 17850 17850 17850 17850 17850 ...
$ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
any(is.na(data))
[1] TRUE
summary(data)
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
Length:541909 Length:541909 Length:541909 Min. :-80995.00 Min. :2010-12-01 08:26:00 Min. :-11062.06 Min. :12346 Length:541909
Class :character Class :character Class :character 1st Qu.: 1.00 1st Qu.:2011-03-28 11:34:00 1st Qu.: 1.25 1st Qu.:13953 Class :character
Mode :character Mode :character Mode :character Median : 3.00 Median :2011-07-19 17:17:00 Median : 2.08 Median :15152 Mode :character
Mean : 9.55 Mean :2011-07-04 13:34:57 Mean : 4.61 Mean :15288
3rd Qu.: 10.00 3rd Qu.:2011-10-19 11:27:00 3rd Qu.: 4.13 3rd Qu.:16791
Max. : 80995.00 Max. :2011-12-09 12:50:00 Max. : 38970.00 Max. :18287
NA's :135080
data <- mutate(data, InvoiceDate = as.Date(InvoiceDate,"%m/%d/%Y"))
unknown timezone '%m/%d/%Y'
data$Amount <- data$Quantity*data$UnitPrice
sapply(data, function(k) sum(is.na(k)))
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country Amount
0 0 1454 0 0 0 135080 0 0
nrow(data)
[1] 541909
data <- data[!is.na(data$CustomerID),]
data <- data[!is.na(data$Description),]
nrow(data)
[1] 406829
data$Day <- format(as.Date(data$InvoiceDate),"%d")
data$Month <- format(as.Date(data$InvoiceDate),"%m")
data$Year <- format(as.Date(data$InvoiceDate),"%Y")
cncldSalesData <- salesData[startsWith(salesData$InvoiceNo,“C”),] salesData <- salesData[!startsWith(salesData$InvoiceNo,“C”),]
cncldata <- data[startsWith(data$InvoiceNo,"C"),]
data <- data[!startsWith(data$InvoiceNo,"C"),]
summary(data)
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country Amount
Length:397924 Length:397924 Length:397924 Min. : 1.00 Min. :2010-12-01 Min. : 0.000 Min. :12346 Length:397924 Min. : 0.00
Class :character Class :character Class :character 1st Qu.: 2.00 1st Qu.:2011-04-07 1st Qu.: 1.250 1st Qu.:13969 Class :character 1st Qu.: 4.68
Mode :character Mode :character Mode :character Median : 6.00 Median :2011-07-31 Median : 1.950 Median :15159 Mode :character Median : 11.80
Mean : 13.02 Mean :2011-07-10 Mean : 3.116 Mean :15294 Mean : 22.39
3rd Qu.: 12.00 3rd Qu.:2011-10-20 3rd Qu.: 3.750 3rd Qu.:16795 3rd Qu.: 19.80
Max. :80995.00 Max. :2011-12-09 Max. :8142.750 Max. :18287 Max. :168469.60
Day Month Year
Length:397924 Length:397924 Length:397924
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
library(ggplot2)
package 㤼㸱ggplot2㤼㸲 was built under R version 3.6.1
ggplot(data, aes(y= Amount , fill= Amount))+ geom_boxplot()
data <- data[data$Amount<20000,]
data is ready to be analyzed. we will check for top 10 Products sold by Month-Year and Year. Top 5 Selling products by sales revenue in 2010 and 2011
Top_2010 <- aggregate(Amount~Description+Year,data,sum)
Top_2011 <- subset(Top_2010, Year == "2011")
Top_2010 <- subset(Top_2010, Year =="2010")
Top_2010 <- head(Top_2010[with(Top_2010,order(-Amount)),])
Top_2011 <- head(Top_2011[with(Top_2011,order(-Amount)),])
Top_2010$Description <- factor(Top_2010$Description ,levels = Top_2010$Description[order(Top_2010$Amount)])
Top_2011$Description <- factor(Top_2011$Description ,levels = Top_2011$Description[order(Top_2010$Amount)])
ggplot(Top_2010,aes(x = Description, y = Amount )) +
geom_bar(stat = "identity", fill ="darkred") + coord_flip()
ggplot(Top_2011,aes(x=Description,y=Amount)) +
geom_bar(stat = "identity", fill = "darkblue") + coord_flip()
How did these Top 5 Selling Products perfrom monthly in 2010 and 2011
Top_monthly_sold_2011 <- subset(data,Description %in% c(as.vector(Top_2011$Description)),
select = c(InvoiceNo,Description,Quantity,InvoiceDate,CustomerID,Amount,Month))
Top_monthly_sold_2010 <- subset(data,Description %in% c(as.vector(Top_2010$Description)),
select = c(InvoiceNo,Description,Quantity,InvoiceDate,CustomerID,Amount,Month))
ggplot(Top_monthly_sold_2010, aes(x=Month, y= Amount )) + facet_wrap(~Description, ncol=2) +
geom_bar(stat="identity", fill = "lightblue") + labs(title = "Sales by month", x = "Month", y = "Sales Revenue 2010")
ggplot(Top_monthly_sold_2011, aes(x=Month, y= Amount )) + facet_wrap(~Description, ncol=2) +
geom_bar(stat="identity",fill = "lightblue") + labs(title = "Sales by month", x = "Month", y = "Sales Revenue 2011")
Top_qnty_2010 <- aggregate(Quantity~Description+Year,data,sum)
Top_qnty_2011 <- subset(Top_qnty_2010,Year=="2011")
Top_qnty_2010 <- subset(Top_qnty_2010,Year=="2010")
Top_qnty_2010 <- head(Top_qnty_2010[with(Top_qnty_2010,order(-Quantity)),])
Top_qnty_2011 <- head(Top_qnty_2011[with(Top_qnty_2011,order(-Quantity)),])
Top_qnty_2010$Description <- factor(Top_qnty_2010$Description,levels=Top_qnty_2010$Description[order(Top_qnty_2010$Quantity)])
Top_qnty_2011$Description <- factor(Top_qnty_2011$Description,levels=Top_qnty_2011$Description[order(Top_qnty_2011$Quantity)])
ggplot(Top_qnty_2010,aes(x=Description,y=Quantity)) + geom_bar(stat = "identity") + coord_flip()
ggplot(Top_qnty_2011,aes(x=Description,y=Quantity)) + geom_bar(stat = "identity") + coord_flip()
Top_monthly_qnty_2011 <- subset(data,Description %in% c(as.vector(Top_qnty_2011$Description)), select = c(InvoiceNo,Description,Quantity,InvoiceDate,CustomerID,Amount,Month))
Top_monthly_qnty_2010 <- subset(data,Description %in% c(as.vector(Top_qnty_2010$Description)), select = c( InvoiceNo,Description,Quantity,InvoiceDate,CustomerID,Amount,Month))
ggplot(Top_monthly_qnty_2011, aes(x=Month, y= Quantity )) + facet_wrap(~Description, ncol=2) +
geom_bar(stat="identity") +
labs(title = "Sales by month", x = "Month", y = "Quantity")
ggplot(Top_monthly_qnty_2010, aes(x=Month, y= Quantity )) + facet_wrap(~Description, ncol=2) +
geom_bar(stat="identity") +
labs(title = "Sales by month", x = "Month", y = "Quantity")
CountryWiseAggregation2010 <- aggregate(Amount~Country+Year,data,sum)
CountryWiseAggregation2011 <- subset(CountryWiseAggregation2010, Year=="2011")
CountryWiseAggregation2010 <- subset(CountryWiseAggregation2010,Year=="2010")
CountryWiseAggregation2010 <- CountryWiseAggregation2010[with(CountryWiseAggregation2010,order(-Amount)),]
CountryWiseAggregation2011 <- CountryWiseAggregation2011[with(CountryWiseAggregation2011,order(-Amount)),]
CountryWiseAggregation2010$Country <- factor(CountryWiseAggregation2010$Country,levels=CountryWiseAggregation2010$Country[order(CountryWiseAggregation2010$Amount)])
CountryWiseAggregation2011$Country <- factor(CountryWiseAggregation2011$Country,levels=CountryWiseAggregation2011$Country[order(CountryWiseAggregation2011$Amount)])
ggplot(CountryWiseAggregation2010,aes(x=Country,y=Amount)) + geom_bar(stat = "identity") + coord_flip() + labs(title="2010")
ggplot(CountryWiseAggregation2011,aes(x=Country,y=Amount)) + geom_bar(stat = "identity") + coord_flip() + labs(title="2011")
CountryProducts2010 <- subset(data,Country %in% c(as.vector(head(CountryWiseAggregation2010$Country))) & Year=="2010" , select = c(InvoiceNo,Description,Quantity,InvoiceDate,CustomerID,Country,Amount,Month,Year))
CountryProducts2011 <- subset(data,Country %in% c(as.vector(head(CountryWiseAggregation2011$Country))) & Year=="2011", select = c(InvoiceNo,Description,Quantity,InvoiceDate,CustomerID,Country,Amount,Month,Year))
TopProdCountryAgg2010 <- aggregate(Quantity~Country+Description+Year,CountryProducts2010,sum)
TopProdCountryAgg2011 <- aggregate(Quantity~Country+Description+Year,CountryProducts2011,sum)
TopProdCountryAgg2010 <- TopProdCountryAgg2010[with(TopProdCountryAgg2010,order(Country,-Quantity)),]
TopProdCountryAgg2011 <- TopProdCountryAgg2011[with(TopProdCountryAgg2011,order(Country,-Quantity)),]
TopProdTopCountry2010 <- TopProdCountryAgg2010[!duplicated(TopProdCountryAgg2010$Country),]
TopProdTopCountry2011 <- TopProdCountryAgg2011[!duplicated(TopProdCountryAgg2011$Country),]
print(TopProdTopCountry2010)
print(TopProdTopCountry2011)
Here I will be preparing the data for the Recency, Frequency and Monetory analysis for Customer classification.There is nothing much to prepare other than formating dates for the analysis. We will need the following columns for the analysis InvoiceNo, Quantity, InvoiceDate, UnitPrice, CustomerId, Country, and Amount. Here I have included cancelled transactions as they can effect how the customer is classified.
dataRFM <- data[,-c(2,3)]
dataRFM$Amount <- dataRFM$Quantity*dataRFM$UnitPrice
dataRFM <- na.omit(dataRFM)
dataRFM<-aggregate(Amount~InvoiceNo+InvoiceDate+CustomerID,dataRFM,sum)
#install.packages("didrooRFM")
library(didrooRFM)
package 㤼㸱didrooRFM㤼㸲 was built under R version 3.6.1
dataRFM <- dataRFM[,c(1,3,2,4)]
RFM <- findRFM(dataRFM,recencyWeight = 5,frequencyWeight = 5,monetoryWeight = 5)
CustClass <- table(RFM$FinalCustomerClass)
CustClass
Class-1 Class-2 Class-3 Class-4 Class-5
790 1395 1249 805 99
barplot(CustClass)
We can see that most of the customers belong to class 2 and 3. Customers belonging to Class 5 and 4 are our most valuebale Customers. Customer Class by Country
dataRFMCustomer <- data[!duplicated(data$CustomerID),c(7,8)]
RFMCustContry <- merge(RFM,dataRFMCustomer,by="CustomerID")
CustclassAndCntry <- table(RFMCustContry$Country,RFMCustContry$FinalCustomerClass)
print(CustclassAndCntry)
Class-1 Class-2 Class-3 Class-4 Class-5
Australia 1 3 2 3 0
Austria 1 4 3 1 0
Bahrain 1 1 0 0 0
Belgium 3 4 9 7 1
Brazil 0 1 0 0 0
Canada 2 1 1 0 0
Channel Islands 0 4 3 2 0
Cyprus 1 1 3 2 0
Czech Republic 0 0 1 0 0
Denmark 0 3 4 1 0
EIRE 0 0 1 0 2
European Community 0 0 1 0 0
Finland 0 4 4 3 1
France 14 21 23 24 5
Germany 6 33 22 29 4
Greece 0 2 2 0 0
Iceland 0 0 0 0 1
Israel 0 2 1 0 0
Italy 0 7 5 2 0
Japan 1 4 1 2 0
Lebanon 0 1 0 0 0
Lithuania 0 0 1 0 0
Malta 0 1 0 1 0
Netherlands 3 1 4 0 1
Norway 1 3 2 3 1
Poland 1 2 2 1 0
Portugal 2 6 5 5 1
RSA 0 0 1 0 0
Saudi Arabia 1 0 0 0 0
Singapore 0 0 0 1 0
Spain 4 8 11 5 0
Sweden 1 2 3 1 1
Switzerland 0 9 8 3 0
United Arab Emirates 0 1 1 0 0
United Kingdom 747 1262 1122 708 81
Unspecified 0 3 1 0 0
USA 0 1 2 1 0