Email: addepalligowri@gmail.com College: VESIT(2012-2016)
This is the Capstone project of Data Analytics with managerial applications under professor Dr. Sameer Mathur which aims to do data analysis of customer purchase patterns through an Ecommerce dataset containing the records of the transactions.
The dataset can be found at: https://www.kaggle.com/carrie1/ecommerce-data/data
The dataset has the following columns with their description: 1. InvoiceNo - The invoice number of each product when the product was purchased. 2. StockCode - The unique code of the product. 3. Description - The product description. 4. Quantity - The quantity of the product purchased. 5. InvoiceDate - The date on which the product was purchased. 6. UnitPrice - The unit price of the product. 7. CustomerID - The unique customer ID of customer buying the product. 8. Country - The country where the purchase has been made.
This will attempt to do an analysis by understanding and cleaning the data to later cater to our needs of doing an indepth analysis and experimentation to draw out valuable insights relevant to the Ecommerce industry and actionable insights for a manager.
# Day 1:
# Task 1a: Initial Analysis of Marketing Analysis of customer purchase patterns in an Ecommerce Industry.
# Ques 1: Read your dataset in R and visualize the length and breadth of your dataset.
setwd("C:/Users/GOWRI/Desktop/iim_internship/Capstone_project")
Ecommdata <- read.csv(file="data.csv",head=TRUE,sep=",")
#View(Ecommdata)
head(Ecommdata)
## InvoiceNo StockCode Description Quantity
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
## 2 536365 71053 WHITE METAL LANTERN 6
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2
## InvoiceDate UnitPrice CustomerID Country
## 1 12/1/2010 8:26 2.55 17850 United Kingdom
## 2 12/1/2010 8:26 3.39 17850 United Kingdom
## 3 12/1/2010 8:26 2.75 17850 United Kingdom
## 4 12/1/2010 8:26 3.39 17850 United Kingdom
## 5 12/1/2010 8:26 3.39 17850 United Kingdom
## 6 12/1/2010 8:26 7.65 17850 United Kingdom
nrow(Ecommdata)
## [1] 541909
# The number of rows is 541909.
ncol(Ecommdata)
## [1] 8
# The number of columns is 8.
# Cleaning the data and removing null/missing/ negative values and creating subset and converting to factors for variables like InvoiceNo,StockCode, InvoiceDate, CustomerID, Country.
EcommDataClean <- Ecommdata[which(Ecommdata$Quantity>=0 & !is.na(Ecommdata$Quantity) & Ecommdata$UnitPrice >=0 & !is.na(Ecommdata$UnitPrice) & !is.na(Ecommdata$CustomerID)),]
EcommDataClean$InvoiceNo<- factor(EcommDataClean$InvoiceNo)
EcommDataClean$StockCode<- factor(EcommDataClean$StockCode)
EcommDataClean$CustomerID<- factor(EcommDataClean$CustomerID)
EcommDataClean$Country<- factor(EcommDataClean$Country)
EcommDataClean$InvoiceDate<- as.Date(EcommDataClean$InvoiceDate, '%m/%d/%Y %H:%M')
# Adding a new column of total purchase price i.e( Unit price * Quantity)
EcommDataClean$TotalPrice <- EcommDataClean$UnitPrice * EcommDataClean$Quantity
#Adding three columns Day, month and year by splitting the invoice date.
EcommDataClean$DateYear <- as.numeric(format(EcommDataClean$InvoiceDate, format = "%Y"))
EcommDataClean$DateMonth <- as.numeric(format(EcommDataClean$InvoiceDate, format = "%m"))
EcommDataClean$DateDay <- as.numeric(format(EcommDataClean$InvoiceDate, format = "%d"))
# Adding additional Quarter Column to specify financial quarter of purchase.
EcommDataClean$Quarter[EcommDataClean$DateMonth < 7] <- 1
EcommDataClean$Quarter[EcommDataClean$DateMonth > 6] <- 2
# Converting them to factors.
EcommDataClean$DateYear<- factor(EcommDataClean$DateYear)
EcommDataClean$DateMonth<- factor(EcommDataClean$DateMonth)
EcommDataClean$DateDay<- factor(EcommDataClean$DateDay)
EcommDataClean$Quarter<- factor(EcommDataClean$Quarter)
#View(EcommDataClean)
nrow(EcommDataClean)
## [1] 397924
# The number of useful rows (Ecommerce purchases) is 397924.
ncol(EcommDataClean)
## [1] 13
# The number of useful columns is 13.
# Ques 2: Create a descriptive statistics (min, max, median etc) of each variable.
# Summary of the original dataset.
summary(Ecommdata)
## InvoiceNo StockCode
## 573585 : 1114 85123A : 2313
## 581219 : 749 22423 : 2203
## 581492 : 731 85099B : 2159
## 580729 : 721 47566 : 1727
## 558475 : 705 20725 : 1639
## 579777 : 687 84879 : 1502
## (Other):537202 (Other):530366
## Description Quantity
## WHITE HANGING HEART T-LIGHT HOLDER: 2369 Min. :-80995.00
## REGENCY CAKESTAND 3 TIER : 2200 1st Qu.: 1.00
## JUMBO BAG RED RETROSPOT : 2159 Median : 3.00
## PARTY BUNTING : 1727 Mean : 9.55
## LUNCH BAG RED RETROSPOT : 1638 3rd Qu.: 10.00
## ASSORTED COLOUR BIRD ORNAMENT : 1501 Max. : 80995.00
## (Other) :530315
## InvoiceDate UnitPrice CustomerID
## 10/31/2011 14:41: 1114 Min. :-11062.06 Min. :12346
## 12/8/2011 9:28 : 749 1st Qu.: 1.25 1st Qu.:13953
## 12/9/2011 10:03 : 731 Median : 2.08 Median :15152
## 12/5/2011 17:24 : 721 Mean : 4.61 Mean :15288
## 6/29/2011 15:58 : 705 3rd Qu.: 4.13 3rd Qu.:16791
## 11/30/2011 15:13: 687 Max. : 38970.00 Max. :18287
## (Other) :537202 NA's :135080
## Country
## United Kingdom:495478
## Germany : 9495
## France : 8557
## EIRE : 8196
## Spain : 2533
## Netherlands : 2371
## (Other) : 15279
# Summary of the cleaned dataset.
summary(EcommDataClean)
## InvoiceNo StockCode
## 576339 : 542 85123A : 2035
## 579196 : 533 22423 : 1724
## 580727 : 529 85099B : 1618
## 578270 : 442 84879 : 1408
## 573576 : 435 47566 : 1397
## 567656 : 421 20725 : 1317
## (Other):395022 (Other):388425
## Description Quantity
## WHITE HANGING HEART T-LIGHT HOLDER: 2028 Min. : 1.00
## REGENCY CAKESTAND 3 TIER : 1724 1st Qu.: 2.00
## JUMBO BAG RED RETROSPOT : 1618 Median : 6.00
## ASSORTED COLOUR BIRD ORNAMENT : 1408 Mean : 13.02
## PARTY BUNTING : 1397 3rd Qu.: 12.00
## LUNCH BAG RED RETROSPOT : 1316 Max. :80995.00
## (Other) :388433
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 Min. : 0.000 17841 : 7847
## 1st Qu.:2011-04-07 1st Qu.: 1.250 14911 : 5677
## Median :2011-07-31 Median : 1.950 14096 : 5111
## Mean :2011-07-10 Mean : 3.116 12748 : 4596
## 3rd Qu.:2011-10-20 3rd Qu.: 3.750 14606 : 2700
## Max. :2011-12-09 Max. :8142.750 15311 : 2379
## (Other):369614
## Country TotalPrice DateYear
## United Kingdom:354345 Min. : 0.00 2010: 26160
## Germany : 9042 1st Qu.: 4.68 2011:371764
## France : 8342 Median : 11.80
## EIRE : 7238 Mean : 22.39
## Spain : 2485 3rd Qu.: 19.80
## Netherlands : 2363 Max. :168469.60
## (Other) : 14109
## DateMonth DateDay Quarter
## 11 : 64545 6 : 18349 1:146488
## 10 : 49557 5 : 16410 2:251436
## 12 : 43464 8 : 15854
## 9 : 40030 7 : 15604
## 5 : 28322 17 : 14913
## 6 : 27185 4 : 14882
## (Other):144821 (Other):301912
# Ques 3: Create one-way contingency tables for the categorical variables in your dataset.
# The categorical variables in the dataset are Description, Country, InvoiceNo, StockCode, CustomerID.
# One-way contingency tables for Description.
CountDescription <- table(EcommDataClean$Description)
head(CountDescription)
##
## 4 PURPLE FLOCK DINNER CANDLES
## 0 39
## 50'S CHRISTMAS GIFT BAG LARGE DOLLY GIRL BEAKER
## 109 138
## I LOVE LONDON MINI BACKPACK I LOVE LONDON MINI RUCKSACK
## 70 1
# One-way contingency tables for Country.
CountCountry <- table(EcommDataClean$Country)
head(CountCountry)
##
## Australia Austria Bahrain Belgium Brazil Canada
## 1185 398 17 2031 32 151
# One-way contingency tables for InvoiceNo.
CountInvoice <- table(EcommDataClean$InvoiceNo)
head(CountInvoice)
##
## 536365 536366 536367 536368 536369 536370
## 7 2 12 4 1 20
# One-way contingency tables for StockCode.
CountStockCode <- table(EcommDataClean$StockCode)
head(CountStockCode)
##
## 10002 10080 10120 10123C 10124A 10124G
## 49 21 30 3 5 4
# One-way contingency tables for CustomerID.
CountCustomerID <- table(EcommDataClean$CustomerID)
head(CountCustomerID)
##
## 12346 12347 12348 12349 12350 12352
## 1 182 31 73 17 85
# One-way contingency tables for DateYear.
CountDateYear <- table(EcommDataClean$DateYear)
head(CountDateYear)
##
## 2010 2011
## 26160 371764
# One-way contingency tables for DateMonth.
CountDateMonth <- table(EcommDataClean$DateMonth)
head(CountDateMonth)
##
## 1 2 3 4 5 6
## 21232 19928 27177 22644 28322 27185
# One-way contingency tables for DateDay.
CountDateDay <- table(EcommDataClean$DateDay)
head(CountDateDay)
##
## 1 2 3 4 5 6
## 13630 12101 10878 14882 16410 18349
# One-way contingency tables for Quarter.
CountQuarter <- table(EcommDataClean$Quarter)
head(CountQuarter)
##
## 1 2
## 146488 251436
# Ques 4: Create two-way contingency tables for the categorical variables in your dataset.
# Two-way contingency tables for CustomerID and Country.
CountCountryCustomerID <- table(EcommDataClean$CustomerID, EcommDataClean$Country)
head(prop.table(CountCountryCustomerID))
##
## Australia Austria Bahrain Belgium Brazil Canada Channel Islands
## 12346 0 0 0 0 0 0 0
## 12347 0 0 0 0 0 0 0
## 12348 0 0 0 0 0 0 0
## 12349 0 0 0 0 0 0 0
## 12350 0 0 0 0 0 0 0
## 12352 0 0 0 0 0 0 0
##
## Cyprus Czech Republic Denmark EIRE European Community Finland
## 12346 0 0 0 0 0 0.000000e+00
## 12347 0 0 0 0 0 0.000000e+00
## 12348 0 0 0 0 0 7.790432e-05
## 12349 0 0 0 0 0 0.000000e+00
## 12350 0 0 0 0 0 0.000000e+00
## 12352 0 0 0 0 0 0.000000e+00
##
## France Germany Greece Iceland Israel Italy Japan
## 12346 0 0 0 0.0000000000 0 0.0000000000 0
## 12347 0 0 0 0.0004573738 0 0.0000000000 0
## 12348 0 0 0 0.0000000000 0 0.0000000000 0
## 12349 0 0 0 0.0000000000 0 0.0001834521 0
## 12350 0 0 0 0.0000000000 0 0.0000000000 0
## 12352 0 0 0 0.0000000000 0 0.0000000000 0
##
## Lebanon Lithuania Malta Netherlands Norway Poland Portugal
## 12346 0 0 0 0 0.000000e+00 0 0
## 12347 0 0 0 0 0.000000e+00 0 0
## 12348 0 0 0 0 0.000000e+00 0 0
## 12349 0 0 0 0 0.000000e+00 0 0
## 12350 0 0 0 0 4.272173e-05 0 0
## 12352 0 0 0 0 2.136086e-04 0 0
##
## RSA Saudi Arabia Singapore Spain Sweden Switzerland
## 12346 0 0 0 0 0 0
## 12347 0 0 0 0 0 0
## 12348 0 0 0 0 0 0
## 12349 0 0 0 0 0 0
## 12350 0 0 0 0 0 0
## 12352 0 0 0 0 0 0
##
## United Arab Emirates United Kingdom Unspecified USA
## 12346 0 2.513043e-06 0 0
## 12347 0 0.000000e+00 0 0
## 12348 0 0.000000e+00 0 0
## 12349 0 0.000000e+00 0 0
## 12350 0 0.000000e+00 0 0
## 12352 0 0.000000e+00 0 0
# Two-way contingency tables for StockCode and Country.
CountCountryStockCode <- table(EcommDataClean$StockCode, EcommDataClean$Country)
head(prop.table(CountCountryStockCode))
##
## Australia Austria Bahrain Belgium Brazil Canada Channel Islands
## 10002 0 0 0 0 0 0 0
## 10080 0 0 0 0 0 0 0
## 10120 0 0 0 0 0 0 0
## 10123C 0 0 0 0 0 0 0
## 10124A 0 0 0 0 0 0 0
## 10124G 0 0 0 0 0 0 0
##
## Cyprus Czech Republic Denmark EIRE European Community
## 10002 0 0 0 2.513043e-06 0
## 10080 0 0 0 0.000000e+00 0
## 10120 0 0 0 0.000000e+00 0
## 10123C 0 0 0 0.000000e+00 0
## 10124A 0 0 0 0.000000e+00 0
## 10124G 0 0 0 0.000000e+00 0
##
## Finland France Germany Greece Iceland Israel Italy
## 10002 0 2.010434e-05 2.513043e-06 0 0 0 0
## 10080 0 0.000000e+00 0.000000e+00 0 0 0 0
## 10120 0 2.513043e-06 0.000000e+00 0 0 0 0
## 10123C 0 0.000000e+00 0.000000e+00 0 0 0 0
## 10124A 0 0.000000e+00 0.000000e+00 0 0 0 0
## 10124G 0 0.000000e+00 0.000000e+00 0 0 0 0
##
## Japan Lebanon Lithuania Malta Netherlands Norway Poland
## 10002 2.513043e-06 0 0 0 0 0 0
## 10080 0.000000e+00 0 0 0 0 0 0
## 10120 0.000000e+00 0 0 0 0 0 0
## 10123C 0.000000e+00 0 0 0 0 0 0
## 10124A 0.000000e+00 0 0 0 0 0 0
## 10124G 0.000000e+00 0 0 0 0 0 0
##
## Portugal RSA Saudi Arabia Singapore Spain Sweden
## 10002 0 0 0 0 2.513043e-06 0
## 10080 0 0 0 0 0.000000e+00 0
## 10120 0 0 0 0 0.000000e+00 0
## 10123C 0 0 0 0 0.000000e+00 0
## 10124A 0 0 0 0 0.000000e+00 0
## 10124G 0 0 0 0 0.000000e+00 0
##
## Switzerland United Arab Emirates United Kingdom Unspecified USA
## 10002 2.513043e-06 0 9.046954e-05 0 0
## 10080 0.000000e+00 0 5.277390e-05 0 0
## 10120 0.000000e+00 0 7.287824e-05 0 0
## 10123C 0.000000e+00 0 7.539128e-06 0 0
## 10124A 0.000000e+00 0 1.256521e-05 0 0
## 10124G 0.000000e+00 0 1.005217e-05 0 0
# Two-way contingency tables for Description and Country.
#CountCountryDescription <- table(EcommDataClean$Description, EcommDataClean$Country)
#prop.table(CountCountryDescription)
# Two-way contingency tables for InvoiceNo and Country.
#CountCountryInvoiceNo <- table(EcommDataClean$InvoiceNo, EcommDataClean$Country)
#prop.table(CountCountryInvoiceNo)
# Ques 5: Draw a boxplot of the variables that belong to your study.
library(lattice)
# Effect of Total Price on Country.
boxplot(EcommDataClean$TotalPrice ~EcommDataClean$Country ,col = "Brown",
xlab = "Country", ylab ="Total price", main = " Effect of Total Price on Country.")
# Effect of unit price on Country.
boxplot(EcommDataClean$UnitPrice ~EcommDataClean$Country ,col = "Cyan",
xlab = "Country", ylab ="Unit price", main = "Effect of unit price on Country.")
# Effect of Quantity on Country.
boxplot(EcommDataClean$Quantity ~EcommDataClean$Country ,col = "yellow",
xlab = "Country", ylab ="Quantity", main = "Effect of Quantity on Country.")
# Effect of Quantity on Month.
boxplot(EcommDataClean$Quantity ~EcommDataClean$DateMonth ,col = "grey",
xlab = "Month", ylab ="Quantity", main = "Effect of Quantity on Month.")
# Effect of Month on Total Price.
boxplot(EcommDataClean$TotalPrice ~EcommDataClean$DateMonth ,col = "yellow",
xlab = "Month", ylab ="Total Price", main = "Effect of Month on Total Price.")
# Effect of Year on Total Price.
boxplot(EcommDataClean$TotalPrice ~EcommDataClean$DateYear ,col = "Red",
xlab = "year", ylab ="Total Price", main = "Effect of Year on Total Price.")
# Effect of Quantity on year.
boxplot(EcommDataClean$Quantity ~EcommDataClean$DateYear ,col = "grey",
xlab = "year", ylab ="Quantity", main = "Effect of Quantity on year.")
# Effect of Quantity on Financial Quarter.
boxplot(EcommDataClean$Quantity ~EcommDataClean$Quarter ,col = "Pink",
xlab = "Quarter", ylab ="Quantity", main = "Effect of Quantity on Financial Quarter.")
# Effect of Total Price on Financial Quarter.
boxplot(EcommDataClean$TotalPrice ~EcommDataClean$Quarter ,col = "Pink",
xlab = "Quarter", ylab ="Total Price", main = "Effect of Total Price on Financial Quarter.")
# Ques 6: Draw Histograms for your suitable data fields.
# Histogram of Unit Price.
hist(EcommDataClean$UnitPrice, col ="pink")
# Histogram of Quantity.
hist(EcommDataClean$Quantity, col ="brown")
# Histogram of Total Price.
hist(EcommDataClean$TotalPrice, col ="green")
# Ques 7: Draw suitable plot for your data fields.
# Distribution of products according to country in log scale.
CountCountry <- table(EcommDataClean$Country )
barplot(CountCountry, main="Distribution of products according to country in log scale.",log='y',
names.arg=rownames(CountCountry), cex.names=0.8, col="Red")
# Distribution of products according to Month in log scale.
CountDateMonth <- table(EcommDataClean$DateMonth )
barplot(CountDateMonth, main="Distribution of products according to Month in log scale.",log='y',
names.arg=rownames(CountDateMonth), cex.names=0.8, col="Orange")
# Distribution of products according to Year in log scale.
CountDateYear <- table(EcommDataClean$DateYear )
barplot(CountDateYear, main="Distribution of products according to Year in log scale.",log='y',
names.arg=rownames(CountDateYear), cex.names=0.8, col="Blue")
# Distribution of products according to Financial Quarter in log scale.
CountQuarter <- table(EcommDataClean$Quarter )
barplot(CountQuarter, main="Distribution of products according to Financial Quarter in log scale.",log='y', names.arg=rownames(CountQuarter), cex.names=0.8, col="Orange")
# Distribution of products according to Date in log scale.
CountDateDay <- table(EcommDataClean$DateDay )
barplot(CountDateDay, main="Distribution of products according to Date in log scale.",log='y',
names.arg=rownames(CountDateDay), cex.names=0.8, col="Cyan")
# Distribution of products according to StockCode in log scale.
CountStockCode <- table(EcommDataClean$StockCode )
barplot(CountStockCode, main="Distribution of products according to StockCode in log scale.",log='y',names.arg=rownames(CountStockCode), cex.names=0.4, col="Blue")
# Distribution of products according to Customer ID in log scale.
CountCustomerID <- table(EcommDataClean$CustomerID )
barplot(CountCustomerID, main="Distribution of products according to Customer ID in log scale.",log='y', names.arg=rownames(CountCustomerID), cex.names=0.4, col="Green")
library(car)
# Plot of log of Total Price vs. log of Quantity.
plot(EcommDataClean$TotalPrice~EcommDataClean$Quantity,spread=FALSE, smoother.args=list(lty=2), pch=19, main="plot of log of Total Price vs. log of Quantity ", log="xy", col = "Orange",xlab="Quantity", ylab="Total Price")
# Plot of log of Total Price vs. log of Unit Price.
plot(EcommDataClean$TotalPrice~EcommDataClean$UnitPrice,spread=FALSE, smoother.args=list(lty=2), pch=19, main ="plot of log of Total Price vs. log of Unit Price ", log ="xy", col = "Pink", xlab ="Unit Price", ylab ="Total Price")
# Ques 8: Create a correlation matrix.
# Create a Variance-Covariance Matrix.
# creating a subset of numeric values.
Ecomm <- EcommDataClean[,c(4,6,9)]
covEcomm <-cov(Ecomm)
head(covEcomm)
## Quantity UnitPrice TotalPrice
## Quantity 32551.45215 -18.18357 50675.227
## UnitPrice -18.18357 488.26804 557.572
## TotalPrice 50675.22664 557.57198 95515.357
#Transform covariance to correlation matrix
covCorrEcomm <- cov2cor(covEcomm)
head(covCorrEcomm)
## Quantity UnitPrice TotalPrice
## Quantity 1.00000000 -0.00456105 0.90881171
## UnitPrice -0.00456105 1.00000000 0.08164605
## TotalPrice 0.90881171 0.08164605 1.00000000
# Ques 9: Visualize your correlation matrix using corrgram.
library("corrgram")
# Corrgram of Ecommerce variables.
corrgram(EcommDataClean, order=FALSE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of Ecommerce variables.")
# Ques 10: Create a scatter plot matrix for your data set.
library(car)
# As the dataset is large, taking a sample of it.
EcommSample <- EcommDataClean[sample(nrow(EcommDataClean), 3000), ]
# Effect of Quantity and Unit price on Total pricing.
scatterplotMatrix(formula = ~ EcommSample$TotalPrice + EcommSample$Quantity + EcommSample$UnitPrice, cex=0.6,
data=EcommSample, main = "Effect of Quantity and Unit price on Total pricing." )
# Scatterplot Matrix of the Ecommerce transactions.
pairs(~InvoiceNo + StockCode + CustomerID + Country+Quantity + UnitPrice + TotalPrice +DateDay + DateYear + DateMonth + Quarter, data= EcommSample, main=" Scatterplot Matrix of the Ecommerce transactions.")
# Ques 11: Run a suitable test to check your hypothesis for your suitable assumptions.
# Ques 12: Run a t-test to analyse your hypothesis.
# Null Hypothesis- "The total price does not depend on Quarter variable.
# T-Test to check correlation between TotalPrice and Quarter
t.test(EcommDataClean$TotalPrice ~ EcommDataClean$Quarter)
##
## Welch Two Sample t-test
##
## data: EcommDataClean$TotalPrice by EcommDataClean$Quarter
## t = 1.6395, df = 386090, p-value = 0.1011
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2967951 3.3332664
## sample estimates:
## mean in group 1 mean in group 2
## 23.35408 21.83584
# As p-value=0.1011(>0.05).We can accept the null hypothesis for Quarter column. Therefore There is no strong correlation between Total price and Financial Quarter.
# Day 2:
# Task 1- Formulate a Regression Model. Fit Linear Regression Models using lm(). Articulate a Hypothesis (or two) that you could test using a Regression Model.Run T-Tests appropriate, to test your Hypotheses. Prepare a list of insights based on your Regression Analysis.
# The linear regression model we are trying to make has total_price as the dependent variable and the rest as indeoendent variables.
# Finding the correlation between the independent variables and the numeric dependent variables.
# correlation between Total Price and Quantity.
cor(EcommDataClean$TotalPrice, EcommDataClean$Quantity)
## [1] 0.9088117
# [1] 0.9088117
# correlation between Total Price and Unit Price.
cor(EcommDataClean$TotalPrice, EcommDataClean$UnitPrice)
## [1] 0.08164605
# [1] 0.08164605
# Create Training and Test data -
set.seed(100) # setting seed to reproduce results of random sampling
# Row indices for training data. Splitting the data into 80:20 ratio.
trainingRowIndex <- sample(1:nrow(EcommDataClean), 0.8*nrow(EcommDataClean))
# Model training data.
trainingData <- EcommDataClean[trainingRowIndex, ]
# Test data.
testData <- EcommDataClean[-trainingRowIndex, ]
# Developing a model on the training data of the Ecommerce data.
EcommModel <- lm(TotalPrice ~
+Quantity
+InvoiceDate
+UnitPrice
+Quarter, data=trainingData)
# View the Coeffecients.
EcommModel
##
## Call:
## lm(formula = TotalPrice ~ +Quantity + InvoiceDate + UnitPrice +
## Quarter, data = trainingData)
##
## Coefficients:
## (Intercept) Quantity InvoiceDate UnitPrice Quarter2
## -32.165212 1.562834 0.002004 1.202539 0.056894
# Hence, the model is => Total_Price = 1.562834*Quantity + 0.002004* InvoiceDate + 1.202539* UnitPrice + 0.056894*Quarter2 -32.165212
# Summary of the model.
summary(EcommModel)
##
## Call:
## lm(formula = TotalPrice ~ +Quantity + InvoiceDate + UnitPrice +
## Quarter, data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38801 -2 0 5 41887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -32.165212 42.984323 -0.748 0.454
## Quantity 1.562834 0.001221 1279.673 <2e-16 ***
## InvoiceDate 0.002004 0.002852 0.703 0.482
## UnitPrice 1.202539 0.010375 115.905 <2e-16 ***
## Quarter2 0.056894 0.666664 0.085 0.932
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 138.2 on 318334 degrees of freedom
## Multiple R-squared: 0.8383, Adjusted R-squared: 0.8383
## F-statistic: 4.125e+05 on 4 and 318334 DF, p-value: < 2.2e-16
# As, we can see through the P-value, only the variables Quantity and UnitPrice are significant that contribute to the linear regression model which can be tested through hypothesis.
# Predicting the model.
ecommTotalPricePred <- predict(EcommModel, testData)
# View the predictions.
head(ecommTotalPricePred)
## 2 6 11 13 18 27
## 11.291815 10.163295 9.740541 14.850397 9.681813 39.855735
# Diagnostics of prediction.
actuals_preds <- data.frame(cbind(actuals=testData$TotalPrice, predicteds=ecommTotalPricePred))
# make actuals_predicteds dataframe.
correlation_accuracy <- cor(actuals_preds) # 67.4%
head(actuals_preds)
## actuals predicteds
## 2 20.34 11.291815
## 6 15.30 10.163295
## 11 12.60 9.740541
## 13 30.00 14.850397
## 18 17.85 9.681813
## 27 90.00 39.855735
# Null Hypothesis- "The total price does not depend on the Financial Quarter variable".
# T-Test to check correlation between TotalPrice and Quarter.
t.test(trainingData$TotalPrice ~ trainingData$Quarter)
##
## Welch Two Sample t-test
##
## data: trainingData$TotalPrice by trainingData$Quarter
## t = 1.2799, df = 309830, p-value = 0.2006
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.7819126 3.7252551
## sample estimates:
## mean in group 1 mean in group 2
## 23.45549 21.98382
# As p-value=0.2006(>0.05).We can accept the null hypothesis for Quarter column.Therefore, there is no correlation between Total price and Quarter which can be seen from the linear model too.
# Customer and Product Insights.
# 1. Finding the total price and transactions of the purchases per customer.
OverallPricePerCustomer <- aggregate(EcommDataClean$TotalPrice, by=list(CustomerID=EcommDataClean$CustomerID), FUN=sum)
TotalTransactionsPerCustomer <- aggregate(EcommDataClean$TotalPrice, by=list(CustomerID=EcommDataClean$CustomerID), FUN=length)
colnames(OverallPricePerCustomer)[2] <- 'Total_price_per_customer'
colnames(TotalTransactionsPerCustomer)[2] <- 'Total_transactions_per_customer'
head(OverallPricePerCustomer)
## CustomerID Total_price_per_customer
## 1 12346 77183.60
## 2 12347 4310.00
## 3 12348 1797.24
## 4 12349 1757.55
## 5 12350 334.40
## 6 12352 2506.04
head(TotalTransactionsPerCustomer)
## CustomerID Total_transactions_per_customer
## 1 12346 1
## 2 12347 182
## 3 12348 31
## 4 12349 73
## 5 12350 17
## 6 12352 85
UniqueCustomerTransactionsDetails <- merge(OverallPricePerCustomer,TotalTransactionsPerCustomer,by="CustomerID")
head(UniqueCustomerTransactionsDetails)
## CustomerID Total_price_per_customer Total_transactions_per_customer
## 1 12346 77183.60 1
## 2 12347 4310.00 182
## 3 12348 1797.24 31
## 4 12349 1757.55 73
## 5 12350 334.40 17
## 6 12352 2506.04 85
summary(UniqueCustomerTransactionsDetails)
## CustomerID Total_price_per_customer Total_transactions_per_customer
## 12346 : 1 Min. : 0.0 Min. : 1.00
## 12347 : 1 1st Qu.: 307.2 1st Qu.: 17.00
## 12348 : 1 Median : 674.5 Median : 41.00
## 12349 : 1 Mean : 2053.8 Mean : 91.71
## 12350 : 1 3rd Qu.: 1661.6 3rd Qu.: 100.00
## 12352 : 1 Max. :280206.0 Max. :7847.00
## (Other):4333
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
describe(UniqueCustomerTransactionsDetails)
## vars n mean sd median trimmed
## CustomerID* 1 4339 2170.00 1252.71 2170.00 2170.00
## Total_price_per_customer 2 4339 2053.79 8988.25 674.45 978.95
## Total_transactions_per_customer 3 4339 91.71 228.79 41.00 57.88
## mad min max range skew kurtosis
## CustomerID* 1608.62 1 4339 4338 0.00 -1.20
## Total_price_per_customer 693.81 0 280206 280206 19.31 477.38
## Total_transactions_per_customer 44.48 1 7847 7846 18.09 482.64
## se
## CustomerID* 19.02
## Total_price_per_customer 136.45
## Total_transactions_per_customer 3.47
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library("tabplot")
## Loading required package: bit
## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
##
## Attaching package: 'bit'
## The following object is masked from 'package:psych':
##
## keysort
## The following object is masked from 'package:base':
##
## xor
## Loading required package: ff
## Attaching package ff
## - getOption("fftempdir")=="C:/Users/GOWRI/AppData/Local/Temp/RtmpCWOxOa"
## - getOption("ffextension")=="ff"
## - getOption("ffdrop")==TRUE
## - getOption("fffinonexit")==TRUE
## - getOption("ffpagesize")==65536
## - getOption("ffcaching")=="mmnoflush" -- consider "ffeachflush" if your system stalls on large writes
## - getOption("ffbatchbytes")==84955627.52 -- consider a different value for tuning your system
## - getOption("ffmaxbytes")==4247781376 -- consider a different value for tuning your system
##
## Attaching package: 'ff'
## The following objects are masked from 'package:bit':
##
## clone, clone.default, clone.list
## The following objects are masked from 'package:utils':
##
## write.csv, write.csv2
## The following objects are masked from 'package:base':
##
## is.factor, is.ordered
## Loading required package: ffbase
##
## Attaching package: 'ffbase'
## The following objects are masked from 'package:ff':
##
## [.ff, [.ffdf, [<-.ff, [<-.ffdf
## The following objects are masked from 'package:base':
##
## %in%, table
# Visualization between the relationship of the columns.
tableplot(UniqueCustomerTransactionsDetails)
# Concentrating on the Top 0.5% price customer.
tableplot(UniqueCustomerTransactionsDetails, sortCol = Total_price_per_customer,
from = 0, to = 0.5)
# Concentrating on the Top 2% transaction customer.
tableplot(UniqueCustomerTransactionsDetails, sortCol = Total_transactions_per_customer,
from = 0, to = 2)
# 2. Finding the total price and transactions of the purchases per country of customers.
OverallPricePerCustomerCountry <- aggregate(EcommDataClean$TotalPrice, by=list(CustomerID=EcommDataClean$CustomerID, Country=EcommDataClean$Country), FUN=sum)
TotalTransactionsPerCustomerCountry <- aggregate(EcommDataClean$TotalPrice, by=list(CustomerID=EcommDataClean$CustomerID, Country=EcommDataClean$Country), FUN=length)
colnames(OverallPricePerCustomerCountry)[3] <- 'Total_price_per_customer_country'
colnames(TotalTransactionsPerCustomerCountry)[3] <- 'Total_transactions_per_customer_country'
head(OverallPricePerCustomerCountry)
## CustomerID Country Total_price_per_customer_country
## 1 12386 Australia 401.90
## 2 12388 Australia 2780.66
## 3 12393 Australia 1582.60
## 4 12415 Australia 124914.53
## 5 12422 Australia 386.20
## 6 12424 Australia 1760.96
head(TotalTransactionsPerCustomerCountry)
## CustomerID Country Total_transactions_per_customer_country
## 1 12386 Australia 10
## 2 12388 Australia 100
## 3 12393 Australia 64
## 4 12415 Australia 716
## 5 12422 Australia 21
## 6 12424 Australia 30
UniqueCustomerCountryTransactionsDetails <- merge(OverallPricePerCustomerCountry,TotalTransactionsPerCustomerCountry,by=c("CustomerID","Country"))
head(UniqueCustomerCountryTransactionsDetails)
## CustomerID Country Total_price_per_customer_country
## 1 12346 United Kingdom 77183.60
## 2 12347 Iceland 4310.00
## 3 12348 Finland 1797.24
## 4 12349 Italy 1757.55
## 5 12350 Norway 334.40
## 6 12352 Norway 2506.04
## Total_transactions_per_customer_country
## 1 1
## 2 182
## 3 31
## 4 73
## 5 17
## 6 85
summary(UniqueCustomerCountryTransactionsDetails)
## CustomerID Country Total_price_per_customer_country
## 12370 : 2 United Kingdom:3921 Min. : 0.0
## 12394 : 2 Germany : 94 1st Qu.: 307.4
## 12417 : 2 France : 87 Median : 672.0
## 12422 : 2 Spain : 30 Mean : 2050.0
## 12429 : 2 Belgium : 25 3rd Qu.: 1661.2
## 12431 : 2 Switzerland : 21 Max. :280206.0
## (Other):4335 (Other) : 169
## Total_transactions_per_customer_country
## Min. : 1.00
## 1st Qu.: 17.00
## Median : 41.00
## Mean : 91.54
## 3rd Qu.: 100.00
## Max. :7847.00
##
library(psych)
describe(UniqueCustomerCountryTransactionsDetails)
## vars n mean sd median
## CustomerID* 1 4347 2166.12 1254.81 2166.00
## Country* 2 4347 33.26 5.99 35.00
## Total_price_per_customer_country 3 4347 2050.01 8980.10 671.96
## Total_transactions_per_customer_country 4 4347 91.54 228.60 41.00
## trimmed mad min max range
## CustomerID* 2166.00 1611.59 1 4339 4338
## Country* 35.00 0.00 1 37 36
## Total_price_per_customer_country 976.48 690.52 0 280206 280206
## Total_transactions_per_customer_country 57.73 44.48 1 7847 7846
## skew kurtosis se
## CustomerID* 0.00 -1.20 19.03
## Country* -3.51 11.40 0.09
## Total_price_per_customer_country 19.33 478.27 136.20
## Total_transactions_per_customer_country 18.11 483.46 3.47
library(ggplot2)
library("tabplot")
# Visualization between the relationship of the columns.
tableplot(UniqueCustomerCountryTransactionsDetails)
# taking summary
summary(tableplot(UniqueCustomerCountryTransactionsDetails))
## general
## dataset :UniqueCustomerCountryTransactionsDetails
## variables :4
## sortCol :1
## decreasing :TRUE
## from :0%
## to :100%
## objects.sample :4347
## objects.full.data:4347
## bins :100
## variable1 variable2
## name :CustomerID name :Country
## type :categorical type :categorical
## categories:51 categories:38
##
##
##
##
##
##
## variable3
## name :Total_price_per_customer_country
## type :numeric
## scale_init :auto
## scale_final:log
##
##
##
##
##
## variable4
## name :Total_transactions_per_customer_country
## type :numeric
## scale_init :auto
## scale_final:log
##
##
##
##
##
# Concentrating on the Top 0.5% price customer per country.
tableplot(UniqueCustomerCountryTransactionsDetails, sortCol = Total_price_per_customer_country,
from = 0, to = 0.5)
# Concentrating on the Top 2% transaction customer per country.
tableplot(UniqueCustomerCountryTransactionsDetails, sortCol = Total_transactions_per_customer_country,
from = 0, to = 2)
# plotting on the world map.
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
n <- joinCountryData2Map(UniqueCustomerCountryTransactionsDetails, joinCode="NAME", nameJoinColumn="Country")
## 4329 codes from your data successfully matched countries in the map
## 18 codes from your data failed to match with a country code in the map
## 211 codes from the map weren't represented in your data
# Total price per customer in respective countries.
mapCountryData(n, nameColumnToPlot="Total_price_per_customer_country", mapTitle="Total price per customer in respective countries.")
# Total transactions per customer in respective countries.
mapCountryData(n, nameColumnToPlot="Total_transactions_per_customer_country", mapTitle="Total transactions per customer in respective countries.", colourPalette="terrain")
# Distribution of customer in respective countries.
mapCountryData(n, nameColumnToPlot="CustomerID", mapTitle="Distribution of customer in respective countries.", catMethod="categorical")
# 3. Finding the total price and transactions of the purchases per product(stock).
OverallPricePerStock <- aggregate(EcommDataClean$TotalPrice, by=list(StockCode=EcommDataClean$StockCode), FUN=sum)
TotalTransactionsPerStock <- aggregate(EcommDataClean$TotalPrice, by=list(StockCode=EcommDataClean$StockCode), FUN=length)
colnames(OverallPricePerStock)[2] <- 'Total_price_per_stock'
colnames(TotalTransactionsPerStock)[2] <- 'Total_transactions_per_stock'
head(OverallPricePerStock)
## StockCode Total_price_per_stock
## 1 10002 699.55
## 2 10080 114.41
## 3 10120 40.53
## 4 10123C 3.25
## 5 10124A 6.72
## 6 10124G 7.14
head(TotalTransactionsPerStock)
## StockCode Total_transactions_per_stock
## 1 10002 49
## 2 10080 21
## 3 10120 30
## 4 10123C 3
## 5 10124A 5
## 6 10124G 4
UniqueStockTransactionsDetails <- merge(OverallPricePerStock,TotalTransactionsPerStock,by="StockCode")
head(UniqueStockTransactionsDetails)
## StockCode Total_price_per_stock Total_transactions_per_stock
## 1 10002 699.55 49
## 2 10080 114.41 21
## 3 10120 40.53 30
## 4 10123C 3.25 3
## 5 10124A 6.72 5
## 6 10124G 7.14 4
summary(UniqueStockTransactionsDetails)
## StockCode Total_price_per_stock Total_transactions_per_stock
## 10002 : 1 Min. : 0.0 Min. : 1.0
## 10080 : 1 1st Qu.: 129.9 1st Qu.: 12.0
## 10120 : 1 Median : 634.0 Median : 49.0
## 10123C : 1 Mean : 2431.5 Mean : 108.6
## 10124A : 1 3rd Qu.: 2075.4 3rd Qu.: 135.0
## 10124G : 1 Max. :168469.6 Max. :2035.0
## (Other):3659
library(psych)
describe(UniqueStockTransactionsDetails)
## vars n mean sd median trimmed
## StockCode* 1 3665 1833.00 1058.14 1833.00 1833.00
## Total_price_per_stock 2 3665 2431.49 6579.43 633.96 1173.55
## Total_transactions_per_stock 3 3665 108.57 163.77 49.00 73.60
## mad min max range skew kurtosis
## StockCode* 1358.06 1 3665.0 3664.0 0.00 -1.20
## Total_price_per_stock 877.00 0 168469.6 168469.6 11.24 205.63
## Total_transactions_per_stock 63.75 1 2035.0 2034.0 3.55 19.61
## se
## StockCode* 17.48
## Total_price_per_stock 108.68
## Total_transactions_per_stock 2.71
library(ggplot2)
library("tabplot")
# Visualization between the relationship of the columns.
tableplot(UniqueStockTransactionsDetails)
# Concentrating on the Top 0.5% price stock.
tableplot(UniqueStockTransactionsDetails, sortCol = Total_price_per_stock,
from = 0, to = 0.5)
# Concentrating on the Top 2% transaction products(Stocks).
tableplot(UniqueStockTransactionsDetails, sortCol = Total_transactions_per_stock,
from = 0, to = 2)
# 4. Finding the total price and transactions of the purchases of stocks per country.
OverallPricePerStockCountry <- aggregate(EcommDataClean$TotalPrice, by=list(StockCode=EcommDataClean$StockCode, Country=EcommDataClean$Country), FUN=sum)
TotalTransactionsPerStockCountry <- aggregate(EcommDataClean$TotalPrice, by=list(StockCode=EcommDataClean$StockCode, Country=EcommDataClean$Country), FUN=length)
colnames(OverallPricePerStockCountry)[3] <- 'Total_price_per_stock_country'
colnames(TotalTransactionsPerStockCountry)[3] <- 'Total_transactions_per_stock_country'
head(OverallPricePerStockCountry)
## StockCode Country Total_price_per_stock_country
## 1 15036 Australia 432.00
## 2 15056BL Australia 17.85
## 3 16161P Australia 136.00
## 4 16169E Australia 10.50
## 5 20665 Australia 17.70
## 6 20675 Australia 228.96
head(TotalTransactionsPerStockCountry)
## StockCode Country Total_transactions_per_stock_country
## 1 15036 Australia 1
## 2 15056BL Australia 1
## 3 16161P Australia 1
## 4 16169E Australia 1
## 5 20665 Australia 1
## 6 20675 Australia 2
UniqueStockCountryTransactionsDetails <- merge(OverallPricePerStockCountry,TotalTransactionsPerStockCountry,by=c("StockCode","Country"))
head(UniqueStockCountryTransactionsDetails)
## StockCode Country Total_price_per_stock_country
## 1 10002 EIRE 10.20
## 2 10002 France 316.20
## 3 10002 Germany 0.85
## 4 10002 Japan 0.85
## 5 10002 Spain 20.40
## 6 10002 Switzerland 10.20
## Total_transactions_per_stock_country
## 1 1
## 2 8
## 3 1
## 4 1
## 5 1
## 6 1
summary(UniqueStockCountryTransactionsDetails)
## StockCode Country Total_price_per_stock_country
## 22423 : 29 United Kingdom:3645 Min. : 0.0
## 22960 : 25 EIRE :1943 1st Qu.: 16.6
## 22961 : 25 Germany :1665 Median : 37.8
## 23240 : 24 France :1522 Mean : 470.6
## 22138 : 23 Spain :1091 3rd Qu.: 153.1
## 23245 : 23 Switzerland : 947 Max. :168469.6
## (Other):18788 (Other) :8124
## Total_transactions_per_stock_country
## Min. : 1.00
## 1st Qu.: 1.00
## Median : 2.00
## Mean : 21.01
## 3rd Qu.: 5.00
## Max. :1947.00
##
library(psych)
describe(UniqueStockCountryTransactionsDetails)
## vars n mean sd median
## StockCode* 1 18937 1615.24 881.41 1579.0
## Country* 2 18937 20.91 11.14 19.0
## Total_price_per_stock_country 3 18937 470.58 2656.60 37.8
## Total_transactions_per_stock_country 4 18937 21.01 74.74 2.0
## trimmed mad min max range
## StockCode* 1587.47 904.39 1 3665.0 3664.0
## Country* 21.37 13.34 1 37.0 36.0
## Total_price_per_stock_country 97.84 40.92 0 168469.6 168469.6
## Total_transactions_per_stock_country 4.31 1.48 1 1947.0 1946.0
## skew kurtosis se
## StockCode* 0.22 -0.68 6.41
## Country* -0.06 -1.38 0.08
## Total_price_per_stock_country 26.83 1223.98 19.31
## Total_transactions_per_stock_country 7.88 94.06 0.54
library(ggplot2)
library("tabplot")
# Visualization between the relationship of the columns.
tableplot(UniqueStockCountryTransactionsDetails)
# taking summary
summary(tableplot(UniqueStockCountryTransactionsDetails))
## general
## dataset :UniqueStockCountryTransactionsDetails
## variables :4
## sortCol :1
## decreasing :TRUE
## from :0%
## to :100%
## objects.sample :18937
## objects.full.data:18937
## bins :100
## variable1 variable2
## name :StockCode name :Country
## type :categorical type :categorical
## categories:51 categories:38
##
##
##
##
##
##
## variable3
## name :Total_price_per_stock_country
## type :numeric
## scale_init :auto
## scale_final:lin
##
##
##
##
##
## variable4
## name :Total_transactions_per_stock_country
## type :numeric
## scale_init :auto
## scale_final:lin
##
##
##
##
##
# Concentrating on the Top 0.5% price stock per country.
tableplot(UniqueStockCountryTransactionsDetails, sortCol = Total_price_per_stock_country,
from = 0, to = 0.5)
# Concentrating on the Top 2% transaction stock per country.
tableplot(UniqueStockCountryTransactionsDetails, sortCol = Total_transactions_per_stock_country,
from = 0, to = 2)
# plotting on the world map.
library(rworldmap)
n1 <- joinCountryData2Map(UniqueStockCountryTransactionsDetails, joinCode="NAME", nameJoinColumn="Country")
## 16243 codes from your data successfully matched countries in the map
## 2694 codes from your data failed to match with a country code in the map
## 211 codes from the map weren't represented in your data
# Total price per stock in respective countries.
mapCountryData(n1, nameColumnToPlot="Total_price_per_stock_country", mapTitle="Total price per stock in respective countries.", colourPalette="terrain")
# Total transactions per stock in respective countries.
mapCountryData(n1, nameColumnToPlot="Total_transactions_per_stock_country", mapTitle="Total transactions per stock in respective countries.", catMethod="fixedWidth")
# Distribution of stocks in respective countries.
mapCountryData(n1, nameColumnToPlot="StockCode", mapTitle="Distribution of stocks in respective countries.", catMethod="categorical")
Kmeans function gives below statistics as standard output.
Cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocated. Centers: A matrix of cluster centres. Totss: The total sum of squares. Withinss: Vector of within-cluster sum of squares, one component per cluster. tot.withinss: Total within-cluster sum of squares, i.e., sum(withinss). Betweenss: The between-cluster sum of squares, i.e. totss-tot.withinss. Size: The number of points in each cluster. Iter: The number of (outer) iterations. Ifault: Integer: indicator of a possible algorithm problem Â- for experts.
# Customer Segmentations using K-means clusters in the dataset on the basis of unit_price, Quantity and total_price.
Ecommkm <- kmeans(EcommDataClean[,c(4,6,9)],
centers= 8)
Ecommkm$centers
## Quantity UnitPrice TotalPrice
## 1 74215.000000 1.040000 77183.60000
## 2 8.063907 2.866211 13.55322
## 3 70.812403 4.873887 125.76849
## 4 1313.175676 388.252297 3301.01649
## 5 521.045845 72.582722 1275.25017
## 6 286.969255 10.036363 404.16013
## 7 80995.000000 2.080000 168469.60000
## 8 60.000000 649.500000 38970.00000
# Visualization
## Plots on clustering variables
TotalPriceClus <- aggregate(EcommDataClean$TotalPrice, by=list(Ecommkm$cluster), mean)
names(TotalPriceClus) <- c("Cluster","TotalPrice")
TotalPriceClus
## Cluster TotalPrice
## 1 1 77183.60000
## 2 2 13.55322
## 3 3 125.76849
## 4 4 3301.01649
## 5 5 1275.25017
## 6 6 404.16013
## 7 7 168469.60000
## 8 8 38970.00000
## Bar Plot: TotalPrice
cp <-barplot(height=TotalPriceClus$TotalPrice ,
names.arg=TotalPriceClus$Cluster,
xlab="Cluster",
ylab="Avg Total Price",
main="Total Price by Clusters",
ylim= c(0,max(TotalPriceClus$TotalPrice)+10 ),
border=NA
)
## Plots on clustering variables
QuantityClus <- aggregate(EcommDataClean$Quantity, by=list(Ecommkm$cluster), mean)
names(QuantityClus) <- c("Cluster","Quantity")
QuantityClus
## Cluster Quantity
## 1 1 74215.000000
## 2 2 8.063907
## 3 3 70.812403
## 4 4 1313.175676
## 5 5 521.045845
## 6 6 286.969255
## 7 7 80995.000000
## 8 8 60.000000
## Bar Plot: Quantity
cp1 <-barplot(height=QuantityClus$Quantity ,
names.arg=QuantityClus$Cluster,
xlab="Cluster",
ylab="Avg Total Quantity",
main="Total Quantity by Clusters",
ylim= c(0,max(QuantityClus$Quantity)+10 ),
border=NA
)
## Plots on clustering variables
UnitPriceClus <- aggregate(EcommDataClean$UnitPrice, by=list(Ecommkm$cluster), mean)
names(UnitPriceClus) <- c("Cluster","Unit_Price")
UnitPriceClus
## Cluster Unit_Price
## 1 1 1.040000
## 2 2 2.866211
## 3 3 4.873887
## 4 4 388.252297
## 5 5 72.582722
## 6 6 10.036363
## 7 7 2.080000
## 8 8 649.500000
## Bar Plot: UnitPrice
cp2 <-barplot(height=UnitPriceClus$Unit_Price,
names.arg=UnitPriceClus$Cluster,
xlab="Cluster",
ylab="Avg Total Unit Price",
main="Total Unit price by Clusters",
ylim= c(0,max(UnitPriceClus$Unit_Price)+10 ),
border=NA
)
RFM stands for the three dimensions:
Recency - How recently did the customer purchase? Frequency - How often do they purchase? Monetary Value - How much do they spend?
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:bit':
##
## setattr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#library(stringr)
#library(DT)
library(tidyr)
library(knitr)
library(rmarkdown)
# RFM analysis.
EcommRFMData <- EcommDataClean[,c(1:9)]
glimpse(EcommRFMData)
## Observations: 397,924
## Variables: 9
## $ InvoiceNo <fct> 536365, 536365, 536365, 536365, 536365, 536365, 53...
## $ StockCode <fct> 85123A, 71053, 84406B, 84029G, 84029E, 22752, 2173...
## $ Description <fct> WHITE HANGING HEART T-LIGHT HOLDER, WHITE METAL LA...
## $ Quantity <int> 6, 6, 8, 6, 6, 2, 6, 6, 6, 32, 6, 6, 8, 6, 6, 3, 2...
## $ InvoiceDate <date> 2010-12-01, 2010-12-01, 2010-12-01, 2010-12-01, 2...
## $ UnitPrice <dbl> 2.55, 3.39, 2.75, 3.39, 3.39, 7.65, 4.25, 1.85, 1....
## $ CustomerID <fct> 17850, 17850, 17850, 17850, 17850, 17850, 17850, 1...
## $ Country <fct> United Kingdom, United Kingdom, United Kingdom, Un...
## $ TotalPrice <dbl> 15.30, 20.34, 22.00, 20.34, 20.34, 15.30, 25.50, 1...
Ecomm_RFM <- EcommRFMData %>%
group_by(CustomerID) %>%
summarise(recency=as.numeric(as.Date("2012-01-01")-max(InvoiceDate)),
frequency=n_distinct(InvoiceNo), monitery= sum(TotalPrice)/n_distinct(InvoiceNo))
summary(Ecomm_RFM)
## CustomerID recency frequency monitery
## 12346 : 1 Min. : 23.0 Min. : 1.000 Min. : 0.0
## 12347 : 1 1st Qu.: 40.0 1st Qu.: 1.000 1st Qu.: 178.5
## 12348 : 1 Median : 73.0 Median : 2.000 Median : 293.7
## 12349 : 1 Mean :115.0 Mean : 4.272 Mean : 419.1
## 12350 : 1 3rd Qu.:164.5 3rd Qu.: 5.000 3rd Qu.: 430.0
## 12352 : 1 Max. :396.0 Max. :210.000 Max. :84236.2
## (Other):4333
kable(head(Ecomm_RFM))
| CustomerID | recency | frequency | monitery |
|---|---|---|---|
| 12346 | 348 | 1 | 77183.6000 |
| 12347 | 25 | 7 | 615.7143 |
| 12348 | 98 | 4 | 449.3100 |
| 12349 | 41 | 1 | 1757.5500 |
| 12350 | 333 | 1 | 334.4000 |
| 12352 | 59 | 8 | 313.2550 |
# Recency - How recently did the customer purchase?
hist(Ecomm_RFM$recency)
# Frequency - How often do they purchase?
hist(Ecomm_RFM$frequency, breaks = 30)
# As the data is skewed.
Ecomm_RFM$frequency <- log(Ecomm_RFM$frequency)
hist(Ecomm_RFM$frequency)
# Monetary Value - How much do they spend?
hist(Ecomm_RFM$monitery, breaks = 100)
# As data is skewed.
Ecomm_RFM$monitery <- log(Ecomm_RFM$monitery)
hist(Ecomm_RFM$monitery)
# Reference: https://www.kaggle.com/hendraherviawan/customer-segmentation-using-rfm-analysis-r
# Time series analysis of the product purchases by the customers on the ecommerce sites.
#Total purchase price with time.
library(ggplot2)
ggplot(data = EcommDataClean, aes(x = InvoiceDate, y = TotalPrice)) +
geom_bar(stat = "identity", fill = "purple") +
labs(title = "Total purchase price with time.",
x = "Date", y = "Total Price")
# Analysis of purchases by month.
OverallPricePerMonth <- aggregate(EcommDataClean$TotalPrice, by=list(DateMonth=EcommDataClean$DateMonth), FUN=sum)
TotalTransactionsPerMonth <- aggregate(EcommDataClean$TotalPrice, by=list(DateMonth=EcommDataClean$DateMonth), FUN=length)
colnames(OverallPricePerMonth)[2] <- 'Total_price_per_month'
colnames(TotalTransactionsPerMonth)[2] <- 'Total_transactions_per_month'
head(OverallPricePerMonth)
## DateMonth Total_price_per_month
## 1 1 569445.0
## 2 2 447137.3
## 3 3 595500.8
## 4 4 469200.4
## 5 5 678594.6
## 6 6 661213.7
head(TotalTransactionsPerMonth)
## DateMonth Total_transactions_per_month
## 1 1 21232
## 2 2 19928
## 3 3 27177
## 4 4 22644
## 5 5 28322
## 6 6 27185
UniqueMonthTransactionsDetails <- merge(OverallPricePerMonth,TotalTransactionsPerMonth,by="DateMonth")
head(UniqueMonthTransactionsDetails)
## DateMonth Total_price_per_month Total_transactions_per_month
## 1 1 569445.0 21232
## 2 10 1039318.8 49557
## 3 11 1161817.4 64545
## 4 12 1090906.7 43464
## 5 2 447137.3 19928
## 6 3 595500.8 27177
summary(UniqueMonthTransactionsDetails)
## DateMonth Total_price_per_month Total_transactions_per_month
## 1 :1 Min. : 447137 Min. :19928
## 2 :1 1st Qu.: 588987 1st Qu.:25781
## 3 :1 Median : 653279 Median :27181
## 4 :1 Mean : 742617 Mean :33160
## 5 :1 3rd Qu.: 974459 3rd Qu.:40889
## 6 :1 Max. :1161817 Max. :64545
## (Other):6
library(psych)
describe(UniqueMonthTransactionsDetails)
## vars n mean sd median trimmed
## DateMonth* 1 12 6.50 3.61 6.5 6.5
## Total_price_per_month 2 12 742617.33 249263.88 653278.8 730245.3
## Total_transactions_per_month 3 12 33160.33 13510.92 27181.0 31345.1
## mad min max range skew
## DateMonth* 4.45 1.0 12 11 0.00
## Total_price_per_month 198603.31 447137.3 1161817 714680 0.50
## Total_transactions_per_month 7773.27 19928.0 64545 44617 1.03
## kurtosis se
## DateMonth* -1.50 1.04
## Total_price_per_month -1.48 71956.29
## Total_transactions_per_month -0.19 3900.27
barplot(UniqueMonthTransactionsDetails$Total_price_per_month, names.arg=rownames(CountDateMonth), cex.names=0.8,col = "Red", xlab = "Month", ylab ="Total price per Month ", main = "Effect of Month on Total Price")
barplot(UniqueMonthTransactionsDetails$Total_transactions_per_month, main="Transactions of products according to Month.", xlab = "Month", ylab ="Transactions per Month ",
names.arg=rownames(CountDateMonth), cex.names=0.8, col="Orange")
# Analysis of purchases by year.
OverallPricePerYear <- aggregate(EcommDataClean$TotalPrice, by=list(DateYear=EcommDataClean$DateYear), FUN=sum)
TotalTransactionsPerYear <- aggregate(EcommDataClean$TotalPrice, by=list(DateYear=EcommDataClean$DateYear), FUN=length)
colnames(OverallPricePerYear)[2] <- 'Total_price_per_year'
colnames(TotalTransactionsPerYear)[2] <- 'Total_transactions_per_year'
head(OverallPricePerYear)
## DateYear Total_price_per_year
## 1 2010 572713.9
## 2 2011 8338694.0
head(TotalTransactionsPerYear)
## DateYear Total_transactions_per_year
## 1 2010 26160
## 2 2011 371764
UniqueYearTransactionsDetails <- merge(OverallPricePerYear,TotalTransactionsPerYear,by="DateYear")
head(UniqueYearTransactionsDetails)
## DateYear Total_price_per_year Total_transactions_per_year
## 1 2010 572713.9 26160
## 2 2011 8338694.0 371764
summary(UniqueYearTransactionsDetails)
## DateYear Total_price_per_year Total_transactions_per_year
## 2010:1 Min. : 572714 Min. : 26160
## 2011:1 1st Qu.:2514209 1st Qu.:112561
## Median :4455704 Median :198962
## Mean :4455704 Mean :198962
## 3rd Qu.:6397199 3rd Qu.:285363
## Max. :8338694 Max. :371764
library(psych)
describe(UniqueYearTransactionsDetails)
## vars n mean sd median
## DateYear* 1 2 1.5 0.71 1.5
## Total_price_per_year 2 2 4455704.0 5491377.21 4455704.0
## Total_transactions_per_year 3 2 198962.0 244378.93 198962.0
## trimmed mad min max range
## DateYear* 1.5 0.74 1.0 2 1
## Total_price_per_year 4455704.0 5756921.07 572713.9 8338694 7765980
## Total_transactions_per_year 198962.0 256196.25 26160.0 371764 345604
## skew kurtosis se
## DateYear* 0 -2.75 0.5
## Total_price_per_year 0 -2.75 3882990.1
## Total_transactions_per_year 0 -2.75 172802.0
barplot(UniqueYearTransactionsDetails$Total_price_per_year, names.arg=rownames(CountDateYear), cex.names=0.8,col = "Green", xlab = "year", ylab ="Total price per year ", main = " Effect of Year on Total Price" )
barplot(UniqueYearTransactionsDetails$Total_transactions_per_year, main=" Transactions of products according to year.", xlab = "Year", ylab ="Transactions per Year ", names.arg = rownames(CountDateYear), cex.names=0.8, col="blue")
# Calculating the days as Customer.
EarliestTransactionByCustomer <- aggregate(EcommDataClean$InvoiceDate, by=list(CustomerID=EcommDataClean$CustomerID), FUN=min)
LatestTransactionByCustomer <- aggregate(EcommDataClean$InvoiceDate, by=list(CustomerID=EcommDataClean$CustomerID), FUN=max)
colnames(EarliestTransactionByCustomer)[2] <- 'Earliest_invoice'
colnames(LatestTransactionByCustomer)[2] <- 'Latest_invoice'
head(EarliestTransactionByCustomer)
## CustomerID Earliest_invoice
## 1 12346 2011-01-18
## 2 12347 2010-12-07
## 3 12348 2010-12-16
## 4 12349 2011-11-21
## 5 12350 2011-02-02
## 6 12352 2011-02-16
head(LatestTransactionByCustomer)
## CustomerID Latest_invoice
## 1 12346 2011-01-18
## 2 12347 2011-12-07
## 3 12348 2011-09-25
## 4 12349 2011-11-21
## 5 12350 2011-02-02
## 6 12352 2011-11-03
InvoiceTransactionsDetails <- merge(EarliestTransactionByCustomer,LatestTransactionByCustomer,by="CustomerID")
head(InvoiceTransactionsDetails)
## CustomerID Earliest_invoice Latest_invoice
## 1 12346 2011-01-18 2011-01-18
## 2 12347 2010-12-07 2011-12-07
## 3 12348 2010-12-16 2011-09-25
## 4 12349 2011-11-21 2011-11-21
## 5 12350 2011-02-02 2011-02-02
## 6 12352 2011-02-16 2011-11-03
summary(InvoiceTransactionsDetails)
## CustomerID Earliest_invoice Latest_invoice
## 12346 : 1 Min. :2010-12-01 Min. :2010-12-01
## 12347 : 1 1st Qu.:2011-01-17 1st Qu.:2011-07-20
## 12348 : 1 Median :2011-04-05 Median :2011-10-20
## 12349 : 1 Mean :2011-04-30 Mean :2011-09-07
## 12350 : 1 3rd Qu.:2011-08-19 3rd Qu.:2011-11-22
## 12352 : 1 Max. :2011-12-09 Max. :2011-12-09
## (Other):4333
library(psych)
describe(InvoiceTransactionsDetails)
## vars n mean sd median trimmed mad min max
## CustomerID* 1 4339 2170 1252.71 2170 2170 1608.62 1 4339
## Earliest_invoice* 2 4339 NaN NA NA NaN NA Inf -Inf
## Latest_invoice* 3 4339 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## CustomerID* 4338 0 -1.2 19.02
## Earliest_invoice* -Inf NA NA NA
## Latest_invoice* -Inf NA NA NA
InvoiceTransactionsDetails['Days_as_customer'] <- (InvoiceTransactionsDetails$Latest_invoice - InvoiceTransactionsDetails$Earliest_invoice) +1
library(ggplot2)
library("tabplot")
# Visualization between the relationship of the columns.
tableplot(InvoiceTransactionsDetails)
# Plotting the days as customer.
DaysCustomer <- as.numeric(InvoiceTransactionsDetails$Days_as_customer)
hist(DaysCustomer)
# Market Basket Analysis using Apriori Algorithm.
# preparing data for Apriori Algorithm.
if(sessionInfo()['basePkgs']=="dplyr" | sessionInfo()['otherPkgs']=="dplyr"){
detach(package:dplyr, unload=TRUE)
}
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:corrgram':
##
## baseball
df_Ecommbasket <- EcommDataClean[,c(3,5,7)]
head(df_Ecommbasket)
## Description InvoiceDate CustomerID
## 1 WHITE HANGING HEART T-LIGHT HOLDER 2010-12-01 17850
## 2 WHITE METAL LANTERN 2010-12-01 17850
## 3 CREAM CUPID HEARTS COAT HANGER 2010-12-01 17850
## 4 KNITTED UNION FLAG HOT WATER BOTTLE 2010-12-01 17850
## 5 RED WOOLLY HOTTIE WHITE HEART. 2010-12-01 17850
## 6 SET 7 BABUSHKA NESTING BOXES 2010-12-01 17850
df_Ecommbasket$CustomerID <- as.numeric(df_Ecommbasket$CustomerID)
itemList <- ddply(df_Ecommbasket,c("CustomerID","InvoiceDate"),
function(df1)paste(df1$Description,
collapse = ","))
colnames(itemList)[3] <- 'Item_List'
head(itemList)
## CustomerID InvoiceDate
## 1 1 2011-01-18
## 2 2 2010-12-07
## 3 2 2011-01-26
## 4 2 2011-04-07
## 5 2 2011-06-09
## 6 2 2011-08-02
## Item_List
## 1 MEDIUM CERAMIC TOP STORAGE JAR
## 2 BLACK CANDELABRA T-LIGHT HOLDER,AIRLINE BAG VINTAGE JET SET BROWN,COLOUR GLASS. STAR T-LIGHT HOLDER,MINI PAINT SET VINTAGE ,CLEAR DRAWER KNOB ACRYLIC EDWARDIAN,PINK DRAWER KNOB ACRYLIC EDWARDIAN,GREEN DRAWER KNOB ACRYLIC EDWARDIAN,RED DRAWER KNOB ACRYLIC EDWARDIAN,PURPLE DRAWERKNOB ACRYLIC EDWARDIAN,BLUE DRAWER KNOB ACRYLIC EDWARDIAN,ALARM CLOCK BAKELIKE CHOCOLATE,ALARM CLOCK BAKELIKE GREEN,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE PINK,ALARM CLOCK BAKELIKE ORANGE,FOUR HOOK WHITE LOVEBIRDS,BLACK GRAND BAROQUE PHOTO FRAME,BATHROOM METAL SIGN ,LARGE HEART MEASURING SPOONS,BOX OF 6 ASSORTED COLOUR TEASPOONS,BLUE 3 PIECE POLKADOT CUTLERY SET,RED 3 PIECE RETROSPOT CUTLERY SET,PINK 3 PIECE POLKADOT CUTLERY SET,EMERGENCY FIRST AID TIN ,SET OF 2 TINS VINTAGE BATHROOM ,SET/3 DECOUPAGE STACKING TINS,BOOM BOX SPEAKER BOYS,RED TOADSTOOL LED NIGHT LIGHT,3D DOG PICTURE PLAYING CARDS,BLACK EAR MUFF HEADPHONES,CAMOUFLAGE EAR MUFF HEADPHONES
## 3 PINK NEW BAROQUECANDLESTICK CANDLE,BLUE NEW BAROQUE CANDLESTICK CANDLE,BLACK CANDELABRA T-LIGHT HOLDER,WOODLAND CHARLOTTE BAG,AIRLINE BAG VINTAGE JET SET BROWN,AIRLINE BAG VINTAGE JET SET WHITE,SANDWICH BATH SPONGE,ALARM CLOCK BAKELIKE CHOCOLATE,ALARM CLOCK BAKELIKE GREEN,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE PINK,ALARM CLOCK BAKELIKE ORANGE,SMALL HEART MEASURING SPOONS,72 SWEETHEART FAIRY CAKE CASES,60 TEATIME FAIRY CAKE CASES,PACK OF 60 MUSHROOM CAKE CASES,PACK OF 60 SPACEBOY CAKE CASES,TEA TIME OVEN GLOVE,RED RETROSPOT OVEN GLOVE ,RED RETROSPOT OVEN GLOVE DOUBLE,SET/2 RED RETROSPOT TEA TOWELS ,REGENCY CAKESTAND 3 TIER,BOX OF 6 ASSORTED COLOUR TEASPOONS,MINI LADLE LOVE HEART RED ,CHOCOLATE CALCULATOR,TOOTHPASTE TUBE PEN,SET OF 2 TINS VINTAGE BATHROOM ,RED TOADSTOOL LED NIGHT LIGHT,3D DOG PICTURE PLAYING CARDS
## 4 AIRLINE BAG VINTAGE JET SET WHITE,AIRLINE BAG VINTAGE JET SET RED,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE JET SET BROWN,RED RETROSPOT PURSE ,ICE CREAM SUNDAE LIP GLOSS,VINTAGE HEADS AND TAILS CARD GAME ,HOLIDAY FUN LUDO,TREASURE ISLAND BOOK BOX,WATERING CAN PINK BUNNY,RED DRAWER KNOB ACRYLIC EDWARDIAN,LARGE HEART MEASURING SPOONS,SMALL HEART MEASURING SPOONS,PACK OF 60 DINOSAUR CAKE CASES,RED RETROSPOT OVEN GLOVE DOUBLE,REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER ,RED TOADSTOOL LED NIGHT LIGHT,MINI PAINT SET VINTAGE ,3D SHEET OF DOG STICKERS,3D SHEET OF CAT STICKERS,SMALL FOLDING SCISSOR(POINTED EDGE),GIFT BAG PSYCHEDELIC APPLES,SET OF 2 TINS VINTAGE BATHROOM
## 5 RABBIT NIGHT LIGHT,REGENCY TEA STRAINER,REGENCY TEA PLATE GREEN ,REGENCY TEA PLATE PINK,REGENCY TEA PLATE ROSES ,REGENCY TEAPOT ROSES ,REGENCY SUGAR BOWL GREEN,REGENCY MILK JUG PINK ,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE JET SET BROWN,VICTORIAN SEWING KIT,NAMASTE SWAGAT INCENSE,TRIPLE HOOK ANTIQUE IVORY ROSE,SMALL HEART MEASURING SPOONS,3D DOG PICTURE PLAYING CARDS,FEATHER PEN,COAL BLACK,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE CHOCOLATE
## 6 SET OF 60 VINTAGE LEAF CAKE CASES ,SET 40 HEART SHAPE PETIT FOUR CASES,AIRLINE BAG VINTAGE JET SET BROWN,AIRLINE BAG VINTAGE JET SET RED,AIRLINE BAG VINTAGE JET SET WHITE,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE WORLD CHAMPION ,WOODLAND DESIGN COTTON TOTE BAG,WOODLAND CHARLOTTE BAG,ALARM CLOCK BAKELIKE RED ,TRIPLE HOOK ANTIQUE IVORY ROSE,SINGLE ANTIQUE ROSE HOOK IVORY,TEA TIME OVEN GLOVE,72 SWEETHEART FAIRY CAKE CASES,60 TEATIME FAIRY CAKE CASES,PACK OF 60 DINOSAUR CAKE CASES,REGENCY CAKESTAND 3 TIER,REGENCY MILK JUG PINK ,3D DOG PICTURE PLAYING CARDS,REVOLVER WOODEN RULER ,VINTAGE HEADS AND TAILS CARD GAME ,RED REFECTORY CLOCK
itemList$CustomerID <- NULL
itemList$InvoiceDate <- NULL
#Rename column headers for ease of use
colnames(itemList) <- c("itemList")
head(itemList)
## itemList
## 1 MEDIUM CERAMIC TOP STORAGE JAR
## 2 BLACK CANDELABRA T-LIGHT HOLDER,AIRLINE BAG VINTAGE JET SET BROWN,COLOUR GLASS. STAR T-LIGHT HOLDER,MINI PAINT SET VINTAGE ,CLEAR DRAWER KNOB ACRYLIC EDWARDIAN,PINK DRAWER KNOB ACRYLIC EDWARDIAN,GREEN DRAWER KNOB ACRYLIC EDWARDIAN,RED DRAWER KNOB ACRYLIC EDWARDIAN,PURPLE DRAWERKNOB ACRYLIC EDWARDIAN,BLUE DRAWER KNOB ACRYLIC EDWARDIAN,ALARM CLOCK BAKELIKE CHOCOLATE,ALARM CLOCK BAKELIKE GREEN,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE PINK,ALARM CLOCK BAKELIKE ORANGE,FOUR HOOK WHITE LOVEBIRDS,BLACK GRAND BAROQUE PHOTO FRAME,BATHROOM METAL SIGN ,LARGE HEART MEASURING SPOONS,BOX OF 6 ASSORTED COLOUR TEASPOONS,BLUE 3 PIECE POLKADOT CUTLERY SET,RED 3 PIECE RETROSPOT CUTLERY SET,PINK 3 PIECE POLKADOT CUTLERY SET,EMERGENCY FIRST AID TIN ,SET OF 2 TINS VINTAGE BATHROOM ,SET/3 DECOUPAGE STACKING TINS,BOOM BOX SPEAKER BOYS,RED TOADSTOOL LED NIGHT LIGHT,3D DOG PICTURE PLAYING CARDS,BLACK EAR MUFF HEADPHONES,CAMOUFLAGE EAR MUFF HEADPHONES
## 3 PINK NEW BAROQUECANDLESTICK CANDLE,BLUE NEW BAROQUE CANDLESTICK CANDLE,BLACK CANDELABRA T-LIGHT HOLDER,WOODLAND CHARLOTTE BAG,AIRLINE BAG VINTAGE JET SET BROWN,AIRLINE BAG VINTAGE JET SET WHITE,SANDWICH BATH SPONGE,ALARM CLOCK BAKELIKE CHOCOLATE,ALARM CLOCK BAKELIKE GREEN,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE PINK,ALARM CLOCK BAKELIKE ORANGE,SMALL HEART MEASURING SPOONS,72 SWEETHEART FAIRY CAKE CASES,60 TEATIME FAIRY CAKE CASES,PACK OF 60 MUSHROOM CAKE CASES,PACK OF 60 SPACEBOY CAKE CASES,TEA TIME OVEN GLOVE,RED RETROSPOT OVEN GLOVE ,RED RETROSPOT OVEN GLOVE DOUBLE,SET/2 RED RETROSPOT TEA TOWELS ,REGENCY CAKESTAND 3 TIER,BOX OF 6 ASSORTED COLOUR TEASPOONS,MINI LADLE LOVE HEART RED ,CHOCOLATE CALCULATOR,TOOTHPASTE TUBE PEN,SET OF 2 TINS VINTAGE BATHROOM ,RED TOADSTOOL LED NIGHT LIGHT,3D DOG PICTURE PLAYING CARDS
## 4 AIRLINE BAG VINTAGE JET SET WHITE,AIRLINE BAG VINTAGE JET SET RED,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE JET SET BROWN,RED RETROSPOT PURSE ,ICE CREAM SUNDAE LIP GLOSS,VINTAGE HEADS AND TAILS CARD GAME ,HOLIDAY FUN LUDO,TREASURE ISLAND BOOK BOX,WATERING CAN PINK BUNNY,RED DRAWER KNOB ACRYLIC EDWARDIAN,LARGE HEART MEASURING SPOONS,SMALL HEART MEASURING SPOONS,PACK OF 60 DINOSAUR CAKE CASES,RED RETROSPOT OVEN GLOVE DOUBLE,REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER ,RED TOADSTOOL LED NIGHT LIGHT,MINI PAINT SET VINTAGE ,3D SHEET OF DOG STICKERS,3D SHEET OF CAT STICKERS,SMALL FOLDING SCISSOR(POINTED EDGE),GIFT BAG PSYCHEDELIC APPLES,SET OF 2 TINS VINTAGE BATHROOM
## 5 RABBIT NIGHT LIGHT,REGENCY TEA STRAINER,REGENCY TEA PLATE GREEN ,REGENCY TEA PLATE PINK,REGENCY TEA PLATE ROSES ,REGENCY TEAPOT ROSES ,REGENCY SUGAR BOWL GREEN,REGENCY MILK JUG PINK ,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE JET SET BROWN,VICTORIAN SEWING KIT,NAMASTE SWAGAT INCENSE,TRIPLE HOOK ANTIQUE IVORY ROSE,SMALL HEART MEASURING SPOONS,3D DOG PICTURE PLAYING CARDS,FEATHER PEN,COAL BLACK,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE CHOCOLATE
## 6 SET OF 60 VINTAGE LEAF CAKE CASES ,SET 40 HEART SHAPE PETIT FOUR CASES,AIRLINE BAG VINTAGE JET SET BROWN,AIRLINE BAG VINTAGE JET SET RED,AIRLINE BAG VINTAGE JET SET WHITE,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE WORLD CHAMPION ,WOODLAND DESIGN COTTON TOTE BAG,WOODLAND CHARLOTTE BAG,ALARM CLOCK BAKELIKE RED ,TRIPLE HOOK ANTIQUE IVORY ROSE,SINGLE ANTIQUE ROSE HOOK IVORY,TEA TIME OVEN GLOVE,72 SWEETHEART FAIRY CAKE CASES,60 TEATIME FAIRY CAKE CASES,PACK OF 60 DINOSAUR CAKE CASES,REGENCY CAKESTAND 3 TIER,REGENCY MILK JUG PINK ,3D DOG PICTURE PLAYING CARDS,REVOLVER WOODEN RULER ,VINTAGE HEADS AND TAILS CARD GAME ,RED REFECTORY CLOCK
write.csv(itemList,"ItemList.csv", row.names = TRUE)
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:ffbase':
##
## %in%
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:base':
##
## %in%, abbreviate, write
txn = read.transactions(file="ItemList.csv", rm.duplicates= TRUE, format="basket",sep=",",cols=1);
txn@itemInfo$labels <- gsub("\"","",txn@itemInfo$labels)
basket_rules <- apriori(txn,parameter = list(sup = 0.001, conf = 0.0,target="rules"));
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 0 0.1 1 none FALSE TRUE 0.001 1 10
## target ext
## rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 16
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[16330 item(s), 16767 transaction(s)] done [0.01s].
## sorting and recoding items ... [4 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 done [0.00s].
## writing ... [4 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
df_basket <- as(basket_rules,"data.frame")
head(df_basket)
## rules support confidence lift
## 1 {} => {REGENCY CAKESTAND 3 TIER} 0.001013896 0.001013896 1
## 2 {} => {CHILLI LIGHTS} 0.001312101 0.001312101 1
## 3 {} => {Manual} 0.001848870 0.001848870 1
## 4 {} => {POSTAGE} 0.002087434 0.002087434 1
# reference: https://www.r-bloggers.com/implementing-apriori-algorithm-in-r/
# Word cloud of the products.
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
##
## Attaching package: 'tm'
## The following object is masked from 'package:arules':
##
## inspect
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(EcommDataClean$Description, min.freq=2000)
# Word cloud of the Countries.
wordcloud(EcommDataClean$Country, min.freq=15)