Email: addepalligowri@gmail.com College: VESIT(2012-2016)

This is the Capstone project of Data Analytics with managerial applications under professor Dr. Sameer Mathur which aims to do data analysis of customer purchase patterns through an Ecommerce dataset containing the records of the transactions.

The dataset can be found at: https://www.kaggle.com/carrie1/ecommerce-data/data

The dataset has the following columns with their description: 1. InvoiceNo - The invoice number of each product when the product was purchased. 2. StockCode - The unique code of the product. 3. Description - The product description. 4. Quantity - The quantity of the product purchased. 5. InvoiceDate - The date on which the product was purchased. 6. UnitPrice - The unit price of the product. 7. CustomerID - The unique customer ID of customer buying the product. 8. Country - The country where the purchase has been made.

This will attempt to do an analysis by understanding and cleaning the data to later cater to our needs of doing an indepth analysis and experimentation to draw out valuable insights relevant to the Ecommerce industry and actionable insights for a manager.

# Day 1: 

# Task 1a: Initial Analysis of Marketing Analysis of customer purchase patterns in an Ecommerce Industry.

# Ques 1: Read your dataset in R and visualize the length and breadth of your dataset.

setwd("C:/Users/GOWRI/Desktop/iim_internship/Capstone_project")
Ecommdata <- read.csv(file="data.csv",head=TRUE,sep=",")

#View(Ecommdata)

head(Ecommdata)
##   InvoiceNo StockCode                         Description Quantity
## 1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6
## 2    536365     71053                 WHITE METAL LANTERN        6
## 3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8
## 4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6
## 5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6
## 6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2
##      InvoiceDate UnitPrice CustomerID        Country
## 1 12/1/2010 8:26      2.55      17850 United Kingdom
## 2 12/1/2010 8:26      3.39      17850 United Kingdom
## 3 12/1/2010 8:26      2.75      17850 United Kingdom
## 4 12/1/2010 8:26      3.39      17850 United Kingdom
## 5 12/1/2010 8:26      3.39      17850 United Kingdom
## 6 12/1/2010 8:26      7.65      17850 United Kingdom
nrow(Ecommdata)
## [1] 541909
# The number of rows is 541909.

ncol(Ecommdata)
## [1] 8
# The number of columns is 8.
# Cleaning the data and removing null/missing/ negative values and creating subset and converting to factors for variables like InvoiceNo,StockCode, InvoiceDate, CustomerID, Country.

EcommDataClean <- Ecommdata[which(Ecommdata$Quantity>=0 & !is.na(Ecommdata$Quantity) &  Ecommdata$UnitPrice >=0 & !is.na(Ecommdata$UnitPrice) & !is.na(Ecommdata$CustomerID)),]

EcommDataClean$InvoiceNo<- factor(EcommDataClean$InvoiceNo)
EcommDataClean$StockCode<- factor(EcommDataClean$StockCode)
EcommDataClean$CustomerID<- factor(EcommDataClean$CustomerID)
EcommDataClean$Country<- factor(EcommDataClean$Country)
EcommDataClean$InvoiceDate<- as.Date(EcommDataClean$InvoiceDate, '%m/%d/%Y %H:%M')

# Adding a new column of total purchase price i.e( Unit price * Quantity)

EcommDataClean$TotalPrice <- EcommDataClean$UnitPrice * EcommDataClean$Quantity

#Adding three columns Day, month and year by splitting the invoice date.

EcommDataClean$DateYear <- as.numeric(format(EcommDataClean$InvoiceDate, format = "%Y"))
EcommDataClean$DateMonth <- as.numeric(format(EcommDataClean$InvoiceDate, format = "%m"))
EcommDataClean$DateDay <- as.numeric(format(EcommDataClean$InvoiceDate, format = "%d"))

# Adding additional Quarter Column to specify financial quarter of purchase.

EcommDataClean$Quarter[EcommDataClean$DateMonth < 7] <- 1
EcommDataClean$Quarter[EcommDataClean$DateMonth > 6] <- 2

# Converting them to factors.

EcommDataClean$DateYear<- factor(EcommDataClean$DateYear)
EcommDataClean$DateMonth<- factor(EcommDataClean$DateMonth)
EcommDataClean$DateDay<- factor(EcommDataClean$DateDay)
EcommDataClean$Quarter<- factor(EcommDataClean$Quarter)

#View(EcommDataClean)

nrow(EcommDataClean)
## [1] 397924
# The number of useful rows (Ecommerce purchases) is 397924.

ncol(EcommDataClean)
## [1] 13
# The number of useful columns is 13.
# Ques 2: Create a descriptive statistics (min, max, median etc) of each variable.

# Summary of the original dataset.
summary(Ecommdata)
##    InvoiceNo        StockCode     
##  573585 :  1114   85123A :  2313  
##  581219 :   749   22423  :  2203  
##  581492 :   731   85099B :  2159  
##  580729 :   721   47566  :  1727  
##  558475 :   705   20725  :  1639  
##  579777 :   687   84879  :  1502  
##  (Other):537202   (Other):530366  
##                              Description        Quantity        
##  WHITE HANGING HEART T-LIGHT HOLDER:  2369   Min.   :-80995.00  
##  REGENCY CAKESTAND 3 TIER          :  2200   1st Qu.:     1.00  
##  JUMBO BAG RED RETROSPOT           :  2159   Median :     3.00  
##  PARTY BUNTING                     :  1727   Mean   :     9.55  
##  LUNCH BAG RED RETROSPOT           :  1638   3rd Qu.:    10.00  
##  ASSORTED COLOUR BIRD ORNAMENT     :  1501   Max.   : 80995.00  
##  (Other)                           :530315                      
##            InvoiceDate       UnitPrice           CustomerID    
##  10/31/2011 14:41:  1114   Min.   :-11062.06   Min.   :12346   
##  12/8/2011 9:28  :   749   1st Qu.:     1.25   1st Qu.:13953   
##  12/9/2011 10:03 :   731   Median :     2.08   Median :15152   
##  12/5/2011 17:24 :   721   Mean   :     4.61   Mean   :15288   
##  6/29/2011 15:58 :   705   3rd Qu.:     4.13   3rd Qu.:16791   
##  11/30/2011 15:13:   687   Max.   : 38970.00   Max.   :18287   
##  (Other)         :537202                       NA's   :135080  
##            Country      
##  United Kingdom:495478  
##  Germany       :  9495  
##  France        :  8557  
##  EIRE          :  8196  
##  Spain         :  2533  
##  Netherlands   :  2371  
##  (Other)       : 15279
# Summary of the cleaned dataset.
summary(EcommDataClean)
##    InvoiceNo        StockCode     
##  576339 :   542   85123A :  2035  
##  579196 :   533   22423  :  1724  
##  580727 :   529   85099B :  1618  
##  578270 :   442   84879  :  1408  
##  573576 :   435   47566  :  1397  
##  567656 :   421   20725  :  1317  
##  (Other):395022   (Other):388425  
##                              Description        Quantity       
##  WHITE HANGING HEART T-LIGHT HOLDER:  2028   Min.   :    1.00  
##  REGENCY CAKESTAND 3 TIER          :  1724   1st Qu.:    2.00  
##  JUMBO BAG RED RETROSPOT           :  1618   Median :    6.00  
##  ASSORTED COLOUR BIRD ORNAMENT     :  1408   Mean   :   13.02  
##  PARTY BUNTING                     :  1397   3rd Qu.:   12.00  
##  LUNCH BAG RED RETROSPOT           :  1316   Max.   :80995.00  
##  (Other)                           :388433                     
##   InvoiceDate           UnitPrice          CustomerID    
##  Min.   :2010-12-01   Min.   :   0.000   17841  :  7847  
##  1st Qu.:2011-04-07   1st Qu.:   1.250   14911  :  5677  
##  Median :2011-07-31   Median :   1.950   14096  :  5111  
##  Mean   :2011-07-10   Mean   :   3.116   12748  :  4596  
##  3rd Qu.:2011-10-20   3rd Qu.:   3.750   14606  :  2700  
##  Max.   :2011-12-09   Max.   :8142.750   15311  :  2379  
##                                          (Other):369614  
##            Country         TotalPrice        DateYear     
##  United Kingdom:354345   Min.   :     0.00   2010: 26160  
##  Germany       :  9042   1st Qu.:     4.68   2011:371764  
##  France        :  8342   Median :    11.80                
##  EIRE          :  7238   Mean   :    22.39                
##  Spain         :  2485   3rd Qu.:    19.80                
##  Netherlands   :  2363   Max.   :168469.60                
##  (Other)       : 14109                                    
##    DateMonth         DateDay       Quarter   
##  11     : 64545   6      : 18349   1:146488  
##  10     : 49557   5      : 16410   2:251436  
##  12     : 43464   8      : 15854             
##  9      : 40030   7      : 15604             
##  5      : 28322   17     : 14913             
##  6      : 27185   4      : 14882             
##  (Other):144821   (Other):301912
# Ques 3: Create one-way contingency tables for the categorical variables in your dataset.
# The categorical variables in the dataset are Description, Country, InvoiceNo, StockCode, CustomerID.

# One-way contingency tables for Description.
CountDescription <- table(EcommDataClean$Description)
head(CountDescription)
## 
##                                 4 PURPLE FLOCK DINNER CANDLES 
##                              0                             39 
##  50'S CHRISTMAS GIFT BAG LARGE              DOLLY GIRL BEAKER 
##                            109                            138 
##    I LOVE LONDON MINI BACKPACK    I LOVE LONDON MINI RUCKSACK 
##                             70                              1
# One-way contingency tables for Country.
CountCountry <- table(EcommDataClean$Country)
head(CountCountry)
## 
## Australia   Austria   Bahrain   Belgium    Brazil    Canada 
##      1185       398        17      2031        32       151
# One-way contingency tables for InvoiceNo.
CountInvoice <- table(EcommDataClean$InvoiceNo)
head(CountInvoice)
## 
## 536365 536366 536367 536368 536369 536370 
##      7      2     12      4      1     20
# One-way contingency tables for StockCode.
CountStockCode <- table(EcommDataClean$StockCode)
head(CountStockCode)
## 
##  10002  10080  10120 10123C 10124A 10124G 
##     49     21     30      3      5      4
# One-way contingency tables for CustomerID.
CountCustomerID <- table(EcommDataClean$CustomerID)
head(CountCustomerID)
## 
## 12346 12347 12348 12349 12350 12352 
##     1   182    31    73    17    85
# One-way contingency tables for DateYear.
CountDateYear <- table(EcommDataClean$DateYear)
head(CountDateYear)
## 
##   2010   2011 
##  26160 371764
# One-way contingency tables for DateMonth.
CountDateMonth <- table(EcommDataClean$DateMonth)
head(CountDateMonth)
## 
##     1     2     3     4     5     6 
## 21232 19928 27177 22644 28322 27185
# One-way contingency tables for DateDay.
CountDateDay <- table(EcommDataClean$DateDay)
head(CountDateDay)
## 
##     1     2     3     4     5     6 
## 13630 12101 10878 14882 16410 18349
# One-way contingency tables for Quarter.
CountQuarter <- table(EcommDataClean$Quarter)
head(CountQuarter)
## 
##      1      2 
## 146488 251436
# Ques 4: Create two-way contingency tables for the categorical variables in your dataset.

# Two-way contingency tables for CustomerID and Country.
CountCountryCustomerID <- table(EcommDataClean$CustomerID, EcommDataClean$Country)
head(prop.table(CountCountryCustomerID))
##        
##         Australia Austria Bahrain Belgium Brazil Canada Channel Islands
##   12346         0       0       0       0      0      0               0
##   12347         0       0       0       0      0      0               0
##   12348         0       0       0       0      0      0               0
##   12349         0       0       0       0      0      0               0
##   12350         0       0       0       0      0      0               0
##   12352         0       0       0       0      0      0               0
##        
##         Cyprus Czech Republic Denmark EIRE European Community      Finland
##   12346      0              0       0    0                  0 0.000000e+00
##   12347      0              0       0    0                  0 0.000000e+00
##   12348      0              0       0    0                  0 7.790432e-05
##   12349      0              0       0    0                  0 0.000000e+00
##   12350      0              0       0    0                  0 0.000000e+00
##   12352      0              0       0    0                  0 0.000000e+00
##        
##         France Germany Greece      Iceland Israel        Italy Japan
##   12346      0       0      0 0.0000000000      0 0.0000000000     0
##   12347      0       0      0 0.0004573738      0 0.0000000000     0
##   12348      0       0      0 0.0000000000      0 0.0000000000     0
##   12349      0       0      0 0.0000000000      0 0.0001834521     0
##   12350      0       0      0 0.0000000000      0 0.0000000000     0
##   12352      0       0      0 0.0000000000      0 0.0000000000     0
##        
##         Lebanon Lithuania Malta Netherlands       Norway Poland Portugal
##   12346       0         0     0           0 0.000000e+00      0        0
##   12347       0         0     0           0 0.000000e+00      0        0
##   12348       0         0     0           0 0.000000e+00      0        0
##   12349       0         0     0           0 0.000000e+00      0        0
##   12350       0         0     0           0 4.272173e-05      0        0
##   12352       0         0     0           0 2.136086e-04      0        0
##        
##         RSA Saudi Arabia Singapore Spain Sweden Switzerland
##   12346   0            0         0     0      0           0
##   12347   0            0         0     0      0           0
##   12348   0            0         0     0      0           0
##   12349   0            0         0     0      0           0
##   12350   0            0         0     0      0           0
##   12352   0            0         0     0      0           0
##        
##         United Arab Emirates United Kingdom Unspecified USA
##   12346                    0   2.513043e-06           0   0
##   12347                    0   0.000000e+00           0   0
##   12348                    0   0.000000e+00           0   0
##   12349                    0   0.000000e+00           0   0
##   12350                    0   0.000000e+00           0   0
##   12352                    0   0.000000e+00           0   0
# Two-way contingency tables for StockCode and Country.
CountCountryStockCode <- table(EcommDataClean$StockCode, EcommDataClean$Country)
head(prop.table(CountCountryStockCode))
##         
##          Australia Austria Bahrain Belgium Brazil Canada Channel Islands
##   10002          0       0       0       0      0      0               0
##   10080          0       0       0       0      0      0               0
##   10120          0       0       0       0      0      0               0
##   10123C         0       0       0       0      0      0               0
##   10124A         0       0       0       0      0      0               0
##   10124G         0       0       0       0      0      0               0
##         
##          Cyprus Czech Republic Denmark         EIRE European Community
##   10002       0              0       0 2.513043e-06                  0
##   10080       0              0       0 0.000000e+00                  0
##   10120       0              0       0 0.000000e+00                  0
##   10123C      0              0       0 0.000000e+00                  0
##   10124A      0              0       0 0.000000e+00                  0
##   10124G      0              0       0 0.000000e+00                  0
##         
##          Finland       France      Germany Greece Iceland Israel Italy
##   10002        0 2.010434e-05 2.513043e-06      0       0      0     0
##   10080        0 0.000000e+00 0.000000e+00      0       0      0     0
##   10120        0 2.513043e-06 0.000000e+00      0       0      0     0
##   10123C       0 0.000000e+00 0.000000e+00      0       0      0     0
##   10124A       0 0.000000e+00 0.000000e+00      0       0      0     0
##   10124G       0 0.000000e+00 0.000000e+00      0       0      0     0
##         
##                 Japan Lebanon Lithuania Malta Netherlands Norway Poland
##   10002  2.513043e-06       0         0     0           0      0      0
##   10080  0.000000e+00       0         0     0           0      0      0
##   10120  0.000000e+00       0         0     0           0      0      0
##   10123C 0.000000e+00       0         0     0           0      0      0
##   10124A 0.000000e+00       0         0     0           0      0      0
##   10124G 0.000000e+00       0         0     0           0      0      0
##         
##          Portugal RSA Saudi Arabia Singapore        Spain Sweden
##   10002         0   0            0         0 2.513043e-06      0
##   10080         0   0            0         0 0.000000e+00      0
##   10120         0   0            0         0 0.000000e+00      0
##   10123C        0   0            0         0 0.000000e+00      0
##   10124A        0   0            0         0 0.000000e+00      0
##   10124G        0   0            0         0 0.000000e+00      0
##         
##           Switzerland United Arab Emirates United Kingdom Unspecified USA
##   10002  2.513043e-06                    0   9.046954e-05           0   0
##   10080  0.000000e+00                    0   5.277390e-05           0   0
##   10120  0.000000e+00                    0   7.287824e-05           0   0
##   10123C 0.000000e+00                    0   7.539128e-06           0   0
##   10124A 0.000000e+00                    0   1.256521e-05           0   0
##   10124G 0.000000e+00                    0   1.005217e-05           0   0
# Two-way contingency tables for Description and Country.
#CountCountryDescription <- table(EcommDataClean$Description, EcommDataClean$Country)
#prop.table(CountCountryDescription)

# Two-way contingency tables for InvoiceNo and Country.
#CountCountryInvoiceNo <- table(EcommDataClean$InvoiceNo, EcommDataClean$Country)
#prop.table(CountCountryInvoiceNo)
# Ques 5: Draw a  boxplot of the variables that belong to your study.

library(lattice)

# Effect of Total Price on Country.
boxplot(EcommDataClean$TotalPrice ~EcommDataClean$Country ,col = "Brown",
        xlab = "Country", ylab ="Total price", main = " Effect of Total Price on Country.")

# Effect of unit price on Country.
boxplot(EcommDataClean$UnitPrice ~EcommDataClean$Country ,col = "Cyan",
        xlab = "Country", ylab ="Unit price", main = "Effect of unit price on Country.")

# Effect of Quantity on Country.
boxplot(EcommDataClean$Quantity ~EcommDataClean$Country ,col = "yellow",
        xlab = "Country", ylab ="Quantity", main = "Effect of Quantity on Country.")

# Effect of Quantity on Month.
boxplot(EcommDataClean$Quantity ~EcommDataClean$DateMonth ,col = "grey",
        xlab = "Month", ylab ="Quantity", main = "Effect of Quantity on Month.")

# Effect of Month on Total Price.
boxplot(EcommDataClean$TotalPrice ~EcommDataClean$DateMonth ,col = "yellow",
        xlab = "Month", ylab ="Total Price", main = "Effect of Month on Total Price.")

# Effect of Year on Total Price.
boxplot(EcommDataClean$TotalPrice ~EcommDataClean$DateYear ,col = "Red",
        xlab = "year", ylab ="Total Price", main = "Effect of Year on Total Price.")

# Effect of Quantity on year.
boxplot(EcommDataClean$Quantity ~EcommDataClean$DateYear ,col = "grey",
        xlab = "year", ylab ="Quantity", main = "Effect of Quantity on year.")

# Effect of Quantity on Financial Quarter.
boxplot(EcommDataClean$Quantity ~EcommDataClean$Quarter ,col = "Pink",
        xlab = "Quarter", ylab ="Quantity", main = "Effect of Quantity on Financial Quarter.")

# Effect of Total Price on Financial Quarter.
boxplot(EcommDataClean$TotalPrice ~EcommDataClean$Quarter ,col = "Pink",
        xlab = "Quarter", ylab ="Total Price", main = "Effect of Total Price on Financial Quarter.")

# Ques 6: Draw Histograms for your suitable data fields.

# Histogram of Unit Price.
hist(EcommDataClean$UnitPrice, col ="pink")

# Histogram of Quantity.
hist(EcommDataClean$Quantity, col ="brown")

# Histogram of Total Price.
hist(EcommDataClean$TotalPrice, col ="green")

# Ques 7: Draw suitable plot for your data fields.

# Distribution of products according to country in log scale.
CountCountry <- table(EcommDataClean$Country )
barplot(CountCountry, main="Distribution of products according to country in log scale.",log='y',
        names.arg=rownames(CountCountry), cex.names=0.8, col="Red")

# Distribution of products according to Month in log scale.
CountDateMonth <- table(EcommDataClean$DateMonth )
barplot(CountDateMonth, main="Distribution of products according to Month in log scale.",log='y',
        names.arg=rownames(CountDateMonth), cex.names=0.8, col="Orange")

# Distribution of products according to Year in log scale.
CountDateYear <- table(EcommDataClean$DateYear )
barplot(CountDateYear, main="Distribution of products according to Year in log scale.",log='y',
        names.arg=rownames(CountDateYear), cex.names=0.8, col="Blue")

# Distribution of products according to Financial Quarter in log scale.
CountQuarter <- table(EcommDataClean$Quarter )
barplot(CountQuarter, main="Distribution of products according to Financial Quarter in log scale.",log='y', names.arg=rownames(CountQuarter), cex.names=0.8, col="Orange")

# Distribution of products according to Date in log scale.
CountDateDay <- table(EcommDataClean$DateDay )
barplot(CountDateDay, main="Distribution of products according to Date in log scale.",log='y',
        names.arg=rownames(CountDateDay), cex.names=0.8, col="Cyan")

# Distribution of products according to StockCode in log scale.
CountStockCode <- table(EcommDataClean$StockCode )
barplot(CountStockCode, main="Distribution of products according to StockCode in log scale.",log='y',names.arg=rownames(CountStockCode), cex.names=0.4, col="Blue")

# Distribution of products according to Customer ID in log scale.
CountCustomerID <- table(EcommDataClean$CustomerID )
barplot(CountCustomerID, main="Distribution of products according to Customer ID in log scale.",log='y', names.arg=rownames(CountCustomerID), cex.names=0.4, col="Green")

library(car)

# Plot of log of Total Price vs. log of Quantity.
plot(EcommDataClean$TotalPrice~EcommDataClean$Quantity,spread=FALSE, smoother.args=list(lty=2), pch=19, main="plot of log of Total Price vs. log of Quantity ", log="xy", col = "Orange",xlab="Quantity", ylab="Total Price")

# Plot of log of Total Price vs. log of Unit Price.
plot(EcommDataClean$TotalPrice~EcommDataClean$UnitPrice,spread=FALSE, smoother.args=list(lty=2), pch=19, main ="plot of log of Total Price vs. log of Unit Price ", log ="xy", col = "Pink", xlab ="Unit Price", ylab ="Total Price")

# Ques 8: Create a correlation matrix.

# Create a Variance-Covariance Matrix.
# creating a subset of numeric values.

Ecomm <- EcommDataClean[,c(4,6,9)]
covEcomm <-cov(Ecomm)
head(covEcomm)
##               Quantity UnitPrice TotalPrice
## Quantity   32551.45215 -18.18357  50675.227
## UnitPrice    -18.18357 488.26804    557.572
## TotalPrice 50675.22664 557.57198  95515.357
#Transform covariance to correlation matrix
covCorrEcomm <- cov2cor(covEcomm)
head(covCorrEcomm)
##               Quantity   UnitPrice TotalPrice
## Quantity    1.00000000 -0.00456105 0.90881171
## UnitPrice  -0.00456105  1.00000000 0.08164605
## TotalPrice  0.90881171  0.08164605 1.00000000
# Ques 9: Visualize your correlation matrix using corrgram.

library("corrgram") 

# Corrgram of Ecommerce variables.
corrgram(EcommDataClean, order=FALSE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of Ecommerce variables.")

# Ques 10: Create a scatter plot matrix for your data set.

library(car)

# As the dataset is large, taking a sample of it.
EcommSample <- EcommDataClean[sample(nrow(EcommDataClean), 3000), ]

# Effect of Quantity and Unit price on Total pricing.
scatterplotMatrix(formula = ~ EcommSample$TotalPrice + EcommSample$Quantity + EcommSample$UnitPrice, cex=0.6,
                  data=EcommSample, main = "Effect of Quantity and Unit price on Total pricing." )

# Scatterplot Matrix of the Ecommerce transactions.
pairs(~InvoiceNo + StockCode + CustomerID + Country+Quantity + UnitPrice + TotalPrice +DateDay + DateYear + DateMonth + Quarter, data= EcommSample, main=" Scatterplot Matrix of the Ecommerce transactions.")

# Ques 11: Run a suitable test to check your hypothesis for your suitable assumptions.

# Ques 12: Run a t-test to analyse your hypothesis.

# Null Hypothesis- "The total price does not depend on Quarter variable.

# T-Test to check correlation between TotalPrice and Quarter
t.test(EcommDataClean$TotalPrice ~ EcommDataClean$Quarter)
## 
##  Welch Two Sample t-test
## 
## data:  EcommDataClean$TotalPrice by EcommDataClean$Quarter
## t = 1.6395, df = 386090, p-value = 0.1011
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2967951  3.3332664
## sample estimates:
## mean in group 1 mean in group 2 
##        23.35408        21.83584
# As p-value=0.1011(>0.05).We can accept the null hypothesis for Quarter column. Therefore There is no strong correlation between Total price and Financial Quarter.
# Day 2:

# Task 1- Formulate a Regression Model. Fit Linear Regression Models using lm(). Articulate a Hypothesis (or two) that you could test using a Regression Model.Run T-Tests appropriate, to test your Hypotheses. Prepare a list of insights based on your Regression Analysis.

# The linear regression model we are trying to make has total_price as the dependent variable and the rest as indeoendent variables.

# Finding the correlation between the independent variables and the numeric dependent variables.

# correlation between Total Price and Quantity.
 cor(EcommDataClean$TotalPrice, EcommDataClean$Quantity)
## [1] 0.9088117
# [1] 0.9088117

# correlation between Total Price and Unit Price.
 cor(EcommDataClean$TotalPrice, EcommDataClean$UnitPrice)
## [1] 0.08164605
# [1] 0.08164605

# Create Training and Test data -
  set.seed(100)  # setting seed to reproduce results of random sampling
  
# Row indices for training data. Splitting the data into 80:20 ratio.
  trainingRowIndex <- sample(1:nrow(EcommDataClean), 0.8*nrow(EcommDataClean)) 
  
# Model training data.
  trainingData <- EcommDataClean[trainingRowIndex, ]  
  
# Test data.
  testData  <- EcommDataClean[-trainingRowIndex, ] 
  
# Developing a model on the training data of the Ecommerce data.
 EcommModel <- lm(TotalPrice ~
+Quantity
+InvoiceDate
+UnitPrice
+Quarter,  data=trainingData)
 
 # View the Coeffecients.
 EcommModel
## 
## Call:
## lm(formula = TotalPrice ~ +Quantity + InvoiceDate + UnitPrice + 
##     Quarter, data = trainingData)
## 
## Coefficients:
## (Intercept)     Quantity  InvoiceDate    UnitPrice     Quarter2  
##  -32.165212     1.562834     0.002004     1.202539     0.056894
 # Hence, the model is => Total_Price = 1.562834*Quantity + 0.002004* InvoiceDate + 1.202539* UnitPrice + 0.056894*Quarter2 -32.165212
 
 # Summary of the model.
  summary(EcommModel)
## 
## Call:
## lm(formula = TotalPrice ~ +Quantity + InvoiceDate + UnitPrice + 
##     Quarter, data = trainingData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38801     -2      0      5  41887 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) -32.165212  42.984323   -0.748    0.454    
## Quantity      1.562834   0.001221 1279.673   <2e-16 ***
## InvoiceDate   0.002004   0.002852    0.703    0.482    
## UnitPrice     1.202539   0.010375  115.905   <2e-16 ***
## Quarter2      0.056894   0.666664    0.085    0.932    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 138.2 on 318334 degrees of freedom
## Multiple R-squared:  0.8383, Adjusted R-squared:  0.8383 
## F-statistic: 4.125e+05 on 4 and 318334 DF,  p-value: < 2.2e-16
  # As, we can see through the P-value, only the variables Quantity and UnitPrice are significant that contribute to the linear regression model which can be tested through hypothesis.
  
  # Predicting the model.
  ecommTotalPricePred <- predict(EcommModel, testData)
  
  # View the predictions.
  head(ecommTotalPricePred)
##         2         6        11        13        18        27 
## 11.291815 10.163295  9.740541 14.850397  9.681813 39.855735
  # Diagnostics of prediction.
  actuals_preds <- data.frame(cbind(actuals=testData$TotalPrice, predicteds=ecommTotalPricePred)) 
  
  # make actuals_predicteds dataframe.
  correlation_accuracy <- cor(actuals_preds)  # 67.4%
  
  head(actuals_preds)
##    actuals predicteds
## 2    20.34  11.291815
## 6    15.30  10.163295
## 11   12.60   9.740541
## 13   30.00  14.850397
## 18   17.85   9.681813
## 27   90.00  39.855735
  # Null Hypothesis- "The total price does not depend on the Financial Quarter variable".

  # T-Test to check correlation between TotalPrice and Quarter.
  t.test(trainingData$TotalPrice ~ trainingData$Quarter)
## 
##  Welch Two Sample t-test
## 
## data:  trainingData$TotalPrice by trainingData$Quarter
## t = 1.2799, df = 309830, p-value = 0.2006
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.7819126  3.7252551
## sample estimates:
## mean in group 1 mean in group 2 
##        23.45549        21.98382
  # As p-value=0.2006(>0.05).We can accept the null hypothesis for Quarter column.Therefore, there is no correlation between Total price and Quarter which can be seen from the linear model too.
# Customer and Product Insights.

# 1. Finding the total price  and transactions of the purchases per customer.

OverallPricePerCustomer <-  aggregate(EcommDataClean$TotalPrice, by=list(CustomerID=EcommDataClean$CustomerID), FUN=sum)
  
TotalTransactionsPerCustomer <-  aggregate(EcommDataClean$TotalPrice, by=list(CustomerID=EcommDataClean$CustomerID), FUN=length)

colnames(OverallPricePerCustomer)[2] <- 'Total_price_per_customer'
colnames(TotalTransactionsPerCustomer)[2] <- 'Total_transactions_per_customer'

head(OverallPricePerCustomer)
##   CustomerID Total_price_per_customer
## 1      12346                 77183.60
## 2      12347                  4310.00
## 3      12348                  1797.24
## 4      12349                  1757.55
## 5      12350                   334.40
## 6      12352                  2506.04
head(TotalTransactionsPerCustomer)
##   CustomerID Total_transactions_per_customer
## 1      12346                               1
## 2      12347                             182
## 3      12348                              31
## 4      12349                              73
## 5      12350                              17
## 6      12352                              85
UniqueCustomerTransactionsDetails <- merge(OverallPricePerCustomer,TotalTransactionsPerCustomer,by="CustomerID")

head(UniqueCustomerTransactionsDetails)
##   CustomerID Total_price_per_customer Total_transactions_per_customer
## 1      12346                 77183.60                               1
## 2      12347                  4310.00                             182
## 3      12348                  1797.24                              31
## 4      12349                  1757.55                              73
## 5      12350                   334.40                              17
## 6      12352                  2506.04                              85
summary(UniqueCustomerTransactionsDetails)
##    CustomerID   Total_price_per_customer Total_transactions_per_customer
##  12346  :   1   Min.   :     0.0         Min.   :   1.00                
##  12347  :   1   1st Qu.:   307.2         1st Qu.:  17.00                
##  12348  :   1   Median :   674.5         Median :  41.00                
##  12349  :   1   Mean   :  2053.8         Mean   :  91.71                
##  12350  :   1   3rd Qu.:  1661.6         3rd Qu.: 100.00                
##  12352  :   1   Max.   :280206.0         Max.   :7847.00                
##  (Other):4333
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
describe(UniqueCustomerTransactionsDetails)
##                                 vars    n    mean      sd  median trimmed
## CustomerID*                        1 4339 2170.00 1252.71 2170.00 2170.00
## Total_price_per_customer           2 4339 2053.79 8988.25  674.45  978.95
## Total_transactions_per_customer    3 4339   91.71  228.79   41.00   57.88
##                                     mad min    max  range  skew kurtosis
## CustomerID*                     1608.62   1   4339   4338  0.00    -1.20
## Total_price_per_customer         693.81   0 280206 280206 19.31   477.38
## Total_transactions_per_customer   44.48   1   7847   7846 18.09   482.64
##                                     se
## CustomerID*                      19.02
## Total_price_per_customer        136.45
## Total_transactions_per_customer   3.47
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library("tabplot")
## Loading required package: bit
## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
## 
## Attaching package: 'bit'
## The following object is masked from 'package:psych':
## 
##     keysort
## The following object is masked from 'package:base':
## 
##     xor
## Loading required package: ff
## Attaching package ff
## - getOption("fftempdir")=="C:/Users/GOWRI/AppData/Local/Temp/RtmpCWOxOa"
## - getOption("ffextension")=="ff"
## - getOption("ffdrop")==TRUE
## - getOption("fffinonexit")==TRUE
## - getOption("ffpagesize")==65536
## - getOption("ffcaching")=="mmnoflush"  -- consider "ffeachflush" if your system stalls on large writes
## - getOption("ffbatchbytes")==84955627.52 -- consider a different value for tuning your system
## - getOption("ffmaxbytes")==4247781376 -- consider a different value for tuning your system
## 
## Attaching package: 'ff'
## The following objects are masked from 'package:bit':
## 
##     clone, clone.default, clone.list
## The following objects are masked from 'package:utils':
## 
##     write.csv, write.csv2
## The following objects are masked from 'package:base':
## 
##     is.factor, is.ordered
## Loading required package: ffbase
## 
## Attaching package: 'ffbase'
## The following objects are masked from 'package:ff':
## 
##     [.ff, [.ffdf, [<-.ff, [<-.ffdf
## The following objects are masked from 'package:base':
## 
##     %in%, table
# Visualization between the relationship of the columns.
tableplot(UniqueCustomerTransactionsDetails)

# Concentrating on the Top 0.5% price customer.
tableplot(UniqueCustomerTransactionsDetails, sortCol = Total_price_per_customer, 
    from = 0, to = 0.5)

# Concentrating on the Top 2% transaction customer.
tableplot(UniqueCustomerTransactionsDetails, sortCol = Total_transactions_per_customer, 
    from = 0, to = 2)

# 2. Finding the total price and transactions of the purchases per country of customers.

OverallPricePerCustomerCountry <-  aggregate(EcommDataClean$TotalPrice, by=list(CustomerID=EcommDataClean$CustomerID, Country=EcommDataClean$Country), FUN=sum)
  
TotalTransactionsPerCustomerCountry <-  aggregate(EcommDataClean$TotalPrice, by=list(CustomerID=EcommDataClean$CustomerID, Country=EcommDataClean$Country), FUN=length)

colnames(OverallPricePerCustomerCountry)[3] <- 'Total_price_per_customer_country'
colnames(TotalTransactionsPerCustomerCountry)[3] <- 'Total_transactions_per_customer_country'

head(OverallPricePerCustomerCountry)
##   CustomerID   Country Total_price_per_customer_country
## 1      12386 Australia                           401.90
## 2      12388 Australia                          2780.66
## 3      12393 Australia                          1582.60
## 4      12415 Australia                        124914.53
## 5      12422 Australia                           386.20
## 6      12424 Australia                          1760.96
head(TotalTransactionsPerCustomerCountry)
##   CustomerID   Country Total_transactions_per_customer_country
## 1      12386 Australia                                      10
## 2      12388 Australia                                     100
## 3      12393 Australia                                      64
## 4      12415 Australia                                     716
## 5      12422 Australia                                      21
## 6      12424 Australia                                      30
UniqueCustomerCountryTransactionsDetails <- merge(OverallPricePerCustomerCountry,TotalTransactionsPerCustomerCountry,by=c("CustomerID","Country"))

head(UniqueCustomerCountryTransactionsDetails)
##   CustomerID        Country Total_price_per_customer_country
## 1      12346 United Kingdom                         77183.60
## 2      12347        Iceland                          4310.00
## 3      12348        Finland                          1797.24
## 4      12349          Italy                          1757.55
## 5      12350         Norway                           334.40
## 6      12352         Norway                          2506.04
##   Total_transactions_per_customer_country
## 1                                       1
## 2                                     182
## 3                                      31
## 4                                      73
## 5                                      17
## 6                                      85
summary(UniqueCustomerCountryTransactionsDetails)
##    CustomerID             Country     Total_price_per_customer_country
##  12370  :   2   United Kingdom:3921   Min.   :     0.0                
##  12394  :   2   Germany       :  94   1st Qu.:   307.4                
##  12417  :   2   France        :  87   Median :   672.0                
##  12422  :   2   Spain         :  30   Mean   :  2050.0                
##  12429  :   2   Belgium       :  25   3rd Qu.:  1661.2                
##  12431  :   2   Switzerland   :  21   Max.   :280206.0                
##  (Other):4335   (Other)       : 169                                   
##  Total_transactions_per_customer_country
##  Min.   :   1.00                        
##  1st Qu.:  17.00                        
##  Median :  41.00                        
##  Mean   :  91.54                        
##  3rd Qu.: 100.00                        
##  Max.   :7847.00                        
## 
library(psych)
describe(UniqueCustomerCountryTransactionsDetails)
##                                         vars    n    mean      sd  median
## CustomerID*                                1 4347 2166.12 1254.81 2166.00
## Country*                                   2 4347   33.26    5.99   35.00
## Total_price_per_customer_country           3 4347 2050.01 8980.10  671.96
## Total_transactions_per_customer_country    4 4347   91.54  228.60   41.00
##                                         trimmed     mad min    max  range
## CustomerID*                             2166.00 1611.59   1   4339   4338
## Country*                                  35.00    0.00   1     37     36
## Total_price_per_customer_country         976.48  690.52   0 280206 280206
## Total_transactions_per_customer_country   57.73   44.48   1   7847   7846
##                                          skew kurtosis     se
## CustomerID*                              0.00    -1.20  19.03
## Country*                                -3.51    11.40   0.09
## Total_price_per_customer_country        19.33   478.27 136.20
## Total_transactions_per_customer_country 18.11   483.46   3.47
library(ggplot2)
library("tabplot")

# Visualization between the relationship of the columns.

tableplot(UniqueCustomerCountryTransactionsDetails)

# taking summary
summary(tableplot(UniqueCustomerCountryTransactionsDetails))

##               general                                        
##  dataset          :UniqueCustomerCountryTransactionsDetails  
##  variables        :4                                         
##  sortCol          :1                                         
##  decreasing       :TRUE                                      
##  from             :0%                                        
##  to               :100%                                      
##  objects.sample   :4347                                      
##  objects.full.data:4347                                      
##  bins             :100                                       
##       variable1                variable2          
##  name      :CustomerID    name      :Country      
##  type      :categorical   type      :categorical  
##  categories:51            categories:38           
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##        variable3                               
##  name       :Total_price_per_customer_country  
##  type       :numeric                           
##  scale_init :auto                              
##  scale_final:log                               
##                                                
##                                                
##                                                
##                                                
##                                                
##        variable4                                      
##  name       :Total_transactions_per_customer_country  
##  type       :numeric                                  
##  scale_init :auto                                     
##  scale_final:log                                      
##                                                       
##                                                       
##                                                       
##                                                       
## 
# Concentrating on the Top 0.5% price customer per country.
tableplot(UniqueCustomerCountryTransactionsDetails, sortCol = Total_price_per_customer_country, 
    from = 0, to = 0.5)

# Concentrating on the Top 2% transaction customer per country.
tableplot(UniqueCustomerCountryTransactionsDetails, sortCol = Total_transactions_per_customer_country, 
    from = 0, to = 2)

# plotting on the world map.

library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
n <- joinCountryData2Map(UniqueCustomerCountryTransactionsDetails, joinCode="NAME", nameJoinColumn="Country")
## 4329 codes from your data successfully matched countries in the map
## 18 codes from your data failed to match with a country code in the map
## 211 codes from the map weren't represented in your data
# Total price per customer in respective countries.
mapCountryData(n, nameColumnToPlot="Total_price_per_customer_country", mapTitle="Total price per customer in respective countries.")

# Total transactions per customer in respective countries.
mapCountryData(n, nameColumnToPlot="Total_transactions_per_customer_country", mapTitle="Total transactions per customer in respective countries.", colourPalette="terrain")

# Distribution of customer in respective countries.
mapCountryData(n, nameColumnToPlot="CustomerID", mapTitle="Distribution of customer in respective countries.", catMethod="categorical")

# 3. Finding the total price  and transactions of the purchases per product(stock).

OverallPricePerStock <-  aggregate(EcommDataClean$TotalPrice, by=list(StockCode=EcommDataClean$StockCode), FUN=sum)
  
TotalTransactionsPerStock <-  aggregate(EcommDataClean$TotalPrice, by=list(StockCode=EcommDataClean$StockCode), FUN=length)

colnames(OverallPricePerStock)[2] <- 'Total_price_per_stock'
colnames(TotalTransactionsPerStock)[2] <- 'Total_transactions_per_stock'

head(OverallPricePerStock)
##   StockCode Total_price_per_stock
## 1     10002                699.55
## 2     10080                114.41
## 3     10120                 40.53
## 4    10123C                  3.25
## 5    10124A                  6.72
## 6    10124G                  7.14
head(TotalTransactionsPerStock)
##   StockCode Total_transactions_per_stock
## 1     10002                           49
## 2     10080                           21
## 3     10120                           30
## 4    10123C                            3
## 5    10124A                            5
## 6    10124G                            4
UniqueStockTransactionsDetails <- merge(OverallPricePerStock,TotalTransactionsPerStock,by="StockCode")

head(UniqueStockTransactionsDetails)
##   StockCode Total_price_per_stock Total_transactions_per_stock
## 1     10002                699.55                           49
## 2     10080                114.41                           21
## 3     10120                 40.53                           30
## 4    10123C                  3.25                            3
## 5    10124A                  6.72                            5
## 6    10124G                  7.14                            4
summary(UniqueStockTransactionsDetails)
##    StockCode    Total_price_per_stock Total_transactions_per_stock
##  10002  :   1   Min.   :     0.0      Min.   :   1.0              
##  10080  :   1   1st Qu.:   129.9      1st Qu.:  12.0              
##  10120  :   1   Median :   634.0      Median :  49.0              
##  10123C :   1   Mean   :  2431.5      Mean   : 108.6              
##  10124A :   1   3rd Qu.:  2075.4      3rd Qu.: 135.0              
##  10124G :   1   Max.   :168469.6      Max.   :2035.0              
##  (Other):3659
library(psych)
describe(UniqueStockTransactionsDetails)
##                              vars    n    mean      sd  median trimmed
## StockCode*                      1 3665 1833.00 1058.14 1833.00 1833.00
## Total_price_per_stock           2 3665 2431.49 6579.43  633.96 1173.55
## Total_transactions_per_stock    3 3665  108.57  163.77   49.00   73.60
##                                  mad min      max    range  skew kurtosis
## StockCode*                   1358.06   1   3665.0   3664.0  0.00    -1.20
## Total_price_per_stock         877.00   0 168469.6 168469.6 11.24   205.63
## Total_transactions_per_stock   63.75   1   2035.0   2034.0  3.55    19.61
##                                  se
## StockCode*                    17.48
## Total_price_per_stock        108.68
## Total_transactions_per_stock   2.71
library(ggplot2)
library("tabplot")

# Visualization between the relationship of the columns.
tableplot(UniqueStockTransactionsDetails)

# Concentrating on the Top 0.5% price stock.
tableplot(UniqueStockTransactionsDetails, sortCol = Total_price_per_stock, 
    from = 0, to = 0.5)

# Concentrating on the Top 2% transaction products(Stocks).
tableplot(UniqueStockTransactionsDetails, sortCol = Total_transactions_per_stock, 
    from = 0, to = 2)

# 4. Finding the total price and transactions of the purchases of stocks per country.

OverallPricePerStockCountry <-  aggregate(EcommDataClean$TotalPrice, by=list(StockCode=EcommDataClean$StockCode, Country=EcommDataClean$Country), FUN=sum)
  
TotalTransactionsPerStockCountry <-  aggregate(EcommDataClean$TotalPrice, by=list(StockCode=EcommDataClean$StockCode, Country=EcommDataClean$Country), FUN=length)

colnames(OverallPricePerStockCountry)[3] <- 'Total_price_per_stock_country'
colnames(TotalTransactionsPerStockCountry)[3] <- 'Total_transactions_per_stock_country'

head(OverallPricePerStockCountry)
##   StockCode   Country Total_price_per_stock_country
## 1     15036 Australia                        432.00
## 2   15056BL Australia                         17.85
## 3    16161P Australia                        136.00
## 4    16169E Australia                         10.50
## 5     20665 Australia                         17.70
## 6     20675 Australia                        228.96
head(TotalTransactionsPerStockCountry)
##   StockCode   Country Total_transactions_per_stock_country
## 1     15036 Australia                                    1
## 2   15056BL Australia                                    1
## 3    16161P Australia                                    1
## 4    16169E Australia                                    1
## 5     20665 Australia                                    1
## 6     20675 Australia                                    2
UniqueStockCountryTransactionsDetails <- merge(OverallPricePerStockCountry,TotalTransactionsPerStockCountry,by=c("StockCode","Country"))

head(UniqueStockCountryTransactionsDetails)
##   StockCode     Country Total_price_per_stock_country
## 1     10002        EIRE                         10.20
## 2     10002      France                        316.20
## 3     10002     Germany                          0.85
## 4     10002       Japan                          0.85
## 5     10002       Spain                         20.40
## 6     10002 Switzerland                         10.20
##   Total_transactions_per_stock_country
## 1                                    1
## 2                                    8
## 3                                    1
## 4                                    1
## 5                                    1
## 6                                    1
summary(UniqueStockCountryTransactionsDetails)
##    StockCode               Country     Total_price_per_stock_country
##  22423  :   29   United Kingdom:3645   Min.   :     0.0             
##  22960  :   25   EIRE          :1943   1st Qu.:    16.6             
##  22961  :   25   Germany       :1665   Median :    37.8             
##  23240  :   24   France        :1522   Mean   :   470.6             
##  22138  :   23   Spain         :1091   3rd Qu.:   153.1             
##  23245  :   23   Switzerland   : 947   Max.   :168469.6             
##  (Other):18788   (Other)       :8124                                
##  Total_transactions_per_stock_country
##  Min.   :   1.00                     
##  1st Qu.:   1.00                     
##  Median :   2.00                     
##  Mean   :  21.01                     
##  3rd Qu.:   5.00                     
##  Max.   :1947.00                     
## 
library(psych)
describe(UniqueStockCountryTransactionsDetails)
##                                      vars     n    mean      sd median
## StockCode*                              1 18937 1615.24  881.41 1579.0
## Country*                                2 18937   20.91   11.14   19.0
## Total_price_per_stock_country           3 18937  470.58 2656.60   37.8
## Total_transactions_per_stock_country    4 18937   21.01   74.74    2.0
##                                      trimmed    mad min      max    range
## StockCode*                           1587.47 904.39   1   3665.0   3664.0
## Country*                               21.37  13.34   1     37.0     36.0
## Total_price_per_stock_country          97.84  40.92   0 168469.6 168469.6
## Total_transactions_per_stock_country    4.31   1.48   1   1947.0   1946.0
##                                       skew kurtosis    se
## StockCode*                            0.22    -0.68  6.41
## Country*                             -0.06    -1.38  0.08
## Total_price_per_stock_country        26.83  1223.98 19.31
## Total_transactions_per_stock_country  7.88    94.06  0.54
library(ggplot2)
library("tabplot")

# Visualization between the relationship of the columns.

tableplot(UniqueStockCountryTransactionsDetails)

# taking summary
summary(tableplot(UniqueStockCountryTransactionsDetails))

##               general                                     
##  dataset          :UniqueStockCountryTransactionsDetails  
##  variables        :4                                      
##  sortCol          :1                                      
##  decreasing       :TRUE                                   
##  from             :0%                                     
##  to               :100%                                   
##  objects.sample   :18937                                  
##  objects.full.data:18937                                  
##  bins             :100                                    
##       variable1                variable2          
##  name      :StockCode     name      :Country      
##  type      :categorical   type      :categorical  
##  categories:51            categories:38           
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##        variable3                            
##  name       :Total_price_per_stock_country  
##  type       :numeric                        
##  scale_init :auto                           
##  scale_final:lin                            
##                                             
##                                             
##                                             
##                                             
##                                             
##        variable4                                   
##  name       :Total_transactions_per_stock_country  
##  type       :numeric                               
##  scale_init :auto                                  
##  scale_final:lin                                   
##                                                    
##                                                    
##                                                    
##                                                    
## 
# Concentrating on the Top 0.5% price stock per country.
tableplot(UniqueStockCountryTransactionsDetails, sortCol = Total_price_per_stock_country, 
    from = 0, to = 0.5)

# Concentrating on the Top 2% transaction stock per country.
tableplot(UniqueStockCountryTransactionsDetails, sortCol = Total_transactions_per_stock_country, 
    from = 0, to = 2)

# plotting on the world map.

library(rworldmap)
n1 <- joinCountryData2Map(UniqueStockCountryTransactionsDetails, joinCode="NAME", nameJoinColumn="Country")
## 16243 codes from your data successfully matched countries in the map
## 2694 codes from your data failed to match with a country code in the map
## 211 codes from the map weren't represented in your data
# Total price per stock in respective countries.
mapCountryData(n1, nameColumnToPlot="Total_price_per_stock_country", mapTitle="Total price per stock in respective countries.", colourPalette="terrain")

# Total transactions per stock in respective countries.
mapCountryData(n1, nameColumnToPlot="Total_transactions_per_stock_country", mapTitle="Total transactions per stock in respective countries.", catMethod="fixedWidth")

# Distribution of stocks in respective countries.
mapCountryData(n1, nameColumnToPlot="StockCode", mapTitle="Distribution of stocks in respective countries.", catMethod="categorical")

Kmeans function gives below statistics as standard output.

Cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocated. Centers: A matrix of cluster centres. Totss: The total sum of squares. Withinss: Vector of within-cluster sum of squares, one component per cluster. tot.withinss: Total within-cluster sum of squares, i.e., sum(withinss). Betweenss: The between-cluster sum of squares, i.e. totss-tot.withinss. Size: The number of points in each cluster. Iter: The number of (outer) iterations. Ifault: Integer: indicator of a possible algorithm problem Â- for experts.

# Customer Segmentations using K-means clusters in the dataset on the basis of unit_price, Quantity and total_price.

Ecommkm <- kmeans(EcommDataClean[,c(4,6,9)],
                   centers= 8)

Ecommkm$centers
##       Quantity  UnitPrice   TotalPrice
## 1 74215.000000   1.040000  77183.60000
## 2     8.063907   2.866211     13.55322
## 3    70.812403   4.873887    125.76849
## 4  1313.175676 388.252297   3301.01649
## 5   521.045845  72.582722   1275.25017
## 6   286.969255  10.036363    404.16013
## 7 80995.000000   2.080000 168469.60000
## 8    60.000000 649.500000  38970.00000
# Visualization

## Plots on clustering variables
TotalPriceClus <- aggregate(EcommDataClean$TotalPrice, by=list(Ecommkm$cluster), mean)
names(TotalPriceClus) <- c("Cluster","TotalPrice") 

TotalPriceClus
##   Cluster   TotalPrice
## 1       1  77183.60000
## 2       2     13.55322
## 3       3    125.76849
## 4       4   3301.01649
## 5       5   1275.25017
## 6       6    404.16013
## 7       7 168469.60000
## 8       8  38970.00000
## Bar Plot: TotalPrice
cp <-barplot(height=TotalPriceClus$TotalPrice ,
        names.arg=TotalPriceClus$Cluster,
        xlab="Cluster",
        ylab="Avg Total Price",
        main="Total Price by Clusters",
        ylim= c(0,max(TotalPriceClus$TotalPrice)+10 ), 
        border=NA 
)

## Plots on clustering variables
QuantityClus <- aggregate(EcommDataClean$Quantity, by=list(Ecommkm$cluster), mean)
names(QuantityClus) <- c("Cluster","Quantity") 

QuantityClus
##   Cluster     Quantity
## 1       1 74215.000000
## 2       2     8.063907
## 3       3    70.812403
## 4       4  1313.175676
## 5       5   521.045845
## 6       6   286.969255
## 7       7 80995.000000
## 8       8    60.000000
## Bar Plot: Quantity
cp1 <-barplot(height=QuantityClus$Quantity ,
        names.arg=QuantityClus$Cluster,
        xlab="Cluster",
        ylab="Avg Total Quantity",
        main="Total Quantity by Clusters",
        ylim= c(0,max(QuantityClus$Quantity)+10 ), 
        border=NA 
)

## Plots on clustering variables
UnitPriceClus <- aggregate(EcommDataClean$UnitPrice, by=list(Ecommkm$cluster), mean)
names(UnitPriceClus) <- c("Cluster","Unit_Price") 

UnitPriceClus
##   Cluster Unit_Price
## 1       1   1.040000
## 2       2   2.866211
## 3       3   4.873887
## 4       4 388.252297
## 5       5  72.582722
## 6       6  10.036363
## 7       7   2.080000
## 8       8 649.500000
## Bar Plot: UnitPrice
cp2 <-barplot(height=UnitPriceClus$Unit_Price,
        names.arg=UnitPriceClus$Cluster,
        xlab="Cluster",
        ylab="Avg Total Unit Price",
        main="Total Unit price by Clusters",
        ylim= c(0,max(UnitPriceClus$Unit_Price)+10 ), 
        border=NA 
)

RFM stands for the three dimensions:

Recency - How recently did the customer purchase? Frequency - How often do they purchase? Monetary Value - How much do they spend?

library(data.table)
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:bit':
## 
##     setattr
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
#library(stringr)
#library(DT)
library(tidyr)
library(knitr)
library(rmarkdown)

# RFM analysis.

EcommRFMData  <- EcommDataClean[,c(1:9)]
glimpse(EcommRFMData)
## Observations: 397,924
## Variables: 9
## $ InvoiceNo   <fct> 536365, 536365, 536365, 536365, 536365, 536365, 53...
## $ StockCode   <fct> 85123A, 71053, 84406B, 84029G, 84029E, 22752, 2173...
## $ Description <fct> WHITE HANGING HEART T-LIGHT HOLDER, WHITE METAL LA...
## $ Quantity    <int> 6, 6, 8, 6, 6, 2, 6, 6, 6, 32, 6, 6, 8, 6, 6, 3, 2...
## $ InvoiceDate <date> 2010-12-01, 2010-12-01, 2010-12-01, 2010-12-01, 2...
## $ UnitPrice   <dbl> 2.55, 3.39, 2.75, 3.39, 3.39, 7.65, 4.25, 1.85, 1....
## $ CustomerID  <fct> 17850, 17850, 17850, 17850, 17850, 17850, 17850, 1...
## $ Country     <fct> United Kingdom, United Kingdom, United Kingdom, Un...
## $ TotalPrice  <dbl> 15.30, 20.34, 22.00, 20.34, 20.34, 15.30, 25.50, 1...
Ecomm_RFM <- EcommRFMData %>% 
  group_by(CustomerID) %>% 
  summarise(recency=as.numeric(as.Date("2012-01-01")-max(InvoiceDate)),
            frequency=n_distinct(InvoiceNo), monitery= sum(TotalPrice)/n_distinct(InvoiceNo)) 

summary(Ecomm_RFM)
##    CustomerID      recency        frequency          monitery      
##  12346  :   1   Min.   : 23.0   Min.   :  1.000   Min.   :    0.0  
##  12347  :   1   1st Qu.: 40.0   1st Qu.:  1.000   1st Qu.:  178.5  
##  12348  :   1   Median : 73.0   Median :  2.000   Median :  293.7  
##  12349  :   1   Mean   :115.0   Mean   :  4.272   Mean   :  419.1  
##  12350  :   1   3rd Qu.:164.5   3rd Qu.:  5.000   3rd Qu.:  430.0  
##  12352  :   1   Max.   :396.0   Max.   :210.000   Max.   :84236.2  
##  (Other):4333
kable(head(Ecomm_RFM))
CustomerID recency frequency monitery
12346 348 1 77183.6000
12347 25 7 615.7143
12348 98 4 449.3100
12349 41 1 1757.5500
12350 333 1 334.4000
12352 59 8 313.2550
# Recency - How recently did the customer purchase?

hist(Ecomm_RFM$recency)

# Frequency - How often do they purchase?

hist(Ecomm_RFM$frequency, breaks = 30)

# As the data is skewed.
Ecomm_RFM$frequency <- log(Ecomm_RFM$frequency)
hist(Ecomm_RFM$frequency)

# Monetary Value - How much do they spend?
hist(Ecomm_RFM$monitery, breaks = 100)

# As data is skewed.
Ecomm_RFM$monitery <- log(Ecomm_RFM$monitery)
hist(Ecomm_RFM$monitery)

# Reference: https://www.kaggle.com/hendraherviawan/customer-segmentation-using-rfm-analysis-r
# Time series analysis of the product purchases by the customers on the ecommerce sites.

#Total purchase price with time.
library(ggplot2)
ggplot(data = EcommDataClean, aes(x = InvoiceDate, y = TotalPrice)) +
      geom_bar(stat = "identity", fill = "purple") +
      labs(title = "Total purchase price with time.",
           x = "Date", y = "Total Price")

# Analysis of purchases by month.
OverallPricePerMonth <-  aggregate(EcommDataClean$TotalPrice, by=list(DateMonth=EcommDataClean$DateMonth), FUN=sum)
  
TotalTransactionsPerMonth <-  aggregate(EcommDataClean$TotalPrice, by=list(DateMonth=EcommDataClean$DateMonth), FUN=length)

colnames(OverallPricePerMonth)[2] <- 'Total_price_per_month'
colnames(TotalTransactionsPerMonth)[2] <- 'Total_transactions_per_month'

head(OverallPricePerMonth)
##   DateMonth Total_price_per_month
## 1         1              569445.0
## 2         2              447137.3
## 3         3              595500.8
## 4         4              469200.4
## 5         5              678594.6
## 6         6              661213.7
head(TotalTransactionsPerMonth)
##   DateMonth Total_transactions_per_month
## 1         1                        21232
## 2         2                        19928
## 3         3                        27177
## 4         4                        22644
## 5         5                        28322
## 6         6                        27185
UniqueMonthTransactionsDetails <- merge(OverallPricePerMonth,TotalTransactionsPerMonth,by="DateMonth")

head(UniqueMonthTransactionsDetails)
##   DateMonth Total_price_per_month Total_transactions_per_month
## 1         1              569445.0                        21232
## 2        10             1039318.8                        49557
## 3        11             1161817.4                        64545
## 4        12             1090906.7                        43464
## 5         2              447137.3                        19928
## 6         3              595500.8                        27177
summary(UniqueMonthTransactionsDetails)
##    DateMonth Total_price_per_month Total_transactions_per_month
##  1      :1   Min.   : 447137       Min.   :19928               
##  2      :1   1st Qu.: 588987       1st Qu.:25781               
##  3      :1   Median : 653279       Median :27181               
##  4      :1   Mean   : 742617       Mean   :33160               
##  5      :1   3rd Qu.: 974459       3rd Qu.:40889               
##  6      :1   Max.   :1161817       Max.   :64545               
##  (Other):6
library(psych)
describe(UniqueMonthTransactionsDetails)
##                              vars  n      mean        sd   median  trimmed
## DateMonth*                      1 12      6.50      3.61      6.5      6.5
## Total_price_per_month           2 12 742617.33 249263.88 653278.8 730245.3
## Total_transactions_per_month    3 12  33160.33  13510.92  27181.0  31345.1
##                                    mad      min     max  range skew
## DateMonth*                        4.45      1.0      12     11 0.00
## Total_price_per_month        198603.31 447137.3 1161817 714680 0.50
## Total_transactions_per_month   7773.27  19928.0   64545  44617 1.03
##                              kurtosis       se
## DateMonth*                      -1.50     1.04
## Total_price_per_month           -1.48 71956.29
## Total_transactions_per_month    -0.19  3900.27
barplot(UniqueMonthTransactionsDetails$Total_price_per_month, names.arg=rownames(CountDateMonth), cex.names=0.8,col = "Red", xlab = "Month", ylab ="Total price per Month ", main = "Effect of Month on Total Price")

barplot(UniqueMonthTransactionsDetails$Total_transactions_per_month, main="Transactions of products according to Month.", xlab = "Month", ylab ="Transactions per Month ",
        names.arg=rownames(CountDateMonth), cex.names=0.8, col="Orange")

# Analysis of purchases by year.
OverallPricePerYear <-  aggregate(EcommDataClean$TotalPrice, by=list(DateYear=EcommDataClean$DateYear), FUN=sum)
  
TotalTransactionsPerYear <-  aggregate(EcommDataClean$TotalPrice, by=list(DateYear=EcommDataClean$DateYear), FUN=length)

colnames(OverallPricePerYear)[2] <- 'Total_price_per_year'
colnames(TotalTransactionsPerYear)[2] <- 'Total_transactions_per_year'

head(OverallPricePerYear)
##   DateYear Total_price_per_year
## 1     2010             572713.9
## 2     2011            8338694.0
head(TotalTransactionsPerYear)
##   DateYear Total_transactions_per_year
## 1     2010                       26160
## 2     2011                      371764
UniqueYearTransactionsDetails <- merge(OverallPricePerYear,TotalTransactionsPerYear,by="DateYear")

head(UniqueYearTransactionsDetails)
##   DateYear Total_price_per_year Total_transactions_per_year
## 1     2010             572713.9                       26160
## 2     2011            8338694.0                      371764
summary(UniqueYearTransactionsDetails)
##  DateYear Total_price_per_year Total_transactions_per_year
##  2010:1   Min.   : 572714      Min.   : 26160             
##  2011:1   1st Qu.:2514209      1st Qu.:112561             
##           Median :4455704      Median :198962             
##           Mean   :4455704      Mean   :198962             
##           3rd Qu.:6397199      3rd Qu.:285363             
##           Max.   :8338694      Max.   :371764
library(psych)
describe(UniqueYearTransactionsDetails)
##                             vars n      mean         sd    median
## DateYear*                      1 2       1.5       0.71       1.5
## Total_price_per_year           2 2 4455704.0 5491377.21 4455704.0
## Total_transactions_per_year    3 2  198962.0  244378.93  198962.0
##                               trimmed        mad      min     max   range
## DateYear*                         1.5       0.74      1.0       2       1
## Total_price_per_year        4455704.0 5756921.07 572713.9 8338694 7765980
## Total_transactions_per_year  198962.0  256196.25  26160.0  371764  345604
##                             skew kurtosis        se
## DateYear*                      0    -2.75       0.5
## Total_price_per_year           0    -2.75 3882990.1
## Total_transactions_per_year    0    -2.75  172802.0
barplot(UniqueYearTransactionsDetails$Total_price_per_year, names.arg=rownames(CountDateYear), cex.names=0.8,col = "Green", xlab = "year", ylab ="Total price per year ", main = " Effect of Year on Total Price" )

barplot(UniqueYearTransactionsDetails$Total_transactions_per_year, main=" Transactions of products according to year.", xlab = "Year", ylab ="Transactions per Year ",  names.arg = rownames(CountDateYear), cex.names=0.8, col="blue")

# Calculating the days as Customer.

EarliestTransactionByCustomer <-  aggregate(EcommDataClean$InvoiceDate, by=list(CustomerID=EcommDataClean$CustomerID), FUN=min)
  
LatestTransactionByCustomer <-  aggregate(EcommDataClean$InvoiceDate, by=list(CustomerID=EcommDataClean$CustomerID), FUN=max)

colnames(EarliestTransactionByCustomer)[2] <- 'Earliest_invoice'
colnames(LatestTransactionByCustomer)[2] <- 'Latest_invoice'

head(EarliestTransactionByCustomer)
##   CustomerID Earliest_invoice
## 1      12346       2011-01-18
## 2      12347       2010-12-07
## 3      12348       2010-12-16
## 4      12349       2011-11-21
## 5      12350       2011-02-02
## 6      12352       2011-02-16
head(LatestTransactionByCustomer)
##   CustomerID Latest_invoice
## 1      12346     2011-01-18
## 2      12347     2011-12-07
## 3      12348     2011-09-25
## 4      12349     2011-11-21
## 5      12350     2011-02-02
## 6      12352     2011-11-03
InvoiceTransactionsDetails <- merge(EarliestTransactionByCustomer,LatestTransactionByCustomer,by="CustomerID")

head(InvoiceTransactionsDetails)
##   CustomerID Earliest_invoice Latest_invoice
## 1      12346       2011-01-18     2011-01-18
## 2      12347       2010-12-07     2011-12-07
## 3      12348       2010-12-16     2011-09-25
## 4      12349       2011-11-21     2011-11-21
## 5      12350       2011-02-02     2011-02-02
## 6      12352       2011-02-16     2011-11-03
summary(InvoiceTransactionsDetails)
##    CustomerID   Earliest_invoice     Latest_invoice      
##  12346  :   1   Min.   :2010-12-01   Min.   :2010-12-01  
##  12347  :   1   1st Qu.:2011-01-17   1st Qu.:2011-07-20  
##  12348  :   1   Median :2011-04-05   Median :2011-10-20  
##  12349  :   1   Mean   :2011-04-30   Mean   :2011-09-07  
##  12350  :   1   3rd Qu.:2011-08-19   3rd Qu.:2011-11-22  
##  12352  :   1   Max.   :2011-12-09   Max.   :2011-12-09  
##  (Other):4333
library(psych)
describe(InvoiceTransactionsDetails)
##                   vars    n mean      sd median trimmed     mad min  max
## CustomerID*          1 4339 2170 1252.71   2170    2170 1608.62   1 4339
## Earliest_invoice*    2 4339  NaN      NA     NA     NaN      NA Inf -Inf
## Latest_invoice*      3 4339  NaN      NA     NA     NaN      NA Inf -Inf
##                   range skew kurtosis    se
## CustomerID*        4338    0     -1.2 19.02
## Earliest_invoice*  -Inf   NA       NA    NA
## Latest_invoice*    -Inf   NA       NA    NA
InvoiceTransactionsDetails['Days_as_customer'] <- (InvoiceTransactionsDetails$Latest_invoice - InvoiceTransactionsDetails$Earliest_invoice) +1

library(ggplot2)
library("tabplot")

# Visualization between the relationship of the columns.
tableplot(InvoiceTransactionsDetails)

# Plotting the days as customer.
DaysCustomer <- as.numeric(InvoiceTransactionsDetails$Days_as_customer)
hist(DaysCustomer)

# Market Basket Analysis using Apriori Algorithm.

# preparing data for Apriori Algorithm.

if(sessionInfo()['basePkgs']=="dplyr" | sessionInfo()['otherPkgs']=="dplyr"){
  detach(package:dplyr, unload=TRUE)
}

library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:corrgram':
## 
##     baseball
df_Ecommbasket <- EcommDataClean[,c(3,5,7)]
head(df_Ecommbasket)
##                           Description InvoiceDate CustomerID
## 1  WHITE HANGING HEART T-LIGHT HOLDER  2010-12-01      17850
## 2                 WHITE METAL LANTERN  2010-12-01      17850
## 3      CREAM CUPID HEARTS COAT HANGER  2010-12-01      17850
## 4 KNITTED UNION FLAG HOT WATER BOTTLE  2010-12-01      17850
## 5      RED WOOLLY HOTTIE WHITE HEART.  2010-12-01      17850
## 6        SET 7 BABUSHKA NESTING BOXES  2010-12-01      17850
df_Ecommbasket$CustomerID <- as.numeric(df_Ecommbasket$CustomerID)

itemList <- ddply(df_Ecommbasket,c("CustomerID","InvoiceDate"), 
                       function(df1)paste(df1$Description, 
                       collapse = ","))
colnames(itemList)[3] <- 'Item_List'
head(itemList)
##   CustomerID InvoiceDate
## 1          1  2011-01-18
## 2          2  2010-12-07
## 3          2  2011-01-26
## 4          2  2011-04-07
## 5          2  2011-06-09
## 6          2  2011-08-02
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          Item_List
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   MEDIUM CERAMIC TOP STORAGE JAR
## 2 BLACK CANDELABRA T-LIGHT HOLDER,AIRLINE BAG VINTAGE JET SET BROWN,COLOUR GLASS. STAR T-LIGHT HOLDER,MINI PAINT SET VINTAGE ,CLEAR DRAWER KNOB ACRYLIC EDWARDIAN,PINK DRAWER KNOB ACRYLIC EDWARDIAN,GREEN DRAWER KNOB ACRYLIC EDWARDIAN,RED DRAWER KNOB ACRYLIC EDWARDIAN,PURPLE DRAWERKNOB ACRYLIC EDWARDIAN,BLUE DRAWER KNOB ACRYLIC EDWARDIAN,ALARM CLOCK BAKELIKE CHOCOLATE,ALARM CLOCK BAKELIKE GREEN,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE PINK,ALARM CLOCK BAKELIKE ORANGE,FOUR HOOK  WHITE LOVEBIRDS,BLACK GRAND BAROQUE PHOTO FRAME,BATHROOM METAL SIGN ,LARGE HEART MEASURING SPOONS,BOX OF 6 ASSORTED COLOUR TEASPOONS,BLUE 3 PIECE POLKADOT CUTLERY SET,RED 3 PIECE RETROSPOT CUTLERY SET,PINK 3 PIECE POLKADOT CUTLERY SET,EMERGENCY FIRST AID TIN ,SET OF 2 TINS VINTAGE BATHROOM ,SET/3 DECOUPAGE STACKING TINS,BOOM BOX SPEAKER BOYS,RED TOADSTOOL LED NIGHT LIGHT,3D DOG PICTURE PLAYING CARDS,BLACK EAR MUFF HEADPHONES,CAMOUFLAGE EAR MUFF HEADPHONES
## 3                                                                                                                  PINK NEW BAROQUECANDLESTICK CANDLE,BLUE NEW BAROQUE CANDLESTICK CANDLE,BLACK CANDELABRA T-LIGHT HOLDER,WOODLAND CHARLOTTE BAG,AIRLINE BAG VINTAGE JET SET BROWN,AIRLINE BAG VINTAGE JET SET WHITE,SANDWICH BATH SPONGE,ALARM CLOCK BAKELIKE CHOCOLATE,ALARM CLOCK BAKELIKE GREEN,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE PINK,ALARM CLOCK BAKELIKE ORANGE,SMALL HEART MEASURING SPOONS,72 SWEETHEART FAIRY CAKE CASES,60 TEATIME FAIRY CAKE CASES,PACK OF 60 MUSHROOM CAKE CASES,PACK OF 60 SPACEBOY CAKE CASES,TEA TIME OVEN GLOVE,RED RETROSPOT OVEN GLOVE ,RED RETROSPOT OVEN GLOVE DOUBLE,SET/2 RED RETROSPOT TEA TOWELS ,REGENCY CAKESTAND 3 TIER,BOX OF 6 ASSORTED COLOUR TEASPOONS,MINI LADLE LOVE HEART RED ,CHOCOLATE CALCULATOR,TOOTHPASTE TUBE PEN,SET OF 2 TINS VINTAGE BATHROOM ,RED TOADSTOOL LED NIGHT LIGHT,3D DOG PICTURE PLAYING CARDS
## 4                                                                                                                                                                                                                                                               AIRLINE BAG VINTAGE JET SET WHITE,AIRLINE BAG VINTAGE JET SET RED,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE JET SET BROWN,RED RETROSPOT PURSE ,ICE CREAM SUNDAE LIP GLOSS,VINTAGE HEADS AND TAILS CARD GAME ,HOLIDAY FUN LUDO,TREASURE ISLAND BOOK BOX,WATERING CAN PINK BUNNY,RED DRAWER KNOB ACRYLIC EDWARDIAN,LARGE HEART MEASURING SPOONS,SMALL HEART MEASURING SPOONS,PACK OF 60 DINOSAUR CAKE CASES,RED RETROSPOT OVEN GLOVE DOUBLE,REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER ,RED TOADSTOOL LED NIGHT LIGHT,MINI PAINT SET VINTAGE ,3D SHEET OF DOG STICKERS,3D SHEET OF CAT STICKERS,SMALL FOLDING SCISSOR(POINTED EDGE),GIFT BAG PSYCHEDELIC APPLES,SET OF 2 TINS VINTAGE BATHROOM 
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       RABBIT NIGHT LIGHT,REGENCY TEA STRAINER,REGENCY TEA PLATE GREEN ,REGENCY TEA PLATE PINK,REGENCY TEA PLATE ROSES ,REGENCY TEAPOT ROSES ,REGENCY SUGAR BOWL GREEN,REGENCY MILK JUG PINK ,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE JET SET BROWN,VICTORIAN SEWING KIT,NAMASTE SWAGAT INCENSE,TRIPLE HOOK ANTIQUE IVORY ROSE,SMALL HEART MEASURING SPOONS,3D DOG PICTURE PLAYING CARDS,FEATHER PEN,COAL BLACK,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE CHOCOLATE
## 6                                                                                                                                                                                                                                                                                                            SET OF 60 VINTAGE LEAF CAKE CASES ,SET 40 HEART SHAPE PETIT FOUR CASES,AIRLINE BAG VINTAGE JET SET BROWN,AIRLINE BAG VINTAGE JET SET RED,AIRLINE BAG VINTAGE JET SET WHITE,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE WORLD CHAMPION ,WOODLAND DESIGN  COTTON TOTE BAG,WOODLAND CHARLOTTE BAG,ALARM CLOCK BAKELIKE RED ,TRIPLE HOOK ANTIQUE IVORY ROSE,SINGLE ANTIQUE ROSE HOOK IVORY,TEA TIME OVEN GLOVE,72 SWEETHEART FAIRY CAKE CASES,60 TEATIME FAIRY CAKE CASES,PACK OF 60 DINOSAUR CAKE CASES,REGENCY CAKESTAND 3 TIER,REGENCY MILK JUG PINK ,3D DOG PICTURE PLAYING CARDS,REVOLVER WOODEN RULER ,VINTAGE HEADS AND TAILS CARD GAME ,RED REFECTORY CLOCK
itemList$CustomerID <- NULL
itemList$InvoiceDate <- NULL

#Rename column headers for ease of use
colnames(itemList) <- c("itemList")
head(itemList)
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           itemList
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   MEDIUM CERAMIC TOP STORAGE JAR
## 2 BLACK CANDELABRA T-LIGHT HOLDER,AIRLINE BAG VINTAGE JET SET BROWN,COLOUR GLASS. STAR T-LIGHT HOLDER,MINI PAINT SET VINTAGE ,CLEAR DRAWER KNOB ACRYLIC EDWARDIAN,PINK DRAWER KNOB ACRYLIC EDWARDIAN,GREEN DRAWER KNOB ACRYLIC EDWARDIAN,RED DRAWER KNOB ACRYLIC EDWARDIAN,PURPLE DRAWERKNOB ACRYLIC EDWARDIAN,BLUE DRAWER KNOB ACRYLIC EDWARDIAN,ALARM CLOCK BAKELIKE CHOCOLATE,ALARM CLOCK BAKELIKE GREEN,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE PINK,ALARM CLOCK BAKELIKE ORANGE,FOUR HOOK  WHITE LOVEBIRDS,BLACK GRAND BAROQUE PHOTO FRAME,BATHROOM METAL SIGN ,LARGE HEART MEASURING SPOONS,BOX OF 6 ASSORTED COLOUR TEASPOONS,BLUE 3 PIECE POLKADOT CUTLERY SET,RED 3 PIECE RETROSPOT CUTLERY SET,PINK 3 PIECE POLKADOT CUTLERY SET,EMERGENCY FIRST AID TIN ,SET OF 2 TINS VINTAGE BATHROOM ,SET/3 DECOUPAGE STACKING TINS,BOOM BOX SPEAKER BOYS,RED TOADSTOOL LED NIGHT LIGHT,3D DOG PICTURE PLAYING CARDS,BLACK EAR MUFF HEADPHONES,CAMOUFLAGE EAR MUFF HEADPHONES
## 3                                                                                                                  PINK NEW BAROQUECANDLESTICK CANDLE,BLUE NEW BAROQUE CANDLESTICK CANDLE,BLACK CANDELABRA T-LIGHT HOLDER,WOODLAND CHARLOTTE BAG,AIRLINE BAG VINTAGE JET SET BROWN,AIRLINE BAG VINTAGE JET SET WHITE,SANDWICH BATH SPONGE,ALARM CLOCK BAKELIKE CHOCOLATE,ALARM CLOCK BAKELIKE GREEN,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE PINK,ALARM CLOCK BAKELIKE ORANGE,SMALL HEART MEASURING SPOONS,72 SWEETHEART FAIRY CAKE CASES,60 TEATIME FAIRY CAKE CASES,PACK OF 60 MUSHROOM CAKE CASES,PACK OF 60 SPACEBOY CAKE CASES,TEA TIME OVEN GLOVE,RED RETROSPOT OVEN GLOVE ,RED RETROSPOT OVEN GLOVE DOUBLE,SET/2 RED RETROSPOT TEA TOWELS ,REGENCY CAKESTAND 3 TIER,BOX OF 6 ASSORTED COLOUR TEASPOONS,MINI LADLE LOVE HEART RED ,CHOCOLATE CALCULATOR,TOOTHPASTE TUBE PEN,SET OF 2 TINS VINTAGE BATHROOM ,RED TOADSTOOL LED NIGHT LIGHT,3D DOG PICTURE PLAYING CARDS
## 4                                                                                                                                                                                                                                                               AIRLINE BAG VINTAGE JET SET WHITE,AIRLINE BAG VINTAGE JET SET RED,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE JET SET BROWN,RED RETROSPOT PURSE ,ICE CREAM SUNDAE LIP GLOSS,VINTAGE HEADS AND TAILS CARD GAME ,HOLIDAY FUN LUDO,TREASURE ISLAND BOOK BOX,WATERING CAN PINK BUNNY,RED DRAWER KNOB ACRYLIC EDWARDIAN,LARGE HEART MEASURING SPOONS,SMALL HEART MEASURING SPOONS,PACK OF 60 DINOSAUR CAKE CASES,RED RETROSPOT OVEN GLOVE DOUBLE,REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER ,RED TOADSTOOL LED NIGHT LIGHT,MINI PAINT SET VINTAGE ,3D SHEET OF DOG STICKERS,3D SHEET OF CAT STICKERS,SMALL FOLDING SCISSOR(POINTED EDGE),GIFT BAG PSYCHEDELIC APPLES,SET OF 2 TINS VINTAGE BATHROOM 
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       RABBIT NIGHT LIGHT,REGENCY TEA STRAINER,REGENCY TEA PLATE GREEN ,REGENCY TEA PLATE PINK,REGENCY TEA PLATE ROSES ,REGENCY TEAPOT ROSES ,REGENCY SUGAR BOWL GREEN,REGENCY MILK JUG PINK ,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE JET SET BROWN,VICTORIAN SEWING KIT,NAMASTE SWAGAT INCENSE,TRIPLE HOOK ANTIQUE IVORY ROSE,SMALL HEART MEASURING SPOONS,3D DOG PICTURE PLAYING CARDS,FEATHER PEN,COAL BLACK,ALARM CLOCK BAKELIKE RED ,ALARM CLOCK BAKELIKE CHOCOLATE
## 6                                                                                                                                                                                                                                                                                                            SET OF 60 VINTAGE LEAF CAKE CASES ,SET 40 HEART SHAPE PETIT FOUR CASES,AIRLINE BAG VINTAGE JET SET BROWN,AIRLINE BAG VINTAGE JET SET RED,AIRLINE BAG VINTAGE JET SET WHITE,AIRLINE BAG VINTAGE TOKYO 78,AIRLINE BAG VINTAGE WORLD CHAMPION ,WOODLAND DESIGN  COTTON TOTE BAG,WOODLAND CHARLOTTE BAG,ALARM CLOCK BAKELIKE RED ,TRIPLE HOOK ANTIQUE IVORY ROSE,SINGLE ANTIQUE ROSE HOOK IVORY,TEA TIME OVEN GLOVE,72 SWEETHEART FAIRY CAKE CASES,60 TEATIME FAIRY CAKE CASES,PACK OF 60 DINOSAUR CAKE CASES,REGENCY CAKESTAND 3 TIER,REGENCY MILK JUG PINK ,3D DOG PICTURE PLAYING CARDS,REVOLVER WOODEN RULER ,VINTAGE HEADS AND TAILS CARD GAME ,RED REFECTORY CLOCK
write.csv(itemList,"ItemList.csv", row.names = TRUE)

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:ffbase':
## 
##     %in%
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     %in%, abbreviate, write
txn = read.transactions(file="ItemList.csv", rm.duplicates= TRUE, format="basket",sep=",",cols=1);
txn@itemInfo$labels <- gsub("\"","",txn@itemInfo$labels)
basket_rules <- apriori(txn,parameter = list(sup = 0.001, conf = 0.0,target="rules"));
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##           0    0.1    1 none FALSE            TRUE   0.001      1     10
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 16 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[16330 item(s), 16767 transaction(s)] done [0.01s].
## sorting and recoding items ... [4 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 done [0.00s].
## writing ... [4 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
df_basket <- as(basket_rules,"data.frame")
head(df_basket)
##                              rules     support  confidence lift
## 1 {} => {REGENCY CAKESTAND 3 TIER} 0.001013896 0.001013896    1
## 2            {} => {CHILLI LIGHTS} 0.001312101 0.001312101    1
## 3                   {} => {Manual} 0.001848870 0.001848870    1
## 4                  {} => {POSTAGE} 0.002087434 0.002087434    1
# reference: https://www.r-bloggers.com/implementing-apriori-algorithm-in-r/
# Word cloud of the products.
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## 
## Attaching package: 'tm'
## The following object is masked from 'package:arules':
## 
##     inspect
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(EcommDataClean$Description, min.freq=2000) 

# Word cloud of the Countries.
wordcloud(EcommDataClean$Country, min.freq=15)