Customer dataset

cust <- read.table("RetailCustomers.csv", header = TRUE, sep = ";")
names(cust)

##  [1] "Last_Active"          "First.Name"           "Surname"             
##  [4] "ID.Type"              "ID.Number"            "Telephone"           
##  [7] "Cell.Number"          "SMSOptOut"            "NoticeFlag"          
## [10] "Email"                "Address.1"            "Address.2"           
## [13] "Suburb"               "City"                 "TV.Licence"          
## [16] "Gender"               "Status"               "Status.Reason"       
## [19] "Status.Store"         "Old.TradeSoft.ID"     "Total.SSB.Count"     
## [22] "Total.SSB.Value"      "Avg.SSB.Value"        "Total.Outright.Count"
## [25] "Total.Outright.Value" "Avg.Outright.Value"   "Retail.Sales.Count"  
## [28] "Retail.Sales.Value"   "Avg.Retail.Sales"     "Active.SSB.Items"    
## [31] "Active.SSB.Capital"   "Capita.Paid"          "Initiation.Paid"     
## [34] "Interest.Paid"

cust <- cust[-c(4,5,6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20, 30, 31)]
str(cust)

## 'data.frame':    4883 obs. of  17 variables:
##  $ Last_Active         : Factor w/ 4813 levels "2016/08/03 13:00",..: 4341 2165 4580 4380 2046 3794 3210 1848 3259 2717 ...
##  $ First.Name          : Factor w/ 3626 levels " ashley"," G L",..: 2561 3051 1307 2994 2896 1300 264 116 151 2177 ...
##  $ Surname             : Factor w/ 3865 levels " THINNIES"," YUMA",..: 2780 2594 944 1489 2781 596 592 2474 525 2718 ...
##  $ City                : Factor w/ 12 levels "Alberton","Bassonia",..: 4 4 4 4 4 5 4 4 4 4 ...
##  $ Gender              : Factor w/ 2 levels "F","M": 1 1 2 2 2 2 2 2 2 2 ...
##  $ Total.SSB.Count     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Total.SSB.Value     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Avg.SSB.Value       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Total.Outright.Count: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Total.Outright.Value: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Avg.Outright.Value  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Retail.Sales.Count  : int  3 7 4 1 3 17 1 2 1 5 ...
##  $ Retail.Sales.Value  : num  27329 27059 22768 21041 20580 ...
##  $ Avg.Retail.Sales    : num  9110 3866 5692 21041 6860 ...
##  $ Capita.Paid         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Initiation.Paid     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Interest.Paid       : num  0 0 0 0 0 0 0 0 0 0 ...

Customer <- paste0(as.character(cust$First.Name), " ",as.character(cust$Surname),sep = " ")
summary(cust)

##            Last_Active      First.Name      Surname    
##  2016/09/10 12:32:   2   david   :  23   ndlovu :  37  
##  2016/09/12 13:10:   2   DAVID   :  16   khumalo:  33  
##  2016/09/17 12:17:   2   sibusiso:  12   dube   :  31  
##  2016/09/22 12:40:   2   james   :  11   NDLOVU :  31  
##  2016/09/27 11:31:   2   samuel  :  11   moyo   :  28  
##  2016/09/30 9:59 :   2   themba  :  11   ncube  :  27  
##  (Other)         :4871   (Other) :4799   (Other):4696  
##               City      Gender   Total.SSB.Count  Total.SSB.Value  
##  Jhb & Surrounds:4444   F: 916   Min.   : 0.000   Min.   :    0.0  
##  kennilworth    : 351   M:3967   1st Qu.: 0.000   1st Qu.:    0.0  
##  Rosettenville  :  44            Median : 0.000   Median :    0.0  
##  Glen Vista     :  15            Mean   : 0.367   Mean   :  351.1  
##  Turffontein    :  14            3rd Qu.: 0.000   3rd Qu.:    0.0  
##  The Hill       :   5            Max.   :91.000   Max.   :87651.7  
##  (Other)        :  10                                              
##  Avg.SSB.Value     Total.Outright.Count Total.Outright.Value
##  Min.   :   0.00   Min.   : 0.0000      Min.   :    0.00    
##  1st Qu.:   0.00   1st Qu.: 0.0000      1st Qu.:    0.00    
##  Median :   0.00   Median : 0.0000      Median :    0.00    
##  Mean   :  39.16   Mean   : 0.0727      Mean   :   40.41    
##  3rd Qu.:   0.00   3rd Qu.: 0.0000      3rd Qu.:    0.00    
##  Max.   :4292.72   Max.   :30.0000      Max.   :27200.00    
##                                                             
##  Avg.Outright.Value Retail.Sales.Count Retail.Sales.Value
##  Min.   :   0.00    Min.   : 1.000     Min.   :  500.6   
##  1st Qu.:   0.00    1st Qu.: 1.000     1st Qu.:  800.0   
##  Median :   0.00    Median : 1.000     Median : 1300.0   
##  Mean   :  16.41    Mean   : 1.287     Mean   : 1912.6   
##  3rd Qu.:   0.00    3rd Qu.: 1.000     3rd Qu.: 2249.9   
##  Max.   :6800.00    Max.   :19.000     Max.   :45988.8   
##                                                          
##  Avg.Retail.Sales   Capita.Paid      Initiation.Paid    Interest.Paid    
##  Min.   :  500.6   Min.   :    0.0   Min.   :    0.00   Min.   :   0.00  
##  1st Qu.:  749.2   1st Qu.:    0.0   1st Qu.:    0.00   1st Qu.:   0.00  
##  Median : 1099.9   Median :    0.0   Median :    0.00   Median :   0.00  
##  Mean   : 1511.0   Mean   :  133.4   Mean   :   51.72   Mean   :   9.62  
##  3rd Qu.: 1829.8   3rd Qu.:    0.0   3rd Qu.:    0.00   3rd Qu.:   0.00  
##  Max.   :21041.1   Max.   :32806.6   Max.   :12579.40   Max.   :3023.63  
##

cust$Customer <- Customer

cust <- cust[-c(2,3)]

plot(cust$City, main = "Customers' City of Residence", las = 2)

plot(cust$Gender, main = "Customer Gender", col = c("pink","blue"))

plot(main= "Borrowing vs Buying Customers",x=cust$Total.SSB.Count, y=cust$Retail.Sales.Count, xlab = "Total number of SSB transactions", ylab = "Total number of purchase transactions", type = "p", col = as.factor(cust$Customer),pch=19)
abline(lm(cust$Retail.Sales.Count~cust$Total.SSB.Count))

plot(main = "Selling vs Buying Customers",x=cust$Total.Outright.Count, y=cust$Retail.Sales.Count, ylab = "Total number of purchase transactions", xlab = "Total number of Outright Transactions", type = "p", col = as.factor(cust$Customer),pch=19)
abline(lm(cust$Retail.Sales.Count~cust$Total.Outright.Count))

plot(main = "Buyshop vs Retail Customers",x=cust$Total.SSB.Count+cust$Total.Outright.Count, y=cust$Retail.Sales.Count, xlab = "Total number of buyshop transactions", ylab="Total number of retail transactions", type = "p", col = as.factor(cust$Customer),pch=19)
abline(lm(cust$Retail.Sales.Count~cust$Total.SSB.Count+cust$Total.Outright.Count))

## Warning in abline(lm(cust$Retail.Sales.Count ~ cust$Total.SSB.Count +
## cust$Total.Outright.Count)): only using the first two of 3 regression
## coefficients

plot(main= "Buyshop customer types",y=cust$Total.SSB.Count, x=cust$Total.Outright.Count, xlab = "Total outright transactions", ylab="Total buyback transactions", type = "p", col = as.factor(cust$Customer),pch=19)
abline(lm(cust$Total.SSB.Count ~ cust$Total.Outright.Count), v=mean(cust$Total.Outright.Count[which(cust$Total.Outright.Count!=0)]),h=mean(cust$Total.SSB.Count[which(cust$Total.SSB.Count!=0)]),col=c("blue","red","red"))

hist(cust$Retail.Sales.Value, main = "Retail Sales Value Histogram")

hist(cust$Total.SSB.Value, main = "Total SSB Value Histogram")

hist(cust$Total.Outright.Value,main = "Total Outright Value Histogram")

##Some background on variables: * SSB’s are items a customer borrows money against, using the item as security for the 3-month loan. When the interest payment on an item fails, the item becomes retail stock * Outrights are items the customer sells to the shop for resale * Capita is the amount loaned, initiation is a base-fee charged when the loan is initiated,interest is charged monthly on a loan

Total Sales and Detailed Sales Report

t.sales <- read.table("Sales_T_January2017.csv", header = TRUE, sep = ";")
head(t.sales)

##      Type          Created        Completed Transaction.Number Total
## 1 Lay Bye 2017/01/02 10:46 2017/01/02 10:49    LB020117LLHO101   200
## 2 Lay Bye 2017/01/02 12:10 2017/01/02 12:13    LB020117LLHO102   950
## 3 Lay Bye 2017/01/03 12:10 2017/01/03 12:17    LB030117LLHO101   270
## 4 Lay Bye 2017/01/03 12:17 2017/01/03 12:21    LB030117LLHO102  1000
## 5 Lay Bye 2017/01/03 15:45 2017/01/03 15:47    LB030117LLHO103   550
## 6 Lay Bye 2017/01/03 16:15 2017/01/03 16:23    LB030117LLHO104  1000
##      Vat Tendered                        Cashier Sales.Status
## 1  73.68      200 02/01/2017 Till 2 Lhoka Lerato       Closed
## 2 343.85      950 02/01/2017 Till 2 Lhoka Lerato       Closed
## 3  98.24      270 03/01/2017 Till 2 Lhoka Lerato       Closed
## 4 380.69     1000 03/01/2017 Till 2 Lhoka Lerato       Closed
## 5 159.64      600 03/01/2017 Till 2 Lhoka Lerato       Closed
## 6 233.33     1000 03/01/2017 Till 2 Lhoka Lerato       Closed
##           Sales.Person           Customer  X X.1 X.2 X.3
## 1      Sithole Philane  masengo, richard  NA  NA  NA  NA
## 2      MOLOI ANASTACIA    bataso, collin  NA  NA  NA  NA
## 3 NDLOVU NSUKUZANAMHLA nhlanhla, teddy    NA  NA  NA  NA
## 4      zondi thokozani        DAVIDS, AJ  NA  NA  NA  NA
## 5            Zulu Eric     cuna, donavin  NA  NA  NA  NA
## 6       Binedell Ethan watkins, samantha  NA  NA  NA  NA
##                    X.4
## 1      Philane Sithole
## 2      ANASTACIA MOLOI
## 3 NSUKUZANAMHLA NDLOVU
## 4      thokozani zondi
## 5            Eric Zulu
## 6       Ethan Binedell

names(t.sales)

##  [1] "Type"               "Created"            "Completed"         
##  [4] "Transaction.Number" "Total"              "Vat"               
##  [7] "Tendered"           "Cashier"            "Sales.Status"      
## [10] "Sales.Person"       "Customer"           "X"                 
## [13] "X.1"                "X.2"                "X.3"               
## [16] "X.4"

t.sales <- t.sales[-c(2,5,6,8,9)]

d.sales <- read.table("Det_Sales_2017.csv", header = TRUE, sep = ";")
str(t.sales)

## 'data.frame':    22934 obs. of  11 variables:
##  $ Type              : Factor w/ 3 levels "","Lay Bye","Standard": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Completed         : Factor w/ 19448 levels "","2017/01/02 10:49",..: 2 3 4 5 6 7 8 9 10 11 ...
##  $ Transaction.Number: Factor w/ 22929 levels "","LB010217CSPA101",..: 51 52 111 112 113 114 157 158 159 160 ...
##  $ Tendered          : Factor w/ 2439 levels "","0,01","0,05",..: 712 2400 1072 16 1958 16 720 426 1201 426 ...
##  $ Sales.Person      : Factor w/ 37 levels ""," La grance Erica",..: 26 17 23 36 37 3 17 17 26 17 ...
##  $ Customer          : Factor w/ 5680 levels ""," DENNISON, CHARMAIN 6912150247088",..: 2571 309 4019 808 762 5507 514 3962 2845 1170 ...
##  $ X                 : logi  NA NA NA NA NA NA ...
##  $ X.1               : logi  NA NA NA NA NA NA ...
##  $ X.2               : logi  NA NA NA NA NA NA ...
##  $ X.3               : logi  NA NA NA NA NA NA ...
##  $ X.4               : Factor w/ 37 levels "","aaliyah harris",..: 32 3 29 35 17 18 3 3 32 3 ...

names(d.sales)

##  [1] "Created"           "Completed"         "Slip.No"          
##  [4] "Sales.Status"      "Sales.Person"      "Category"         
##  [7] "Division"          "Make"              "Model"            
## [10] "Stock.Code"        "Description"       "Quantity"         
## [13] "Unit.Price"        "Serial.Number"     "Unit.Cst.Incl.Vat"
## [16] "Vat.Incl"          "Vat.Excl"          "Vat"

d.sales <- d.sales[-c(1,3,4,9,10,11,12,14,16,17,18)]

str(d.sales)

## 'data.frame':    55527 obs. of  7 variables:
##  $ Completed        : Factor w/ 28984 levels "","2017/01/02 10:00",..: 102 103 105 105 103 104 106 107 108 108 ...
##  $ Sales.Person     : Factor w/ 43 levels "","aaliyah  harris",..: 37 26 29 29 26 1 30 33 30 30 ...
##  $ Category         : Factor w/ 761 levels "","12 Inch","15 Inch",..: 717 21 249 141 191 497 514 50 497 497 ...
##  $ Division         : Factor w/ 79 levels "","A/V Accessories",..: 8 15 30 26 57 69 2 31 52 69 ...
##  $ Make             : Factor w/ 3958 levels ""," BEJAMIN DUDE",..: 878 1048 1988 3225 2402 2402 2402 2544 2402 2402 ...
##  $ Unit.Price       : Factor w/ 605 levels "","0,00","0.5",..: 357 462 277 566 297 245 512 341 576 165 ...
##  $ Unit.Cst.Incl.Vat: Factor w/ 4107 levels "","0,00","0.4788",..: 834 2360 558 2990 1247 807 2519 1690 1327 784 ...

str(t.sales)

## 'data.frame':    22934 obs. of  11 variables:
##  $ Type              : Factor w/ 3 levels "","Lay Bye","Standard": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Completed         : Factor w/ 19448 levels "","2017/01/02 10:49",..: 2 3 4 5 6 7 8 9 10 11 ...
##  $ Transaction.Number: Factor w/ 22929 levels "","LB010217CSPA101",..: 51 52 111 112 113 114 157 158 159 160 ...
##  $ Tendered          : Factor w/ 2439 levels "","0,01","0,05",..: 712 2400 1072 16 1958 16 720 426 1201 426 ...
##  $ Sales.Person      : Factor w/ 37 levels ""," La grance Erica",..: 26 17 23 36 37 3 17 17 26 17 ...
##  $ Customer          : Factor w/ 5680 levels ""," DENNISON, CHARMAIN 6912150247088",..: 2571 309 4019 808 762 5507 514 3962 2845 1170 ...
##  $ X                 : logi  NA NA NA NA NA NA ...
##  $ X.1               : logi  NA NA NA NA NA NA ...
##  $ X.2               : logi  NA NA NA NA NA NA ...
##  $ X.3               : logi  NA NA NA NA NA NA ...
##  $ X.4               : Factor w/ 37 levels "","aaliyah harris",..: 32 3 29 35 17 18 3 3 32 3 ...

as.numeric.factor <- function(x) {as.numeric(levels(x))[x]}

d.sales$Unit.Price <- as.numeric(d.sales$Unit.Price)
d.sales$Unit.Cst.Incl.Vat <- as.numeric(d.sales$Unit.Cst.Incl.Vat)
t.sales$Tendered <- as.numeric(t.sales$Tendered)

t.sales <- t.sales[-c(3)]

t.sales$Sales.Person <- t.sales$X.4

t.sales <- t.sales[c(1:5)]

Self-organzing maps

#install.packages("kohonen")
#install.packages("tidyverse")

require(kohonen)
require(tidyverse)

data_train <- cust[1:4000,]
data_test <- cust[4001:4883,]

all_data <- scale(cust[c(4:15)])

som_grid <- somgrid(xdim = 5, ydim=4, topo="hexagonal")

som_model <- som(all_data, 
        grid=som_grid, 
        rlen=100, 
        alpha=c(0.05,0.01), 
        keep.data = TRUE)

par(mfrow=c(2,2))

plot(som_model, type="changes")

#the model requires a lot of iterations to decrease neighbour distance

plot(som_model, type="count")

#there appears to be two clusters that have higher numbers of customers, one of which contains around 2000 people

plot(som_model, type="dist.neighbours")

#there are larger within-group distances in the top right corner of this graph

plot(som_model, type="codes")

#the node with the highest amount of customers appears to be fairly inactive, thus once-off customers, who haven't been very active; this is to be expected in a business with such a variety of products and services
#the very top-right node shows customers that appear to be very busy in the buyshop (SSB, Outright, etc., but not too fond of buying goods on the retail side)
#the firsat node from top left shows customers that have spent a lot of money, but haven't visited the shop often (once-off big buys)
#the third node in the top row shows customers that have spent a lot of money, and are regular retail customers
#the last node in the second row appear to be customers that sell their goods outright, but don't borrow or shop all that much
#the node below this shows customers who borrow money often, but not large amounts on average

all_data1 <- as.data.frame(all_data)

par(mfrow=c(3,3))

var <- 1 #define the variable to plot 
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] 
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)

## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length

var <- 2 #define the variable to plot 
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] 
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)

## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length

var <- 3 #define the variable to plot 
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] 
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)

## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length

var <- 4 #define the variable to plot 
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] 
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)

## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length

var <- 5 #define the variable to plot 
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] 
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)

## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length

var <- 6 #define the variable to plot 
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] 
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)

## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length

var <- 7 #define the variable to plot 
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] 
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)

## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length

var <- 8 #define the variable to plot 
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] 
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)

## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length

var <- 9 #define the variable to plot 
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] 
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)

## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length

## use hierarchical clustering to cluster the codebook vectors
som_m <- as.data.frame(som_model$codes)

som_cluster <- cutree(hclust(dist(som_m)), 6)
# plot these results:
plot(som_model, type="mapping", bgcol = rainbow(6)[som_cluster], main = "Clusters",col="white") 
add.cluster.boundaries(som_model, som_cluster)

# Nonmetric MDS
# N rows (objects) x p columns (variables)
# each row identified by a unique row name

library(MASS)
all_data1 <- unique(cust)
d <- dist(all_data1) #  distances between the rows

## Warning in dist(all_data1): NAs introduced by coercion

# fit <- isoMDS(d, k=3) # k is the number of dim
# fit # view results
# 
# # plot solution
# x <- fit$points[,1]
# y <- fit$points[,2]
# plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", main="Nonmetric MDS", type="p")
# text(x, y, labels = row.names(all_data))

# Classical MDS
# N rows (objects) x p columns (variables)
# each row identified by a unique row name

fit1 <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim
#fit1 # view results

# plot solution
x <- fit1$points[,1]
y <- fit1$points[,2]
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", main="Metric MDS", type="p")

MSc Data Science, Department of Statistical Sciences, University of Cape Town

STA5077Z: Unsupervised Learning Module Project

Gerhard Viljoen [VLJCHR004]

26 July 2017

Customer dataset

Total Sales and Detailed Sales Report

Self-organzing maps