cust <- read.table("RetailCustomers.csv", header = TRUE, sep = ";")
names(cust)
## [1] "Last_Active" "First.Name" "Surname"
## [4] "ID.Type" "ID.Number" "Telephone"
## [7] "Cell.Number" "SMSOptOut" "NoticeFlag"
## [10] "Email" "Address.1" "Address.2"
## [13] "Suburb" "City" "TV.Licence"
## [16] "Gender" "Status" "Status.Reason"
## [19] "Status.Store" "Old.TradeSoft.ID" "Total.SSB.Count"
## [22] "Total.SSB.Value" "Avg.SSB.Value" "Total.Outright.Count"
## [25] "Total.Outright.Value" "Avg.Outright.Value" "Retail.Sales.Count"
## [28] "Retail.Sales.Value" "Avg.Retail.Sales" "Active.SSB.Items"
## [31] "Active.SSB.Capital" "Capita.Paid" "Initiation.Paid"
## [34] "Interest.Paid"
cust <- cust[-c(4,5,6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20, 30, 31)]
str(cust)
## 'data.frame': 4883 obs. of 17 variables:
## $ Last_Active : Factor w/ 4813 levels "2016/08/03 13:00",..: 4341 2165 4580 4380 2046 3794 3210 1848 3259 2717 ...
## $ First.Name : Factor w/ 3626 levels " ashley"," G L",..: 2561 3051 1307 2994 2896 1300 264 116 151 2177 ...
## $ Surname : Factor w/ 3865 levels " THINNIES"," YUMA",..: 2780 2594 944 1489 2781 596 592 2474 525 2718 ...
## $ City : Factor w/ 12 levels "Alberton","Bassonia",..: 4 4 4 4 4 5 4 4 4 4 ...
## $ Gender : Factor w/ 2 levels "F","M": 1 1 2 2 2 2 2 2 2 2 ...
## $ Total.SSB.Count : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Total.SSB.Value : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Avg.SSB.Value : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Total.Outright.Count: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Total.Outright.Value: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Avg.Outright.Value : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Retail.Sales.Count : int 3 7 4 1 3 17 1 2 1 5 ...
## $ Retail.Sales.Value : num 27329 27059 22768 21041 20580 ...
## $ Avg.Retail.Sales : num 9110 3866 5692 21041 6860 ...
## $ Capita.Paid : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Initiation.Paid : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Interest.Paid : num 0 0 0 0 0 0 0 0 0 0 ...
Customer <- paste0(as.character(cust$First.Name), " ",as.character(cust$Surname),sep = " ")
summary(cust)
## Last_Active First.Name Surname
## 2016/09/10 12:32: 2 david : 23 ndlovu : 37
## 2016/09/12 13:10: 2 DAVID : 16 khumalo: 33
## 2016/09/17 12:17: 2 sibusiso: 12 dube : 31
## 2016/09/22 12:40: 2 james : 11 NDLOVU : 31
## 2016/09/27 11:31: 2 samuel : 11 moyo : 28
## 2016/09/30 9:59 : 2 themba : 11 ncube : 27
## (Other) :4871 (Other) :4799 (Other):4696
## City Gender Total.SSB.Count Total.SSB.Value
## Jhb & Surrounds:4444 F: 916 Min. : 0.000 Min. : 0.0
## kennilworth : 351 M:3967 1st Qu.: 0.000 1st Qu.: 0.0
## Rosettenville : 44 Median : 0.000 Median : 0.0
## Glen Vista : 15 Mean : 0.367 Mean : 351.1
## Turffontein : 14 3rd Qu.: 0.000 3rd Qu.: 0.0
## The Hill : 5 Max. :91.000 Max. :87651.7
## (Other) : 10
## Avg.SSB.Value Total.Outright.Count Total.Outright.Value
## Min. : 0.00 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 0.00 Median : 0.0000 Median : 0.00
## Mean : 39.16 Mean : 0.0727 Mean : 40.41
## 3rd Qu.: 0.00 3rd Qu.: 0.0000 3rd Qu.: 0.00
## Max. :4292.72 Max. :30.0000 Max. :27200.00
##
## Avg.Outright.Value Retail.Sales.Count Retail.Sales.Value
## Min. : 0.00 Min. : 1.000 Min. : 500.6
## 1st Qu.: 0.00 1st Qu.: 1.000 1st Qu.: 800.0
## Median : 0.00 Median : 1.000 Median : 1300.0
## Mean : 16.41 Mean : 1.287 Mean : 1912.6
## 3rd Qu.: 0.00 3rd Qu.: 1.000 3rd Qu.: 2249.9
## Max. :6800.00 Max. :19.000 Max. :45988.8
##
## Avg.Retail.Sales Capita.Paid Initiation.Paid Interest.Paid
## Min. : 500.6 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 749.2 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 1099.9 Median : 0.0 Median : 0.00 Median : 0.00
## Mean : 1511.0 Mean : 133.4 Mean : 51.72 Mean : 9.62
## 3rd Qu.: 1829.8 3rd Qu.: 0.0 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :21041.1 Max. :32806.6 Max. :12579.40 Max. :3023.63
##
cust$Customer <- Customer
cust <- cust[-c(2,3)]
plot(cust$City, main = "Customers' City of Residence", las = 2)
plot(cust$Gender, main = "Customer Gender", col = c("pink","blue"))
plot(main= "Borrowing vs Buying Customers",x=cust$Total.SSB.Count, y=cust$Retail.Sales.Count, xlab = "Total number of SSB transactions", ylab = "Total number of purchase transactions", type = "p", col = as.factor(cust$Customer),pch=19)
abline(lm(cust$Retail.Sales.Count~cust$Total.SSB.Count))
plot(main = "Selling vs Buying Customers",x=cust$Total.Outright.Count, y=cust$Retail.Sales.Count, ylab = "Total number of purchase transactions", xlab = "Total number of Outright Transactions", type = "p", col = as.factor(cust$Customer),pch=19)
abline(lm(cust$Retail.Sales.Count~cust$Total.Outright.Count))
plot(main = "Buyshop vs Retail Customers",x=cust$Total.SSB.Count+cust$Total.Outright.Count, y=cust$Retail.Sales.Count, xlab = "Total number of buyshop transactions", ylab="Total number of retail transactions", type = "p", col = as.factor(cust$Customer),pch=19)
abline(lm(cust$Retail.Sales.Count~cust$Total.SSB.Count+cust$Total.Outright.Count))
## Warning in abline(lm(cust$Retail.Sales.Count ~ cust$Total.SSB.Count +
## cust$Total.Outright.Count)): only using the first two of 3 regression
## coefficients
plot(main= "Buyshop customer types",y=cust$Total.SSB.Count, x=cust$Total.Outright.Count, xlab = "Total outright transactions", ylab="Total buyback transactions", type = "p", col = as.factor(cust$Customer),pch=19)
abline(lm(cust$Total.SSB.Count ~ cust$Total.Outright.Count), v=mean(cust$Total.Outright.Count[which(cust$Total.Outright.Count!=0)]),h=mean(cust$Total.SSB.Count[which(cust$Total.SSB.Count!=0)]),col=c("blue","red","red"))
hist(cust$Retail.Sales.Value, main = "Retail Sales Value Histogram")
hist(cust$Total.SSB.Value, main = "Total SSB Value Histogram")
hist(cust$Total.Outright.Value,main = "Total Outright Value Histogram")
##Some background on variables: * SSB’s are items a customer borrows money against, using the item as security for the 3-month loan. When the interest payment on an item fails, the item becomes retail stock * Outrights are items the customer sells to the shop for resale * Capita is the amount loaned, initiation is a base-fee charged when the loan is initiated,interest is charged monthly on a loan
t.sales <- read.table("Sales_T_January2017.csv", header = TRUE, sep = ";")
head(t.sales)
## Type Created Completed Transaction.Number Total
## 1 Lay Bye 2017/01/02 10:46 2017/01/02 10:49 LB020117LLHO101 200
## 2 Lay Bye 2017/01/02 12:10 2017/01/02 12:13 LB020117LLHO102 950
## 3 Lay Bye 2017/01/03 12:10 2017/01/03 12:17 LB030117LLHO101 270
## 4 Lay Bye 2017/01/03 12:17 2017/01/03 12:21 LB030117LLHO102 1000
## 5 Lay Bye 2017/01/03 15:45 2017/01/03 15:47 LB030117LLHO103 550
## 6 Lay Bye 2017/01/03 16:15 2017/01/03 16:23 LB030117LLHO104 1000
## Vat Tendered Cashier Sales.Status
## 1 73.68 200 02/01/2017 Till 2 Lhoka Lerato Closed
## 2 343.85 950 02/01/2017 Till 2 Lhoka Lerato Closed
## 3 98.24 270 03/01/2017 Till 2 Lhoka Lerato Closed
## 4 380.69 1000 03/01/2017 Till 2 Lhoka Lerato Closed
## 5 159.64 600 03/01/2017 Till 2 Lhoka Lerato Closed
## 6 233.33 1000 03/01/2017 Till 2 Lhoka Lerato Closed
## Sales.Person Customer X X.1 X.2 X.3
## 1 Sithole Philane masengo, richard NA NA NA NA
## 2 MOLOI ANASTACIA bataso, collin NA NA NA NA
## 3 NDLOVU NSUKUZANAMHLA nhlanhla, teddy NA NA NA NA
## 4 zondi thokozani DAVIDS, AJ NA NA NA NA
## 5 Zulu Eric cuna, donavin NA NA NA NA
## 6 Binedell Ethan watkins, samantha NA NA NA NA
## X.4
## 1 Philane Sithole
## 2 ANASTACIA MOLOI
## 3 NSUKUZANAMHLA NDLOVU
## 4 thokozani zondi
## 5 Eric Zulu
## 6 Ethan Binedell
names(t.sales)
## [1] "Type" "Created" "Completed"
## [4] "Transaction.Number" "Total" "Vat"
## [7] "Tendered" "Cashier" "Sales.Status"
## [10] "Sales.Person" "Customer" "X"
## [13] "X.1" "X.2" "X.3"
## [16] "X.4"
t.sales <- t.sales[-c(2,5,6,8,9)]
d.sales <- read.table("Det_Sales_2017.csv", header = TRUE, sep = ";")
str(t.sales)
## 'data.frame': 22934 obs. of 11 variables:
## $ Type : Factor w/ 3 levels "","Lay Bye","Standard": 2 2 2 2 2 2 2 2 2 2 ...
## $ Completed : Factor w/ 19448 levels "","2017/01/02 10:49",..: 2 3 4 5 6 7 8 9 10 11 ...
## $ Transaction.Number: Factor w/ 22929 levels "","LB010217CSPA101",..: 51 52 111 112 113 114 157 158 159 160 ...
## $ Tendered : Factor w/ 2439 levels "","0,01","0,05",..: 712 2400 1072 16 1958 16 720 426 1201 426 ...
## $ Sales.Person : Factor w/ 37 levels ""," La grance Erica",..: 26 17 23 36 37 3 17 17 26 17 ...
## $ Customer : Factor w/ 5680 levels ""," DENNISON, CHARMAIN 6912150247088",..: 2571 309 4019 808 762 5507 514 3962 2845 1170 ...
## $ X : logi NA NA NA NA NA NA ...
## $ X.1 : logi NA NA NA NA NA NA ...
## $ X.2 : logi NA NA NA NA NA NA ...
## $ X.3 : logi NA NA NA NA NA NA ...
## $ X.4 : Factor w/ 37 levels "","aaliyah harris",..: 32 3 29 35 17 18 3 3 32 3 ...
names(d.sales)
## [1] "Created" "Completed" "Slip.No"
## [4] "Sales.Status" "Sales.Person" "Category"
## [7] "Division" "Make" "Model"
## [10] "Stock.Code" "Description" "Quantity"
## [13] "Unit.Price" "Serial.Number" "Unit.Cst.Incl.Vat"
## [16] "Vat.Incl" "Vat.Excl" "Vat"
d.sales <- d.sales[-c(1,3,4,9,10,11,12,14,16,17,18)]
str(d.sales)
## 'data.frame': 55527 obs. of 7 variables:
## $ Completed : Factor w/ 28984 levels "","2017/01/02 10:00",..: 102 103 105 105 103 104 106 107 108 108 ...
## $ Sales.Person : Factor w/ 43 levels "","aaliyah harris",..: 37 26 29 29 26 1 30 33 30 30 ...
## $ Category : Factor w/ 761 levels "","12 Inch","15 Inch",..: 717 21 249 141 191 497 514 50 497 497 ...
## $ Division : Factor w/ 79 levels "","A/V Accessories",..: 8 15 30 26 57 69 2 31 52 69 ...
## $ Make : Factor w/ 3958 levels ""," BEJAMIN DUDE",..: 878 1048 1988 3225 2402 2402 2402 2544 2402 2402 ...
## $ Unit.Price : Factor w/ 605 levels "","0,00","0.5",..: 357 462 277 566 297 245 512 341 576 165 ...
## $ Unit.Cst.Incl.Vat: Factor w/ 4107 levels "","0,00","0.4788",..: 834 2360 558 2990 1247 807 2519 1690 1327 784 ...
str(t.sales)
## 'data.frame': 22934 obs. of 11 variables:
## $ Type : Factor w/ 3 levels "","Lay Bye","Standard": 2 2 2 2 2 2 2 2 2 2 ...
## $ Completed : Factor w/ 19448 levels "","2017/01/02 10:49",..: 2 3 4 5 6 7 8 9 10 11 ...
## $ Transaction.Number: Factor w/ 22929 levels "","LB010217CSPA101",..: 51 52 111 112 113 114 157 158 159 160 ...
## $ Tendered : Factor w/ 2439 levels "","0,01","0,05",..: 712 2400 1072 16 1958 16 720 426 1201 426 ...
## $ Sales.Person : Factor w/ 37 levels ""," La grance Erica",..: 26 17 23 36 37 3 17 17 26 17 ...
## $ Customer : Factor w/ 5680 levels ""," DENNISON, CHARMAIN 6912150247088",..: 2571 309 4019 808 762 5507 514 3962 2845 1170 ...
## $ X : logi NA NA NA NA NA NA ...
## $ X.1 : logi NA NA NA NA NA NA ...
## $ X.2 : logi NA NA NA NA NA NA ...
## $ X.3 : logi NA NA NA NA NA NA ...
## $ X.4 : Factor w/ 37 levels "","aaliyah harris",..: 32 3 29 35 17 18 3 3 32 3 ...
as.numeric.factor <- function(x) {as.numeric(levels(x))[x]}
d.sales$Unit.Price <- as.numeric(d.sales$Unit.Price)
d.sales$Unit.Cst.Incl.Vat <- as.numeric(d.sales$Unit.Cst.Incl.Vat)
t.sales$Tendered <- as.numeric(t.sales$Tendered)
t.sales <- t.sales[-c(3)]
t.sales$Sales.Person <- t.sales$X.4
t.sales <- t.sales[c(1:5)]
#install.packages("kohonen")
#install.packages("tidyverse")
require(kohonen)
require(tidyverse)
data_train <- cust[1:4000,]
data_test <- cust[4001:4883,]
all_data <- scale(cust[c(4:15)])
som_grid <- somgrid(xdim = 5, ydim=4, topo="hexagonal")
som_model <- som(all_data,
grid=som_grid,
rlen=100,
alpha=c(0.05,0.01),
keep.data = TRUE)
par(mfrow=c(2,2))
plot(som_model, type="changes")
#the model requires a lot of iterations to decrease neighbour distance
plot(som_model, type="count")
#there appears to be two clusters that have higher numbers of customers, one of which contains around 2000 people
plot(som_model, type="dist.neighbours")
#there are larger within-group distances in the top right corner of this graph
plot(som_model, type="codes")
#the node with the highest amount of customers appears to be fairly inactive, thus once-off customers, who haven't been very active; this is to be expected in a business with such a variety of products and services
#the very top-right node shows customers that appear to be very busy in the buyshop (SSB, Outright, etc., but not too fond of buying goods on the retail side)
#the firsat node from top left shows customers that have spent a lot of money, but haven't visited the shop often (once-off big buys)
#the third node in the top row shows customers that have spent a lot of money, and are regular retail customers
#the last node in the second row appear to be customers that sell their goods outright, but don't borrow or shop all that much
#the node below this shows customers who borrow money often, but not large amounts on average
all_data1 <- as.data.frame(all_data)
par(mfrow=c(3,3))
var <- 1 #define the variable to plot
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)
## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length
var <- 2 #define the variable to plot
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)
## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length
var <- 3 #define the variable to plot
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)
## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length
var <- 4 #define the variable to plot
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)
## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length
var <- 5 #define the variable to plot
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)
## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length
var <- 6 #define the variable to plot
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)
## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length
var <- 7 #define the variable to plot
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)
## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length
var <- 8 #define the variable to plot
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)
## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length
var <- 9 #define the variable to plot
var_unscaled <- aggregate(as.numeric(all_data[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
plot(som_model, type = "property", property=var_unscaled, main=names(all_data1)[var], palette.name=rainbow)
## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of
## replacement length
## use hierarchical clustering to cluster the codebook vectors
som_m <- as.data.frame(som_model$codes)
som_cluster <- cutree(hclust(dist(som_m)), 6)
# plot these results:
plot(som_model, type="mapping", bgcol = rainbow(6)[som_cluster], main = "Clusters",col="white")
add.cluster.boundaries(som_model, som_cluster)
# Nonmetric MDS
# N rows (objects) x p columns (variables)
# each row identified by a unique row name
library(MASS)
all_data1 <- unique(cust)
d <- dist(all_data1) # distances between the rows
## Warning in dist(all_data1): NAs introduced by coercion
# fit <- isoMDS(d, k=3) # k is the number of dim
# fit # view results
#
# # plot solution
# x <- fit$points[,1]
# y <- fit$points[,2]
# plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", main="Nonmetric MDS", type="p")
# text(x, y, labels = row.names(all_data))
# Classical MDS
# N rows (objects) x p columns (variables)
# each row identified by a unique row name
fit1 <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim
#fit1 # view results
# plot solution
x <- fit1$points[,1]
y <- fit1$points[,2]
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", main="Metric MDS", type="p")