Load the Data and Remove the CustomerID = 1981

df<-read.csv("dataset3.csv", sep=";")
df<-subset(df, CODIGO!='1981')
df$Transaction_Date<-as.Date(df$Transaction_Date, format="%d-%m-%y")

The Structure of the Datase is the following:

str(df)
## 'data.frame':    15187 obs. of  13 variables:
##  $ Transaction_Date       : Date, format: "2015-10-05" "2015-10-06" ...
##  $ CODIGO                 : Factor w/ 1884 levels "10","1010","1017",..: 1162 1162 1162 1162 1162 1162 1162 1162 1162 1162 ...
##  $ Client_Sub_Type        : Factor w/ 49 levels "ACCIONISTA ( NEGOCIO / PROPIO)",..: 45 45 45 45 45 45 45 45 45 45 ...
##  $ Business_Type          : Factor w/ 188 levels "","Administrative Management and General Management Consulting Services",..: 1 1 1 1 1 1 1 1 1 140 ...
##  $ Customer_Type          : Factor w/ 2 levels "B","P": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Customer_Country       : Factor w/ 29 levels " ","AE","AR",..: 28 28 28 28 28 28 28 28 28 28 ...
##  $ Account_Risk           : int  NA NA NA NA NA NA NA NA NA 1 ...
##  $ Customer_Risk          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ TransactionReference   : Factor w/ 17311 levels "146- TRANSF. ENVIADA A 3ERO EN ARCA (47897)",..: 4283 12566 12201 4453 12975 13156 13204 5006 838 5261 ...
##  $ Transaction.Description: Factor w/ 5 levels "Checks Deposited",..: 2 5 5 2 5 5 5 2 3 2 ...
##  $ Amount                 : int  941 1000 15059 200000 28000 1100 1400 18000 31413 20000 ...
##  $ Source.Reference       : Factor w/ 7689 levels "","(56543) SALDO A SU FAVOR",..: 6527 2061 5193 1994 2250 6523 2121 6520 2087 2865 ...
##  $ Originator_Country     : Factor w/ 58 levels "","AE","AI","AN",..: 8 56 56 57 56 56 56 53 56 56 ...

Specify the Number of Clusters based on their transactions

c_df<-df%>%group_by(CODIGO)%>%summarise(N_Transactions=n(), Total_Amount=sum(Amount), Average_Amount=mean(Amount), Median_Amount=median(Amount), Q1=quantile(Amount,0.25), Q3=quantile(Amount,0.75), Max=max(Amount), Min=min(Amount))

data=c_df[,2:dim(c_df)[2]]
k.max <- 15 # Maximal number of clusters
wss <- sapply(1:k.max, 
        function(k){kmeans(data, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")
abline(v = 3, lty =2)

Represent the 3 clusters

clusters<-kmeans(data, 3)

kable(clusters$centers, format="markdown")
N_Transactions Total_Amount Average_Amount Median_Amount Q1 Q3 Max Min
45.800000 9815891.80 368775.941 201398.300 83294.000 447225.200 2014081.70 7218.30
40.129032 2388711.81 201974.839 135286.081 88611.315 233286.347 725743.82 56989.39
6.759249 49840.84 8369.098 7166.998 5589.042 9811.988 17909.04 4428.99

Sizes of 3 clusters

k=c('1', '2','3')
proportions<-round(clusters$size/sum(clusters$size)*100,2)
data.frame(Cluster=k, Proportions=proportions)
##   Cluster Proportions
## 1       1        0.53
## 2       2        3.29
## 3       3       96.18
kable(data.frame(Cluster=k, Proportions=proportions), format="markdown")
Cluster Proportions
1 0.53
2 3.29
3 96.18

Plot the 3 clusters

clusplot(data, clusters$cluster, color=TRUE, shade=TRUE,     labels=1, lines=0)

Histogram of the Transaction Amount by Cluster

c_df$Cluster=clusters$cluster
c_df2=c_df[, c("CODIGO", "Cluster")]
df<-merge(df, c_df2)
df$Cluster<-as.factor(df$Cluster)

ggplot(df, aes(x=Amount))+geom_histogram(aes(y = ..density..), bins=1000)+facet_grid(.~Cluster)+coord_cartesian(xlim=c(0,10000)) +ggtitle("Histogram of Transaction Amount by Cluster")+theme(axis.text.x = element_text(angle = 90, hjust = 1))

Boxplot of the Transaction Amount by Cluster

ggplot(df, aes(x=Cluster, y=Amount))+geom_boxplot(aes(fill=Cluster))+coord_cartesian(ylim=c(0,500000))+ggtitle("Box Plot of Transactions by Cluster")

Scatter Plot of the Transaction Amount by Cluster

ggplot(df, aes(x=Transaction_Date, y=Amount))+geom_point(aes(color=Cluster))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Scatter Plot of Transaction Amount by Cluster")

##Adding the Customer Type
ggplot(df, aes(x=Transaction_Date, y=Amount))+geom_point(aes(color=Cluster, shape=Customer_Type))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Scatter Plot of Transaction Amount by Cluster and Customer Type")

##Adding A Smooth Line
 ggplot(df, aes(x=Transaction_Date, y=Amount, group=Cluster))+geom_point(aes(color=Cluster))+stat_smooth(aes(color=Cluster))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Scatter Plot of Transaction Amount by Cluster")

Time Series Plot of the Transaction Amount by Cluster

tdf<-df%>%group_by(Transaction_Date, Cluster)%>%summarise(AvgAmount=mean(Amount))
ggplot(tdf, aes(x=Transaction_Date, y=AvgAmount, group=Cluster))+geom_line(aes(color=Cluster))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Time Series of Transaction Amount by Cluster")