Bank Transactions

Load the Data and Remove the CustomerID = 1981

df<-read.csv("dataset3.csv", sep=";")
df<-subset(df, CODIGO!='1981')
df$Transaction_Date<-as.Date(df$Transaction_Date, format="%d-%m-%y")

The Structure of the Datase is the following:

str(df)

## 'data.frame':    15187 obs. of  13 variables:
##  $ Transaction_Date       : Date, format: "2015-10-05" "2015-10-06" ...
##  $ CODIGO                 : Factor w/ 1884 levels "10","1010","1017",..: 1162 1162 1162 1162 1162 1162 1162 1162 1162 1162 ...
##  $ Client_Sub_Type        : Factor w/ 49 levels "ACCIONISTA ( NEGOCIO / PROPIO)",..: 45 45 45 45 45 45 45 45 45 45 ...
##  $ Business_Type          : Factor w/ 188 levels "","Administrative Management and General Management Consulting Services",..: 1 1 1 1 1 1 1 1 1 140 ...
##  $ Customer_Type          : Factor w/ 2 levels "B","P": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Customer_Country       : Factor w/ 29 levels " ","AE","AR",..: 28 28 28 28 28 28 28 28 28 28 ...
##  $ Account_Risk           : int  NA NA NA NA NA NA NA NA NA 1 ...
##  $ Customer_Risk          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ TransactionReference   : Factor w/ 17311 levels "146- TRANSF. ENVIADA A 3ERO EN ARCA (47897)",..: 4283 12566 12201 4453 12975 13156 13204 5006 838 5261 ...
##  $ Transaction.Description: Factor w/ 5 levels "Checks Deposited",..: 2 5 5 2 5 5 5 2 3 2 ...
##  $ Amount                 : int  941 1000 15059 200000 28000 1100 1400 18000 31413 20000 ...
##  $ Source.Reference       : Factor w/ 7689 levels "","(56543) SALDO A SU FAVOR",..: 6527 2061 5193 1994 2250 6523 2121 6520 2087 2865 ...
##  $ Originator_Country     : Factor w/ 58 levels "","AE","AI","AN",..: 8 56 56 57 56 56 56 53 56 56 ...

Specify the Number of Clusters based on their transactions

c_df<-df%>%group_by(CODIGO)%>%summarise(N_Transactions=n(), Total_Amount=sum(Amount), Average_Amount=mean(Amount), Median_Amount=median(Amount), Q1=quantile(Amount,0.25), Q3=quantile(Amount,0.75), Max=max(Amount), Min=min(Amount))

data=c_df[,2:dim(c_df)[2]]
k.max <- 15 # Maximal number of clusters
wss <- sapply(1:k.max, 
        function(k){kmeans(data, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")
abline(v = 3, lty =2)

Represent the 3 clusters

clusters<-kmeans(data, 3)

kable(clusters$centers, format="markdown")

N_Transactions	Total_Amount	Average_Amount	Median_Amount	Q1	Q3	Max	Min
45.800000	9815891.80	368775.941	201398.300	83294.000	447225.200	2014081.70	7218.30
40.129032	2388711.81	201974.839	135286.081	88611.315	233286.347	725743.82	56989.39
6.759249	49840.84	8369.098	7166.998	5589.042	9811.988	17909.04	4428.99

Sizes of 3 clusters

k=c('1', '2','3')
proportions<-round(clusters$size/sum(clusters$size)*100,2)
data.frame(Cluster=k, Proportions=proportions)

##   Cluster Proportions
## 1       1        0.53
## 2       2        3.29
## 3       3       96.18

kable(data.frame(Cluster=k, Proportions=proportions), format="markdown")

Cluster	Proportions
1	0.53
2	3.29
3	96.18

Plot the 3 clusters

clusplot(data, clusters$cluster, color=TRUE, shade=TRUE,     labels=1, lines=0)

Histogram of the Transaction Amount by Cluster

c_df$Cluster=clusters$cluster
c_df2=c_df[, c("CODIGO", "Cluster")]
df<-merge(df, c_df2)
df$Cluster<-as.factor(df$Cluster)

ggplot(df, aes(x=Amount))+geom_histogram(aes(y = ..density..), bins=1000)+facet_grid(.~Cluster)+coord_cartesian(xlim=c(0,10000)) +ggtitle("Histogram of Transaction Amount by Cluster")+theme(axis.text.x = element_text(angle = 90, hjust = 1))

Boxplot of the Transaction Amount by Cluster

ggplot(df, aes(x=Cluster, y=Amount))+geom_boxplot(aes(fill=Cluster))+coord_cartesian(ylim=c(0,500000))+ggtitle("Box Plot of Transactions by Cluster")

Scatter Plot of the Transaction Amount by Cluster

ggplot(df, aes(x=Transaction_Date, y=Amount))+geom_point(aes(color=Cluster))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Scatter Plot of Transaction Amount by Cluster")

##Adding the Customer Type
ggplot(df, aes(x=Transaction_Date, y=Amount))+geom_point(aes(color=Cluster, shape=Customer_Type))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Scatter Plot of Transaction Amount by Cluster and Customer Type")

##Adding A Smooth Line
 ggplot(df, aes(x=Transaction_Date, y=Amount, group=Cluster))+geom_point(aes(color=Cluster))+stat_smooth(aes(color=Cluster))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Scatter Plot of Transaction Amount by Cluster")

Time Series Plot of the Transaction Amount by Cluster

tdf<-df%>%group_by(Transaction_Date, Cluster)%>%summarise(AvgAmount=mean(Amount))
ggplot(tdf, aes(x=Transaction_Date, y=AvgAmount, group=Cluster))+geom_line(aes(color=Cluster))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Time Series of Transaction Amount by Cluster")