Load the Data and Remove the CustomerID = 1981
df<-read.csv("dataset3.csv", sep=";")
df<-subset(df, CODIGO!='1981')
df$Transaction_Date<-as.Date(df$Transaction_Date, format="%d-%m-%y")
The Structure of the Datase is the following:
str(df)
## 'data.frame': 15187 obs. of 13 variables:
## $ Transaction_Date : Date, format: "2015-10-05" "2015-10-06" ...
## $ CODIGO : Factor w/ 1884 levels "10","1010","1017",..: 1162 1162 1162 1162 1162 1162 1162 1162 1162 1162 ...
## $ Client_Sub_Type : Factor w/ 49 levels "ACCIONISTA ( NEGOCIO / PROPIO)",..: 45 45 45 45 45 45 45 45 45 45 ...
## $ Business_Type : Factor w/ 188 levels "","Administrative Management and General Management Consulting Services",..: 1 1 1 1 1 1 1 1 1 140 ...
## $ Customer_Type : Factor w/ 2 levels "B","P": 2 2 2 2 2 2 2 2 2 2 ...
## $ Customer_Country : Factor w/ 29 levels " ","AE","AR",..: 28 28 28 28 28 28 28 28 28 28 ...
## $ Account_Risk : int NA NA NA NA NA NA NA NA NA 1 ...
## $ Customer_Risk : int 1 1 1 1 1 1 1 1 1 1 ...
## $ TransactionReference : Factor w/ 17311 levels "146- TRANSF. ENVIADA A 3ERO EN ARCA (47897)",..: 4283 12566 12201 4453 12975 13156 13204 5006 838 5261 ...
## $ Transaction.Description: Factor w/ 5 levels "Checks Deposited",..: 2 5 5 2 5 5 5 2 3 2 ...
## $ Amount : int 941 1000 15059 200000 28000 1100 1400 18000 31413 20000 ...
## $ Source.Reference : Factor w/ 7689 levels "","(56543) SALDO A SU FAVOR",..: 6527 2061 5193 1994 2250 6523 2121 6520 2087 2865 ...
## $ Originator_Country : Factor w/ 58 levels "","AE","AI","AN",..: 8 56 56 57 56 56 56 53 56 56 ...
Specify the Number of Clusters based on their transactions
c_df<-df%>%group_by(CODIGO)%>%summarise(N_Transactions=n(), Total_Amount=sum(Amount), Average_Amount=mean(Amount), Median_Amount=median(Amount), Q1=quantile(Amount,0.25), Q3=quantile(Amount,0.75), Max=max(Amount), Min=min(Amount))
data=c_df[,2:dim(c_df)[2]]
k.max <- 15 # Maximal number of clusters
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
abline(v = 3, lty =2)

Represent the 3 clusters
clusters<-kmeans(data, 3)
kable(clusters$centers, format="markdown")
| 45.800000 |
9815891.80 |
368775.941 |
201398.300 |
83294.000 |
447225.200 |
2014081.70 |
7218.30 |
| 40.129032 |
2388711.81 |
201974.839 |
135286.081 |
88611.315 |
233286.347 |
725743.82 |
56989.39 |
| 6.759249 |
49840.84 |
8369.098 |
7166.998 |
5589.042 |
9811.988 |
17909.04 |
4428.99 |
Sizes of 3 clusters
k=c('1', '2','3')
proportions<-round(clusters$size/sum(clusters$size)*100,2)
data.frame(Cluster=k, Proportions=proportions)
## Cluster Proportions
## 1 1 0.53
## 2 2 3.29
## 3 3 96.18
kable(data.frame(Cluster=k, Proportions=proportions), format="markdown")
Plot the 3 clusters
clusplot(data, clusters$cluster, color=TRUE, shade=TRUE, labels=1, lines=0)

Histogram of the Transaction Amount by Cluster
c_df$Cluster=clusters$cluster
c_df2=c_df[, c("CODIGO", "Cluster")]
df<-merge(df, c_df2)
df$Cluster<-as.factor(df$Cluster)
ggplot(df, aes(x=Amount))+geom_histogram(aes(y = ..density..), bins=1000)+facet_grid(.~Cluster)+coord_cartesian(xlim=c(0,10000)) +ggtitle("Histogram of Transaction Amount by Cluster")+theme(axis.text.x = element_text(angle = 90, hjust = 1))

Boxplot of the Transaction Amount by Cluster
ggplot(df, aes(x=Cluster, y=Amount))+geom_boxplot(aes(fill=Cluster))+coord_cartesian(ylim=c(0,500000))+ggtitle("Box Plot of Transactions by Cluster")

Scatter Plot of the Transaction Amount by Cluster
ggplot(df, aes(x=Transaction_Date, y=Amount))+geom_point(aes(color=Cluster))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Scatter Plot of Transaction Amount by Cluster")

##Adding the Customer Type
ggplot(df, aes(x=Transaction_Date, y=Amount))+geom_point(aes(color=Cluster, shape=Customer_Type))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Scatter Plot of Transaction Amount by Cluster and Customer Type")

##Adding A Smooth Line
ggplot(df, aes(x=Transaction_Date, y=Amount, group=Cluster))+geom_point(aes(color=Cluster))+stat_smooth(aes(color=Cluster))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Scatter Plot of Transaction Amount by Cluster")

Time Series Plot of the Transaction Amount by Cluster
tdf<-df%>%group_by(Transaction_Date, Cluster)%>%summarise(AvgAmount=mean(Amount))
ggplot(tdf, aes(x=Transaction_Date, y=AvgAmount, group=Cluster))+geom_line(aes(color=Cluster))+xlab("Transaction Date")+ylab("Amount")+ggtitle("Time Series of Transaction Amount by Cluster")
