#https://datatables.net/reference/option/
options(DT.options = list(scrollX = TRUE, pagin=TRUE, fixedHeader = TRUE, searchHighlight = TRUE))

Introduction

Please read this Kaggle

Data Dictionary

# CUSTID : Identification of Credit Card holder (Categorical)
# 
# BALANCE : Balance amount left in their account to make purchases (
# 
# BALANCEFREQUENCY : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)
# 
# PURCHASES : Amount of purchases made from account
# 
# ONEOFFPURCHASES : Maximum purchase amount done in one-go
# 
# INSTALLMENTSPURCHASES : Amount of purchase done in installment
# 
# CASHADVANCE : Cash in advance given by the user
# 
# PURCHASESFREQUENCY : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)
# 
# ONEOFFPURCHASESFREQUENCY : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)
# 
# PURCHASESINSTALLMENTSFREQUENCY : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)
# 
# CASHADVANCEFREQUENCY : How frequently the cash in advance being paid
# 
# CASHADVANCETRX : Number of Transactions made with "Cash in Advanced"
# 
# PURCHASESTRX : Numbe of purchase transactions made
# 
# CREDITLIMIT : Limit of Credit Card for user
# 
# PAYMENTS : Amount of Payment done by user
# 
# MINIMUM_PAYMENTS : Minimum amount of payments made by user
# 
# PRCFULLPAYMENT : Percent of full payment paid by user
# 
# TENURE : Tenure of credit card service for user

In one piped statement:

read in data
rename all colnames lowercase
convert char to factor vars
order cols by name: alphabetically
order cols by datatype: nominal, then numeric

Get Data

a = read_csv('CreditCard_Customers.csv') %>% 
  clean_names() %>% 
  mutate(across(where(is.character),factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(where(is.factor),where(is.numeric))

quick skim

a %>% head %>% DT::datatable()

a %>% glimpse

## Rows: 8,950
## Columns: 18
## $ cust_id                          <fct> C10001, C10002, C10003, C10004, C1...
## $ balance                          <dbl> 40.90075, 3202.46742, 2495.14886, ...
## $ balance_frequency                <dbl> 0.818182, 0.909091, 1.000000, 0.63...
## $ cash_advance                     <dbl> 0.0000, 6442.9455, 0.0000, 205.788...
## $ cash_advance_frequency           <dbl> 0.000000, 0.250000, 0.000000, 0.08...
## $ cash_advance_trx                 <dbl> 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ credit_limit                     <dbl> 1000, 7000, 7500, 7500, 1200, 1800...
## $ installments_purchases           <dbl> 95.40, 0.00, 0.00, 0.00, 0.00, 133...
## $ minimum_payments                 <dbl> 139.50979, 1072.34022, 627.28479, ...
## $ oneoff_purchases                 <dbl> 0.00, 0.00, 773.17, 1499.00, 16.00...
## $ oneoff_purchases_frequency       <dbl> 0.000000, 0.000000, 1.000000, 0.08...
## $ payments                         <dbl> 201.8021, 4103.0326, 622.0667, 0.0...
## $ prc_full_payment                 <dbl> 0.000000, 0.222222, 0.000000, 0.00...
## $ purchases                        <dbl> 95.40, 0.00, 773.17, 1499.00, 16.0...
## $ purchases_frequency              <dbl> 0.166667, 0.000000, 1.000000, 0.08...
## $ purchases_installments_frequency <dbl> 0.083333, 0.000000, 0.000000, 0.00...
## $ purchases_trx                    <dbl> 2, 0, 12, 1, 1, 8, 64, 12, 5, 3, 1...
## $ tenure                           <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12...

There appears to be outliers for male annual income

XXX

remove unecessary col

a = a %>% select(-cust_id)

EDA: num vars

check missing values

miss_var_summary(a)

distribution: viz

DataExplorer::plot_histogram(a, nrow = 3, ncol = 3)

dlookr vignette

outliers: viz

#find names of cols with outliers_ratio > 10%
(a %>% diagnose_outlier() %>% filter(outliers_ratio > 5))

outlier.cols = a %>% diagnose_outlier() %>% filter(outliers_ratio > 5) %>% select(variables) %>% pull()

a %>% plot_outlier(all_of(outlier.cols))

normality: viz

a %>% plot_normality(
  colnames(a)[1],
  colnames(a)[2],
  colnames(a)[3],
  colnames(a)[4],
  colnames(a)[5],
  colnames(a)[6],
  colnames(a)[7],
  colnames(a)[8],
  colnames(a)[9],
  colnames(a)[10],
  colnames(a)[11],
  colnames(a)[12],
  colnames(a)[13],
  colnames(a)[14],
  colnames(a)[15],
  colnames(a)[16],
  colnames(a)[17]
  )

correlations: viz

dlookr::plot_correlate(a)

pairwise correlations: viz

a %>% GGally::ggcorr(low = '#990000', mid = '#E0E0E0', high = '#009900', label = TRUE)

Preprocessing

remove missing rows

a = a %>% na.omit()
abak = a

I feel fine removing only 3.5% of the dataset

transformation: log

#a = abak
a = a %>% map_df(~ifelse(.x == 0, 0.00000001 , .x + 0.00000001))
a.log = a %>% map_df(log)

Because we have zeroes in our data, you need at add a small constant before logging; if you don’t, you’ll get infinite values’’

Sanity Check

DataExplorer::plot_histogram(a.log, nrow = 3, ncol = 3)

create matrix

a.matrix = a.log %>% as.matrix

Determine Optimal Number of Clusters

Reference

silhouette analysis with kmeans and euclidean distancing

factoextra::fviz_nbclust(
  a.matrix,
  diss = dist(a.matrix, method = "euclidean"),
  FUNcluster=kmeans,
  method="silhouette"
  ) +
  theme_classic()

While 2 is what is suggested, it likely makes little business sense to only have 2 clusters; will go with 3, 4, or 5

Clustering via Hierarchial Methods

Reference 1 Reference 2

dist.a = dist(a, method = 'euclidean') #Distance Matrix Computation

#ward.D2, creates groups such that variance is minimized within clusters, dissimilarities are squared before clustering
hclust.a = hclust(dist.a, method = 'ward.D2') #agglomeration method, how to group points
#hclust.a = hclust(del.distanced, method = 'complete') 

plot(hclust.a, hang = -1, cex = 0.8, main = NULL, xlab = NULL)

plot(hclust.a, hang = -1, cex = 0.8, main = NULL, xlab = NULL);abline(h = 220000, col = 'blue', lty = 2) #4 clusters at this height

#At a given height cutoff of x
#Clusters below height x  have a [dist.method] [hclust.method] distance <= height cutoff

hclust.colors = RColorBrewer::brewer.pal(5, 'Paired') #choose # of cols corresponding an optimal number of clusters

hclust.clusters = cutree(hclust.a, 5)

plot(as.phylo(hclust.a), type = 'fan', tip.color = hclust.colors[hclust.clusters],
     label.offset = 2, cex = 1.0)

plot(ape::as.phylo(hclust.a), type = 'unrooted', tip.color = hclust.colors[hclust.clusters],
     label.offset = 2, cex = 1.0, no.margin = TRUE)

Clusters below a height cutoff of 210000 have a kmeans euclidean distance <= height cutoff

Looking at the dendogram, 9 clusters might be ‘over-clustering’; will stick with 5 clusters

Clustering via Centroid Based Methods: Kmeans

Kmeans

km = eclust(
  a.matrix,
  FUNcluster="kmeans",
  k=5,
  hc_metric = "euclidean",
  hc_method = "ward.D2",
  )

Summarize Clustering Results

summary: data

a = a %>% mutate(cluster = factor(km$cluster, levels = 1:5))

a %>%
  group_by(cluster) %>%
  summarise(
      mean.balance = mean(balance, na.rm = TRUE),
      mean.balance_frequency = mean(balance_frequency, na.rm = TRUE),
      mean.cash_advance = mean(cash_advance, na.rm = TRUE),
      mean.cash_advance_frequency = mean(cash_advance_frequency, na.rm = TRUE),
      mean.cash_advance_trx = mean(cash_advance_trx, na.rm = TRUE),
      mean.credit_limit = mean(credit_limit, na.rm = TRUE),
      mean.installments_purchases = mean(installments_purchases, na.rm = TRUE),
      mean.minimum_payments = mean(minimum_payments, na.rm = TRUE),
      mean.oneoff_purchases = mean(oneoff_purchases, na.rm = TRUE),
      mean.oneoff_purchases_frequency = mean(oneoff_purchases_frequency, na.rm = TRUE),
      mean.payments = mean(payments, na.rm = TRUE),
      mean.prc_full_payment = mean(prc_full_payment, na.rm = TRUE),
      mean.purchases = mean(purchases, na.rm = TRUE),
      mean.purchases_frequency = mean(purchases_frequency, na.rm = TRUE),
      mean.purchases_installments_frequency = mean(purchases_installments_frequency, na.rm = TRUE),
      mean.purchases_trx = mean(purchases_trx, na.rm = TRUE),
      mean.tenure = mean(tenure, na.rm = TRUE),
      count = n()
  )

summary: viz

library(ggthemes)
DataExplorer::plot_boxplot(a, by = 'cluster', nrow = 3, ncol = 1)

summary: viz

xvars = a %>% select(-cluster) %>% names %>% as.character()

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[2])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[2],' by cluster'), xaxis = list(title = xvars[2]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[3])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[3],' by cluster'), xaxis = list(title = xvars[3]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[4])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[4],' by cluster'), xaxis = list(title = xvars[4]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[5])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[5],' by cluster'), xaxis = list(title = xvars[5]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[6])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[6],' by cluster'), xaxis = list(title = xvars[6]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[7])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[7],' by cluster'), xaxis = list(title = xvars[7]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[8])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[8],' by cluster'), xaxis = list(title = xvars[8]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[9])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[9],' by cluster'), xaxis = list(title = xvars[9]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[10])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[10],' by cluster'), xaxis = list(title = xvars[10]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[11])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[11],' by cluster'), xaxis = list(title = xvars[11]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[12])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[12],' by cluster'), xaxis = list(title = xvars[12]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[13])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[13],' by cluster'), xaxis = list(title = xvars[13]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[14])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[14],' by cluster'), xaxis = list(title = xvars[14]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[15])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[15],' by cluster'), xaxis = list(title = xvars[15]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[16])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[16],' by cluster'), xaxis = list(title = xvars[16]))

a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[17])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[17],' by cluster'), xaxis = list(title = xvars[17]))

Credit Card Customer Clustering

Jeremiah W

October 29, 2020

Introduction

Data Dictionary

Get Data

quick skim

There appears to be outliers for male annual income

XXX

remove unecessary col

EDA: num vars

check missing values

distribution: viz

outliers: viz

normality: viz

correlations: viz

pairwise correlations: viz

Preprocessing

remove missing rows

transformation: log

Sanity Check

create matrix

Determine Optimal Number of Clusters

silhouette analysis with kmeans and euclidean distancing

While 2 is what is suggested, it likely makes little business sense to only have 2 clusters; will go with 3, 4, or 5

Clustering via Hierarchial Methods

Clusters below a height cutoff of 210000 have a kmeans euclidean distance <= height cutoff

Looking at the dendogram, 9 clusters might be ‘over-clustering’; will stick with 5 clusters

Clustering via Centroid Based Methods: Kmeans

Kmeans

Summarize Clustering Results

summary: data

summary: viz

summary: viz