#https://datatables.net/reference/option/
options(DT.options = list(scrollX = TRUE, pagin=TRUE, fixedHeader = TRUE, searchHighlight = TRUE))
# CUSTID : Identification of Credit Card holder (Categorical)
#
# BALANCE : Balance amount left in their account to make purchases (
#
# BALANCEFREQUENCY : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)
#
# PURCHASES : Amount of purchases made from account
#
# ONEOFFPURCHASES : Maximum purchase amount done in one-go
#
# INSTALLMENTSPURCHASES : Amount of purchase done in installment
#
# CASHADVANCE : Cash in advance given by the user
#
# PURCHASESFREQUENCY : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)
#
# ONEOFFPURCHASESFREQUENCY : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)
#
# PURCHASESINSTALLMENTSFREQUENCY : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)
#
# CASHADVANCEFREQUENCY : How frequently the cash in advance being paid
#
# CASHADVANCETRX : Number of Transactions made with "Cash in Advanced"
#
# PURCHASESTRX : Numbe of purchase transactions made
#
# CREDITLIMIT : Limit of Credit Card for user
#
# PAYMENTS : Amount of Payment done by user
#
# MINIMUM_PAYMENTS : Minimum amount of payments made by user
#
# PRCFULLPAYMENT : Percent of full payment paid by user
#
# TENURE : Tenure of credit card service for user
In one piped statement:
a = read_csv('CreditCard_Customers.csv') %>%
clean_names() %>%
mutate(across(where(is.character),factor)) %>%
select(sort(tidyselect::peek_vars())) %>%
select(where(is.factor),where(is.numeric))
a %>% head %>% DT::datatable()
a %>% glimpse
## Rows: 8,950
## Columns: 18
## $ cust_id <fct> C10001, C10002, C10003, C10004, C1...
## $ balance <dbl> 40.90075, 3202.46742, 2495.14886, ...
## $ balance_frequency <dbl> 0.818182, 0.909091, 1.000000, 0.63...
## $ cash_advance <dbl> 0.0000, 6442.9455, 0.0000, 205.788...
## $ cash_advance_frequency <dbl> 0.000000, 0.250000, 0.000000, 0.08...
## $ cash_advance_trx <dbl> 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ credit_limit <dbl> 1000, 7000, 7500, 7500, 1200, 1800...
## $ installments_purchases <dbl> 95.40, 0.00, 0.00, 0.00, 0.00, 133...
## $ minimum_payments <dbl> 139.50979, 1072.34022, 627.28479, ...
## $ oneoff_purchases <dbl> 0.00, 0.00, 773.17, 1499.00, 16.00...
## $ oneoff_purchases_frequency <dbl> 0.000000, 0.000000, 1.000000, 0.08...
## $ payments <dbl> 201.8021, 4103.0326, 622.0667, 0.0...
## $ prc_full_payment <dbl> 0.000000, 0.222222, 0.000000, 0.00...
## $ purchases <dbl> 95.40, 0.00, 773.17, 1499.00, 16.0...
## $ purchases_frequency <dbl> 0.166667, 0.000000, 1.000000, 0.08...
## $ purchases_installments_frequency <dbl> 0.083333, 0.000000, 0.000000, 0.00...
## $ purchases_trx <dbl> 2, 0, 12, 1, 1, 8, 64, 12, 5, 3, 1...
## $ tenure <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12...
a = a %>% select(-cust_id)
miss_var_summary(a)
#find names of cols with outliers_ratio > 10%
(a %>% diagnose_outlier() %>% filter(outliers_ratio > 5))
outlier.cols = a %>% diagnose_outlier() %>% filter(outliers_ratio > 5) %>% select(variables) %>% pull()
a %>% plot_outlier(all_of(outlier.cols))
a %>% plot_normality(
colnames(a)[1],
colnames(a)[2],
colnames(a)[3],
colnames(a)[4],
colnames(a)[5],
colnames(a)[6],
colnames(a)[7],
colnames(a)[8],
colnames(a)[9],
colnames(a)[10],
colnames(a)[11],
colnames(a)[12],
colnames(a)[13],
colnames(a)[14],
colnames(a)[15],
colnames(a)[16],
colnames(a)[17]
)
dlookr::plot_correlate(a)
a %>% GGally::ggcorr(low = '#990000', mid = '#E0E0E0', high = '#009900', label = TRUE)
a = a %>% na.omit()
abak = a
#a = abak
a = a %>% map_df(~ifelse(.x == 0, 0.00000001 , .x + 0.00000001))
a.log = a %>% map_df(log)
DataExplorer::plot_histogram(a.log, nrow = 3, ncol = 3)
a.matrix = a.log %>% as.matrix
factoextra::fviz_nbclust(
a.matrix,
diss = dist(a.matrix, method = "euclidean"),
FUNcluster=kmeans,
method="silhouette"
) +
theme_classic()
dist.a = dist(a, method = 'euclidean') #Distance Matrix Computation
#ward.D2, creates groups such that variance is minimized within clusters, dissimilarities are squared before clustering
hclust.a = hclust(dist.a, method = 'ward.D2') #agglomeration method, how to group points
#hclust.a = hclust(del.distanced, method = 'complete')
plot(hclust.a, hang = -1, cex = 0.8, main = NULL, xlab = NULL)
plot(hclust.a, hang = -1, cex = 0.8, main = NULL, xlab = NULL);abline(h = 220000, col = 'blue', lty = 2) #4 clusters at this height
#At a given height cutoff of x
#Clusters below height x have a [dist.method] [hclust.method] distance <= height cutoff
hclust.colors = RColorBrewer::brewer.pal(5, 'Paired') #choose # of cols corresponding an optimal number of clusters
hclust.clusters = cutree(hclust.a, 5)
plot(as.phylo(hclust.a), type = 'fan', tip.color = hclust.colors[hclust.clusters],
label.offset = 2, cex = 1.0)
plot(ape::as.phylo(hclust.a), type = 'unrooted', tip.color = hclust.colors[hclust.clusters],
label.offset = 2, cex = 1.0, no.margin = TRUE)
km = eclust(
a.matrix,
FUNcluster="kmeans",
k=5,
hc_metric = "euclidean",
hc_method = "ward.D2",
)
a = a %>% mutate(cluster = factor(km$cluster, levels = 1:5))
a %>%
group_by(cluster) %>%
summarise(
mean.balance = mean(balance, na.rm = TRUE),
mean.balance_frequency = mean(balance_frequency, na.rm = TRUE),
mean.cash_advance = mean(cash_advance, na.rm = TRUE),
mean.cash_advance_frequency = mean(cash_advance_frequency, na.rm = TRUE),
mean.cash_advance_trx = mean(cash_advance_trx, na.rm = TRUE),
mean.credit_limit = mean(credit_limit, na.rm = TRUE),
mean.installments_purchases = mean(installments_purchases, na.rm = TRUE),
mean.minimum_payments = mean(minimum_payments, na.rm = TRUE),
mean.oneoff_purchases = mean(oneoff_purchases, na.rm = TRUE),
mean.oneoff_purchases_frequency = mean(oneoff_purchases_frequency, na.rm = TRUE),
mean.payments = mean(payments, na.rm = TRUE),
mean.prc_full_payment = mean(prc_full_payment, na.rm = TRUE),
mean.purchases = mean(purchases, na.rm = TRUE),
mean.purchases_frequency = mean(purchases_frequency, na.rm = TRUE),
mean.purchases_installments_frequency = mean(purchases_installments_frequency, na.rm = TRUE),
mean.purchases_trx = mean(purchases_trx, na.rm = TRUE),
mean.tenure = mean(tenure, na.rm = TRUE),
count = n()
)
library(ggthemes)
DataExplorer::plot_boxplot(a, by = 'cluster', nrow = 3, ncol = 1)
xvars = a %>% select(-cluster) %>% names %>% as.character()
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[2])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[2],' by cluster'), xaxis = list(title = xvars[2]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[3])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[3],' by cluster'), xaxis = list(title = xvars[3]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[4])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[4],' by cluster'), xaxis = list(title = xvars[4]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[5])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[5],' by cluster'), xaxis = list(title = xvars[5]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[6])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[6],' by cluster'), xaxis = list(title = xvars[6]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[7])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[7],' by cluster'), xaxis = list(title = xvars[7]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[8])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[8],' by cluster'), xaxis = list(title = xvars[8]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[9])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[9],' by cluster'), xaxis = list(title = xvars[9]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[10])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[10],' by cluster'), xaxis = list(title = xvars[10]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[11])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[11],' by cluster'), xaxis = list(title = xvars[11]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[12])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[12],' by cluster'), xaxis = list(title = xvars[12]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[13])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[13],' by cluster'), xaxis = list(title = xvars[13]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[14])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[14],' by cluster'), xaxis = list(title = xvars[14]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[15])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[15],' by cluster'), xaxis = list(title = xvars[15]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[16])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[16],' by cluster'), xaxis = list(title = xvars[16]))
a %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[17])), color = ~cluster, colors = hclust.colors) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[17],' by cluster'), xaxis = list(title = xvars[17]))