Clustering

# Packages Preparation
library( ggplot2 )
library(flexclust)

## Loading required package: grid

## Loading required package: lattice

## Loading required package: modeltools

## Loading required package: stats4

library( rpart )
library( rpart.plot )

# Step 1 - Reading Data
wk7 <- read.csv(file.choose())
str(wk7)

## 'data.frame':    5960 obs. of  29 variables:
##  $ TARGET_BAD_FLAG    : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ TARGET_LOSS_AMT    : int  641 1109 767 1425 0 335 1841 373 1217 1523 ...
##  $ LOAN               : int  1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
##  $ IMP_MORTDUE        : num  25860 70053 13500 65000 97800 ...
##  $ M_MORTDUE          : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ IMP_VALUE          : num  39025 68400 16700 89000 112000 ...
##  $ M_VALUE            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_YOJ            : num  10.5 7 4 7 3 9 5 11 3 16 ...
##  $ M_YOJ              : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DEROG          : int  0 0 0 1 0 0 3 0 0 0 ...
##  $ M_DEROG            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DELINQ         : int  0 2 0 1 0 0 2 0 2 0 ...
##  $ M_DELINQ           : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_CLAGE          : num  94.4 121.8 149.5 174 93.3 ...
##  $ M_CLAGE            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_NINQ           : int  1 0 1 1 0 1 1 0 1 0 ...
##  $ M_NINQ             : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_CLNO           : int  9 14 10 20 14 8 17 8 12 13 ...
##  $ M_CLNO             : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DEBTINC        : num  35 35 35 35 35 ...
##  $ M_DEBTINC          : int  1 1 1 1 1 0 1 0 1 1 ...
##  $ FLAG.Job.Mgr       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Job.Office    : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ FLAG.Job.Other     : int  1 1 1 0 0 1 1 1 1 0 ...
##  $ FLAG.Job.ProfExe   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Job.Sales     : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ FLAG.Job.Self      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Reason.DebtCon: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Reason.HomeImp: int  1 1 1 0 1 1 1 1 1 1 ...

summary(wk7)

##  TARGET_BAD_FLAG  TARGET_LOSS_AMT      LOAN        IMP_MORTDUE    
##  Min.   :0.0000   Min.   :    0   Min.   : 1100   Min.   :  2063  
##  1st Qu.:0.0000   1st Qu.:    0   1st Qu.:11100   1st Qu.: 48139  
##  Median :0.0000   Median :    0   Median :16300   Median : 65000  
##  Mean   :0.1995   Mean   : 2676   Mean   :18608   Mean   : 72999  
##  3rd Qu.:0.0000   3rd Qu.:    0   3rd Qu.:23300   3rd Qu.: 88200  
##  Max.   :1.0000   Max.   :78987   Max.   :89900   Max.   :399550  
##    M_MORTDUE         IMP_VALUE         M_VALUE           IMP_YOJ      
##  Min.   :0.00000   Min.   :  8000   Min.   :0.00000   Min.   : 0.000  
##  1st Qu.:0.00000   1st Qu.: 66490   1st Qu.:0.00000   1st Qu.: 3.000  
##  Median :0.00000   Median : 89000   Median :0.00000   Median : 7.000  
##  Mean   :0.08691   Mean   :101536   Mean   :0.01879   Mean   : 8.756  
##  3rd Qu.:0.00000   3rd Qu.:119005   3rd Qu.:0.00000   3rd Qu.:12.000  
##  Max.   :1.00000   Max.   :855909   Max.   :1.00000   Max.   :41.000  
##      M_YOJ           IMP_DEROG          M_DEROG         IMP_DELINQ    
##  Min.   :0.00000   Min.   : 0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.: 0.000  
##  Median :0.00000   Median : 0.0000   Median :0.0000   Median : 0.000  
##  Mean   :0.08641   Mean   : 0.3431   Mean   :0.1188   Mean   : 0.503  
##  3rd Qu.:0.00000   3rd Qu.: 0.0000   3rd Qu.:0.0000   3rd Qu.: 1.000  
##  Max.   :1.00000   Max.   :10.0000   Max.   :1.0000   Max.   :15.000  
##     M_DELINQ         IMP_CLAGE         M_CLAGE           IMP_NINQ    
##  Min.   :0.00000   Min.   :   0.0   Min.   :0.00000   Min.   : 0.00  
##  1st Qu.:0.00000   1st Qu.: 117.4   1st Qu.:0.00000   1st Qu.: 0.00  
##  Median :0.00000   Median : 174.0   Median :0.00000   Median : 1.00  
##  Mean   :0.09732   Mean   : 179.5   Mean   :0.05168   Mean   : 1.17  
##  3rd Qu.:0.00000   3rd Qu.: 227.1   3rd Qu.:0.00000   3rd Qu.: 2.00  
##  Max.   :1.00000   Max.   :1168.2   Max.   :1.00000   Max.   :17.00  
##      M_NINQ           IMP_CLNO         M_CLNO         IMP_DEBTINC      
##  Min.   :0.00000   Min.   : 0.00   Min.   :0.00000   Min.   :  0.5245  
##  1st Qu.:0.00000   1st Qu.:15.00   1st Qu.:0.00000   1st Qu.: 30.7632  
##  Median :0.00000   Median :20.00   Median :0.00000   Median : 35.0000  
##  Mean   :0.08557   Mean   :21.25   Mean   :0.03725   Mean   : 34.0393  
##  3rd Qu.:0.00000   3rd Qu.:26.00   3rd Qu.:0.00000   3rd Qu.: 37.9499  
##  Max.   :1.00000   Max.   :71.00   Max.   :1.00000   Max.   :203.3122  
##    M_DEBTINC       FLAG.Job.Mgr    FLAG.Job.Office  FLAG.Job.Other  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.2126   Mean   :0.1287   Mean   :0.1591   Mean   :0.4007  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  FLAG.Job.ProfExe FLAG.Job.Sales    FLAG.Job.Self     FLAG.Reason.DebtCon
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000     
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000     
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :1.0000     
##  Mean   :0.2141   Mean   :0.01829   Mean   :0.03238   Mean   :0.6591     
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000     
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000     
##  FLAG.Reason.HomeImp
##  Min.   :0.0000     
##  1st Qu.:0.0000     
##  Median :0.0000     
##  Mean   :0.2987     
##  3rd Qu.:1.0000     
##  Max.   :1.0000

head(wk7)

# STEP 2 - PCA Analysis
df_pca = wk7
df_pca$TARGET_BAD_FLAG = NULL
df_pca$TARGET_LOSS_AMT = NULL

pca = prcomp(df_pca[,c(1,2,4,6,8,10,12,14,16,18)],center=TRUE, scale=TRUE)
summary(pca)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.4905 1.2085 1.1163 1.0009 0.97918 0.91572 0.86520
## Proportion of Variance 0.2222 0.1461 0.1246 0.1002 0.09588 0.08385 0.07486
## Cumulative Proportion  0.2222 0.3682 0.4928 0.5930 0.68889 0.77274 0.84760
##                            PC8     PC9    PC10
## Standard deviation     0.83568 0.79387 0.44203
## Proportion of Variance 0.06984 0.06302 0.01954
## Cumulative Proportion  0.91744 0.98046 1.00000

# Scree Plot
plot(pca, type = "l")

df_new = data.frame(predict( pca, df_pca ))

# Based on the Scree plot, using 4 PCs as the eigenvalues are over 1
# The first 4 PC are in a decreasing weights. (PC1>PC2>PC3>PC4)

# Scatter plot using the first two Principal Components. 
df_kmeans = df_new[1:2]
print( head( df_kmeans ) )

##          PC1        PC2
## 1 -2.4361630 -0.2914953
## 2 -1.2657133  0.3930930
## 3 -2.6621119 -0.1696773
## 4 -0.7828377  0.8659403
## 5 -0.5746093 -0.2924981
## 6 -2.3178901 -0.2111695

plot( df_kmeans$PC1, df_kmeans$PC2 )

# STEP 3 - Cluster Analysis - Find the Number of Clusters
df_kmeans = df_new[1:2]
# Maximum Clusters To Search
MAX_N = 10

# Set up an array to hold the Sum of Square Errors
WSS = numeric( MAX_N )

for ( N in 1:MAX_N ) 
{
  km = kmeans( df_kmeans, centers=N, nstart=20  )
  WSS[N] = km$tot.withinss
}

## Warning: did not converge in 10 iterations

df_wss = as.data.frame( WSS )
df_wss$clusters = 1:MAX_N

scree_plot = ggplot( df_wss, aes( x=clusters, y=WSS, group=1 )) +
  geom_point( size=4 ) +
  geom_line() +
  scale_x_continuous( breaks=c(2,4,6,8,10)) +
  xlab("Number of Clusters")

scree_plot

# According to the scree plot, 4 clusters would be optimum

# STEP 4 - Cluster Analysis

BEST_N = 4
km = kmeans( df_kmeans, centers=BEST_N, nstart=20  )

print( km$size )

## [1] 2671  694 1991  604

print( km$centers )

##          PC1        PC2
## 1 -1.0891828 -0.3035157
## 2 -0.1673616  2.3202858
## 3  0.5570299 -0.3656450
## 4  3.1726984 -0.1185246

# Convert into "flexclust" clusters and print bar chart
kf = as.kcca( object=km, data=df_kmeans, save.data=TRUE )
kfi = kcca2df( kf )
agg = aggregate( kfi$value, list( kfi$variable, kfi$group ), FUN=mean )

barplot(kf)

# Cluster 2 & 3 have most of the data where PC1 and PC2 are similar.
# Cluster 1 has most large PC2 data and Cluster 4 has most of the large PC1 data.

clus = predict( kf, df_kmeans )
plot( df_kmeans$PC1, df_kmeans$PC2, col=clus )
legend( x="topleft", legend=c(1:BEST_N), fill=c(1:BEST_N) )

wk7$CLUSTER = clus
agg = aggregate( wk7$TARGET_BAD_FLAG, list( wk7$CLUSTER ), FUN=mean )
agg

# Yes, the clusters predict loan default. Based on the clusters plot,
# when PC1 and PC2 are low, which means they sit in Cluster 1, and 
# they are mostly like to default on loan.


# Step 5 - Describe the Clusters Using Decision Trees
df_tree = df_pca
df_tree$CLUSTER = as.factor(clus)

dt = rpart( CLUSTER ~ . , data=df_tree )
dt = rpart( CLUSTER ~ . , data=df_tree, maxdepth=3 )

rpart.plot( dt )

# Cluster 3 and 4 tend to be good loan as they have more expensive house,
# and more mortgage, potentially meaning they are wealthy and could afford
# better and plan well for their loan. On the other hand, cluster 1 demonstrated
# that people with cheaper houses, more credits line, and more inquires tend
# to be bad loaners.

# STEP 6 - Comment

# Based on the decision tree on clusters, we can tell that people 
# have more expensive house- and more mortgage, 
# potentially meaning they are wealthy and could afford
# better and they tend to plan well for their loan. 
# On the other hand, people with cheaper houses, more credits line, 
# and more inquires tend to be bad loaners, which makes sense as
# these factors demonstrated higher risk of people being not financial healthy.