# Packages Preparation
library( ggplot2 )
library(flexclust)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: modeltools
## Loading required package: stats4
library( rpart )
library( rpart.plot )
# Step 1 - Reading Data
wk7 <- read.csv(file.choose())
str(wk7)
## 'data.frame': 5960 obs. of 29 variables:
## $ TARGET_BAD_FLAG : int 1 1 1 1 0 1 1 1 1 1 ...
## $ TARGET_LOSS_AMT : int 641 1109 767 1425 0 335 1841 373 1217 1523 ...
## $ LOAN : int 1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
## $ IMP_MORTDUE : num 25860 70053 13500 65000 97800 ...
## $ M_MORTDUE : int 0 0 0 1 0 0 0 0 0 1 ...
## $ IMP_VALUE : num 39025 68400 16700 89000 112000 ...
## $ M_VALUE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_YOJ : num 10.5 7 4 7 3 9 5 11 3 16 ...
## $ M_YOJ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DEROG : int 0 0 0 1 0 0 3 0 0 0 ...
## $ M_DEROG : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DELINQ : int 0 2 0 1 0 0 2 0 2 0 ...
## $ M_DELINQ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_CLAGE : num 94.4 121.8 149.5 174 93.3 ...
## $ M_CLAGE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_NINQ : int 1 0 1 1 0 1 1 0 1 0 ...
## $ M_NINQ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_CLNO : int 9 14 10 20 14 8 17 8 12 13 ...
## $ M_CLNO : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DEBTINC : num 35 35 35 35 35 ...
## $ M_DEBTINC : int 1 1 1 1 1 0 1 0 1 1 ...
## $ FLAG.Job.Mgr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Job.Office : int 0 0 0 0 1 0 0 0 0 0 ...
## $ FLAG.Job.Other : int 1 1 1 0 0 1 1 1 1 0 ...
## $ FLAG.Job.ProfExe : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Job.Sales : int 0 0 0 0 0 0 0 0 0 1 ...
## $ FLAG.Job.Self : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Reason.DebtCon: int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Reason.HomeImp: int 1 1 1 0 1 1 1 1 1 1 ...
summary(wk7)
## TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN IMP_MORTDUE
## Min. :0.0000 Min. : 0 Min. : 1100 Min. : 2063
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:11100 1st Qu.: 48139
## Median :0.0000 Median : 0 Median :16300 Median : 65000
## Mean :0.1995 Mean : 2676 Mean :18608 Mean : 72999
## 3rd Qu.:0.0000 3rd Qu.: 0 3rd Qu.:23300 3rd Qu.: 88200
## Max. :1.0000 Max. :78987 Max. :89900 Max. :399550
## M_MORTDUE IMP_VALUE M_VALUE IMP_YOJ
## Min. :0.00000 Min. : 8000 Min. :0.00000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 66490 1st Qu.:0.00000 1st Qu.: 3.000
## Median :0.00000 Median : 89000 Median :0.00000 Median : 7.000
## Mean :0.08691 Mean :101536 Mean :0.01879 Mean : 8.756
## 3rd Qu.:0.00000 3rd Qu.:119005 3rd Qu.:0.00000 3rd Qu.:12.000
## Max. :1.00000 Max. :855909 Max. :1.00000 Max. :41.000
## M_YOJ IMP_DEROG M_DEROG IMP_DELINQ
## Min. :0.00000 Min. : 0.0000 Min. :0.0000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.000
## Median :0.00000 Median : 0.0000 Median :0.0000 Median : 0.000
## Mean :0.08641 Mean : 0.3431 Mean :0.1188 Mean : 0.503
## 3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.: 1.000
## Max. :1.00000 Max. :10.0000 Max. :1.0000 Max. :15.000
## M_DELINQ IMP_CLAGE M_CLAGE IMP_NINQ
## Min. :0.00000 Min. : 0.0 Min. :0.00000 Min. : 0.00
## 1st Qu.:0.00000 1st Qu.: 117.4 1st Qu.:0.00000 1st Qu.: 0.00
## Median :0.00000 Median : 174.0 Median :0.00000 Median : 1.00
## Mean :0.09732 Mean : 179.5 Mean :0.05168 Mean : 1.17
## 3rd Qu.:0.00000 3rd Qu.: 227.1 3rd Qu.:0.00000 3rd Qu.: 2.00
## Max. :1.00000 Max. :1168.2 Max. :1.00000 Max. :17.00
## M_NINQ IMP_CLNO M_CLNO IMP_DEBTINC
## Min. :0.00000 Min. : 0.00 Min. :0.00000 Min. : 0.5245
## 1st Qu.:0.00000 1st Qu.:15.00 1st Qu.:0.00000 1st Qu.: 30.7632
## Median :0.00000 Median :20.00 Median :0.00000 Median : 35.0000
## Mean :0.08557 Mean :21.25 Mean :0.03725 Mean : 34.0393
## 3rd Qu.:0.00000 3rd Qu.:26.00 3rd Qu.:0.00000 3rd Qu.: 37.9499
## Max. :1.00000 Max. :71.00 Max. :1.00000 Max. :203.3122
## M_DEBTINC FLAG.Job.Mgr FLAG.Job.Office FLAG.Job.Other
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2126 Mean :0.1287 Mean :0.1591 Mean :0.4007
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## FLAG.Job.ProfExe FLAG.Job.Sales FLAG.Job.Self FLAG.Reason.DebtCon
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :1.0000
## Mean :0.2141 Mean :0.01829 Mean :0.03238 Mean :0.6591
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## FLAG.Reason.HomeImp
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2987
## 3rd Qu.:1.0000
## Max. :1.0000
head(wk7)
# STEP 2 - PCA Analysis
df_pca = wk7
df_pca$TARGET_BAD_FLAG = NULL
df_pca$TARGET_LOSS_AMT = NULL
pca = prcomp(df_pca[,c(1,2,4,6,8,10,12,14,16,18)],center=TRUE, scale=TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4905 1.2085 1.1163 1.0009 0.97918 0.91572 0.86520
## Proportion of Variance 0.2222 0.1461 0.1246 0.1002 0.09588 0.08385 0.07486
## Cumulative Proportion 0.2222 0.3682 0.4928 0.5930 0.68889 0.77274 0.84760
## PC8 PC9 PC10
## Standard deviation 0.83568 0.79387 0.44203
## Proportion of Variance 0.06984 0.06302 0.01954
## Cumulative Proportion 0.91744 0.98046 1.00000
# Scree Plot
plot(pca, type = "l")

df_new = data.frame(predict( pca, df_pca ))
# Based on the Scree plot, using 4 PCs as the eigenvalues are over 1
# The first 4 PC are in a decreasing weights. (PC1>PC2>PC3>PC4)
# Scatter plot using the first two Principal Components.
df_kmeans = df_new[1:2]
print( head( df_kmeans ) )
## PC1 PC2
## 1 -2.4361630 -0.2914953
## 2 -1.2657133 0.3930930
## 3 -2.6621119 -0.1696773
## 4 -0.7828377 0.8659403
## 5 -0.5746093 -0.2924981
## 6 -2.3178901 -0.2111695
plot( df_kmeans$PC1, df_kmeans$PC2 )

# STEP 3 - Cluster Analysis - Find the Number of Clusters
df_kmeans = df_new[1:2]
# Maximum Clusters To Search
MAX_N = 10
# Set up an array to hold the Sum of Square Errors
WSS = numeric( MAX_N )
for ( N in 1:MAX_N )
{
km = kmeans( df_kmeans, centers=N, nstart=20 )
WSS[N] = km$tot.withinss
}
## Warning: did not converge in 10 iterations
df_wss = as.data.frame( WSS )
df_wss$clusters = 1:MAX_N
scree_plot = ggplot( df_wss, aes( x=clusters, y=WSS, group=1 )) +
geom_point( size=4 ) +
geom_line() +
scale_x_continuous( breaks=c(2,4,6,8,10)) +
xlab("Number of Clusters")
scree_plot

# According to the scree plot, 4 clusters would be optimum
# STEP 4 - Cluster Analysis
BEST_N = 4
km = kmeans( df_kmeans, centers=BEST_N, nstart=20 )
print( km$size )
## [1] 2671 694 1991 604
print( km$centers )
## PC1 PC2
## 1 -1.0891828 -0.3035157
## 2 -0.1673616 2.3202858
## 3 0.5570299 -0.3656450
## 4 3.1726984 -0.1185246
# Convert into "flexclust" clusters and print bar chart
kf = as.kcca( object=km, data=df_kmeans, save.data=TRUE )
kfi = kcca2df( kf )
agg = aggregate( kfi$value, list( kfi$variable, kfi$group ), FUN=mean )
barplot(kf)

# Cluster 2 & 3 have most of the data where PC1 and PC2 are similar.
# Cluster 1 has most large PC2 data and Cluster 4 has most of the large PC1 data.
clus = predict( kf, df_kmeans )
plot( df_kmeans$PC1, df_kmeans$PC2, col=clus )
legend( x="topleft", legend=c(1:BEST_N), fill=c(1:BEST_N) )

wk7$CLUSTER = clus
agg = aggregate( wk7$TARGET_BAD_FLAG, list( wk7$CLUSTER ), FUN=mean )
agg
# Yes, the clusters predict loan default. Based on the clusters plot,
# when PC1 and PC2 are low, which means they sit in Cluster 1, and
# they are mostly like to default on loan.
# Step 5 - Describe the Clusters Using Decision Trees
df_tree = df_pca
df_tree$CLUSTER = as.factor(clus)
dt = rpart( CLUSTER ~ . , data=df_tree )
dt = rpart( CLUSTER ~ . , data=df_tree, maxdepth=3 )
rpart.plot( dt )

# Cluster 3 and 4 tend to be good loan as they have more expensive house,
# and more mortgage, potentially meaning they are wealthy and could afford
# better and plan well for their loan. On the other hand, cluster 1 demonstrated
# that people with cheaper houses, more credits line, and more inquires tend
# to be bad loaners.
# STEP 6 - Comment
# Based on the decision tree on clusters, we can tell that people
# have more expensive house- and more mortgage,
# potentially meaning they are wealthy and could afford
# better and they tend to plan well for their loan.
# On the other hand, people with cheaper houses, more credits line,
# and more inquires tend to be bad loaners, which makes sense as
# these factors demonstrated higher risk of people being not financial healthy.