R Markdown

The German credit data contains attributes and outcomes on 1,000 loan applications.

** Load data **

# chagne working directory to where german.csv is at
#setwd("~/..")

# load German data
german <- read.csv("germancredit.csv", sep = "," , header = T )

# preview german data
head(german,3)
##   Default checkingstatus1 duration history purpose amount savings employ
## 1       0             A11        6     A34     A43   1169     A65    A75
## 2       1             A12       48     A32     A43   5951     A61    A73
## 3       0             A14       12     A34     A46   2096     A61    A74
##   installment status others residence property age otherplans housing
## 1           4    A93   A101         4     A121  67       A143    A152
## 2           2    A92   A101         2     A121  22       A143    A152
## 3           2    A93   A101         3     A121  49       A143    A152
##   cards  job liable tele foreign
## 1     2 A173      1 A192    A201
## 2     1 A173      1 A191    A201
## 3     1 A172      2 A191    A201
## Q1) Impute columns to duration, amount, installment, and age in this analysis, along with loan history, purpose, and rent
german <- german[, c('duration', 'amount', 'installment', 'age', 'history', 'purpose', 'housing','Default')]

# check the data
head(german)
##   duration amount installment age history purpose housing Default
## 1        6   1169           4  67     A34     A43    A152       0
## 2       48   5951           2  22     A32     A43    A152       1
## 3       12   2096           2  49     A34     A46    A152       0
## 4       42   7882           2  45     A32     A42    A153       0
## 5       24   4870           3  53     A33     A40    A153       1
## 6       36   9055           2  35     A32     A46    A153       0

** data engineering for k-means **

# select only duration, amount, and installmen
german <- german[,c("duration","amount","installment", "Default")]

# splitting  to feature vector
# unlike KNN K-Means is unsupervised learning and I need to get rid-off the labels from the features
features <- scale(german[,c("duration","amount","installment")])

# K-means could not take null values so I will dropp them if there are any
features <- na.omit(features)

# to standardize the distance between the features for K-means to give a correct class estimate
standard.features <- scale(features)

k-means classification with three continuous variables: duration, amount, and installment

# k-means model is given as follows
k.means <- kmeans(features, centers = 2, nstart = 25)
str(k.means)
## List of 9
##  $ cluster     : int [1:1000] 1 2 1 2 1 2 1 2 1 2 ...
##  $ centers     : num [1:2, 1:3] -0.369 1.387 -0.396 1.491 0.083 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:3] "duration" "amount" "installment"
##  $ totss       : num 2997
##  $ withinss    : num [1:2] 1215 653
##  $ tot.withinss: num 1869
##  $ betweenss   : num 1128
##  $ size        : int [1:2] 790 210
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

** Plot k-Means **

# install required libraries
if (!require("factoextra")) install.packages("factoextra", dependencies=TRUE)
## Loading required package: factoextra
## Loading required package: ggplot2
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
# load library
library("factoextra")     # for k-means plot

# plot k.means 
fviz_cluster(k.means, data = features)

Then you will use cross-validation with k = 5 for the nearest neighbor

# cross-validation
library(cluster)

clusplot(german[,-1], k.means$cluster, main='Compare between original lebel and k-means clusters', color=TRUE, shade=TRUE, labels=2, lines=0)