Data

setwd("~/Dropbox/Works/Class/Data_Science/R.WD/zmPDSwR")
load("~/Dropbox/Works/Class/Data_Science/R.WD/zmPDSwR/GCDData.RData")
ls()
## [1] "creditdata"  "d"           "model"       "resultframe" "rtab"       
## [6] "tab1"        "tab2"        "vars"
options(width=180)
str(creditdata)
## 'data.frame':    1000 obs. of  21 variables:
##  $ Status.of.existing.checking.account                     : Factor w/ 4 levels "... < 0 DM","... >= 200 DM / salary assignments for at least 1 year",..: 1 3 4 1 1 4 4 3 4 3 ...
##  $ Duration.in.month                                       : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ Credit.history                                          : Factor w/ 5 levels "all credits at this bank paid back duly",..: 2 4 2 4 3 4 4 4 4 2 ...
##  $ Purpose                                                 : Factor w/ 10 levels "business","car (new)",..: 8 8 5 6 2 5 6 3 8 2 ...
##  $ Credit.amount                                           : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ Savings.account.bonds                                   : Factor w/ 5 levels ".. >= 1000 DM",..: 5 2 2 2 2 5 4 2 1 2 ...
##  $ Present.employment.since                                : Factor w/ 5 levels ".. >= 7 years",..: 1 3 4 4 3 3 1 3 4 5 ...
##  $ Installment.rate.in.percentage.of.disposable.income     : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ Personal.status.and.sex                                 : Factor w/ 4 levels "female : divorced/separated/married",..: 4 1 4 4 4 4 4 4 2 3 ...
##  $ Other.debtors.guarantors                                : Factor w/ 3 levels "co-applicant",..: 3 3 3 2 3 3 3 3 3 3 ...
##  $ Present.residence.since                                 : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ Property                                                : Factor w/ 4 levels "if not A121 : building society savings agreement/life insurance",..: 3 3 3 1 4 4 1 2 3 2 ...
##  $ Age.in.years                                            : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ Other.installment.plans                                 : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Housing                                                 : Factor w/ 3 levels "for free","own",..: 2 2 2 1 1 1 2 3 2 2 ...
##  $ Number.of.existing.credits.at.this.bank                 : int  2 1 1 1 2 1 1 1 1 2 ...
##  $ Job                                                     : Factor w/ 4 levels "management/ self-employed/highly qualified employee/ officer",..: 2 2 4 2 2 4 2 1 4 1 ...
##  $ Number.of.people.being.liable.to.provide.maintenance.for: int  1 1 2 2 2 2 1 1 1 1 ...
##  $ Telephone                                               : Factor w/ 2 levels "none","yes, registered under the customers name": 2 1 1 1 1 2 1 2 1 1 ...
##  $ foreign.worker                                          : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Good.Loan                                               : Factor w/ 2 levels "BadLoan","GoodLoan": 2 1 2 2 1 2 2 2 2 1 ...
attach(creditdata)
head(Good.Loan, n=10)
##  [1] GoodLoan BadLoan  GoodLoan GoodLoan BadLoan  GoodLoan GoodLoan GoodLoan GoodLoan BadLoan 
## Levels: BadLoan GoodLoan
table(Good.Loan)
## Good.Loan
##  BadLoan GoodLoan 
##      300      700
options(digits=2)
prop.table(table(Good.Loan))
## Good.Loan
##  BadLoan GoodLoan 
##      0.3      0.7
table(Credit.history, Good.Loan)
##                                                             Good.Loan
## Credit.history                                               BadLoan GoodLoan
##   all credits at this bank paid back duly                         28       21
##   critical account/other credits existing (not at this bank)      50      243
##   delay in paying off in the past                                 28       60
##   existing credits paid back duly till now                       169      361
##   no credits taken/all credits paid back duly                     25       15
prop.table(table(Credit.history, Good.Loan), margin=1)
##                                                             Good.Loan
## Credit.history                                               BadLoan GoodLoan
##   all credits at this bank paid back duly                       0.57     0.43
##   critical account/other credits existing (not at this bank)    0.17     0.83
##   delay in paying off in the past                               0.32     0.68
##   existing credits paid back duly till now                      0.32     0.68
##   no credits taken/all credits paid back duly                   0.62     0.38
par(pin=c(4, 4), mai=c(1.0, 4.5, 1.0, 0.5))
CG.percentage <- prop.table(table(Credit.history, Good.Loan), margin=1)
o.GL <- order(CG.percentage[,1])
barplot(t(CG.percentage[o.GL,]),las=1, horiz=TRUE)
title(xlab="Fraction of Defaulted Loans", ylab="")

library(ggplot2)
ggplot(creditdata) + geom_bar(aes(x=Credit.history, fill=Good.Loan), position="fill") + coord_flip()

library(rpart)
model <- rpart(Good.Loan ~ Duration.in.month +
                 Installment.rate.in.percentage.of.disposable.income +
                 Credit.amount +
                 Other.installment.plans,
               data=d, 
               control=rpart.control(maxdepth=4),
               method="class")
model
## n= 1000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 1000 300 GoodLoan (0.30 0.70)  
##    2) Duration.in.month>=34 170  82 GoodLoan (0.48 0.52)  
##      4) Credit.amount< 2.2e+03 8   1 BadLoan (0.88 0.12) *
##      5) Credit.amount>=2.2e+03 162  75 GoodLoan (0.46 0.54)  
##       10) Duration.in.month>=44 69  30 BadLoan (0.57 0.43)  
##         20) Credit.amount< 7.4e+03 37  12 BadLoan (0.68 0.32) *
##         21) Credit.amount>=7.4e+03 32  14 GoodLoan (0.44 0.56) *
##       11) Duration.in.month< 44 93  36 GoodLoan (0.39 0.61) *
##    3) Duration.in.month< 34 830 220 GoodLoan (0.26 0.74)  
##      6) Credit.amount>=1.1e+04 9   0 BadLoan (1.00 0.00) *
##      7) Credit.amount< 1.1e+04 821 210 GoodLoan (0.25 0.75) *
resultframe <- data.frame(Good.Loan=creditdata$Good.Loan,
                          pred=predict(model, type="class"))
rtab <- table(resultframe)
rtab
##           pred
## Good.Loan  BadLoan GoodLoan
##   BadLoan       41      259
##   GoodLoan      13      687
sum(diag(rtab))/sum(rtab)
## [1] 0.73
sum(rtab[1,1])/sum(rtab[,1])
## [1] 0.76
sum(rtab[1,1])/sum(rtab[1,])
## [1] 0.14
sum(rtab[2,1])/sum(rtab[2,])
## [1] 0.019
tab1 <- as.table(matrix(data=c(50,6,0,44),nrow=2,ncol=2))
dimnames(tab1) <- list('loan.as.pct.disposable.income'=
      c('LT.15pct','GT.15pct'),
   'loan.quality.pop1'=
      c('goodloan','badloan'))
tab1
##                              loan.quality.pop1
## loan.as.pct.disposable.income goodloan badloan
##                      LT.15pct       50       0
##                      GT.15pct        6      44
tab2 <- as.table(matrix(data=c(34,18,16,32),nrow=2,ncol=2))
dimnames(tab2) <- list('loan.as.pct.disposable.income'=
      c('LT.15pct','GT.15pct'),
   'loan.quality.pop2'=
      c('goodloan','badloan'))
tab2
##                              loan.quality.pop2
## loan.as.pct.disposable.income goodloan badloan
##                      LT.15pct       34      16
##                      GT.15pct       18      32