This is an R Markdown document demonstrating exploratory analysis on German credit data, a free download available from the UCI website here . On the data exploration side, On the data exploration section, we walk through a few visualization examples using popular visualization packages in R and Python.
The dataset has 21 columns showing the financial and demographic information for use in credit scoring examples. The 21st column (goodBad) is the indicator variable on loan defaults, which can be used as the target in supervised learning. The following are the descriptive tags given to these variables:
#Read in the data
cdata <- read.table("german.data", h=F, sep = ' ')
#Add readable column names
colnames(cdata) <- c("chkngAcctStatus", "durationMonths", "creditHistory", "loanPurpose", "creditAmount", "savingsTotal", "crrntEmplmtSince", "instllmtPct", "persnlStatus", "othrDebtorGuaranters", "crrntResidenceSince", "propertyType", "age", "otherInstllmtType", "housingType", "existingCredits","jobStatus", "numDependents", "registeredPhone", "foriegnWorker", "goodBad")
#Change the target variable into categorical with values 'Good' and 'Bad'
cdata$goodBad<-as.factor(ifelse(cdata$goodBad == 1, "Good", "Bad"))
The distribution of the target variable is shown below. The chart is created using Plotly and has elements of interactivity in it. Howering over the bars will display the values:
cdata %>% plot_ly(x = cdata$goodBad, type = "histogram") %>% layout(title = "Histogram of 'Good' vs 'Bad'")
A base plot across Total savings and the target:
plot(cdata$savingsTotal, cdata$goodBad,
main="Savings account/bonds ~ Good-Bad",
xlab="Savings/Bonds",
ylab="Good-Bad", legend = T)
Similar plot using ggplot:
ctable <- as.data.frame(table(cdata$savingsTotal, cdata$goodBad))
names(ctable) <- c('savingsTotal', 'goodBad', 'counts')
ggplot(data=ctable, aes(x=savingsTotal, y=counts, fill=goodBad)) +
geom_bar(stat="identity") + ylab(" ") + ggtitle("goodBad vs savingsTotal")
Again with Plotly - this one shows the median loan duration in months against defaults:
cdata_df <- tbl_df(cdata)
cdata_df %>% group_by(goodBad) %>% summarise(meanMonths = mean(durationMonths), medianMonths = median(durationMonths)) %>% plot_ly(x = ~goodBad, y = ~medianMonths, type = "bar") %>% layout(title = " median loan duration in months vs goodBad")
Scatter plot of Credit Amount and Age with color separation of Good and Bad loans:
cdata_df %>% plot_ly(x = ~age, y = ~creditAmount, color = ~goodBad, colors = c("red", "blue"), type = "scatter", mode = "markers" ) %>% layout(title = "Age vs creditAmount by goodBad")
3D Plots using rgl package - Loan duration, Age, Credit Amount with color separation of Good and Bad loans:
plot3d(cdata$durationMonths, y=cdata$age, z=cdata$creditAmount, type="p", size = 5, col=rainbow(3)[cdata$goodBad], radius=1, xlab = "Months", ylab = "Age",zlab="Credit Amount ")
You must enable Javascript to view this page properly.
Python:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_table("german.data",header = None, sep=' ',names = ['chkngAcctStatus', 'durationMonths', 'creditHistory', 'loanPurpose', 'creditAmount', 'savingsTotal', 'crrntEmplmtSince', 'instllmtPct', 'persnlStatus', 'othrDebtorGuaranters', 'crrntResidenceSince', 'propertyType', 'age', 'otherInstllmtType', 'housingType', 'existingCredits','jobStatus', 'numDependents', 'registeredPhone', 'foriegnWorker', 'goodBad'])
temp = df.pivot_table(values='goodBad',index=['creditHistory'],aggfunc=lambda x: x.map({1:0,2:1}).mean())
fig = plt.figure(figsize=(7,4))
ax2 = fig.add_subplot(111)
temp.plot(kind = 'bar')
ax2.set_xlabel('Credit_History')
ax2.set_ylabel('Bad%')
ax2.set_title("Probability of bad loans by credit history")
#not yet figured out a way to directly display Python plots on markdown document. So using a crude method of first
#saving the plot as image and then displaying that image (outside this chunk)
fig.savefig('credHist.jpg')
Here we explore approaches to fit a logistic regression model using two methods, stepwise and cross validation. First we use two stepwise functions step().
First we split the data into test and train (70:30):
set.seed(252)
split <- createDataPartition(y = cdata$goodBad, p = 0.7, list = F)
train <- cdata[split,]
test <- cdata[-split,]
startMdl <- glm(goodBad ~., train, family = binomial)
stepMdl <- step(startMdl, trace = FALSE, steps = 5000, k= log(nrow(train)))
#display summary
summary(stepMdl)
##
## Call:
## glm(formula = goodBad ~ chkngAcctStatus + durationMonths + foriegnWorker,
## family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3060 -1.0308 0.4640 0.8507 1.6178
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.653301 0.219312 2.979 0.00289 **
## chkngAcctStatusA12 0.638683 0.213375 2.993 0.00276 **
## chkngAcctStatusA13 0.947266 0.389701 2.431 0.01507 *
## chkngAcctStatusA14 2.241761 0.251144 8.926 < 2e-16 ***
## durationMonths -0.034312 0.007512 -4.568 4.93e-06 ***
## foriegnWorkerA202 1.695924 0.768367 2.207 0.02730 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 855.21 on 699 degrees of freedom
## Residual deviance: 723.46 on 694 degrees of freedom
## AIC: 735.46
##
## Number of Fisher Scoring iterations: 5
# variable importance
varImp(stepMdl, scale = FALSE)
## Overall
## chkngAcctStatusA12 2.993250
## chkngAcctStatusA13 2.430750
## chkngAcctStatusA14 8.926185
## durationMonths 4.567550
## foriegnWorkerA202 2.207180
#Plot the ROCR and calculate the AUC
stepPr = predict(stepMdl, newdata = test, type = "response")
stepPred = prediction(stepPr, test$goodBad)
stepPerf = performance(stepPred, "tpr", "fpr")
plot(stepPerf, colorize =TRUE, main = "ROC Curve", col = 2, lwd = 2, print.cutoffs.at=seq(0,1,by=0.1), text.adj=c(-0.2,1.7))
abline( a =0, b = 1, lwd = 2, lty = 2, col = "gray")
round(auc(test$goodBad, stepPr), digits = 2)
## [1] 0.73
set.seed(123)
ctrl <- trainControl(method = "repeatedcv",repeats = 3,classProbs = TRUE,summaryFunction = twoClassSummary)
cvFit <- train(goodBad ~ .,data = train, method = "glm", family = binomial, tuneLength = 5, trControl = ctrl,
metric = "ROC")
#display summary
summary(cvFit)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6594 -0.6999 0.3306 0.6836 1.9887
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.355e-01 1.460e+00 0.572 0.567223
## chkngAcctStatusA12 5.901e-01 2.594e-01 2.275 0.022928 *
## chkngAcctStatusA13 5.786e-01 4.441e-01 1.303 0.192644
## chkngAcctStatusA14 2.066e+00 3.010e-01 6.863 6.73e-12 ***
## durationMonths -3.078e-02 1.175e-02 -2.620 0.008783 **
## creditHistoryA31 -6.416e-01 6.503e-01 -0.987 0.323853
## creditHistoryA32 1.312e-01 5.020e-01 0.261 0.793768
## creditHistoryA33 3.509e-01 5.509e-01 0.637 0.524220
## creditHistoryA34 9.396e-01 5.158e-01 1.822 0.068522 .
## loanPurposeA41 1.661e+00 4.299e-01 3.863 0.000112 ***
## loanPurposeA410 1.718e+00 8.719e-01 1.971 0.048742 *
## loanPurposeA42 1.156e+00 3.282e-01 3.524 0.000426 ***
## loanPurposeA43 1.043e+00 3.045e-01 3.426 0.000613 ***
## loanPurposeA44 4.183e-01 8.107e-01 0.516 0.605856
## loanPurposeA45 7.249e-02 6.094e-01 0.119 0.905313
## loanPurposeA46 2.033e-01 4.812e-01 0.422 0.672691
## loanPurposeA48 1.640e+00 1.351e+00 1.214 0.224846
## loanPurposeA49 9.085e-01 4.050e-01 2.243 0.024876 *
## creditAmount -9.327e-05 5.346e-05 -1.745 0.081058 .
## savingsTotalA62 1.691e-01 3.386e-01 0.500 0.617410
## savingsTotalA63 1.655e-01 4.932e-01 0.336 0.737185
## savingsTotalA64 1.343e+00 6.874e-01 1.955 0.050639 .
## savingsTotalA65 8.565e-01 3.206e-01 2.671 0.007557 **
## crrntEmplmtSinceA72 -8.132e-02 5.331e-01 -0.153 0.878760
## crrntEmplmtSinceA73 1.505e-01 5.101e-01 0.295 0.767936
## crrntEmplmtSinceA74 7.490e-01 5.510e-01 1.359 0.174042
## crrntEmplmtSinceA75 3.022e-01 5.171e-01 0.584 0.558941
## instllmtPct -3.519e-01 1.086e-01 -3.242 0.001189 **
## persnlStatusA92 2.674e-01 4.395e-01 0.608 0.542882
## persnlStatusA93 8.388e-01 4.279e-01 1.960 0.049972 *
## persnlStatusA94 3.253e-01 5.354e-01 0.608 0.543413
## othrDebtorGuarantersA102 -8.184e-01 4.792e-01 -1.708 0.087666 .
## othrDebtorGuarantersA103 8.766e-01 5.111e-01 1.715 0.086324 .
## crrntResidenceSince -3.937e-02 1.043e-01 -0.377 0.705926
## propertyTypeA122 3.162e-02 3.123e-01 0.101 0.919334
## propertyTypeA123 -3.441e-02 2.882e-01 -0.119 0.904961
## propertyTypeA124 -5.552e-01 5.238e-01 -1.060 0.289200
## age 2.589e-02 1.149e-02 2.252 0.024291 *
## otherInstllmtTypeA142 3.426e-01 5.055e-01 0.678 0.497837
## otherInstllmtTypeA143 9.257e-01 2.893e-01 3.199 0.001378 **
## housingTypeA152 1.497e-01 2.851e-01 0.525 0.599638
## housingTypeA153 1.706e-01 5.909e-01 0.289 0.772858
## existingCredits -5.076e-01 2.328e-01 -2.180 0.029238 *
## jobStatusA172 -1.440e+00 1.027e+00 -1.402 0.160964
## jobStatusA173 -1.648e+00 1.002e+00 -1.645 0.099956 .
## jobStatusA174 -1.818e+00 1.023e+00 -1.777 0.075493 .
## numDependents -1.121e-01 3.137e-01 -0.358 0.720710
## registeredPhoneA192 4.606e-02 2.415e-01 0.191 0.848757
## foriegnWorkerA202 1.722e+00 8.420e-01 2.045 0.040809 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 855.21 on 699 degrees of freedom
## Residual deviance: 612.27 on 651 degrees of freedom
## AIC: 710.27
##
## Number of Fisher Scoring iterations: 5
# Variable Importance
varImp(cvFit, scale = FALSE)
## glm variable importance
##
## only 20 most important variables shown (out of 48)
##
## Overall
## chkngAcctStatusA14 6.863
## loanPurposeA41 3.863
## loanPurposeA42 3.524
## loanPurposeA43 3.426
## instllmtPct 3.242
## otherInstllmtTypeA143 3.199
## savingsTotalA65 2.671
## durationMonths 2.620
## chkngAcctStatusA12 2.275
## age 2.252
## loanPurposeA49 2.243
## existingCredits 2.180
## foriegnWorkerA202 2.045
## loanPurposeA410 1.971
## persnlStatusA93 1.960
## savingsTotalA64 1.955
## creditHistoryA34 1.822
## jobStatusA174 1.777
## creditAmount 1.745
## othrDebtorGuarantersA103 1.715
#Plot the ROCR and calculate the AUC
cvPr = predict(cvFit, newdata = test, type = "prob")
cvPred = prediction(cvPr[,2], test$goodBad)
cvPerf = performance(cvPred, "tpr", "fpr")
plot(cvPerf, colorize =TRUE, main = "ROC Curve", col = 2, lwd = 2, print.cutoffs.at=seq(0,1,by=0.1), text.adj=c(-0.2,1.7))
abline( a =0, b = 1, lwd = 2, lty = 2, col = "gray")
round(auc(test$goodBad, cvPr[,2]), digits = 2)
## [1] 0.78
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
# Feature Importance
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
#Function for one-hot encoding (dummy variable) of categorical variables
def dummyEncode(df):
columnsToEncode = list(df.select_dtypes(include=['category','object']))
le = LabelEncoder()
for feature in columnsToEncode:
try:
df[feature] = le.fit_transform(df[feature])
except:
print('Error encoding '+feature)
return df
df = pd.read_table("german.data",header = None, sep=' ',names = ['chkngAcctStatus', 'durationMonths', 'creditHistory', 'loanPurpose', 'creditAmount', 'savingsTotal', 'crrntEmplmtSince', 'instllmtPct', 'persnlStatus', 'othrDebtorGuaranters', 'crrntResidenceSince', 'propertyType', 'age', 'otherInstllmtType', 'housingType', 'existingCredits','jobStatus', 'numDependents', 'registeredPhone', 'foriegnWorker', 'goodBad'])
df['goodBad']=df["goodBad"] - 1
dfPred = dummyEncode(df)
predictors = dfPred.drop('goodBad', 1)
targets = df['goodBad']
np.random.seed(123)
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4)
model = GaussianNB()
model.fit(pred_train, tar_train)
# Train the model using Naive Bayesian classifier
predNB=model.predict(pred_test)
# Confustion Matrix
print "Confustion Matrix:"
print sklearn.metrics.confusion_matrix(tar_test,predNB)
print "\n"
print "Accuracy: " + str(sklearn.metrics.accuracy_score(tar_test, predNB))
## Confustion Matrix:
## [[224 49]
## [ 45 82]]
##
##
## Accuracy: 0.765
The deployment of the step model on Shiny can be viewed in the link below: