1.Introduction

This notebook will examine default credit data. I try to build a predictive model to determine the payment status of the client. Hopefully, by examining the characteristics of the client we can predict whether they will have difficulties in payment or will repay the credit on time.

2.Retrieving Data

There are 7 different sources of data, but this time I will only use application_train data as the data source for analysis. The data is still manageable to work with my current tool.

library(readr) #package reading data
library(tidyverse) #package for table manipulation
## Warning: package 'ggplot2' was built under R version 3.5.3
library(randomForest)
library(caret)
library(DataExplorer)
require(xgboost)
library(ROCR)
library(pROC)


#Read Data
data <- read.csv("application_train.csv")

3.Glimpse of Data

In additon of having a firm theory of the risk, examining our data will help us enhancing our hypothesis. We can use summary() to examine the mean,median, max, min, etc of the data.

head(data) #showing the first five of the records
str(data) #checking the type variables
## 'data.frame':    307511 obs. of  122 variables:
##  $ SK_ID_CURR                  : int  100002 100003 100004 100006 100007 100008 100009 100010 100011 100012 ...
##  $ TARGET                      : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ NAME_CONTRACT_TYPE          : Factor w/ 2 levels "Cash loans","Revolving loans": 1 1 2 1 1 1 1 1 1 2 ...
##  $ CODE_GENDER                 : Factor w/ 3 levels "F","M","XNA": 2 1 2 1 2 2 1 2 1 2 ...
##  $ FLAG_OWN_CAR                : Factor w/ 2 levels "N","Y": 1 1 2 1 1 1 2 2 1 1 ...
##  $ FLAG_OWN_REALTY             : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 2 2 2 ...
##  $ CNT_CHILDREN                : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ AMT_INCOME_TOTAL            : num  202500 270000 67500 135000 121500 ...
##  $ AMT_CREDIT                  : num  406598 1293503 135000 312683 513000 ...
##  $ AMT_ANNUITY                 : num  24701 35699 6750 29687 21866 ...
##  $ AMT_GOODS_PRICE             : num  351000 1129500 135000 297000 513000 ...
##  $ NAME_TYPE_SUITE             : Factor w/ 8 levels "","Children",..: 8 3 8 8 8 7 8 8 2 8 ...
##  $ NAME_INCOME_TYPE            : Factor w/ 8 levels "Businessman",..: 8 5 8 8 8 5 2 5 4 8 ...
##  $ NAME_EDUCATION_TYPE         : Factor w/ 5 levels "Academic degree",..: 5 2 5 5 5 5 2 2 5 5 ...
##  $ NAME_FAMILY_STATUS          : Factor w/ 6 levels "Civil marriage",..: 4 2 4 1 4 2 2 2 2 4 ...
##  $ NAME_HOUSING_TYPE           : Factor w/ 6 levels "Co-op apartment",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ REGION_POPULATION_RELATIVE  : num  0.0188 0.00354 0.01003 0.00802 0.02866 ...
##  $ DAYS_BIRTH                  : int  -9461 -16765 -19046 -19005 -19932 -16941 -13778 -18850 -20099 -14469 ...
##  $ DAYS_EMPLOYED               : int  -637 -1188 -225 -3039 -3038 -1588 -3130 -449 365243 -2019 ...
##  $ DAYS_REGISTRATION           : num  -3648 -1186 -4260 -9833 -4311 ...
##  $ DAYS_ID_PUBLISH             : int  -2120 -291 -2531 -2437 -3458 -477 -619 -2379 -3514 -3992 ...
##  $ OWN_CAR_AGE                 : num  NA NA 26 NA NA NA 17 8 NA NA ...
##  $ FLAG_MOBIL                  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ FLAG_EMP_PHONE              : int  1 1 1 1 1 1 1 1 0 1 ...
##  $ FLAG_WORK_PHONE             : int  0 0 1 0 0 1 0 1 0 0 ...
##  $ FLAG_CONT_MOBILE            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ FLAG_PHONE                  : int  1 1 1 0 0 1 1 0 0 0 ...
##  $ FLAG_EMAIL                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ OCCUPATION_TYPE             : Factor w/ 19 levels "","Accountants",..: 10 5 10 10 5 10 2 12 1 10 ...
##  $ CNT_FAM_MEMBERS             : num  1 2 1 2 1 2 3 2 2 1 ...
##  $ REGION_RATING_CLIENT        : int  2 1 2 2 2 2 2 3 2 2 ...
##  $ REGION_RATING_CLIENT_W_CITY : int  2 1 2 2 2 2 2 3 2 2 ...
##  $ WEEKDAY_APPR_PROCESS_START  : Factor w/ 7 levels "FRIDAY","MONDAY",..: 7 2 2 7 5 7 4 2 7 5 ...
##  $ HOUR_APPR_PROCESS_START     : int  10 11 9 17 11 16 16 16 14 8 ...
##  $ REG_REGION_NOT_LIVE_REGION  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ REG_REGION_NOT_WORK_REGION  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ LIVE_REGION_NOT_WORK_REGION : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ REG_CITY_NOT_LIVE_CITY      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ REG_CITY_NOT_WORK_CITY      : int  0 0 0 0 1 0 0 1 0 0 ...
##  $ LIVE_CITY_NOT_WORK_CITY     : int  0 0 0 0 1 0 0 1 0 0 ...
##  $ ORGANIZATION_TYPE           : Factor w/ 58 levels "Advertising",..: 6 40 12 6 38 34 6 34 58 10 ...
##  $ EXT_SOURCE_1                : num  0.083 0.311 NA NA NA ...
##  $ EXT_SOURCE_2                : num  0.263 0.622 0.556 0.65 0.323 ...
##  $ EXT_SOURCE_3                : num  0.139 NA 0.73 NA NA ...
##  $ APARTMENTS_AVG              : num  0.0247 0.0959 NA NA NA NA NA NA NA NA ...
##  $ BASEMENTAREA_AVG            : num  0.0369 0.0529 NA NA NA NA NA NA NA NA ...
##  $ YEARS_BEGINEXPLUATATION_AVG : num  0.972 0.985 NA NA NA ...
##  $ YEARS_BUILD_AVG             : num  0.619 0.796 NA NA NA ...
##  $ COMMONAREA_AVG              : num  0.0143 0.0605 NA NA NA NA NA NA NA NA ...
##  $ ELEVATORS_AVG               : num  0 0.08 NA NA NA NA NA NA NA NA ...
##  $ ENTRANCES_AVG               : num  0.069 0.0345 NA NA NA NA NA NA NA NA ...
##  $ FLOORSMAX_AVG               : num  0.0833 0.2917 NA NA NA ...
##  $ FLOORSMIN_AVG               : num  0.125 0.333 NA NA NA ...
##  $ LANDAREA_AVG                : num  0.0369 0.013 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAPARTMENTS_AVG        : num  0.0202 0.0773 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAREA_AVG              : num  0.019 0.0549 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAPARTMENTS_AVG     : num  0 0.0039 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAREA_AVG           : num  0 0.0098 NA NA NA NA NA NA NA NA ...
##  $ APARTMENTS_MODE             : num  0.0252 0.0924 NA NA NA NA NA NA NA NA ...
##  $ BASEMENTAREA_MODE           : num  0.0383 0.0538 NA NA NA NA NA NA NA NA ...
##  $ YEARS_BEGINEXPLUATATION_MODE: num  0.972 0.985 NA NA NA ...
##  $ YEARS_BUILD_MODE            : num  0.634 0.804 NA NA NA ...
##  $ COMMONAREA_MODE             : num  0.0144 0.0497 NA NA NA NA NA NA NA NA ...
##  $ ELEVATORS_MODE              : num  0 0.0806 NA NA NA NA NA NA NA NA ...
##  $ ENTRANCES_MODE              : num  0.069 0.0345 NA NA NA NA NA NA NA NA ...
##  $ FLOORSMAX_MODE              : num  0.0833 0.2917 NA NA NA ...
##  $ FLOORSMIN_MODE              : num  0.125 0.333 NA NA NA ...
##  $ LANDAREA_MODE               : num  0.0377 0.0128 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAPARTMENTS_MODE       : num  0.022 0.079 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAREA_MODE             : num  0.0198 0.0554 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAPARTMENTS_MODE    : num  0 0 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAREA_MODE          : num  0 0 NA NA NA NA NA NA NA NA ...
##  $ APARTMENTS_MEDI             : num  0.025 0.0968 NA NA NA NA NA NA NA NA ...
##  $ BASEMENTAREA_MEDI           : num  0.0369 0.0529 NA NA NA NA NA NA NA NA ...
##  $ YEARS_BEGINEXPLUATATION_MEDI: num  0.972 0.985 NA NA NA ...
##  $ YEARS_BUILD_MEDI            : num  0.624 0.799 NA NA NA ...
##  $ COMMONAREA_MEDI             : num  0.0144 0.0608 NA NA NA NA NA NA NA NA ...
##  $ ELEVATORS_MEDI              : num  0 0.08 NA NA NA NA NA NA NA NA ...
##  $ ENTRANCES_MEDI              : num  0.069 0.0345 NA NA NA NA NA NA NA NA ...
##  $ FLOORSMAX_MEDI              : num  0.0833 0.2917 NA NA NA ...
##  $ FLOORSMIN_MEDI              : num  0.125 0.333 NA NA NA ...
##  $ LANDAREA_MEDI               : num  0.0375 0.0132 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAPARTMENTS_MEDI       : num  0.0205 0.0787 NA NA NA NA NA NA NA NA ...
##  $ LIVINGAREA_MEDI             : num  0.0193 0.0558 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAPARTMENTS_MEDI    : num  0 0.0039 NA NA NA NA NA NA NA NA ...
##  $ NONLIVINGAREA_MEDI          : num  0 0.01 NA NA NA NA NA NA NA NA ...
##  $ FONDKAPREMONT_MODE          : Factor w/ 5 levels "","not specified",..: 4 4 1 1 1 1 1 1 1 1 ...
##  $ HOUSETYPE_MODE              : Factor w/ 4 levels "","block of flats",..: 2 2 1 1 1 1 1 1 1 1 ...
##  $ TOTALAREA_MODE              : num  0.0149 0.0714 NA NA NA NA NA NA NA NA ...
##  $ WALLSMATERIAL_MODE          : Factor w/ 8 levels "","Block","Mixed",..: 7 2 1 1 1 1 1 1 1 1 ...
##  $ EMERGENCYSTATE_MODE         : Factor w/ 3 levels "","No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
##  $ OBS_30_CNT_SOCIAL_CIRCLE    : num  2 1 0 2 0 0 1 2 1 2 ...
##  $ DEF_30_CNT_SOCIAL_CIRCLE    : num  2 0 0 0 0 0 0 0 0 0 ...
##  $ OBS_60_CNT_SOCIAL_CIRCLE    : num  2 1 0 2 0 0 1 2 1 2 ...
##  $ DEF_60_CNT_SOCIAL_CIRCLE    : num  2 0 0 0 0 0 0 0 0 0 ...
##  $ DAYS_LAST_PHONE_CHANGE      : num  -1134 -828 -815 -617 -1106 ...
##  $ FLAG_DOCUMENT_2             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG_DOCUMENT_3             : int  1 1 0 1 0 1 0 1 1 0 ...
##  $ FLAG_DOCUMENT_4             : int  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]

4.Checking Missing Data

Missing data can produce noise especially if it can change the distribution of the data. I will generate how many observations that are missing in each variables using a handy r package called DataExplorer.

a1 <- as.data.frame(colSums(is.na(data)))
b1 <- round (as.data.frame(colMeans(is.na(data))*100),digits=2)
miss <- data.frame(cbind(setNames(a1,c("count")),
                              setNames(b1,c("percentage"))))
remove(a1,b1)
print(miss)
##                               count percentage
## SK_ID_CURR                        0       0.00
## TARGET                            0       0.00
## NAME_CONTRACT_TYPE                0       0.00
## CODE_GENDER                       0       0.00
## FLAG_OWN_CAR                      0       0.00
## FLAG_OWN_REALTY                   0       0.00
## CNT_CHILDREN                      0       0.00
## AMT_INCOME_TOTAL                  0       0.00
## AMT_CREDIT                        0       0.00
## AMT_ANNUITY                      12       0.00
## AMT_GOODS_PRICE                 278       0.09
## NAME_TYPE_SUITE                   0       0.00
## NAME_INCOME_TYPE                  0       0.00
## NAME_EDUCATION_TYPE               0       0.00
## NAME_FAMILY_STATUS                0       0.00
## NAME_HOUSING_TYPE                 0       0.00
## REGION_POPULATION_RELATIVE        0       0.00
## DAYS_BIRTH                        0       0.00
## DAYS_EMPLOYED                     0       0.00
## DAYS_REGISTRATION                 0       0.00
## DAYS_ID_PUBLISH                   0       0.00
## OWN_CAR_AGE                  202929      65.99
## FLAG_MOBIL                        0       0.00
## FLAG_EMP_PHONE                    0       0.00
## FLAG_WORK_PHONE                   0       0.00
## FLAG_CONT_MOBILE                  0       0.00
## FLAG_PHONE                        0       0.00
## FLAG_EMAIL                        0       0.00
## OCCUPATION_TYPE                   0       0.00
## CNT_FAM_MEMBERS                   2       0.00
## REGION_RATING_CLIENT              0       0.00
## REGION_RATING_CLIENT_W_CITY       0       0.00
## WEEKDAY_APPR_PROCESS_START        0       0.00
## HOUR_APPR_PROCESS_START           0       0.00
## REG_REGION_NOT_LIVE_REGION        0       0.00
## REG_REGION_NOT_WORK_REGION        0       0.00
## LIVE_REGION_NOT_WORK_REGION       0       0.00
## REG_CITY_NOT_LIVE_CITY            0       0.00
## REG_CITY_NOT_WORK_CITY            0       0.00
## LIVE_CITY_NOT_WORK_CITY           0       0.00
## ORGANIZATION_TYPE                 0       0.00
## EXT_SOURCE_1                 173378      56.38
## EXT_SOURCE_2                    660       0.21
## EXT_SOURCE_3                  60965      19.83
## APARTMENTS_AVG               156061      50.75
## BASEMENTAREA_AVG             179943      58.52
## YEARS_BEGINEXPLUATATION_AVG  150007      48.78
## YEARS_BUILD_AVG              204488      66.50
## COMMONAREA_AVG               214865      69.87
## ELEVATORS_AVG                163891      53.30
## ENTRANCES_AVG                154828      50.35
## FLOORSMAX_AVG                153020      49.76
## FLOORSMIN_AVG                208642      67.85
## LANDAREA_AVG                 182590      59.38
## LIVINGAPARTMENTS_AVG         210199      68.35
## LIVINGAREA_AVG               154350      50.19
## NONLIVINGAPARTMENTS_AVG      213514      69.43
## NONLIVINGAREA_AVG            169682      55.18
## APARTMENTS_MODE              156061      50.75
## BASEMENTAREA_MODE            179943      58.52
## YEARS_BEGINEXPLUATATION_MODE 150007      48.78
## YEARS_BUILD_MODE             204488      66.50
## COMMONAREA_MODE              214865      69.87
## ELEVATORS_MODE               163891      53.30
## ENTRANCES_MODE               154828      50.35
## FLOORSMAX_MODE               153020      49.76
## FLOORSMIN_MODE               208642      67.85
## LANDAREA_MODE                182590      59.38
## LIVINGAPARTMENTS_MODE        210199      68.35
## LIVINGAREA_MODE              154350      50.19
## NONLIVINGAPARTMENTS_MODE     213514      69.43
## NONLIVINGAREA_MODE           169682      55.18
## APARTMENTS_MEDI              156061      50.75
## BASEMENTAREA_MEDI            179943      58.52
## YEARS_BEGINEXPLUATATION_MEDI 150007      48.78
## YEARS_BUILD_MEDI             204488      66.50
## COMMONAREA_MEDI              214865      69.87
## ELEVATORS_MEDI               163891      53.30
## ENTRANCES_MEDI               154828      50.35
## FLOORSMAX_MEDI               153020      49.76
## FLOORSMIN_MEDI               208642      67.85
## LANDAREA_MEDI                182590      59.38
## LIVINGAPARTMENTS_MEDI        210199      68.35
## LIVINGAREA_MEDI              154350      50.19
## NONLIVINGAPARTMENTS_MEDI     213514      69.43
## NONLIVINGAREA_MEDI           169682      55.18
## FONDKAPREMONT_MODE                0       0.00
## HOUSETYPE_MODE                    0       0.00
## TOTALAREA_MODE               148431      48.27
## WALLSMATERIAL_MODE                0       0.00
## EMERGENCYSTATE_MODE               0       0.00
## OBS_30_CNT_SOCIAL_CIRCLE       1021       0.33
## DEF_30_CNT_SOCIAL_CIRCLE       1021       0.33
## OBS_60_CNT_SOCIAL_CIRCLE       1021       0.33
## DEF_60_CNT_SOCIAL_CIRCLE       1021       0.33
## DAYS_LAST_PHONE_CHANGE            1       0.00
## FLAG_DOCUMENT_2                   0       0.00
## FLAG_DOCUMENT_3                   0       0.00
## FLAG_DOCUMENT_4                   0       0.00
## FLAG_DOCUMENT_5                   0       0.00
## FLAG_DOCUMENT_6                   0       0.00
## FLAG_DOCUMENT_7                   0       0.00
## FLAG_DOCUMENT_8                   0       0.00
## FLAG_DOCUMENT_9                   0       0.00
## FLAG_DOCUMENT_10                  0       0.00
## FLAG_DOCUMENT_11                  0       0.00
## FLAG_DOCUMENT_12                  0       0.00
## FLAG_DOCUMENT_13                  0       0.00
## FLAG_DOCUMENT_14                  0       0.00
## FLAG_DOCUMENT_15                  0       0.00
## FLAG_DOCUMENT_16                  0       0.00
## FLAG_DOCUMENT_17                  0       0.00
## FLAG_DOCUMENT_18                  0       0.00
## FLAG_DOCUMENT_19                  0       0.00
## FLAG_DOCUMENT_20                  0       0.00
## FLAG_DOCUMENT_21                  0       0.00
## AMT_REQ_CREDIT_BUREAU_HOUR    41519      13.50
## AMT_REQ_CREDIT_BUREAU_DAY     41519      13.50
## AMT_REQ_CREDIT_BUREAU_WEEK    41519      13.50
## AMT_REQ_CREDIT_BUREAU_MON     41519      13.50
## AMT_REQ_CREDIT_BUREAU_QRT     41519      13.50
## AMT_REQ_CREDIT_BUREAU_YEAR    41519      13.50
#plot_missing(train, title="missing values")
#I run the plot seperately and resize it to make it easier to look

5.Recoding and Anomaly Data

There are some categorical variables that need to be recoded since they are still in string type. By recoding them, they are ready to be analysed as categorical data (nominal and ordinal). I also check whether there is an anomlay in the data or not. DAYS_BIRTH has negative values which indicate an anomaly (not make sense), but I can get the client’s age from that variable by transforming the data (dividing with -365).

#Recoding Data
dmy1 <- dummyVars("~NAME_INCOME_TYPE", data, fullRank = T)
trsf1 <- data.frame(predict(dmy1, newdata = data))

dmy2 <- dummyVars("~OCCUPATION_TYPE", data, fullRank = T)
trsf2 <- data.frame(predict(dmy2, newdata = data))

dmy3 <- dummyVars("~NAME_EDUCATION_TYPE", data, fullRank = T)
trsf3 <- data.frame(predict(dmy3, newdata = data))

data <- cbind(data,trsf1,trsf2,trsf3,fill=T)

remove(dmy1,dmy2,dmy3,trsf1,trsf2,trsf3) #cleaning object

head(data)
#Correcting Anomaly in Days_BIRTH
data$DAYS_BIRTH <- data$DAYS_BIRTH/-365

6.DATA Exploration

This time I use Tableau to help me doing Data Explanatory Analysis, since my current tool can’t handle to run the data. It is very computer excessive.

Link_Tableau

7.Correlation

correlation is used to check linear relationship among variables. It is also a good start before making the model, we will know the direction of the relationship. It could be the first signal whether our hypothesis is correct or not.

library(reshape2)
cormat <- as.data.frame(cor(data[,c("TARGET","AMT_ANNUITY","AMT_CREDIT","AMT_GOODS_PRICE","EXT_SOURCE_1",
  "EXT_SOURCE_2","EXT_SOURCE_3","NAME_EDUCATION_TYPE.Higher.education",
  "NAME_EDUCATION_TYPE.Incomplete.higher","DAYS_BIRTH",
  "NAME_EDUCATION_TYPE.Lower.secondary",
  "NAME_INCOME_TYPE.Maternity.leave",
  "NAME_INCOME_TYPE.Pensioner",
  "NAME_INCOME_TYPE.Student","NAME_INCOME_TYPE.Unemployed",
  "NAME_INCOME_TYPE.Working","OCCUPATION_TYPE.Accountants",
  "OCCUPATION_TYPE.Cleaning.staff",
  "OCCUPATION_TYPE.High.skill.tech.staff",
  "OCCUPATION_TYPE.IT.staff","OCCUPATION_TYPE.Laborers",
  "OCCUPATION_TYPE.Low.skill.Laborers","OCCUPATION_TYPE.Managers",
  "OCCUPATION_TYPE.Realty.agents","OCCUPATION_TYPE.Sales.staff",
  "OCCUPATION_TYPE.Secretaries","OCCUPATION_TYPE.Security.staff",
  "OCCUPATION_TYPE.Waiters.barmen.staff")], method="spearman", 
  use="complete"))

#Ideally, we should check all the variables relationships. But again, it is very computer excessive

cormat
#Plot Heatmap Correlation of Variables
#change column name NAME_EDUCATION_TYPE.Higher.education
colnames(data)[148]  <- "HIGHER_ED"

cor <- cor(data[,c("TARGET","HIGHER_ED","AMT_GOODS_PRICE",
  "DAYS_BIRTH","EXT_SOURCE_1","EXT_SOURCE_2","EXT_SOURCE_3")], use="complete", method="spearman")

melted_cormat <- melt(cor)
head(melted_cormat)
heatmap <- ggplot(data=melted_cormat,aes(x=Var1, y=Var2, fill=value)) + geom_tile() + 
  xlab("") + ylab("")
#print(heatmap)

8.Logistic Regression Model

There are many ways to do predictive analyses. In this case, I will use logistic regression where the dependent variable is binary 0:repaid on time and 1:payment difficulties. Ideally, we should generate many models then we choose the best logistic regression model that has the smallest accuracy measures as the base for model training.The AIC of model2 is lower.

data$TARGET <- as.factor(data$TARGET)
data$TARGET <- relevel(data$TARGET, ref = "0")

model1 <- glm(TARGET~AMT_CREDIT+AMT_GOODS_PRICE+NAME_EDUCATION_TYPE+
               DAYS_BIRTH+EXT_SOURCE_1+EXT_SOURCE_2+EXT_SOURCE_3, 
              family =binomial(link="logit"),data=data, 
              na.action = na.omit)

summary(model1$aic)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50752   50752   50752   50752   50752   50752
model2 <- glm(TARGET~AMT_CREDIT+AMT_GOODS_PRICE+DAYS_BIRTH+
               EXT_SOURCE_1+EXT_SOURCE_2+EXT_SOURCE_3+
               NAME_EDUCATION_TYPE+
               NAME_INCOME_TYPE+
               OCCUPATION_TYPE, 
               family =binomial(link="logit"),
               data=data,na.action = na.omit)

summary(model2$aic)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50709   50709   50709   50709   50709   50709
#model 2 is better (lower AIC)

9.Performance of Logistic Regression

We choose the model that has the lowest AIC as model for training. AUC Valid for this model is 0.74 which is quite good. We will check whether we can improve the model using other algorithm.

#partition and create training data, testing data
datafix <- subset(data, select = c(
             "TARGET",
             "AMT_CREDIT",
             "AMT_GOODS_PRICE",
             "DAYS_BIRTH",
             "EXT_SOURCE_1",
             "EXT_SOURCE_2",
             "EXT_SOURCE_3",
             "NAME_EDUCATION_TYPE",
             "NAME_INCOME_TYPE",
             "OCCUPATION_TYPE"))
set.seed(100)
train <- sample(nrow(data), 0.7*nrow(datafix))
TrainSet <- datafix[train,]
TestSet <- datafix[-train,]
modelRL <- glm(TARGET~., 
               family =binomial(link="logit"),
               data=TrainSet,na.action = na.omit)

RL_p <- predict(modelRL, newdata = TestSet, type="response")

auc(TestSet$TARGET, RL_p)
## Area under the curve: 0.7428

10.Improving Model

First, I use Random Forest algorithm to improve the model. Using this model, we can see which variables are the most relevant. AUC Valid is not better than logistic regression. Unfortunately, my current tool can’t run the random forest using ntree >= 500 (default setting). Increasing ntree should help increase the model’s performance. Second, Xgboost also shows lower AUC Valid compared to Logistic Regression.

set.seed(100)
rf <- randomForest(TARGET~., data=TrainSet, strata = TARGET,
            importance=TRUE, na.action = na.omit,
            ntree=300
            )

rf_p <- predict(rf,TestSet,type="prob")[,2]
rf_pr <- prediction(rf_p, TestSet$TARGET)

rf
## 
## Call:
##  randomForest(formula = TARGET ~ ., data = TrainSet, strata = TARGET,      importance = TRUE, ntree = 300, na.action = na.omit) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 7.34%
## Confusion matrix:
##       0   1 class.error
## 0 70952 175 0.002460388
## 1  5457 142 0.974638328
print(importance(rf))
##                             0          1 MeanDecreaseAccuracy
## AMT_CREDIT          59.810548 -45.522654             58.58072
## AMT_GOODS_PRICE     59.562378 -43.222295             58.10881
## DAYS_BIRTH          79.497820 -34.919023             73.06177
## EXT_SOURCE_1        64.049337  30.607921             73.16163
## EXT_SOURCE_2         3.872229  39.974773             16.92502
## EXT_SOURCE_3         7.983979  67.765172             31.14029
## NAME_EDUCATION_TYPE 17.985347   4.764355             18.90114
## NAME_INCOME_TYPE    25.498428  -7.218595             23.37970
## OCCUPATION_TYPE     28.556848  -1.751988             26.88850
##                     MeanDecreaseGini
## AMT_CREDIT                 1231.5313
## AMT_GOODS_PRICE             940.8110
## DAYS_BIRTH                 1579.6090
## EXT_SOURCE_1               1821.2064
## EXT_SOURCE_2               1763.9157
## EXT_SOURCE_3               1712.6514
## NAME_EDUCATION_TYPE         190.9026
## NAME_INCOME_TYPE            261.7019
## OCCUPATION_TYPE             862.7551
varImpPlot(rf, main="importance feature",type=2)

train <- data.matrix(TrainSet)
test <- data.matrix(TestSet)
y = data.matrix(TrainSet['TARGET'])
xgb <- xgboost(data=train[,-1], label = y, booster = 'gbtree',
               verbose = 0, 
               nrounds = 25,
               eta = 0.1,
               max_depth = 5,
               subsample = 0.5
               )

xgb_p <- predict(xgb, test[,-1])

xgb$feature_names
## [1] "AMT_CREDIT"          "AMT_GOODS_PRICE"     "DAYS_BIRTH"         
## [4] "EXT_SOURCE_1"        "EXT_SOURCE_2"        "EXT_SOURCE_3"       
## [7] "NAME_EDUCATION_TYPE" "NAME_INCOME_TYPE"    "OCCUPATION_TYPE"
plot(roc(TestSet$TARGET, xgb_p, direction="<"), col="red", lwd=2, main="ROC Plot")
lines(roc(TestSet$TARGET, rf_p, direction="<"), col="purple", lwd=2)
lines(roc(TestSet$TARGET, RL_p, direction="<"), col="blue", lwd=2)
legend(0.0,0.4, c('LR','RF','XGB'),lty=c(1,1), lwd=c(2.5,2.5),col=c('red','purple','blue'))

paste("Logistic Regression:", round(auc(TestSet$TARGET, RL_p),5))
## [1] "Logistic Regression: 0.74282"
paste("Random Forest:", round(auc(TestSet$TARGET, rf_p),5))
## [1] "Random Forest: 0.71204"
paste("XGBoost:", round(auc(TestSet$TARGET, xgb_p),5))
## [1] "XGBoost: 0.73241"