Logistic_Regression

##Step 1 - setwd

setwd("C:/Users/rpandey/Desktop/Classes")

##Step 2 - Load Data

logdata <- read.csv("Default_On_Payment.csv")

##Step 3 - Explore Data- where are the missing values? why some are NAs and others no values?

dim(logdata)

## [1] 40121    22

names(logdata)

##  [1] "Customer_ID"                 "Status_Checking_Acc"        
##  [3] "Duration_in_Months"          "Credit_History"             
##  [5] "Purposre_Credit_Taken"       "Credit_Amount"              
##  [7] "Savings_Acc"                 "Years_At_Present_Employment"
##  [9] "Inst_Rt_Income"              "Marital_Status_Gender"      
## [11] "Other_Debtors_Guarantors"    "Current_Address_Yrs"        
## [13] "Property"                    "Age"                        
## [15] "Other_Inst_Plans"            "Housing"                    
## [17] "Num_CC"                      "Job"                        
## [19] "Dependents"                  "Telephone"                  
## [21] "Foreign_Worker"              "Default_On_Payment"

head(logdata)

##   Customer_ID Status_Checking_Acc Duration_in_Months Credit_History
## 1      100015                 A14                 27            A32
## 2      100031                 A11                 12            A34
## 3      100046                 A12                 13            A34
## 4      100103                 A14                 24            A32
## 5      100104                 A11                 24            A32
## 6      100128                 A11                 10            A32
##   Purposre_Credit_Taken Credit_Amount Savings_Acc
## 1                   A40          2570         A61
## 2                   A43           385         A61
## 3                   A43           882         A61
## 4                   A40          1393         A61
## 5                   A40          3123         A61
## 6                   A43          2315         A61
##   Years_At_Present_Employment Inst_Rt_Income Marital_Status_Gender
## 1                         A73              3                   A92
## 2                         A74              4                   A92
## 3                         A72              4                   A93
## 4                         A73              2                   A93
## 5                         A72              4                   A92
## 6                         A75              3                   A93
##   Other_Debtors_Guarantors Current_Address_Yrs Property Age
## 1                     A101                   3     A121  21
## 2                     A101                   3     A121  58
## 3                     A103                   4     A121  23
## 4                     A103                   2     A121  31
## 5                     A101                   1     A122  27
## 6                     A101                   4     A121  52
##   Other_Inst_Plans Housing Num_CC  Job Dependents Telephone Foreign_Worker
## 1             A143    A151      1 A173          1      A191           A201
## 2             A143    A152      4 A172          1      A192           A201
## 3             A143    A152      2 A173          1      A191           A201
## 4             A143    A152      1 A173          1      A192           A201
## 5             A143    A152      1 A173          1      A191           A201
## 6             A143    A152      1 A172          1      A191           A201
##   Default_On_Payment
## 1                  1
## 2                  0
## 3                  0
## 4                  0
## 5                  1
## 6                  0

tail(logdata)

##       Customer_ID Status_Checking_Acc Duration_in_Months Credit_History
## 40116      986907                 A11                 24            A32
## 40117      986936                 A12                 24            A34
## 40118      986939                 A12                 60            A32
## 40119      986944                 A12                 36            A30
## 40120      986974                 A11                 20            A34
## 40121      987000                 A14                 36            A34
##       Purposre_Credit_Taken Credit_Amount Savings_Acc
## 40116                   A40          1285         A65
## 40117                   A49          1935         A61
## 40118                   A40         14027         A61
## 40119                   A43          3804         A61
## 40120                   A40          2235         A61
## 40121                                                
##       Years_At_Present_Employment Inst_Rt_Income Marital_Status_Gender
## 40116                         A74              4                   A92
## 40117                         A75              4                   A91
## 40118                         A74              4                   A93
## 40119                         A73              4                   A92
## 40120                         A73              4                   A94
## 40121                                         NA                      
##       Other_Debtors_Guarantors Current_Address_Yrs Property Age
## 40116                     A101                   4     A124  32
## 40117                     A101                   4     A121  31
## 40118                     A101                   2     A124  27
## 40119                     A101                   1     A123  42
## 40120                     A103                   2     A122  33
## 40121                                           NA           NA
##       Other_Inst_Plans Housing Num_CC  Job Dependents Telephone
## 40116             A143    A151      1 A173          1      A191
## 40117             A143    A152      2 A173          1      A192
## 40118             A143    A152      1 A174          1      A192
## 40119             A143    A152      1 A173          1      A192
## 40120             A141    A151      2 A173          1      A191
## 40121                              NA              NA          
##       Foreign_Worker Default_On_Payment
## 40116           A201                  1
## 40117           A201                  1
## 40118           A201                  1
## 40119           A201                  1
## 40120           A202                  1
## 40121                                NA

str(logdata)

## 'data.frame':    40121 obs. of  22 variables:
##  $ Customer_ID                : int  100015 100031 100046 100103 100104 100128 100148 100164 100182 100230 ...
##  $ Status_Checking_Acc        : Factor w/ 5 levels "2","A11","A12",..: 5 2 3 5 2 2 2 5 3 5 ...
##  $ Duration_in_Months         : Factor w/ 34 levels "10","11","12",..: 14 3 4 12 12 1 2 12 8 3 ...
##  $ Credit_History             : Factor w/ 6 levels "46","A30","A31",..: 4 6 6 4 4 4 6 6 4 4 ...
##  $ Purposre_Credit_Taken      : Factor w/ 12 levels "","A143","A40",..: 3 7 7 3 3 7 3 7 7 3 ...
##  $ Credit_Amount              : Factor w/ 923 levels "","1007","10127",..: 419 602 877 153 511 373 615 353 875 150 ...
##  $ Savings_Acc                : Factor w/ 7 levels "","2","A61","A62",..: 3 3 3 3 3 3 3 4 3 5 ...
##  $ Years_At_Present_Employment: Factor w/ 7 levels "","A172","A71",..: 5 6 4 5 4 7 5 7 5 5 ...
##  $ Inst_Rt_Income             : int  3 4 4 2 4 3 1 4 4 2 ...
##  $ Marital_Status_Gender      : Factor w/ 6 levels "","A192","A91",..: 4 4 5 5 4 5 5 5 6 4 ...
##  $ Other_Debtors_Guarantors   : Factor w/ 5 levels "","A101","A102",..: 2 2 4 4 2 2 2 2 4 2 ...
##  $ Current_Address_Yrs        : int  3 3 4 2 1 4 2 4 2 2 ...
##  $ Property                   : Factor w/ 5 levels "","A121","A122",..: 2 2 2 2 3 2 2 3 2 3 ...
##  $ Age                        : int  21 58 23 31 27 52 40 52 25 26 ...
##  $ Other_Inst_Plans           : Factor w/ 4 levels "","A141","A142",..: 4 4 4 4 4 4 4 2 4 4 ...
##  $ Housing                    : Factor w/ 4 levels "","A151","A152",..: 2 3 3 3 3 3 3 3 3 3 ...
##  $ Num_CC                     : int  1 4 2 1 1 1 2 2 1 1 ...
##  $ Job                        : Factor w/ 5 levels "","A171","A172",..: 4 3 4 4 4 3 3 4 3 4 ...
##  $ Dependents                 : int  1 1 1 1 1 1 2 1 1 1 ...
##  $ Telephone                  : Factor w/ 3 levels "","A191","A192": 2 3 2 3 2 2 2 2 2 2 ...
##  $ Foreign_Worker             : Factor w/ 3 levels "","A201","A202": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Default_On_Payment         : int  1 0 0 0 1 0 0 0 0 1 ...

summary(as.factor(logdata$Default_On_Payment))

##     0     1  NA's 
## 28118 12001     2

summary_logdata = summary(logdata)
write.csv(summary_logdata,"summary_logdata.csv",row.names=F)

##Step 4 - Remove any rows with missing values and removed customer_id

logdata <- logdata[complete.cases(logdata),]

logdata <- logdata[,c(-1)]

# Step 5- Bivariate Analyis 

library(gmodels)

CrossTable(logdata$Status_Checking_Acc,logdata$Default_On_Payment)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  40119 
## 
##  
##                             | logdata$Default_On_Payment 
## logdata$Status_Checking_Acc |         0 |         1 | Row Total | 
## ----------------------------|-----------|-----------|-----------|
##                         A11 |      5573 |      5417 |     10990 | 
##                             |   588.743 |  1379.407 |           | 
##                             |     0.507 |     0.493 |     0.274 | 
##                             |     0.198 |     0.451 |           | 
##                             |     0.139 |     0.135 |           | 
## ----------------------------|-----------|-----------|-----------|
##                         A12 |      6624 |      4175 |     10799 | 
##                             |   117.900 |   276.237 |           | 
##                             |     0.613 |     0.387 |     0.269 | 
##                             |     0.236 |     0.348 |           | 
##                             |     0.165 |     0.104 |           | 
## ----------------------------|-----------|-----------|-----------|
##                         A13 |      1967 |       564 |      2531 | 
##                             |    21.023 |    49.255 |           | 
##                             |     0.777 |     0.223 |     0.063 | 
##                             |     0.070 |     0.047 |           | 
##                             |     0.049 |     0.014 |           | 
## ----------------------------|-----------|-----------|-----------|
##                         A14 |     13954 |      1845 |     15799 | 
##                             |   749.606 |  1756.306 |           | 
##                             |     0.883 |     0.117 |     0.394 | 
##                             |     0.496 |     0.154 |           | 
##                             |     0.348 |     0.046 |           | 
## ----------------------------|-----------|-----------|-----------|
##                Column Total |     28118 |     12001 |     40119 | 
##                             |     0.701 |     0.299 |           | 
## ----------------------------|-----------|-----------|-----------|
## 
##

CrossTable(logdata$Status_Checking_Acc,logdata$Default_On_Payment,expected=FALSE,prop.r=FALSE, prop.c=FALSE,prop.t=FALSE, prop.chisq=FALSE,chisq=FALSE)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |-------------------------|
## 
##  
## Total Observations in Table:  40119 
## 
##  
##                             | logdata$Default_On_Payment 
## logdata$Status_Checking_Acc |         0 |         1 | Row Total | 
## ----------------------------|-----------|-----------|-----------|
##                         A11 |      5573 |      5417 |     10990 | 
## ----------------------------|-----------|-----------|-----------|
##                         A12 |      6624 |      4175 |     10799 | 
## ----------------------------|-----------|-----------|-----------|
##                         A13 |      1967 |       564 |      2531 | 
## ----------------------------|-----------|-----------|-----------|
##                         A14 |     13954 |      1845 |     15799 | 
## ----------------------------|-----------|-----------|-----------|
##                Column Total |     28118 |     12001 |     40119 | 
## ----------------------------|-----------|-----------|-----------|
## 
##

CrossTable(logdata$Status_Checking_Acc,logdata$Default_On_Payment,expected=FALSE, prop.c=FALSE,prop.t=FALSE, prop.chisq=FALSE,chisq=FALSE)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  40119 
## 
##  
##                             | logdata$Default_On_Payment 
## logdata$Status_Checking_Acc |         0 |         1 | Row Total | 
## ----------------------------|-----------|-----------|-----------|
##                         A11 |      5573 |      5417 |     10990 | 
##                             |     0.507 |     0.493 |     0.274 | 
## ----------------------------|-----------|-----------|-----------|
##                         A12 |      6624 |      4175 |     10799 | 
##                             |     0.613 |     0.387 |     0.269 | 
## ----------------------------|-----------|-----------|-----------|
##                         A13 |      1967 |       564 |      2531 | 
##                             |     0.777 |     0.223 |     0.063 | 
## ----------------------------|-----------|-----------|-----------|
##                         A14 |     13954 |      1845 |     15799 | 
##                             |     0.883 |     0.117 |     0.394 | 
## ----------------------------|-----------|-----------|-----------|
##                Column Total |     28118 |     12001 |     40119 | 
## ----------------------------|-----------|-----------|-----------|
## 
##

# Step 6- WOE & IV

library(Information)
library(riv)

## Loading required package: MASS

## Loading required package: rrcov

## Loading required package: robustbase

## Scalable Robust Estimators with High Breakdown Point (version 1.4-3)

## Loading required package: quantreg

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

library(devtools)
library(woe)
library(gridExtra)


# Step 6a Generate IV of each independent factor- what variables we want to select for building model?

stat <- create_infotables(data=logdata, y = "Default_On_Payment")
grid.table(stat$Summary, rows=NULL)

write.csv(stat$Summary,"IV_summary.csv",row.names=F)


# Step 6b- subset the data to select only significant variables

newdata <- subset(logdata, select = c(Status_Checking_Acc, Duration_in_Months, Credit_History, Savings_Acc, Purposre_Credit_Taken, Age, Property, Years_At_Present_Employment, Housing, Other_Inst_Plans, Default_On_Payment))


# 6c Generate WOE table for each independed factor

# stat <- create_infotables(data=newdata, y = "Default_On_Payment")
# 
# grid.table(new$Tables$Status_Checking_Acc, rows=NULL)
# 
# grid.table(stat$Tables$Duration_in_Months, rows=NULL)

# Step 7 - Build Linear Reg model

library(car)

linreg = lm(Default_On_Payment~., data = newdata)
summary(linreg)

## 
## Call:
## lm(formula = Default_On_Payment ~ ., data = newdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.86382 -0.28676 -0.09219  0.32246  1.00918 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.7697585  0.0218420  35.242  < 2e-16 ***
## Status_Checking_AccA12         -0.1002857  0.0056839 -17.644  < 2e-16 ***
## Status_Checking_AccA13         -0.1982932  0.0089160 -22.240  < 2e-16 ***
## Status_Checking_AccA14         -0.2719186  0.0053806 -50.537  < 2e-16 ***
## Duration_in_Months11           -0.0883551  0.0241493  -3.659 0.000254 ***
## Duration_in_Months12            0.1316079  0.0127738  10.303  < 2e-16 ***
## Duration_in_Months13           -0.1679345  0.0336398  -4.992 6.00e-07 ***
## Duration_in_Months14           -0.0310201  0.0334909  -0.926 0.354335    
## Duration_in_Months15            0.0651237  0.0142697   4.564 5.04e-06 ***
## Duration_in_Months16            0.1752283  0.0463289   3.782 0.000156 ***
## Duration_in_Months18            0.2004975  0.0134166  14.944  < 2e-16 ***
## Duration_in_Months20           -0.0221322  0.0254798  -0.869 0.385061    
## Duration_in_Months21            0.1846304  0.0165940  11.126  < 2e-16 ***
## Duration_in_Months22            0.0164428  0.0457009   0.360 0.719005    
## Duration_in_Months24            0.1777103  0.0128954  13.781  < 2e-16 ***
## Duration_in_Months26           -0.1471282  0.0637199  -2.309 0.020950 *  
## Duration_in_Months27            0.2374538  0.0215329  11.028  < 2e-16 ***
## Duration_in_Months28            0.1920215  0.0379099   5.065 4.10e-07 ***
## Duration_in_Months30            0.2005126  0.0157673  12.717  < 2e-16 ***
## Duration_in_Months33            0.3675467  0.0381566   9.633  < 2e-16 ***
## Duration_in_Months36            0.2938116  0.0139703  21.031  < 2e-16 ***
## Duration_in_Months39            0.1831788  0.0306500   5.976 2.30e-09 ***
## Duration_in_Months4             0.0600658  0.0282814   2.124 0.033688 *  
## Duration_in_Months40            0.4745183  0.0640407   7.410 1.29e-13 ***
## Duration_in_Months42            0.1325365  0.0226113   5.862 4.62e-09 ***
## Duration_in_Months45            0.5373842  0.0307741  17.462  < 2e-16 ***
## Duration_in_Months47           -0.4267245  0.0633595  -6.735 1.66e-11 ***
## Duration_in_Months48            0.3796413  0.0154894  24.510  < 2e-16 ***
## Duration_in_Months5             0.1641571  0.0641030   2.561 0.010446 *  
## Duration_in_Months54            0.2297115  0.0470660   4.881 1.06e-06 ***
## Duration_in_Months6            -0.0284747  0.0138936  -2.049 0.040421 *  
## Duration_in_Months60            0.2800291  0.0212741  13.163  < 2e-16 ***
## Duration_in_Months7            -0.0453094  0.0304958  -1.486 0.137351    
## Duration_in_Months72            0.8438571  0.0636178  13.264  < 2e-16 ***
## Duration_in_Months8            -0.0262472  0.0267354  -0.982 0.326235    
## Duration_in_Months9             0.1470090  0.0149335   9.844  < 2e-16 ***
## Credit_HistoryA31              -0.0168741  0.0139103  -1.213 0.225113    
## Credit_HistoryA32              -0.1712827  0.0109288 -15.673  < 2e-16 ***
## Credit_HistoryA33              -0.1722833  0.0123986 -13.895  < 2e-16 ***
## Credit_HistoryA34              -0.2559607  0.0113152 -22.621  < 2e-16 ***
## Savings_AccA62                 -0.0331820  0.0069638  -4.765 1.90e-06 ***
## Savings_AccA63                 -0.0727308  0.0085281  -8.528  < 2e-16 ***
## Savings_AccA64                 -0.1124998  0.0096075 -11.710  < 2e-16 ***
## Savings_AccA65                 -0.1105122  0.0055340 -19.970  < 2e-16 ***
## Purposre_Credit_TakenA41       -0.2385023  0.0078033 -30.564  < 2e-16 ***
## Purposre_Credit_TakenA410      -0.1952226  0.0191432 -10.198  < 2e-16 ***
## Purposre_Credit_TakenA42       -0.1275840  0.0063874 -19.974  < 2e-16 ***
## Purposre_Credit_TakenA43       -0.1372409  0.0057771 -23.756  < 2e-16 ***
## Purposre_Credit_TakenA44       -0.0593060  0.0185287  -3.201 0.001372 ** 
## Purposre_Credit_TakenA45       -0.0251342  0.0140787  -1.785 0.074226 .  
## Purposre_Credit_TakenA46        0.0183318  0.0100066   1.832 0.066963 .  
## Purposre_Credit_TakenA48       -0.2157282  0.0213985 -10.081  < 2e-16 ***
## Purposre_Credit_TakenA49       -0.1231291  0.0080966 -15.208  < 2e-16 ***
## Age                            -0.0012711  0.0002051  -6.198 5.76e-10 ***
## PropertyA122                    0.0611509  0.0058654  10.426  < 2e-16 ***
## PropertyA123                    0.0528360  0.0054169   9.754  < 2e-16 ***
## PropertyA124                    0.1489449  0.0097867  15.219  < 2e-16 ***
## Years_At_Present_EmploymentA72  0.0252472  0.0097476   2.590 0.009599 ** 
## Years_At_Present_EmploymentA73 -0.0414445  0.0090543  -4.577 4.72e-06 ***
## Years_At_Present_EmploymentA74 -0.1210120  0.0096866 -12.493  < 2e-16 ***
## Years_At_Present_EmploymentA75 -0.0506770  0.0091716  -5.525 3.31e-08 ***
## HousingA152                    -0.0798273  0.0055000 -14.514  < 2e-16 ***
## HousingA153                    -0.1272980  0.0112635 -11.302  < 2e-16 ***
## Other_Inst_PlansA142            0.0045000  0.0107968   0.417 0.676837    
## Other_Inst_PlansA143           -0.0726811  0.0061262 -11.864  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3908 on 40054 degrees of freedom
## Multiple R-squared:  0.2726, Adjusted R-squared:  0.2715 
## F-statistic: 234.6 on 64 and 40054 DF,  p-value: < 2.2e-16

plot(predict(linreg)) #predicted values lie outside 0 to 1 range - hence improper model fit

plot(linreg)

# Step 8- Transformmation/Dummy coding of variables for getting the best model
write.csv(stat$Tables$Duration_in_Months,"Duration_summary.csv",row.names=F)
newdata$Duration_Category <- ifelse(newdata$Duration_in_Months %in% c("4","5","6", "7", "8", "9", "10", "11","12", "13", "14", "15", "16", "17", "18"),"lessthan20", ifelse(newdata$Duration_in_Months %in% c("20","21","22", "24", "26", "27", "28", "30","33", "36", "39"), "20to40", "morethan40"))
table(newdata$Duration_in_Months, newdata$Duration_Category)

##       
##        20to40 lessthan20 morethan40
##   10        0       1123          0
##   11        0        361          0
##   12        0       7179          0
##   13        0        161          0
##   14        0        161          0
##   15        0       2567          0
##   16        0         80          0
##   18        0       4536          0
##   20      322          0          0
##   21     1205          0          0
##   22       80          0          0
##   24     7386          0          0
##   26       40          0          0
##   27      521          0          0
##   28      120          0          0
##   30     1604          0          0
##   33      121          0          0
##   36     3326          0          0
##   39      200          0          0
##   4         0        240          0
##   40        0          0         41
##   42        0          0        440
##   45        0          0        200
##   47        0          0         40
##   48        0          0       1924
##   5         0         40          0
##   54        0          0         80
##   6         0       3009          0
##   60        0          0        524
##   7         0        200          0
##   72        0          0         40
##   8         0        280          0
##   9         0       1968          0
##   A122      0          0          0

stat <- create_infotables(data=newdata, y = "Default_On_Payment")
newdata$Duration_Dummy_20 <- ifelse(newdata$Duration_Category == "lessthan20", 1,0)
newdata$Duration_Dummy_40 <- ifelse(newdata$Duration_Category == "20to40", 1,0)
newdata$Status_Checking_Acc_A11 <- ifelse(newdata$Status_Checking_Acc == "A11", 1,0)
newdata$Status_Checking_Acc_A12 <- ifelse(newdata$Status_Checking_Acc == "A12", 1,0)
newdata$Status_Checking_Acc_A13 <- ifelse(newdata$Status_Checking_Acc == "A13", 1,0)
newdata$Default_On_Payment1 <- as.factor(ifelse(newdata$Default_On_Payment == 1,"1","0"))

#Step 9 - Build logistic regression

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

logreg <- glm(Default_On_Payment ~ Duration_Dummy_20+Duration_Dummy_40+Status_Checking_Acc_A11+Status_Checking_Acc_A12+Status_Checking_Acc_A13+Age, family = binomial("logit"),data = newdata)
summary(logreg)

## 
## Call:
## glm(formula = Default_On_Payment ~ Duration_Dummy_20 + Duration_Dummy_40 + 
##     Status_Checking_Acc_A11 + Status_Checking_Acc_A12 + Status_Checking_Acc_A13 + 
##     Age, family = binomial("logit"), data = newdata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6125  -0.8832  -0.4819   1.0449   2.3970  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.595650   0.059126  -10.07   <2e-16 ***
## Duration_Dummy_20       -1.154614   0.041206  -28.02   <2e-16 ***
## Duration_Dummy_40       -0.677572   0.042070  -16.11   <2e-16 ***
## Status_Checking_Acc_A11  1.984595   0.031684   62.64   <2e-16 ***
## Status_Checking_Acc_A12  1.495144   0.032235   46.38   <2e-16 ***
## Status_Checking_Acc_A13  0.865074   0.054460   15.88   <2e-16 ***
## Age                     -0.015651   0.001076  -14.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 48956  on 40118  degrees of freedom
## Residual deviance: 42581  on 40112  degrees of freedom
## AIC: 42595
## 
## Number of Fisher Scoring iterations: 4

plot(predict(logreg,type="response")) #note plot option has type to get inverse of log_odds

newdata$predicted = predict(logreg,type="response")
write.csv(newdata,"output_logreg.csv", row.names = F)

capture.output(summary(logreg), file = "summary_logreg.csv")
summary_residuals_model.csv<-residuals(logreg, type="deviance")
write.csv(summary_residuals_model.csv, "summary_residuals_model.csv")

#Step 10 - Model Diagnostics

# ROCR

newdata$predicted = predict(logreg,type="response")
pred<-prediction(newdata$predicted,newdata$Default_On_Payment)
perf <- performance(pred,"tpr","fpr")
plot(perf)
abline(a=0, b=1, col="Red")

# AUC
auc.perf = performance(pred, measure = "auc")
auc.perf@y.values

## [[1]]
## [1] 0.7429535

# Create Decile by scorebands

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

newdata$decile <- ntile(-newdata$predicted,10)
write.csv(newdata, "newdata.csv")

# KS

max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

## [1] 0.3810604

# Lift
lift.obj <- performance(pred, "lift", x.measure = "rpp")
plot(lift.obj,
     main="Cross-Sell - Lift Chart",
     xlab="% Populations",
     ylab="Lift",
     col="blue")
abline(1,0,col="grey")

# Lorenz and Gini
library(ineq)
# Gini Index
ineq(newdata$predicted,type="Gini")

## [1] 0.3388472

## Lorenz Curve
plot(Lc(newdata$predicted),col="darkred",lwd=2)

##Get Concordance/Pairs Stats

# High-Low Ratio of bad rate by decile in Excel. 
# Confidence interval for ROC, KS, Gini
#  mydata[ which(mydata$gender=='F' & mydata$age > 65), ]



Concordance = function(y,yhat) 
{
  outcome_and_fitted_col<-data.frame(y, yhat)
  colnames(outcome_and_fitted_col)<-c("Responder","fitted.values")
  # get a subset of outcomes where the event actually happened
  ones = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 1,]
  # get a subset of outcomes where the event didn't actually happen
  zeros = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 0,]
  # Equate the length of the event and non-event tables
  if (length(ones[,1])>length(zeros[,1])) {ones = ones[1:length(zeros[,1]),]}
  else {zeros = zeros[1:length(ones[,1]),]}
  # Following will be c(ones_outcome, ones_fitted, zeros_outcome, zeros_fitted)
  ones_and_zeros = data.frame(ones, zeros)
  # initiate columns to store concordant, discordant, and tie pair evaluations
  conc = rep(NA, length(ones_and_zeros[,1]))
  disc = rep(NA, length(ones_and_zeros[,1]))
  ties = rep(NA, length(ones_and_zeros[,1]))
  for (i in 1:length(ones_and_zeros[,1])) {
    # This tests for concordance
    if (ones_and_zeros[i,2] > ones_and_zeros[i,4])
    {conc[i] = 1
    disc[i] = 0
    ties[i] = 0
    }
    # This tests for a tie
    else if (ones_and_zeros[i,2] == ones_and_zeros[i,4])
    {
      conc[i] = 0
      disc[i] = 0
      ties[i] = 1
    }
    # This should catch discordant pairs.
    else if (ones_and_zeros[i,2] < ones_and_zeros[i,4])
    {
      conc[i] = 0
      disc[i] = 1
      ties[i] = 0 
    }
  }
  # Here we save the various rates
  conc_rate = mean(conc, na.rm=TRUE)
  disc_rate = mean(disc, na.rm=TRUE)
  tie_rate = mean(ties, na.rm=TRUE)
  return(list(concordance=conc_rate, num_concordant=sum(conc), discordance=disc_rate, num_discordant=sum(disc), tie_rate=tie_rate,num_tied=sum(ties)))
}

Concordance_test<-Concordance(newdata$Default_On_Payment,newdata$predicted)

Concordance_test

## $concordance
## [1] 0.7421882
## 
## $num_concordant
## [1] 8907
## 
## $discordance
## [1] 0.2553121
## 
## $num_discordant
## [1] 3064
## 
## $tie_rate
## [1] 0.002499792
## 
## $num_tied
## [1] 30

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Logistic_Regression_R

Ratnakar_Pandey

September 21, 2017

R Markdown

Including Plots