##Step 1 - setwd
setwd("C:/Users/rpandey/Desktop/Classes")
##Step 2 - Load Data
logdata <- read.csv("Default_On_Payment.csv")
##Step 3 - Explore Data- where are the missing values? why some are NAs and others no values?
dim(logdata)
## [1] 40121 22
names(logdata)
## [1] "Customer_ID" "Status_Checking_Acc"
## [3] "Duration_in_Months" "Credit_History"
## [5] "Purposre_Credit_Taken" "Credit_Amount"
## [7] "Savings_Acc" "Years_At_Present_Employment"
## [9] "Inst_Rt_Income" "Marital_Status_Gender"
## [11] "Other_Debtors_Guarantors" "Current_Address_Yrs"
## [13] "Property" "Age"
## [15] "Other_Inst_Plans" "Housing"
## [17] "Num_CC" "Job"
## [19] "Dependents" "Telephone"
## [21] "Foreign_Worker" "Default_On_Payment"
head(logdata)
## Customer_ID Status_Checking_Acc Duration_in_Months Credit_History
## 1 100015 A14 27 A32
## 2 100031 A11 12 A34
## 3 100046 A12 13 A34
## 4 100103 A14 24 A32
## 5 100104 A11 24 A32
## 6 100128 A11 10 A32
## Purposre_Credit_Taken Credit_Amount Savings_Acc
## 1 A40 2570 A61
## 2 A43 385 A61
## 3 A43 882 A61
## 4 A40 1393 A61
## 5 A40 3123 A61
## 6 A43 2315 A61
## Years_At_Present_Employment Inst_Rt_Income Marital_Status_Gender
## 1 A73 3 A92
## 2 A74 4 A92
## 3 A72 4 A93
## 4 A73 2 A93
## 5 A72 4 A92
## 6 A75 3 A93
## Other_Debtors_Guarantors Current_Address_Yrs Property Age
## 1 A101 3 A121 21
## 2 A101 3 A121 58
## 3 A103 4 A121 23
## 4 A103 2 A121 31
## 5 A101 1 A122 27
## 6 A101 4 A121 52
## Other_Inst_Plans Housing Num_CC Job Dependents Telephone Foreign_Worker
## 1 A143 A151 1 A173 1 A191 A201
## 2 A143 A152 4 A172 1 A192 A201
## 3 A143 A152 2 A173 1 A191 A201
## 4 A143 A152 1 A173 1 A192 A201
## 5 A143 A152 1 A173 1 A191 A201
## 6 A143 A152 1 A172 1 A191 A201
## Default_On_Payment
## 1 1
## 2 0
## 3 0
## 4 0
## 5 1
## 6 0
tail(logdata)
## Customer_ID Status_Checking_Acc Duration_in_Months Credit_History
## 40116 986907 A11 24 A32
## 40117 986936 A12 24 A34
## 40118 986939 A12 60 A32
## 40119 986944 A12 36 A30
## 40120 986974 A11 20 A34
## 40121 987000 A14 36 A34
## Purposre_Credit_Taken Credit_Amount Savings_Acc
## 40116 A40 1285 A65
## 40117 A49 1935 A61
## 40118 A40 14027 A61
## 40119 A43 3804 A61
## 40120 A40 2235 A61
## 40121
## Years_At_Present_Employment Inst_Rt_Income Marital_Status_Gender
## 40116 A74 4 A92
## 40117 A75 4 A91
## 40118 A74 4 A93
## 40119 A73 4 A92
## 40120 A73 4 A94
## 40121 NA
## Other_Debtors_Guarantors Current_Address_Yrs Property Age
## 40116 A101 4 A124 32
## 40117 A101 4 A121 31
## 40118 A101 2 A124 27
## 40119 A101 1 A123 42
## 40120 A103 2 A122 33
## 40121 NA NA
## Other_Inst_Plans Housing Num_CC Job Dependents Telephone
## 40116 A143 A151 1 A173 1 A191
## 40117 A143 A152 2 A173 1 A192
## 40118 A143 A152 1 A174 1 A192
## 40119 A143 A152 1 A173 1 A192
## 40120 A141 A151 2 A173 1 A191
## 40121 NA NA
## Foreign_Worker Default_On_Payment
## 40116 A201 1
## 40117 A201 1
## 40118 A201 1
## 40119 A201 1
## 40120 A202 1
## 40121 NA
str(logdata)
## 'data.frame': 40121 obs. of 22 variables:
## $ Customer_ID : int 100015 100031 100046 100103 100104 100128 100148 100164 100182 100230 ...
## $ Status_Checking_Acc : Factor w/ 5 levels "2","A11","A12",..: 5 2 3 5 2 2 2 5 3 5 ...
## $ Duration_in_Months : Factor w/ 34 levels "10","11","12",..: 14 3 4 12 12 1 2 12 8 3 ...
## $ Credit_History : Factor w/ 6 levels "46","A30","A31",..: 4 6 6 4 4 4 6 6 4 4 ...
## $ Purposre_Credit_Taken : Factor w/ 12 levels "","A143","A40",..: 3 7 7 3 3 7 3 7 7 3 ...
## $ Credit_Amount : Factor w/ 923 levels "","1007","10127",..: 419 602 877 153 511 373 615 353 875 150 ...
## $ Savings_Acc : Factor w/ 7 levels "","2","A61","A62",..: 3 3 3 3 3 3 3 4 3 5 ...
## $ Years_At_Present_Employment: Factor w/ 7 levels "","A172","A71",..: 5 6 4 5 4 7 5 7 5 5 ...
## $ Inst_Rt_Income : int 3 4 4 2 4 3 1 4 4 2 ...
## $ Marital_Status_Gender : Factor w/ 6 levels "","A192","A91",..: 4 4 5 5 4 5 5 5 6 4 ...
## $ Other_Debtors_Guarantors : Factor w/ 5 levels "","A101","A102",..: 2 2 4 4 2 2 2 2 4 2 ...
## $ Current_Address_Yrs : int 3 3 4 2 1 4 2 4 2 2 ...
## $ Property : Factor w/ 5 levels "","A121","A122",..: 2 2 2 2 3 2 2 3 2 3 ...
## $ Age : int 21 58 23 31 27 52 40 52 25 26 ...
## $ Other_Inst_Plans : Factor w/ 4 levels "","A141","A142",..: 4 4 4 4 4 4 4 2 4 4 ...
## $ Housing : Factor w/ 4 levels "","A151","A152",..: 2 3 3 3 3 3 3 3 3 3 ...
## $ Num_CC : int 1 4 2 1 1 1 2 2 1 1 ...
## $ Job : Factor w/ 5 levels "","A171","A172",..: 4 3 4 4 4 3 3 4 3 4 ...
## $ Dependents : int 1 1 1 1 1 1 2 1 1 1 ...
## $ Telephone : Factor w/ 3 levels "","A191","A192": 2 3 2 3 2 2 2 2 2 2 ...
## $ Foreign_Worker : Factor w/ 3 levels "","A201","A202": 2 2 2 2 2 2 2 2 2 2 ...
## $ Default_On_Payment : int 1 0 0 0 1 0 0 0 0 1 ...
summary(as.factor(logdata$Default_On_Payment))
## 0 1 NA's
## 28118 12001 2
summary_logdata = summary(logdata)
write.csv(summary_logdata,"summary_logdata.csv",row.names=F)
##Step 4 - Remove any rows with missing values and removed customer_id
logdata <- logdata[complete.cases(logdata),]
logdata <- logdata[,c(-1)]
# Step 5- Bivariate Analyis
library(gmodels)
CrossTable(logdata$Status_Checking_Acc,logdata$Default_On_Payment)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 40119
##
##
## | logdata$Default_On_Payment
## logdata$Status_Checking_Acc | 0 | 1 | Row Total |
## ----------------------------|-----------|-----------|-----------|
## A11 | 5573 | 5417 | 10990 |
## | 588.743 | 1379.407 | |
## | 0.507 | 0.493 | 0.274 |
## | 0.198 | 0.451 | |
## | 0.139 | 0.135 | |
## ----------------------------|-----------|-----------|-----------|
## A12 | 6624 | 4175 | 10799 |
## | 117.900 | 276.237 | |
## | 0.613 | 0.387 | 0.269 |
## | 0.236 | 0.348 | |
## | 0.165 | 0.104 | |
## ----------------------------|-----------|-----------|-----------|
## A13 | 1967 | 564 | 2531 |
## | 21.023 | 49.255 | |
## | 0.777 | 0.223 | 0.063 |
## | 0.070 | 0.047 | |
## | 0.049 | 0.014 | |
## ----------------------------|-----------|-----------|-----------|
## A14 | 13954 | 1845 | 15799 |
## | 749.606 | 1756.306 | |
## | 0.883 | 0.117 | 0.394 |
## | 0.496 | 0.154 | |
## | 0.348 | 0.046 | |
## ----------------------------|-----------|-----------|-----------|
## Column Total | 28118 | 12001 | 40119 |
## | 0.701 | 0.299 | |
## ----------------------------|-----------|-----------|-----------|
##
##
CrossTable(logdata$Status_Checking_Acc,logdata$Default_On_Payment,expected=FALSE,prop.r=FALSE, prop.c=FALSE,prop.t=FALSE, prop.chisq=FALSE,chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## |-------------------------|
##
##
## Total Observations in Table: 40119
##
##
## | logdata$Default_On_Payment
## logdata$Status_Checking_Acc | 0 | 1 | Row Total |
## ----------------------------|-----------|-----------|-----------|
## A11 | 5573 | 5417 | 10990 |
## ----------------------------|-----------|-----------|-----------|
## A12 | 6624 | 4175 | 10799 |
## ----------------------------|-----------|-----------|-----------|
## A13 | 1967 | 564 | 2531 |
## ----------------------------|-----------|-----------|-----------|
## A14 | 13954 | 1845 | 15799 |
## ----------------------------|-----------|-----------|-----------|
## Column Total | 28118 | 12001 | 40119 |
## ----------------------------|-----------|-----------|-----------|
##
##
CrossTable(logdata$Status_Checking_Acc,logdata$Default_On_Payment,expected=FALSE, prop.c=FALSE,prop.t=FALSE, prop.chisq=FALSE,chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 40119
##
##
## | logdata$Default_On_Payment
## logdata$Status_Checking_Acc | 0 | 1 | Row Total |
## ----------------------------|-----------|-----------|-----------|
## A11 | 5573 | 5417 | 10990 |
## | 0.507 | 0.493 | 0.274 |
## ----------------------------|-----------|-----------|-----------|
## A12 | 6624 | 4175 | 10799 |
## | 0.613 | 0.387 | 0.269 |
## ----------------------------|-----------|-----------|-----------|
## A13 | 1967 | 564 | 2531 |
## | 0.777 | 0.223 | 0.063 |
## ----------------------------|-----------|-----------|-----------|
## A14 | 13954 | 1845 | 15799 |
## | 0.883 | 0.117 | 0.394 |
## ----------------------------|-----------|-----------|-----------|
## Column Total | 28118 | 12001 | 40119 |
## ----------------------------|-----------|-----------|-----------|
##
##
# Step 6- WOE & IV
library(Information)
library(riv)
## Loading required package: MASS
## Loading required package: rrcov
## Loading required package: robustbase
## Scalable Robust Estimators with High Breakdown Point (version 1.4-3)
## Loading required package: quantreg
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(devtools)
library(woe)
library(gridExtra)
# Step 6a Generate IV of each independent factor- what variables we want to select for building model?
stat <- create_infotables(data=logdata, y = "Default_On_Payment")
grid.table(stat$Summary, rows=NULL)
write.csv(stat$Summary,"IV_summary.csv",row.names=F)
# Step 6b- subset the data to select only significant variables
newdata <- subset(logdata, select = c(Status_Checking_Acc, Duration_in_Months, Credit_History, Savings_Acc, Purposre_Credit_Taken, Age, Property, Years_At_Present_Employment, Housing, Other_Inst_Plans, Default_On_Payment))
# 6c Generate WOE table for each independed factor
# stat <- create_infotables(data=newdata, y = "Default_On_Payment")
#
# grid.table(new$Tables$Status_Checking_Acc, rows=NULL)
#
# grid.table(stat$Tables$Duration_in_Months, rows=NULL)
# Step 7 - Build Linear Reg model
library(car)
linreg = lm(Default_On_Payment~., data = newdata)
summary(linreg)
##
## Call:
## lm(formula = Default_On_Payment ~ ., data = newdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.86382 -0.28676 -0.09219 0.32246 1.00918
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7697585 0.0218420 35.242 < 2e-16 ***
## Status_Checking_AccA12 -0.1002857 0.0056839 -17.644 < 2e-16 ***
## Status_Checking_AccA13 -0.1982932 0.0089160 -22.240 < 2e-16 ***
## Status_Checking_AccA14 -0.2719186 0.0053806 -50.537 < 2e-16 ***
## Duration_in_Months11 -0.0883551 0.0241493 -3.659 0.000254 ***
## Duration_in_Months12 0.1316079 0.0127738 10.303 < 2e-16 ***
## Duration_in_Months13 -0.1679345 0.0336398 -4.992 6.00e-07 ***
## Duration_in_Months14 -0.0310201 0.0334909 -0.926 0.354335
## Duration_in_Months15 0.0651237 0.0142697 4.564 5.04e-06 ***
## Duration_in_Months16 0.1752283 0.0463289 3.782 0.000156 ***
## Duration_in_Months18 0.2004975 0.0134166 14.944 < 2e-16 ***
## Duration_in_Months20 -0.0221322 0.0254798 -0.869 0.385061
## Duration_in_Months21 0.1846304 0.0165940 11.126 < 2e-16 ***
## Duration_in_Months22 0.0164428 0.0457009 0.360 0.719005
## Duration_in_Months24 0.1777103 0.0128954 13.781 < 2e-16 ***
## Duration_in_Months26 -0.1471282 0.0637199 -2.309 0.020950 *
## Duration_in_Months27 0.2374538 0.0215329 11.028 < 2e-16 ***
## Duration_in_Months28 0.1920215 0.0379099 5.065 4.10e-07 ***
## Duration_in_Months30 0.2005126 0.0157673 12.717 < 2e-16 ***
## Duration_in_Months33 0.3675467 0.0381566 9.633 < 2e-16 ***
## Duration_in_Months36 0.2938116 0.0139703 21.031 < 2e-16 ***
## Duration_in_Months39 0.1831788 0.0306500 5.976 2.30e-09 ***
## Duration_in_Months4 0.0600658 0.0282814 2.124 0.033688 *
## Duration_in_Months40 0.4745183 0.0640407 7.410 1.29e-13 ***
## Duration_in_Months42 0.1325365 0.0226113 5.862 4.62e-09 ***
## Duration_in_Months45 0.5373842 0.0307741 17.462 < 2e-16 ***
## Duration_in_Months47 -0.4267245 0.0633595 -6.735 1.66e-11 ***
## Duration_in_Months48 0.3796413 0.0154894 24.510 < 2e-16 ***
## Duration_in_Months5 0.1641571 0.0641030 2.561 0.010446 *
## Duration_in_Months54 0.2297115 0.0470660 4.881 1.06e-06 ***
## Duration_in_Months6 -0.0284747 0.0138936 -2.049 0.040421 *
## Duration_in_Months60 0.2800291 0.0212741 13.163 < 2e-16 ***
## Duration_in_Months7 -0.0453094 0.0304958 -1.486 0.137351
## Duration_in_Months72 0.8438571 0.0636178 13.264 < 2e-16 ***
## Duration_in_Months8 -0.0262472 0.0267354 -0.982 0.326235
## Duration_in_Months9 0.1470090 0.0149335 9.844 < 2e-16 ***
## Credit_HistoryA31 -0.0168741 0.0139103 -1.213 0.225113
## Credit_HistoryA32 -0.1712827 0.0109288 -15.673 < 2e-16 ***
## Credit_HistoryA33 -0.1722833 0.0123986 -13.895 < 2e-16 ***
## Credit_HistoryA34 -0.2559607 0.0113152 -22.621 < 2e-16 ***
## Savings_AccA62 -0.0331820 0.0069638 -4.765 1.90e-06 ***
## Savings_AccA63 -0.0727308 0.0085281 -8.528 < 2e-16 ***
## Savings_AccA64 -0.1124998 0.0096075 -11.710 < 2e-16 ***
## Savings_AccA65 -0.1105122 0.0055340 -19.970 < 2e-16 ***
## Purposre_Credit_TakenA41 -0.2385023 0.0078033 -30.564 < 2e-16 ***
## Purposre_Credit_TakenA410 -0.1952226 0.0191432 -10.198 < 2e-16 ***
## Purposre_Credit_TakenA42 -0.1275840 0.0063874 -19.974 < 2e-16 ***
## Purposre_Credit_TakenA43 -0.1372409 0.0057771 -23.756 < 2e-16 ***
## Purposre_Credit_TakenA44 -0.0593060 0.0185287 -3.201 0.001372 **
## Purposre_Credit_TakenA45 -0.0251342 0.0140787 -1.785 0.074226 .
## Purposre_Credit_TakenA46 0.0183318 0.0100066 1.832 0.066963 .
## Purposre_Credit_TakenA48 -0.2157282 0.0213985 -10.081 < 2e-16 ***
## Purposre_Credit_TakenA49 -0.1231291 0.0080966 -15.208 < 2e-16 ***
## Age -0.0012711 0.0002051 -6.198 5.76e-10 ***
## PropertyA122 0.0611509 0.0058654 10.426 < 2e-16 ***
## PropertyA123 0.0528360 0.0054169 9.754 < 2e-16 ***
## PropertyA124 0.1489449 0.0097867 15.219 < 2e-16 ***
## Years_At_Present_EmploymentA72 0.0252472 0.0097476 2.590 0.009599 **
## Years_At_Present_EmploymentA73 -0.0414445 0.0090543 -4.577 4.72e-06 ***
## Years_At_Present_EmploymentA74 -0.1210120 0.0096866 -12.493 < 2e-16 ***
## Years_At_Present_EmploymentA75 -0.0506770 0.0091716 -5.525 3.31e-08 ***
## HousingA152 -0.0798273 0.0055000 -14.514 < 2e-16 ***
## HousingA153 -0.1272980 0.0112635 -11.302 < 2e-16 ***
## Other_Inst_PlansA142 0.0045000 0.0107968 0.417 0.676837
## Other_Inst_PlansA143 -0.0726811 0.0061262 -11.864 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3908 on 40054 degrees of freedom
## Multiple R-squared: 0.2726, Adjusted R-squared: 0.2715
## F-statistic: 234.6 on 64 and 40054 DF, p-value: < 2.2e-16
plot(predict(linreg)) #predicted values lie outside 0 to 1 range - hence improper model fit
plot(linreg)
# Step 8- Transformmation/Dummy coding of variables for getting the best model
write.csv(stat$Tables$Duration_in_Months,"Duration_summary.csv",row.names=F)
newdata$Duration_Category <- ifelse(newdata$Duration_in_Months %in% c("4","5","6", "7", "8", "9", "10", "11","12", "13", "14", "15", "16", "17", "18"),"lessthan20", ifelse(newdata$Duration_in_Months %in% c("20","21","22", "24", "26", "27", "28", "30","33", "36", "39"), "20to40", "morethan40"))
table(newdata$Duration_in_Months, newdata$Duration_Category)
##
## 20to40 lessthan20 morethan40
## 10 0 1123 0
## 11 0 361 0
## 12 0 7179 0
## 13 0 161 0
## 14 0 161 0
## 15 0 2567 0
## 16 0 80 0
## 18 0 4536 0
## 20 322 0 0
## 21 1205 0 0
## 22 80 0 0
## 24 7386 0 0
## 26 40 0 0
## 27 521 0 0
## 28 120 0 0
## 30 1604 0 0
## 33 121 0 0
## 36 3326 0 0
## 39 200 0 0
## 4 0 240 0
## 40 0 0 41
## 42 0 0 440
## 45 0 0 200
## 47 0 0 40
## 48 0 0 1924
## 5 0 40 0
## 54 0 0 80
## 6 0 3009 0
## 60 0 0 524
## 7 0 200 0
## 72 0 0 40
## 8 0 280 0
## 9 0 1968 0
## A122 0 0 0
stat <- create_infotables(data=newdata, y = "Default_On_Payment")
newdata$Duration_Dummy_20 <- ifelse(newdata$Duration_Category == "lessthan20", 1,0)
newdata$Duration_Dummy_40 <- ifelse(newdata$Duration_Category == "20to40", 1,0)
newdata$Status_Checking_Acc_A11 <- ifelse(newdata$Status_Checking_Acc == "A11", 1,0)
newdata$Status_Checking_Acc_A12 <- ifelse(newdata$Status_Checking_Acc == "A12", 1,0)
newdata$Status_Checking_Acc_A13 <- ifelse(newdata$Status_Checking_Acc == "A13", 1,0)
newdata$Default_On_Payment1 <- as.factor(ifelse(newdata$Default_On_Payment == 1,"1","0"))
#Step 9 - Build logistic regression
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
logreg <- glm(Default_On_Payment ~ Duration_Dummy_20+Duration_Dummy_40+Status_Checking_Acc_A11+Status_Checking_Acc_A12+Status_Checking_Acc_A13+Age, family = binomial("logit"),data = newdata)
summary(logreg)
##
## Call:
## glm(formula = Default_On_Payment ~ Duration_Dummy_20 + Duration_Dummy_40 +
## Status_Checking_Acc_A11 + Status_Checking_Acc_A12 + Status_Checking_Acc_A13 +
## Age, family = binomial("logit"), data = newdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6125 -0.8832 -0.4819 1.0449 2.3970
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.595650 0.059126 -10.07 <2e-16 ***
## Duration_Dummy_20 -1.154614 0.041206 -28.02 <2e-16 ***
## Duration_Dummy_40 -0.677572 0.042070 -16.11 <2e-16 ***
## Status_Checking_Acc_A11 1.984595 0.031684 62.64 <2e-16 ***
## Status_Checking_Acc_A12 1.495144 0.032235 46.38 <2e-16 ***
## Status_Checking_Acc_A13 0.865074 0.054460 15.88 <2e-16 ***
## Age -0.015651 0.001076 -14.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 48956 on 40118 degrees of freedom
## Residual deviance: 42581 on 40112 degrees of freedom
## AIC: 42595
##
## Number of Fisher Scoring iterations: 4
plot(predict(logreg,type="response")) #note plot option has type to get inverse of log_odds
newdata$predicted = predict(logreg,type="response")
write.csv(newdata,"output_logreg.csv", row.names = F)
capture.output(summary(logreg), file = "summary_logreg.csv")
summary_residuals_model.csv<-residuals(logreg, type="deviance")
write.csv(summary_residuals_model.csv, "summary_residuals_model.csv")
#Step 10 - Model Diagnostics
# ROCR
newdata$predicted = predict(logreg,type="response")
pred<-prediction(newdata$predicted,newdata$Default_On_Payment)
perf <- performance(pred,"tpr","fpr")
plot(perf)
abline(a=0, b=1, col="Red")
# AUC
auc.perf = performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.7429535
# Create Decile by scorebands
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
newdata$decile <- ntile(-newdata$predicted,10)
write.csv(newdata, "newdata.csv")
# KS
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])
## [1] 0.3810604
# Lift
lift.obj <- performance(pred, "lift", x.measure = "rpp")
plot(lift.obj,
main="Cross-Sell - Lift Chart",
xlab="% Populations",
ylab="Lift",
col="blue")
abline(1,0,col="grey")
# Lorenz and Gini
library(ineq)
# Gini Index
ineq(newdata$predicted,type="Gini")
## [1] 0.3388472
## Lorenz Curve
plot(Lc(newdata$predicted),col="darkred",lwd=2)
##Get Concordance/Pairs Stats
# High-Low Ratio of bad rate by decile in Excel.
# Confidence interval for ROC, KS, Gini
# mydata[ which(mydata$gender=='F' & mydata$age > 65), ]
Concordance = function(y,yhat)
{
outcome_and_fitted_col<-data.frame(y, yhat)
colnames(outcome_and_fitted_col)<-c("Responder","fitted.values")
# get a subset of outcomes where the event actually happened
ones = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 1,]
# get a subset of outcomes where the event didn't actually happen
zeros = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 0,]
# Equate the length of the event and non-event tables
if (length(ones[,1])>length(zeros[,1])) {ones = ones[1:length(zeros[,1]),]}
else {zeros = zeros[1:length(ones[,1]),]}
# Following will be c(ones_outcome, ones_fitted, zeros_outcome, zeros_fitted)
ones_and_zeros = data.frame(ones, zeros)
# initiate columns to store concordant, discordant, and tie pair evaluations
conc = rep(NA, length(ones_and_zeros[,1]))
disc = rep(NA, length(ones_and_zeros[,1]))
ties = rep(NA, length(ones_and_zeros[,1]))
for (i in 1:length(ones_and_zeros[,1])) {
# This tests for concordance
if (ones_and_zeros[i,2] > ones_and_zeros[i,4])
{conc[i] = 1
disc[i] = 0
ties[i] = 0
}
# This tests for a tie
else if (ones_and_zeros[i,2] == ones_and_zeros[i,4])
{
conc[i] = 0
disc[i] = 0
ties[i] = 1
}
# This should catch discordant pairs.
else if (ones_and_zeros[i,2] < ones_and_zeros[i,4])
{
conc[i] = 0
disc[i] = 1
ties[i] = 0
}
}
# Here we save the various rates
conc_rate = mean(conc, na.rm=TRUE)
disc_rate = mean(disc, na.rm=TRUE)
tie_rate = mean(ties, na.rm=TRUE)
return(list(concordance=conc_rate, num_concordant=sum(conc), discordance=disc_rate, num_discordant=sum(disc), tie_rate=tie_rate,num_tied=sum(ties)))
}
Concordance_test<-Concordance(newdata$Default_On_Payment,newdata$predicted)
Concordance_test
## $concordance
## [1] 0.7421882
##
## $num_concordant
## [1] 8907
##
## $discordance
## [1] 0.2553121
##
## $num_discordant
## [1] 3064
##
## $tie_rate
## [1] 0.002499792
##
## $num_tied
## [1] 30
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.