Load

ccd <- CreditCardDefault <- read.csv("~/Downloads/CreditCardDefault.csv")
 head(ccd, 3)

Data Wrangling Part 1

ccd <- CreditCardDefault <- read.csv("~/Downloads/CreditCardDefault.csv")

#subset data to only the well defined dataset
#The original dataset contained information outside of the defined descriptions provided by the source. To handle these, we've narrowed those variables to content and observations that we understand.
ccd<- subset(ccd, (ccd$EDUCATION %in% c(1, 2, 3, 4)))
ccd<- subset(ccd, (ccd$MARRIAGE %in% c(1, 2)))

#Setting categorical variables as factors
#The original dataset was nearly entirely integers. Some categorical factors were converted to factor data type to enable use with R data visualizations and built in linear modeling tools. 
ccd$DEFAULT <- as.factor(ccd$default.payment.next.month)
ccd$SEX <- as.factor(ccd$SEX)
ccd$EDUCATION <- as.factor(ccd$EDUCATION)
ccd$MARRIAGE <- as.factor(ccd$MARRIAGE)

#handle level naming conventions
#To reduce confusion in visualization or reporting and interpretation the levels for the converted factors were renamed
levels(ccd$DEFAULT) <- c("No Default Next Month", "Defaults Next Month")
levels(ccd$SEX) <- c("Male", "Female")
levels(ccd$MARRIAGE) <- c('Married','Single')
levels(ccd$EDUCATION) <- c('Graduate School', 'University','High School', 'Other')

#Feature Generation
#These three features were created after some initial modeling with the belief that inclusion of these three variables may improve the model in some capacity. We speculate that including information about credit user average long run behavior over the five months preceding may reveal useful information in understanding they future bill amounts and accordingly their predicted probability of defaulting in the next month. 

#credit utilization, numerical variable with the average bill amount divided by the limit balance per user
ccd$AVG_UTILIZATION_RATE =(ccd$BILL_AMT2+ccd$BILL_AMT3+ccd$BILL_AMT4+ccd$BILL_AMT5+ccd$BILL_AMT6)/5/ccd$LIMIT_BAL

#Create a logical binary variable for if the average monthly excess balance over the past five months was greater than the limit balance. 
ccd$OVERSPENDER<-as.factor((ccd$BILL_AMT2+ccd$BILL_AMT3+ccd$BILL_AMT4+ccd$BILL_AMT5+ccd$BILL_AMT6)/5 > ccd$LIMIT_BAL)

#Create a numerical variable which captures how much excess balance over the past three months is being carried. The idea here is that users who make payments greater than their statement balance will have a negative excess balance average over three months. Those who are having trouble paying or are carrying revolving credit will 
ccd$TOTAL_EXCESS_BALANCE<-(((ccd$BILL_AMT2-ccd$PAY_AMT2+ccd$BILL_AMT3-ccd$PAY_AMT3+ccd$BILL_AMT4-ccd$PAY_AMT4+ccd$BILL_AMT5-ccd$PAY_AMT5+ccd$BILL_AMT6-ccd$PAY_AMT6)/5)-ccd$LIMIT_BAL)

#For whatever the reason the dataset is unevenly split with different shares of males and females. We do not attempt to speculate why, but simply split the population by gender and resample to head off potential unobserved built in bias.
#handle uneven Male and female data set, split sample by Male and Female and sample from each
ccd_male<- subset(ccd, (ccd$SEX == 'Male'))
ccd_female<- subset(ccd, (ccd$SEX == 'Female'))

#this way everyone gets the same output
set.seed((1234))

#sample for even 50/50 split between males and females, size is determined by the smallest available subset, in this case males with 11616 observations
ccd_male_sample <- sample_n(ccd_male, min(nrow(ccd_male), nrow(ccd_female)))
ccd_female_sample <- sample_n(ccd_female, min(nrow(ccd_male), nrow(ccd_female)))
ccd <- data.frame(rbind(ccd_male_sample,ccd_female_sample))

#handle NANs by only selecting complete observations
ccd<-ccd[complete.cases(ccd),]

#Workspace cleanup
rm(CreditCardDefault)
rm(ccd_female)
rm(ccd_male)
rm(ccd_female_sample)
rm(ccd_male_sample)

#narrow our working dataset to features we will specifically use here
#drop PAY_0, we would not have this information yet
#drop BILL_AMT1, we are modeling this,...but retain for model accuracy checks
#drop PAY_AMT1, we would not have this information yet...but retain for inclusion into logistic Regression.
ccd <- ccd %>% dplyr::select(c(-PAY_0))

#Because we are working with a relatively large dataset,  we choose to split the training and test data 50/50.
set.seed((1234))
#Split Data into train and test, ccd will remain train,  ccd.test will be test data
Split_Data = sample(c(rep(0, 0.5 * nrow(ccd)), rep(1, 0.5 * nrow(ccd))))
ccd.train<- ccd[Split_Data==0,]
ccd.test<- ccd[Split_Data==1,]

write.csv(ccd, "ccd.csv", row.names = FALSE)
write.csv(ccd.train,"ccd.train.csv", row.names = FALSE)
write.csv(ccd.test,"ccd.test.csv", row.names = FALSE)

Data Visualization

Plot 1

Data Visualization: Using proportion bar charts, we explore the categorical features within our data with intent to identify impactful predictors. Splitting by individuals that default, our response variable and what we are attempting to model predictions for with the logistic regression, we note possible differences based on Sex, Education, Marital Status, and the Overspender Flag.

The Overspender Flag and Other Education immediately stick out as features with a relative large difference in proportion of defaults in comparison the population average default proportion. Males appear to have a larger proportion of defaults within the data, but the magnitude of the effect is difficult to discern from visualization alone. Marriage also appears to have a slightly larger share of the population than which results in default.

Plot 2

Data Visualization: We explore some of our numeric predictor variables using boxplots with outliers depicted in red. Limit Balance on average appears to be larger for those who do not default versus those that default. As we would expect with Total_Excess_Balance, the greater the balance over the limit balance, the more defaults on average. Average Credit Card Utilization Rates appear to have an a postive impact of the share of the population subset which defaults within the next month.

Plot 3

Based on density plot visualization and comparison, there appears to be a difference in the distributions of Average Utilization Rate between those who default and those who do not default. Increases the average utilization rate appears to be associated with increased defaults.

Plot 4

Plot 5


Linear Regression Model (Place Holder)

linear<-lm(BILL_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+AGE+SEX+EDUCATION+MARRIAGE, data = ccd)
summary(linear)
## 
## Call:
## lm(formula = BILL_AMT1 ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 + 
##     PAY_AMT2 + PAY_AMT3 + AGE + SEX + EDUCATION + MARRIAGE, data = ccd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -489733   -4391   -2143     705  414124 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.499e+02  8.647e+02  -0.405  0.68572    
## LIMIT_BAL             1.442e-02  1.286e-03  11.213  < 2e-16 ***
## BILL_AMT2             8.842e-01  6.878e-03 128.545  < 2e-16 ***
## BILL_AMT3             1.059e-01  7.361e-03  14.386  < 2e-16 ***
## PAY_AMT2             -8.000e-02  8.673e-03  -9.224  < 2e-16 ***
## PAY_AMT3              6.934e-02  9.511e-03   7.290 3.19e-13 ***
## AGE                   5.287e+00  1.907e+01   0.277  0.78162    
## SEXFemale            -6.792e+02  2.997e+02  -2.266  0.02346 *  
## EDUCATIONUniversity   1.673e+03  3.434e+02   4.871 1.12e-06 ***
## EDUCATIONHigh School  1.485e+03  4.715e+02   3.151  0.00163 ** 
## EDUCATIONOther        6.791e+03  2.308e+03   2.942  0.00326 ** 
## MARRIAGESingle       -1.845e+00  3.439e+02  -0.005  0.99572    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22590 on 23220 degrees of freedom
## Multiple R-squared:  0.9081, Adjusted R-squared:  0.9081 
## F-statistic: 2.086e+04 on 11 and 23220 DF,  p-value: < 2.2e-16

T-tests seem to indicate that AGE and MARRIAGESSingle are not significant, given the other variables in the model. Let’s see if we can remove them.

\(H_{0} : B_{1} = B_{2} = B_{3} = B_{4} = ... B_{p} = 0\)

\(H_{a}\) : at least one of the coefficients in \(H_{0}\) is not zero.

reduced<-lm(BILL_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION, data = ccd)
anova(reduced,linear)

Our F statistic is 0.0494 with a p-value of 0.9518. We do not reject the null hypothesis. Our data suggests we can drop the predictors AGE and MARRIAGE and go with the reduced model.

summary(reduced)
## 
## Call:
## lm(formula = BILL_AMT1 ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 + 
##     PAY_AMT2 + PAY_AMT3 + SEX + EDUCATION, data = ccd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -489738   -4386   -2141     710  414107 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.806e+02  3.794e+02  -0.476 0.634085    
## LIMIT_BAL             1.449e-02  1.258e-03  11.526  < 2e-16 ***
## BILL_AMT2             8.842e-01  6.876e-03 128.583  < 2e-16 ***
## BILL_AMT3             1.059e-01  7.360e-03  14.390  < 2e-16 ***
## PAY_AMT2             -8.003e-02  8.672e-03  -9.228  < 2e-16 ***
## PAY_AMT3              6.933e-02  9.509e-03   7.291 3.19e-13 ***
## SEXFemale            -6.890e+02  2.971e+02  -2.319 0.020395 *  
## EDUCATIONUniversity   1.680e+03  3.393e+02   4.952 7.39e-07 ***
## EDUCATIONHigh School  1.523e+03  4.547e+02   3.349 0.000812 ***
## EDUCATIONOther        6.787e+03  2.308e+03   2.941 0.003279 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22590 on 23222 degrees of freedom
## Multiple R-squared:  0.9081, Adjusted R-squared:  0.9081 
## F-statistic: 2.55e+04 on 9 and 23222 DF,  p-value: < 2.2e-16

Let’s double check our reduced model by doing the regsubsets() frunction to run all possible regressions on our original (full) predictor set.

library(leaps)
allreg<-regsubsets(BILL_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+AGE+SEX+EDUCATION+MARRIAGE, data = ccd, nbest = 2)
summary(allreg)
## Subset selection object
## Call: regsubsets.formula(BILL_AMT1 ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 + 
##     PAY_AMT2 + PAY_AMT3 + AGE + SEX + EDUCATION + MARRIAGE, data = ccd, 
##     nbest = 2)
## 11 Variables  (and intercept)
##                      Forced in Forced out
## LIMIT_BAL                FALSE      FALSE
## BILL_AMT2                FALSE      FALSE
## BILL_AMT3                FALSE      FALSE
## PAY_AMT2                 FALSE      FALSE
## PAY_AMT3                 FALSE      FALSE
## AGE                      FALSE      FALSE
## SEXFemale                FALSE      FALSE
## EDUCATIONUniversity      FALSE      FALSE
## EDUCATIONHigh School     FALSE      FALSE
## EDUCATIONOther           FALSE      FALSE
## MARRIAGESingle           FALSE      FALSE
## 2 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          LIMIT_BAL BILL_AMT2 BILL_AMT3 PAY_AMT2 PAY_AMT3 AGE SEXFemale
## 1  ( 1 ) " "       "*"       " "       " "      " "      " " " "      
## 1  ( 2 ) " "       " "       "*"       " "      " "      " " " "      
## 2  ( 1 ) " "       "*"       "*"       " "      " "      " " " "      
## 2  ( 2 ) "*"       "*"       " "       " "      " "      " " " "      
## 3  ( 1 ) "*"       "*"       "*"       " "      " "      " " " "      
## 3  ( 2 ) " "       "*"       "*"       " "      "*"      " " " "      
## 4  ( 1 ) "*"       "*"       "*"       "*"      " "      " " " "      
## 4  ( 2 ) "*"       "*"       "*"       " "      "*"      " " " "      
## 5  ( 1 ) "*"       "*"       "*"       "*"      "*"      " " " "      
## 5  ( 2 ) "*"       "*"       "*"       "*"      " "      " " " "      
## 6  ( 1 ) "*"       "*"       "*"       "*"      "*"      " " " "      
## 6  ( 2 ) "*"       "*"       "*"       "*"      "*"      " " " "      
## 7  ( 1 ) "*"       "*"       "*"       "*"      "*"      " " " "      
## 7  ( 2 ) "*"       "*"       "*"       "*"      "*"      " " " "      
## 8  ( 1 ) "*"       "*"       "*"       "*"      "*"      " " " "      
## 8  ( 2 ) "*"       "*"       "*"       "*"      "*"      " " "*"      
##          EDUCATIONUniversity EDUCATIONHigh School EDUCATIONOther MARRIAGESingle
## 1  ( 1 ) " "                 " "                  " "            " "           
## 1  ( 2 ) " "                 " "                  " "            " "           
## 2  ( 1 ) " "                 " "                  " "            " "           
## 2  ( 2 ) " "                 " "                  " "            " "           
## 3  ( 1 ) " "                 " "                  " "            " "           
## 3  ( 2 ) " "                 " "                  " "            " "           
## 4  ( 1 ) " "                 " "                  " "            " "           
## 4  ( 2 ) " "                 " "                  " "            " "           
## 5  ( 1 ) " "                 " "                  " "            " "           
## 5  ( 2 ) "*"                 " "                  " "            " "           
## 6  ( 1 ) "*"                 " "                  " "            " "           
## 6  ( 2 ) " "                 " "                  "*"            " "           
## 7  ( 1 ) "*"                 "*"                  " "            " "           
## 7  ( 2 ) "*"                 " "                  "*"            " "           
## 8  ( 1 ) "*"                 "*"                  "*"            " "           
## 8  ( 2 ) "*"                 "*"                  " "            " "

Let’s see which model has the best (highest) adjusted \(R^{2}\).

coef(allreg, which.max(summary(allreg)$adjr2))
##          (Intercept)            LIMIT_BAL            BILL_AMT2 
##        -499.38363102           0.01436234           0.88453542 
##            BILL_AMT3             PAY_AMT2             PAY_AMT3 
##           0.10572357          -0.07988742           0.06967691 
##  EDUCATIONUniversity EDUCATIONHigh School       EDUCATIONOther 
##        1651.13782180        1510.39547423        6721.23012387

This model dropped AGE and MARRIAGE like our reduced model did. It also dropped SEX.

Let’s see which model is has the best (lowest) Mallow’s \(C_p\).

coef(allreg, which.min(summary(allreg)$cp))
##          (Intercept)            LIMIT_BAL            BILL_AMT2 
##        -499.38363102           0.01436234           0.88453542 
##            BILL_AMT3             PAY_AMT2             PAY_AMT3 
##           0.10572357          -0.07988742           0.06967691 
##  EDUCATIONUniversity EDUCATIONHigh School       EDUCATIONOther 
##        1651.13782180        1510.39547423        6721.23012387

This has the same predictors as the model which maximizes adjusted \(R^2\). It drops AGE, MARRIAGE, and SEX.

Let’s see which model has the best (lowest) BIC.

coef(allreg, which.min(summary(allreg)$bic))
##         (Intercept)           LIMIT_BAL           BILL_AMT2           BILL_AMT3 
##        171.58094718          0.01344336          0.88485507          0.10581910 
##            PAY_AMT2            PAY_AMT3 EDUCATIONUniversity 
##         -0.07990731          0.06988625       1092.26986352

This model removes some of the education factors. It’s only a predictor for clients who attended university.

Lastly, let’s try a stepwise regression to see which model will be the best. We will start with an intercept-only model.

regnull<- lm(BILL_AMT1 ~ 1, data = ccd)
step(regnull, scope = list(lower=regnull, upper=linear), direction = "both")
## Start:  AIC=521270.2
## BILL_AMT1 ~ 1
## 
##             Df  Sum of Sq        RSS    AIC
## + BILL_AMT2  1 1.1693e+14 1.2063e+13 466222
## + BILL_AMT3  1 1.0194e+14 2.7054e+13 484985
## + LIMIT_BAL  1 1.0629e+13 1.1837e+14 519274
## + PAY_AMT3   1 2.8306e+12 1.2616e+14 520757
## + PAY_AMT2   1 1.3590e+12 1.2764e+14 521026
## + AGE        1 4.5774e+11 1.2854e+14 521190
## + SEX        1 1.4178e+11 1.2885e+14 521247
## + EDUCATION  3 1.2562e+11 1.2887e+14 521254
## + MARRIAGE   1 6.5190e+10 1.2893e+14 521260
## <none>                    1.2899e+14 521270
## 
## Step:  AIC=466221.5
## BILL_AMT1 ~ BILL_AMT2
## 
##             Df  Sum of Sq        RSS    AIC
## + BILL_AMT3  1 7.1095e+10 1.1992e+13 466086
## + LIMIT_BAL  1 6.8674e+10 1.1995e+13 466091
## + PAY_AMT3   1 2.9881e+10 1.2033e+13 466166
## + PAY_AMT2   1 4.4609e+09 1.2059e+13 466215
## + EDUCATION  3 6.0505e+09 1.2057e+13 466216
## + AGE        1 3.8989e+09 1.2059e+13 466216
## + MARRIAGE   1 2.8508e+09 1.2060e+13 466218
## + SEX        1 1.3719e+09 1.2062e+13 466221
## <none>                    1.2063e+13 466222
## - BILL_AMT2  1 1.1693e+14 1.2899e+14 521270
## 
## Step:  AIC=466086.2
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3
## 
##             Df  Sum of Sq        RSS    AIC
## + LIMIT_BAL  1 5.9404e+10 1.1933e+13 465973
## + PAY_AMT3   1 2.9098e+10 1.1963e+13 466032
## + PAY_AMT2   1 2.0111e+10 1.1972e+13 466049
## + EDUCATION  3 6.4398e+09 1.1986e+13 466080
## + AGE        1 3.5813e+09 1.1989e+13 466081
## + MARRIAGE   1 2.3350e+09 1.1990e+13 466084
## + SEX        1 1.5964e+09 1.1991e+13 466085
## <none>                    1.1992e+13 466086
## - BILL_AMT3  1 7.1095e+10 1.2063e+13 466222
## - BILL_AMT2  1 1.5062e+13 2.7054e+13 484985
## 
## Step:  AIC=465972.8
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL
## 
##             Df  Sum of Sq        RSS    AIC
## + PAY_AMT2   1 3.2457e+10 1.1900e+13 465912
## + PAY_AMT3   1 1.6290e+10 1.1917e+13 465943
## + EDUCATION  3 1.6371e+10 1.1916e+13 465947
## + SEX        1 2.3673e+09 1.1930e+13 465970
## <none>                    1.1933e+13 465973
## + AGE        1 6.6674e+08 1.1932e+13 465974
## + MARRIAGE   1 4.8953e+08 1.1932e+13 465974
## - LIMIT_BAL  1 5.9404e+10 1.1992e+13 466086
## - BILL_AMT3  1 6.1825e+10 1.1995e+13 466091
## - BILL_AMT2  1 1.4947e+13 2.6879e+13 484837
## 
## Step:  AIC=465911.5
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL + PAY_AMT2
## 
##             Df  Sum of Sq        RSS    AIC
## + PAY_AMT3   1 2.7423e+10 1.1873e+13 465860
## + EDUCATION  3 1.6135e+10 1.1884e+13 465886
## + SEX        1 2.5566e+09 1.1898e+13 465909
## <none>                    1.1900e+13 465912
## + AGE        1 5.8576e+08 1.1900e+13 465912
## + MARRIAGE   1 3.8415e+08 1.1900e+13 465913
## - PAY_AMT2   1 3.2457e+10 1.1933e+13 465973
## - LIMIT_BAL  1 7.1750e+10 1.1972e+13 466049
## - BILL_AMT3  1 9.3635e+10 1.1994e+13 466092
## - BILL_AMT2  1 8.8123e+12 2.0713e+13 478784
## 
## Step:  AIC=465859.9
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL + PAY_AMT2 + PAY_AMT3
## 
##             Df  Sum of Sq        RSS    AIC
## + EDUCATION  3 1.6126e+10 1.1857e+13 465834
## + SEX        1 2.2972e+09 1.1871e+13 465857
## <none>                    1.1873e+13 465860
## + AGE        1 6.1621e+08 1.1872e+13 465861
## + MARRIAGE   1 5.0231e+08 1.1872e+13 465861
## - PAY_AMT3   1 2.7423e+10 1.1900e+13 465912
## - PAY_AMT2   1 4.3590e+10 1.1917e+13 465943
## - LIMIT_BAL  1 5.6741e+10 1.1930e+13 465969
## - BILL_AMT3  1 1.0566e+11 1.1979e+13 466064
## - BILL_AMT2  1 8.4736e+12 2.0347e+13 478372
## 
## Step:  AIC=465834.4
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL + PAY_AMT2 + PAY_AMT3 + 
##     EDUCATION
## 
##             Df  Sum of Sq        RSS    AIC
## + SEX        1 2.7455e+09 1.1854e+13 465831
## <none>                    1.1857e+13 465834
## + AGE        1 1.5878e+08 1.1857e+13 465836
## + MARRIAGE   1 4.4906e+06 1.1857e+13 465836
## - EDUCATION  3 1.6126e+10 1.1873e+13 465860
## - PAY_AMT3   1 2.7413e+10 1.1884e+13 465886
## - PAY_AMT2   1 4.3318e+10 1.1900e+13 465917
## - LIMIT_BAL  1 6.6721e+10 1.1924e+13 465963
## - BILL_AMT3  1 1.0534e+11 1.1962e+13 466038
## - BILL_AMT2  1 8.4517e+12 2.0309e+13 478334
## 
## Step:  AIC=465831
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL + PAY_AMT2 + PAY_AMT3 + 
##     EDUCATION + SEX
## 
##             Df  Sum of Sq        RSS    AIC
## <none>                    1.1854e+13 465831
## + AGE        1 5.0380e+07 1.1854e+13 465833
## + MARRIAGE   1 1.1165e+07 1.1854e+13 465833
## - SEX        1 2.7455e+09 1.1857e+13 465834
## - EDUCATION  3 1.6574e+10 1.1871e+13 465857
## - PAY_AMT3   1 2.7132e+10 1.1881e+13 465882
## - PAY_AMT2   1 4.3469e+10 1.1898e+13 465914
## - LIMIT_BAL  1 6.7817e+10 1.1922e+13 465962
## - BILL_AMT3  1 1.0570e+11 1.1960e+13 466035
## - BILL_AMT2  1 8.4399e+12 2.0294e+13 478320
## 
## Call:
## lm(formula = BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL + 
##     PAY_AMT2 + PAY_AMT3 + EDUCATION + SEX, data = ccd)
## 
## Coefficients:
##          (Intercept)             BILL_AMT2             BILL_AMT3  
##           -180.60583               0.88416               0.10591  
##            LIMIT_BAL              PAY_AMT2              PAY_AMT3  
##              0.01449              -0.08003               0.06933  
##  EDUCATIONUniversity  EDUCATIONHigh School        EDUCATIONOther  
##           1680.03227            1522.86782            6786.68206  
##            SEXFemale  
##           -688.99111

The stepwise regression ends with the same predictors as our reduced model above. We remove only AGE and MARRIAGE.


Data Wrangling Part 2

#Chose to standardize because of application scenario to Logistic Regression. If there are outliers in the data, it is a reasonable conclusion that they would contain important information. Standardization retains the impact of outliers.
#endstate is numeric features only
noscale <- function(x){x}
standardscale <- function(x){(x-mean(x))/(sd(x))}
minmaxscale <- function(x){(x-min(x))/(max(x)-min(x))}
robustscale <- function(x){
  Q <- quantile(x, probs=c(.25, .75), na.rm = FALSE)
  return (x-median(x))/(Q[2]-Q[1])}
  
standardscale<-noscale

ccd.scaled <- data.frame(ccd$LIMIT_BAL)
ccd.scaled$LIMIT_BAL<- standardscale(ccd$LIMIT_BAL)
ccd.scaled <- ccd.scaled %>% dplyr::select(-ccd.LIMIT_BAL)
ccd.scaled$AGE<- standardscale(ccd$AGE)
ccd.scaled$PAY_2<- standardscale(ccd$PAY_2)
ccd.scaled$PAY_3<- standardscale(ccd$PAY_3)
ccd.scaled$PAY_4<- standardscale(ccd$PAY_4)
ccd.scaled$PAY_5<- standardscale(ccd$PAY_5)
ccd.scaled$PAY_6<- standardscale(ccd$PAY_6)
ccd.scaled$BILL_AMT1<- standardscale(ccd$BILL_AMT1)
ccd.scaled$BILL_AMT2<- standardscale(ccd$BILL_AMT2)
ccd.scaled$BILL_AMT3<- standardscale(ccd$BILL_AMT3)
ccd.scaled$BILL_AMT4<- standardscale(ccd$BILL_AMT4)
ccd.scaled$BILL_AMT5<- standardscale(ccd$BILL_AMT5)
ccd.scaled$BILL_AMT6<- standardscale(ccd$BILL_AMT6)
ccd.scaled$PAY_AMT1<- standardscale(ccd$PAY_AMT1)
ccd.scaled$PAY_AMT2<- standardscale(ccd$PAY_AMT2)
ccd.scaled$PAY_AMT3<- standardscale(ccd$PAY_AMT3)
ccd.scaled$PAY_AMT4<- standardscale(ccd$PAY_AMT4)
ccd.scaled$PAY_AMT5<- standardscale(ccd$PAY_AMT5)
ccd.scaled$PAY_AMT6<- standardscale(ccd$PAY_AMT6)
ccd.scaled$DEFAULT <- as.numeric(ifelse(ccd$default.payment.next.month == 1,1,0))
ccd.scaled$AVG_UTILIZATION_RATE <- standardscale(ccd$AVG_UTILIZATION_RATE)
ccd.scaled$OVERSPENDER <- ifelse(ccd$OVERSPENDER == 1, 1,0)
ccd.scaled$TOTAL_EXCESS_BALANCE <- standardscale(ccd$TOTAL_EXCESS_BALANCE)
ccd.scaled$MARRIAGE <- ccd$MARRIAGE
ccd.scaled$EDUCATION <- ccd$EDUCATION
ccd.scaled$SEX <- ccd$SEX

#May not be necessary...yet
#ccd.scaled$SEX_MALE <- ifelse(ccd$SEX == 'Male', 1, 0)
#ccd.scaled$SEX_FEMALE <- ifelse(ccd$SEX == 'Female', 1, 0)
#ccd.scaled$EDU_GRAD <- ifelse(ccd$EDUCATION == 'Graduate School', 1, 0)
#ccd.scaled$EDU_UNIVERSITY <- ifelse(ccd$EDUCATION == 'University', 1, 0)
#ccd.scaled$EDU_HIGH_SCHOOL <- ifelse(ccd$EDUCATION == 'High School', 1, 0)
#ccd.scaled$EDU_OTHER <- ifelse(ccd$EDUCATION == 'Other', 1, 0)
#ccd.scaled$MARRIAGE_M <- ifelse(ccd$MARRIAGE == 'Married', 1, 0)
#ccd.scaled$MARRIAGE_S <- ifelse(ccd$MARRIAGE == 'Single', 1, 0)

#Feature Scaling References
#REF: https://www.oreilly.com/library/view/hands-on-machine-learning/9781788393485/fd5b8a44-e9d3-4c19-bebb-c2fa5a5ebfee.xhtml
#REF: https://vitalflux.com/minmaxscaler-standardscaler-python-examples/
#REF: https://becominghuman.ai/what-does-feature-scaling-mean-when-to-normalize-data-and-when-to-standardize-data-c3de654405ed
#REF: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
#REF: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

#Handling an imbalanced dataset, may need to resample the unbalanced dataset to account for fewer defaults
#keep all of the defaults, randomly sample an equal number of non defaults
ccd_defaults<- subset(ccd, (ccd$default.payment.next.month == 1))
ccd_no_defaults <- subset(ccd, (ccd$default.payment.next.month == 0))

#this way everyone gets the same output
set.seed((1234))

#unbalanced dataset of defaults and no defaults, resample for maximum number of defaults and equivalent no defaults.  
#down sampling:
ccd_no_defaults_downsample <- sample_n(ccd_no_defaults, nrow(ccd_defaults), replace = F)
ccd_down_balanced <- data.frame(rbind(ccd_defaults,ccd_no_defaults_downsample))

#up sampling:
ccd_defaults_upsample <- sample_n(ccd_defaults, nrow(ccd_no_defaults), replace = T)
ccd_up_balanced <- data.frame(rbind(ccd_no_defaults,ccd_defaults_upsample))
set.seed((1234))
#Split Data into train and test, ccd will remain train,  ccd.test will be test data
Split_Data = sample(c(rep(0, 0.5 * nrow(ccd_down_balanced)), rep(1, 0.5 *nrow(ccd_down_balanced))))
ccd.down.train<- ccd_down_balanced[Split_Data==0,]
ccd.down.test<- ccd_down_balanced[Split_Data==1,]
set.seed((1234))
#Split Data into train and test, ccd will remain train,  ccd.test will be test data
Split_Data = sample(c(rep(0, 0.5 * nrow(ccd_up_balanced)), rep(1, 0.5 *nrow(ccd_up_balanced))))
ccd.up.train<- ccd_up_balanced[Split_Data==0,]
ccd.up.test<- ccd_up_balanced[Split_Data==1,]
#Handling an imbalanced dataset, may need to resample the unbalanced dataset to account for fewer defaults
#keep all of the defaults, randomly sample an equal number of non defaults
ccd_defaults<- subset(ccd.scaled, (ccd.scaled$DEFAULT == 1))
ccd_no_defaults <- subset(ccd.scaled, (ccd.scaled$DEFAULT == 0))

#this way everyone gets the same output
set.seed((1234))

#unbalanced dataset of defaults and no defaults, resample for maximum number of defaults and equivalent no defaults.  
#down sampling:
ccd_no_defaults_downsample <- sample_n(ccd_no_defaults, nrow(ccd_defaults), replace = F)
ccd_down_balanced <- data.frame(rbind(ccd_defaults,ccd_no_defaults_downsample))

#up sampling:
ccd_defaults_upsample <- sample_n(ccd_defaults, nrow(ccd_no_defaults), replace = T)
ccd_up_balanced <- data.frame(rbind(ccd_no_defaults,ccd_defaults_upsample))

set.seed((1234))
#Split Data into train and test, ccd will remain train,  ccd.test will be test data
Split_Data = sample(c(rep(0, 0.5 * nrow(ccd_up_balanced)), rep(1, 0.5 *nrow(ccd_up_balanced))))
ccd.scaled.up.train<- ccd_up_balanced[Split_Data==0,]
ccd.scaled.up.test<- ccd_up_balanced[Split_Data==1,]

#clean workspace
rm(ccd_defaults)
rm(ccd_no_defaults)
rm(ccd_no_defaults_downsample)
rm(ccd_down_balanced)
rm(ccd_defaults_upsample)
rm(ccd_up_balanced)
#scaled up
ccd.test = ccd.scaled.up.test
ccd.train = ccd.scaled.up.train

Logistic Regression Model Building

Model 1: All Variables

As a starting point, our first logistic regression model simply includes every feature available in the dataset.

mod1 = glm(formula = DEFAULT~LIMIT_BAL+BILL_AMT1+BILL_AMT2+BILL_AMT3+BILL_AMT4+BILL_AMT5+BILL_AMT6+PAY_AMT1+PAY_AMT2+PAY_AMT3+PAY_AMT4+PAY_AMT5+PAY_AMT6+AVG_UTILIZATION_RATE+PAY_2+PAY_3+PAY_4+PAY_5+PAY_6+SEX+MARRIAGE+EDUCATION+AGE, family = binomial(link = "logit"), data=ccd.train)                                                     
summary(mod1)
## 
## Call:
## glm(formula = DEFAULT ~ LIMIT_BAL + BILL_AMT1 + BILL_AMT2 + BILL_AMT3 + 
##     BILL_AMT4 + BILL_AMT5 + BILL_AMT6 + PAY_AMT1 + PAY_AMT2 + 
##     PAY_AMT3 + PAY_AMT4 + PAY_AMT5 + PAY_AMT6 + AVG_UTILIZATION_RATE + 
##     PAY_2 + PAY_3 + PAY_4 + PAY_5 + PAY_6 + SEX + MARRIAGE + 
##     EDUCATION + AGE, family = binomial(link = "logit"), data = ccd.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5278  -1.1127   0.2719   1.0918   3.7838  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           4.740e-01  9.859e-02   4.807 1.53e-06 ***
## LIMIT_BAL            -1.411e-06  1.949e-07  -7.238 4.55e-13 ***
## BILL_AMT1            -6.595e-06  1.142e-06  -5.777 7.62e-09 ***
## BILL_AMT2             6.355e-06  1.481e-06   4.291 1.78e-05 ***
## BILL_AMT3             1.135e-06  1.274e-06   0.891 0.372863    
## BILL_AMT4            -2.011e-06  1.392e-06  -1.445 0.148524    
## BILL_AMT5            -1.677e-06  1.596e-06  -1.051 0.293159    
## BILL_AMT6             4.482e-06  1.256e-06   3.568 0.000360 ***
## PAY_AMT1             -1.536e-05  2.208e-06  -6.957 3.48e-12 ***
## PAY_AMT2             -9.856e-06  2.003e-06  -4.919 8.69e-07 ***
## PAY_AMT3             -2.565e-06  1.548e-06  -1.657 0.097424 .  
## PAY_AMT4             -5.754e-06  1.736e-06  -3.314 0.000919 ***
## PAY_AMT5             -8.599e-06  1.868e-06  -4.603 4.17e-06 ***
## PAY_AMT6             -3.001e-06  1.381e-06  -2.173 0.029788 *  
## AVG_UTILIZATION_RATE -2.490e-01  7.590e-02  -3.280 0.001036 ** 
## PAY_2                 3.099e-01  2.000e-02  15.495  < 2e-16 ***
## PAY_3                 6.277e-02  2.379e-02   2.639 0.008317 ** 
## PAY_4                 3.118e-02  2.608e-02   1.195 0.231946    
## PAY_5                 5.516e-02  2.806e-02   1.966 0.049348 *  
## PAY_6                 2.102e-02  2.308e-02   0.911 0.362388    
## SEXFemale            -9.410e-02  3.217e-02  -2.925 0.003441 ** 
## MARRIAGESingle       -1.634e-01  3.687e-02  -4.430 9.43e-06 ***
## EDUCATIONUniversity  -1.945e-02  3.721e-02  -0.523 0.601195    
## EDUCATIONHigh School -3.927e-02  5.071e-02  -0.774 0.438728    
## EDUCATIONOther       -9.365e-01  3.457e-01  -2.709 0.006752 ** 
## AGE                   3.566e-03  2.029e-03   1.757 0.078877 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24878  on 17945  degrees of freedom
## Residual deviance: 22749  on 17920  degrees of freedom
## AIC: 22801
## 
## Number of Fisher Scoring iterations: 5
anova(mod1, test="Chisq")
equatiomatic::extract_eq(mod1)

\[ \log\left[ \frac { P( \operatorname{DEFAULT} = \operatorname{1} ) }{ 1 - P( \operatorname{DEFAULT} = \operatorname{1} ) } \right] = \alpha + \beta_{1}(\operatorname{LIMIT\_BAL}) + \beta_{2}(\operatorname{BILL\_AMT1}) + \beta_{3}(\operatorname{BILL\_AMT2}) + \beta_{4}(\operatorname{BILL\_AMT3}) + \beta_{5}(\operatorname{BILL\_AMT4}) + \beta_{6}(\operatorname{BILL\_AMT5}) + \beta_{7}(\operatorname{BILL\_AMT6}) + \beta_{8}(\operatorname{PAY\_AMT1}) + \beta_{9}(\operatorname{PAY\_AMT2}) + \beta_{10}(\operatorname{PAY\_AMT3}) + \beta_{11}(\operatorname{PAY\_AMT4}) + \beta_{12}(\operatorname{PAY\_AMT5}) + \beta_{13}(\operatorname{PAY\_AMT6}) + \beta_{14}(\operatorname{AVG\_UTILIZATION\_RATE}) + \beta_{15}(\operatorname{PAY\_2}) + \beta_{16}(\operatorname{PAY\_3}) + \beta_{17}(\operatorname{PAY\_4}) + \beta_{18}(\operatorname{PAY\_5}) + \beta_{19}(\operatorname{PAY\_6}) + \beta_{20}(\operatorname{SEX}_{\operatorname{Female}}) + \beta_{21}(\operatorname{MARRIAGE}_{\operatorname{Single}}) + \beta_{22}(\operatorname{EDUCATION}_{\operatorname{University}}) + \beta_{23}(\operatorname{EDUCATION}_{\operatorname{High\ School}}) + \beta_{24}(\operatorname{EDUCATION}_{\operatorname{Other}}) + \beta_{25}(\operatorname{AGE}) \]

Model 2: With Interactions

mod2 = glm(formula = DEFAULT~
             BILL_AMT1+BILL_AMT2+BILL_AMT3+BILL_AMT4+BILL_AMT5+BILL_AMT6+
             PAY_AMT1+PAY_AMT2+PAY_AMT3+PAY_AMT4+PAY_AMT5+PAY_AMT6+
             PAY_2+PAY_3+PAY_4+PAY_5+PAY_6+
             SEX*MARRIAGE+EDUCATION*AGE+LIMIT_BAL*AGE+AVG_UTILIZATION_RATE*AGE, family = binomial(link = "logit"), data=ccd.train)                                       
summary(mod2)
## 
## Call:
## glm(formula = DEFAULT ~ BILL_AMT1 + BILL_AMT2 + BILL_AMT3 + BILL_AMT4 + 
##     BILL_AMT5 + BILL_AMT6 + PAY_AMT1 + PAY_AMT2 + PAY_AMT3 + 
##     PAY_AMT4 + PAY_AMT5 + PAY_AMT6 + PAY_2 + PAY_3 + PAY_4 + 
##     PAY_5 + PAY_6 + SEX * MARRIAGE + EDUCATION * AGE + LIMIT_BAL * 
##     AGE + AVG_UTILIZATION_RATE * AGE, family = binomial(link = "logit"), 
##     data = ccd.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5517  -1.1129   0.2706   1.0918   3.8062  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               6.253e-01  1.933e-01   3.235 0.001218 ** 
## BILL_AMT1                -6.484e-06  1.139e-06  -5.690 1.27e-08 ***
## BILL_AMT2                 6.479e-06  1.480e-06   4.379 1.19e-05 ***
## BILL_AMT3                 1.013e-06  1.267e-06   0.799 0.424122    
## BILL_AMT4                -1.843e-06  1.383e-06  -1.333 0.182591    
## BILL_AMT5                -1.682e-06  1.596e-06  -1.054 0.292027    
## BILL_AMT6                 4.422e-06  1.258e-06   3.517 0.000437 ***
## PAY_AMT1                 -1.547e-05  2.207e-06  -7.010 2.38e-12 ***
## PAY_AMT2                 -9.607e-06  2.001e-06  -4.801 1.58e-06 ***
## PAY_AMT3                 -2.688e-06  1.526e-06  -1.762 0.078057 .  
## PAY_AMT4                 -5.579e-06  1.731e-06  -3.223 0.001267 ** 
## PAY_AMT5                 -8.512e-06  1.865e-06  -4.563 5.04e-06 ***
## PAY_AMT6                 -3.033e-06  1.380e-06  -2.197 0.027990 *  
## PAY_2                     3.125e-01  2.005e-02  15.591  < 2e-16 ***
## PAY_3                     6.009e-02  2.383e-02   2.521 0.011690 *  
## PAY_4                     3.501e-02  2.612e-02   1.341 0.180012    
## PAY_5                     5.464e-02  2.808e-02   1.946 0.051687 .  
## PAY_6                     2.139e-02  2.310e-02   0.926 0.354548    
## SEXFemale                -1.255e-01  4.739e-02  -2.648 0.008085 ** 
## MARRIAGESingle           -1.877e-01  4.945e-02  -3.796 0.000147 ***
## EDUCATIONUniversity       1.521e-01  1.512e-01   1.006 0.314388    
## EDUCATIONHigh School      3.042e-01  1.983e-01   1.534 0.125041    
## EDUCATIONOther           -2.008e+00  1.477e+00  -1.359 0.173998    
## AGE                       3.373e-04  5.182e-03   0.065 0.948105    
## LIMIT_BAL                -4.474e-06  6.130e-07  -7.298 2.92e-13 ***
## AVG_UTILIZATION_RATE     -8.223e-02  1.924e-01  -0.427 0.669058    
## SEXFemale:MARRIAGESingle  8.801e-02  6.412e-02   1.373 0.169875    
## EDUCATIONUniversity:AGE  -4.897e-03  4.269e-03  -1.147 0.251383    
## EDUCATIONHigh School:AGE -8.808e-03  5.137e-03  -1.715 0.086435 .  
## EDUCATIONOther:AGE        2.990e-02  3.960e-02   0.755 0.450344    
## AGE:LIMIT_BAL             7.888e-08  1.569e-08   5.027 4.98e-07 ***
## AGE:AVG_UTILIZATION_RATE -5.961e-03  5.230e-03  -1.140 0.254323    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24878  on 17945  degrees of freedom
## Residual deviance: 22700  on 17914  degrees of freedom
## AIC: 22764
## 
## Number of Fisher Scoring iterations: 5
anova(mod2, test="Chisq")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
equatiomatic::extract_eq(mod2)

\[ \log\left[ \frac { P( \operatorname{DEFAULT} = \operatorname{1} ) }{ 1 - P( \operatorname{DEFAULT} = \operatorname{1} ) } \right] = \alpha + \beta_{1}(\operatorname{BILL\_AMT1}) + \beta_{2}(\operatorname{BILL\_AMT2}) + \beta_{3}(\operatorname{BILL\_AMT3}) + \beta_{4}(\operatorname{BILL\_AMT4}) + \beta_{5}(\operatorname{BILL\_AMT5}) + \beta_{6}(\operatorname{BILL\_AMT6}) + \beta_{7}(\operatorname{PAY\_AMT1}) + \beta_{8}(\operatorname{PAY\_AMT2}) + \beta_{9}(\operatorname{PAY\_AMT3}) + \beta_{10}(\operatorname{PAY\_AMT4}) + \beta_{11}(\operatorname{PAY\_AMT5}) + \beta_{12}(\operatorname{PAY\_AMT6}) + \beta_{13}(\operatorname{PAY\_2}) + \beta_{14}(\operatorname{PAY\_3}) + \beta_{15}(\operatorname{PAY\_4}) + \beta_{16}(\operatorname{PAY\_5}) + \beta_{17}(\operatorname{PAY\_6}) + \beta_{18}(\operatorname{SEX}_{\operatorname{Female}}) + \beta_{19}(\operatorname{MARRIAGE}_{\operatorname{Single}}) + \beta_{20}(\operatorname{EDUCATION}_{\operatorname{University}}) + \beta_{21}(\operatorname{EDUCATION}_{\operatorname{High\ School}}) + \beta_{22}(\operatorname{EDUCATION}_{\operatorname{Other}}) + \beta_{23}(\operatorname{AGE}) + \beta_{24}(\operatorname{LIMIT\_BAL}) + \beta_{25}(\operatorname{AVG\_UTILIZATION\_RATE}) + \beta_{26}(\operatorname{SEX}_{\operatorname{Female}} \times \operatorname{MARRIAGE}_{\operatorname{Single}}) + \beta_{27}(\operatorname{EDUCATION}_{\operatorname{University}} \times \operatorname{AGE}) + \beta_{28}(\operatorname{EDUCATION}_{\operatorname{High\ School}} \times \operatorname{AGE}) + \beta_{29}(\operatorname{EDUCATION}_{\operatorname{Other}} \times \operatorname{AGE}) + \beta_{30}(\operatorname{AGE} \times \operatorname{LIMIT\_BAL}) + \beta_{31}(\operatorname{AGE} \times \operatorname{AVG\_UTILIZATION\_RATE}) \]

Model 3: Two Stage Model

Predict Limit Balance, then incorporate those predictions into the logistic regression. Possible opportunity to demonstrate causality via two stage regression techniques that BILL_AMT1 and PAY_AMT1 directly impact the next month’s default.

Bill_Amt_Lin_Reg = lm(formula = BILL_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE, data=ccd.train)

summary(Bill_Amt_Lin_Reg) 
## 
## Call:
## lm(formula = BILL_AMT1 ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 + 
##     PAY_AMT2 + PAY_AMT3 + SEX + EDUCATION + AVG_UTILIZATION_RATE, 
##     data = ccd.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -303245   -4161   -2004    1140  416262 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.007e+03  4.734e+02   4.240 2.24e-05 ***
## LIMIT_BAL             4.172e-03  1.632e-03   2.557   0.0106 *  
## BILL_AMT2             9.035e-01  6.971e-03 129.596  < 2e-16 ***
## BILL_AMT3             1.142e-01  7.737e-03  14.754  < 2e-16 ***
## PAY_AMT2             -1.095e-01  8.755e-03 -12.503  < 2e-16 ***
## PAY_AMT3              5.679e-02  9.847e-03   5.767 8.18e-09 ***
## SEXFemale            -3.534e+02  2.933e+02  -1.205   0.2283    
## EDUCATIONUniversity   1.732e+03  3.388e+02   5.112 3.22e-07 ***
## EDUCATIONHigh School  1.991e+03  4.515e+02   4.411 1.04e-05 ***
## EDUCATIONOther        2.070e+03  2.677e+03   0.773   0.4394    
## AVG_UTILIZATION_RATE -5.753e+03  6.370e+02  -9.031  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19580 on 17935 degrees of freedom
## Multiple R-squared:  0.9292, Adjusted R-squared:  0.9292 
## F-statistic: 2.356e+04 on 10 and 17935 DF,  p-value: < 2.2e-16
anova(Bill_Amt_Lin_Reg)
Pay_Amt_Lin_Reg = lm( formula = PAY_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE, data=ccd.train)

summary(Pay_Amt_Lin_Reg)
## 
## Call:
## lm(formula = PAY_AMT1 ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 + PAY_AMT2 + 
##     PAY_AMT3 + SEX + EDUCATION + AVG_UTILIZATION_RATE, data = ccd.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -138373   -2543    -967     276  296088 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.787e+01  3.049e+02   0.255    0.798    
## LIMIT_BAL             1.046e-02  1.051e-03   9.951   <2e-16 ***
## BILL_AMT2             1.572e-01  4.490e-03  35.020   <2e-16 ***
## BILL_AMT3            -1.281e-01  4.983e-03 -25.707   <2e-16 ***
## PAY_AMT2              2.049e-01  5.639e-03  36.344   <2e-16 ***
## PAY_AMT3              6.336e-02  6.342e-03   9.990   <2e-16 ***
## SEXFemale             2.211e+02  1.889e+02   1.171    0.242    
## EDUCATIONUniversity  -3.577e+02  2.182e+02  -1.639    0.101    
## EDUCATIONHigh School -3.237e+02  2.908e+02  -1.113    0.266    
## EDUCATIONOther        1.042e+03  1.724e+03   0.605    0.545    
## AVG_UTILIZATION_RATE  3.939e+02  4.103e+02   0.960    0.337    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12610 on 17935 degrees of freedom
## Multiple R-squared:  0.1829, Adjusted R-squared:  0.1824 
## F-statistic: 401.4 on 10 and 17935 DF,  p-value: < 2.2e-16
anova(Pay_Amt_Lin_Reg)
ccd.train$BILL_AMT1_PRED<- predict(Bill_Amt_Lin_Reg, ccd.test)
ccd.test$BILL_AMT1_PRED<- predict(Bill_Amt_Lin_Reg, ccd.test)

ccd.train$PAY_AMT1_PRED<- predict(Pay_Amt_Lin_Reg, ccd.test)
ccd.test$PAY_AMT1_PRED<- predict(Pay_Amt_Lin_Reg, ccd.test)

mod3 = glm(formula = DEFAULT~LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE+BILL_AMT1_PRED+PAY_AMT1_PRED, family = binomial(link = "logit"), data=ccd.train)    

summary(mod3)
## 
## Call:
## glm(formula = DEFAULT ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 + PAY_AMT2 + 
##     PAY_AMT3 + SEX + EDUCATION + AVG_UTILIZATION_RATE + BILL_AMT1_PRED + 
##     PAY_AMT1_PRED, family = binomial(link = "logit"), data = ccd.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4479  -1.1364   0.6582   1.0616   4.0625  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           5.516e-01  5.307e-02  10.394  < 2e-16 ***
## LIMIT_BAL            -2.220e-06  1.814e-07 -12.241  < 2e-16 ***
## BILL_AMT2            -2.292e-06  8.817e-07  -2.599 0.009336 ** 
## BILL_AMT3             3.403e-06  9.739e-07   3.494 0.000476 ***
## PAY_AMT2             -1.883e-05  2.165e-06  -8.701  < 2e-16 ***
## PAY_AMT3             -6.547e-06  1.451e-06  -4.512 6.43e-06 ***
## SEXFemale            -1.300e-01  3.116e-02  -4.173 3.01e-05 ***
## EDUCATIONUniversity   4.005e-02  3.594e-02   1.114 0.265098    
## EDUCATIONHigh School  4.175e-02  4.790e-02   0.872 0.383389    
## EDUCATIONOther       -1.173e+00  3.481e-01  -3.369 0.000756 ***
## AVG_UTILIZATION_RATE  4.255e-01  6.886e-02   6.180 6.42e-10 ***
## BILL_AMT1_PRED        5.046e-06  3.706e-07  13.614  < 2e-16 ***
## PAY_AMT1_PRED        -1.134e-04  5.981e-06 -18.962  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24878  on 17945  degrees of freedom
## Residual deviance: 23408  on 17933  degrees of freedom
## AIC: 23434
## 
## Number of Fisher Scoring iterations: 5
anova(mod3, test="Chisq")
equatiomatic::extract_eq(mod3)

\[ \log\left[ \frac { P( \operatorname{DEFAULT} = \operatorname{1} ) }{ 1 - P( \operatorname{DEFAULT} = \operatorname{1} ) } \right] = \alpha + \beta_{1}(\operatorname{LIMIT\_BAL}) + \beta_{2}(\operatorname{BILL\_AMT2}) + \beta_{3}(\operatorname{BILL\_AMT3}) + \beta_{4}(\operatorname{PAY\_AMT2}) + \beta_{5}(\operatorname{PAY\_AMT3}) + \beta_{6}(\operatorname{SEX}_{\operatorname{Female}}) + \beta_{7}(\operatorname{EDUCATION}_{\operatorname{University}}) + \beta_{8}(\operatorname{EDUCATION}_{\operatorname{High\ School}}) + \beta_{9}(\operatorname{EDUCATION}_{\operatorname{Other}}) + \beta_{10}(\operatorname{AVG\_UTILIZATION\_RATE}) + \beta_{11}(\operatorname{BILL\_AMT1\_PRED}) + \beta_{12}(\operatorname{PAY\_AMT1\_PRED}) \]

Model 4: Logistic Transformation

mod4 = glm(formula = DEFAULT~log(LIMIT_BAL+.01)+SEX+AGE+log(BILL_AMT1_PRED+.01)+log(BILL_AMT2+.01)+log(BILL_AMT3+.01)+log(PAY_AMT1_PRED+.01)+log(PAY_AMT2+.01)+log(PAY_AMT3+.01)+log(AVG_UTILIZATION_RATE+10)+PAY_2+PAY_3, family = binomial(link = "logit"), data=ccd.train)
summary(mod4)
## 
## Call:
## glm(formula = DEFAULT ~ log(LIMIT_BAL + 0.01) + SEX + AGE + log(BILL_AMT1_PRED + 
##     0.01) + log(BILL_AMT2 + 0.01) + log(BILL_AMT3 + 0.01) + log(PAY_AMT1_PRED + 
##     0.01) + log(PAY_AMT2 + 0.01) + log(PAY_AMT3 + 0.01) + log(AVG_UTILIZATION_RATE + 
##     10) + PAY_2 + PAY_3, family = binomial(link = "logit"), data = ccd.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0864  -0.9870   0.1906   1.0204   2.4550  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -8.784575   1.711273  -5.133 2.85e-07 ***
## log(LIMIT_BAL + 0.01)          -0.149109   0.020383  -7.315 2.57e-13 ***
## SEXFemale                      -0.039514   0.034037  -1.161 0.245678    
## AGE                             0.005638   0.001856   3.038 0.002381 ** 
## log(BILL_AMT1_PRED + 0.01)      0.222171   0.016368  13.573  < 2e-16 ***
## log(BILL_AMT2 + 0.01)          -0.074134   0.006665 -11.122  < 2e-16 ***
## log(BILL_AMT3 + 0.01)          -0.002808   0.009235  -0.304 0.761105    
## log(PAY_AMT1_PRED + 0.01)      -0.610441   0.024556 -24.859  < 2e-16 ***
## log(PAY_AMT2 + 0.01)           -0.025402   0.006731  -3.774 0.000161 ***
## log(PAY_AMT3 + 0.01)           -0.045630   0.004185 -10.904  < 2e-16 ***
## log(AVG_UTILIZATION_RATE + 10)  5.990430   0.694560   8.625  < 2e-16 ***
## PAY_2                           0.395521   0.021204  18.654  < 2e-16 ***
## PAY_3                           0.185561   0.028576   6.494 8.38e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23904  on 17242  degrees of freedom
## Residual deviance: 20519  on 17230  degrees of freedom
##   (703 observations deleted due to missingness)
## AIC: 20545
## 
## Number of Fisher Scoring iterations: 4
anova(mod4, test="Chisq")

Generate Model Predictions

#Make the Predictions
ccd.test$DEFAULT_PROB_PREDICT = predict(mod4, ccd.test, type="response")
## Warning in log(BILL_AMT1_PRED + 0.01): NaNs produced
## Warning in log(BILL_AMT2 + 0.01): NaNs produced
## Warning in log(BILL_AMT3 + 0.01): NaNs produced
## Warning in log(PAY_AMT1_PRED + 0.01): NaNs produced
#Define the prediction threshold
threshold = .275

#Generate Prediction 
ccd.test$DEFAULT_PREDICTION <- ifelse(ccd.test$DEFAULT_PROB_PREDICT >= threshold, 1, 0 )

table(factor(ccd.test$DEFAULT_PREDICTION),factor(ccd.test$DEFAULT))
##    
##        0    1
##   0 2437  919
##   1 6198 7727

Iterative Evaluation of Model Performance

#This was for the single use case.
model_names = c("mod1", "mod2", "mod3", "mod4")
description = c("Everything", "Interactions", "Two-Stage", "Log Transformation")

Model_Performance<-as.data.frame(matrix(nrow=length(model_names),ncol=12))


for(i in 1:length(model_names)){

#Make the Predictions
ccd.test$DEFAULT_PROB_PREDICT = predict(eval(parse(text = paste('mod',i,sep=""))), ccd.test, type="response")

#Define the prediction threshold
threshold = .25

#Generate Prediction 
ccd.test$DEFAULT_PREDICTION <- ifelse(ccd.test$DEFAULT_PROB_PREDICT >= threshold, 1, 0 )

#Confusion Matrix Build
table(factor(ccd.test$DEFAULT_PREDICTION),factor(ccd.test$DEFAULT))

confusion_table = table(factor(ccd.test$DEFAULT_PREDICTION),factor(ccd.test$DEFAULT))
confusion_values = c(confusion_table[1,1], confusion_table[2,1], confusion_table[1,2], confusion_table[2,2])
Label = c('TN','FP','FN','TP')
Prediction = c(0,1,0,1)
Actual = c(0,0,1,1)
confusion_frame = data.frame(Prediction, Actual, confusion_values, Label)
colnames(confusion_frame)<- c("Prediction", "Actual","Frequency", "Label")
confusion_frame

colnames(Model_Performance)<-c("Name","Accuracy","Precision","Recall","Specificity","F1_Score","FPR", "TP", "TN", "FP", "FN", "Prediction_Threshold")

#True Positives
TP = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="TP"])
Model_Performance$TP[i] = TP

#True Negatives
TN = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="TN"])
Model_Performance$TN[i] = TN 

#False Positive
FP = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="FP"])
Model_Performance$FP[i] = FP

#False Negatives
FN = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="FN"])
Model_Performance$FN[i] = FN

#Model Name
Model_Performance$Name[i] = paste("Model ",i ,": ",description[i], sep = "")

#Accuracy
Model_Performance$Accuracy[i] = round((TP+TN)/(TP+TN+FP+FN),3)

#Precision
Model_Performance$Precision[i] = round(TP / (TP + FP),3)

#Recall, Sensitivity, True Positive Rate, We likely want to optimize for recall 
Model_Performance$Recall[i] = round(TP / (TP + FN),3)

#Specificity 
Model_Performance$Specificity[i] = round(TN / (TN + FP),3)

# F1 Score
Model_Performance$F1_Score[i] = round(2*TP / (2*TP + FP + FN),3)

#False Positive Rate
Model_Performance$FPR[i] = round(FP / (FP+TN),3)

#Model Prediction Threshold
Model_Performance$Prediction_Threshold[i] = threshold

}


Model_Performance

ROC Curve Build Model Comparison

ccd <- read.csv("~/Documents/Academic/UVA_MSDS/STAT6021/Project_2/ccd.csv")

model_names = c("mod1", "mod2", "mod3", "mod4")
description = c("Everything", "Interactions", "Two-Stage", "Log Transformation")
scaling_type = c('No Scaling', 'MinMaxScaler', 'StandardScaler', 'RobustScaler')
balancing_type = c('No Adjustment', 'Over-Sample', 'Under-Sample')

Model_Performance<-as.data.frame(matrix(nrow=length(model_names)*length(seq(0,1,.025))*length(scaling_type)*length(balancing_type),ncol=15))

#i represent a row entry
i=1
s=1
b=1
#loop through scaling, 'No Scaling', 'MinMaxScaler', 'StandardScaler', 'RobustScaler'

for(s in scaling_type){ 
  if(s == 'No Scaling'){
    noscale <- function(x){x}
    scale <- noscale
    
  }else if(s == 'MinMaxScaler'){

    minmaxscale<-function(x){(x-min(x))/(max(x)-min(x))}
    scale <- minmaxscale
  }else if(s == 'StandardScaler'){
    
    standardscale<-function(x){(x-mean(x))/(sd(x))}
    scale <- standardscale
  }else if(s == 'RobustScaler'){
  
    robustscale<-function(x){
          Q <- quantile(x, probs=c(.25, .75), na.rm = FALSE)
          return (x-median(x))/(Q[2]-Q[1])}
    scale <- robustscale
  }

ccd$DEFAULT <- as.numeric(ccd$default.payment.next.month)
ccd.scaled <- data.frame(ccd$LIMIT_BAL)
ccd.scaled$LIMIT_BAL<- scale(ccd$LIMIT_BAL)
ccd.scaled <- ccd.scaled %>% dplyr::select(-ccd.LIMIT_BAL)
ccd.scaled$AGE<- scale(ccd$AGE)
ccd.scaled$PAY_2<- scale(ccd$PAY_2)
ccd.scaled$PAY_3<- scale(ccd$PAY_3)
ccd.scaled$PAY_4<- scale(ccd$PAY_4)
ccd.scaled$PAY_5<- scale(ccd$PAY_5)
ccd.scaled$PAY_6<- scale(ccd$PAY_6)
ccd.scaled$BILL_AMT1<- scale(ccd$BILL_AMT1)
ccd.scaled$BILL_AMT2<- scale(ccd$BILL_AMT2)
ccd.scaled$BILL_AMT3<- scale(ccd$BILL_AMT3)
ccd.scaled$BILL_AMT4<- scale(ccd$BILL_AMT4)
ccd.scaled$BILL_AMT5<- scale(ccd$BILL_AMT5)
ccd.scaled$BILL_AMT6<- scale(ccd$BILL_AMT6)
ccd.scaled$PAY_AMT1<- scale(ccd$PAY_AMT1)
ccd.scaled$PAY_AMT2<- scale(ccd$PAY_AMT2)
ccd.scaled$PAY_AMT3<- scale(ccd$PAY_AMT3)
ccd.scaled$PAY_AMT4<- scale(ccd$PAY_AMT4)
ccd.scaled$PAY_AMT5<- scale(ccd$PAY_AMT5)
ccd.scaled$PAY_AMT6<- scale(ccd$PAY_AMT6)
ccd.scaled$DEFAULT <- as.numeric(ifelse(ccd$default.payment.next.month == 1,1,0))
ccd.scaled$AVG_UTILIZATION_RATE <- scale(ccd$AVG_UTILIZATION_RATE)
ccd.scaled$OVERSPENDER <- ifelse(ccd$OVERSPENDER == 1, 1,0)
ccd.scaled$TOTAL_EXCESS_BALANCE <- scale(ccd$TOTAL_EXCESS_BALANCE)
ccd.scaled$MARRIAGE <- ccd$MARRIAGE
ccd.scaled$EDUCATION <- ccd$EDUCATION
ccd.scaled$SEX <- ccd$SEX

#Handling an imbalanced dataset, may need to resample the unbalanced dataset to account for fewer defaults
#keep all of the defaults, randomly sample an equal number of non defaults
ccd_defaults<- subset(ccd.scaled, (ccd.scaled$DEFAULT == 1))
ccd_no_defaults <- subset(ccd.scaled, (ccd.scaled$DEFAULT == 0))

#loop through sampling, 'No Adjustment', 'Upsample', 'Downsample'
for(c in balancing_type){ 

  if(c == 'No Adjustment'){
    #Split Data into train and test, ccd will remain train,  ccd.test will be test data
    Split_Data = sample(c(rep(0, 0.5 * nrow(ccd.scaled)), rep(1, 0.5 *nrow(ccd.scaled))))
    ccd.train<- ccd.scaled[Split_Data==0,]
    ccd.test<- ccd.scaled[Split_Data==1,]
    
  }else if(c == 'Over-Sample'){
    
    #up sampling:
    ccd_defaults_upsample <- sample_n(ccd_defaults, nrow(ccd_no_defaults), replace = T)
    ccd_up_balanced <- data.frame(rbind(ccd_no_defaults,ccd_defaults_upsample))
    
    #Split Data into train and test, ccd will remain train,  ccd.test will be test data
    Split_Data = sample(c(rep(0, 0.5 * nrow(ccd_up_balanced)), rep(1, 0.5 *nrow(ccd_up_balanced))))
    ccd.up.train<- ccd_up_balanced[Split_Data==0,]
    ccd.up.test<- ccd_up_balanced[Split_Data==1,]

    #synchronize
    ccd.train <- ccd.up.train
    ccd.test <- ccd.up.test
    
  }else if(c == 'Under-Sample'){

    #unbalanced dataset of defaults and no defaults, resample for maximum number of defaults and equivalent no defaults.  
    #down sampling:
    ccd_no_defaults_downsample <- sample_n(ccd_no_defaults, nrow(ccd_defaults), replace = F)
    ccd_down_balanced <- data.frame(rbind(ccd_defaults,ccd_no_defaults_downsample))
    
    #Split Data into train and test, ccd will remain train,  ccd.test will be test data
    Split_Data = sample(c(rep(0, 0.5 * nrow(ccd_down_balanced)), rep(1, 0.5 *nrow(ccd_down_balanced))))
    ccd.down.train<- ccd_down_balanced[Split_Data==0,]
    ccd.down.test<- ccd_down_balanced[Split_Data==1,]
    
    #synchronize
    ccd.train <- ccd.down.train
    ccd.test <- ccd.down.test

  }

  #Mod 1
  mod1 = glm(formula = DEFAULT~LIMIT_BAL+BILL_AMT1+BILL_AMT2+BILL_AMT3+BILL_AMT4+BILL_AMT5+BILL_AMT6+PAY_AMT1+PAY_AMT2+PAY_AMT3+PAY_AMT4+PAY_AMT5+PAY_AMT6+AVG_UTILIZATION_RATE+PAY_2+PAY_3+PAY_4+PAY_5+PAY_6+SEX+MARRIAGE+EDUCATION+AGE, family = binomial(link = "logit"), data=ccd.train)  
  
  #Mod2
  mod2 = glm(formula = DEFAULT~
             BILL_AMT1+BILL_AMT2+BILL_AMT3+BILL_AMT4+BILL_AMT5+BILL_AMT6+
             PAY_AMT1+PAY_AMT2+PAY_AMT3+PAY_AMT4+PAY_AMT5+PAY_AMT6+
             PAY_2+PAY_3+PAY_4+PAY_5+PAY_6+
             MARRIAGE+LIMIT_BAL*AGE+AVG_UTILIZATION_RATE, family = binomial(link = "logit"), data=ccd.train)
  
  #Mod 3
  Bill_Amt_Lin_Reg = lm(formula = BILL_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE, data=ccd.train)

summary(Bill_Amt_Lin_Reg) 
anova(Bill_Amt_Lin_Reg)

Pay_Amt_Lin_Reg = lm( formula = PAY_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE, data=ccd.train)

summary(Pay_Amt_Lin_Reg)
anova(Pay_Amt_Lin_Reg)

ccd.train$BILL_AMT1_PRED<- predict(Bill_Amt_Lin_Reg, ccd.test)
ccd.test$BILL_AMT1_PRED<- predict(Bill_Amt_Lin_Reg, ccd.test)

ccd.train$PAY_AMT1_PRED<- predict(Pay_Amt_Lin_Reg, ccd.test)
ccd.test$PAY_AMT1_PRED<- predict(Pay_Amt_Lin_Reg, ccd.test)

mod3 = glm(formula = DEFAULT~LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE+BILL_AMT1_PRED+PAY_AMT1_PRED, family = binomial(link = "logit"), data=ccd.train)    

#Mod 4
mod4 = glm(formula = DEFAULT~log(LIMIT_BAL+.01)+SEX+AGE+log(BILL_AMT1_PRED+.01)+log(BILL_AMT2+.01)+log(BILL_AMT3+.01)+log(PAY_AMT1_PRED+.01)+log(PAY_AMT2+.01)+log(PAY_AMT3+.01)+log(AVG_UTILIZATION_RATE+.01)+PAY_2+PAY_3, family = binomial(link = "logit"), data=ccd.train)

#loop through prediction interval
for(j in seq(0,1,.025)){ 
  
#loop through models
for(k in 1:length(model_names)){ 

#Make the Predictions
ccd.test$DEFAULT_PROB_PREDICT = predict(eval(parse(text = paste('mod',k,sep=""))), ccd.test, type="response")

#Define the prediction threshold
threshold = j

#Generate Prediction 
ccd.test$DEFAULT_PREDICTION <- ifelse(ccd.test$DEFAULT_PROB_PREDICT >= threshold, 1, 0 )

#Confusion Matrix Build
confusion_table = table(factor(ccd.test$DEFAULT_PREDICTION),factor(ccd.test$DEFAULT))

if(length(confusion_table) == 4){
confusion_values = c(confusion_table[1,1], confusion_table[2,1], confusion_table[1,2], confusion_table[2,2])
Label = c('TN','FP','FN','TP')
Prediction = c(0,1,0,1)
Actual = c(0,0,1,1)
confusion_frame = data.frame(Prediction, Actual, confusion_values, Label)
colnames(confusion_frame)<- c("Prediction", "Actual","Frequency", "Label")
confusion_frame
}else if(length(confusion_table) == 2){
confusion_values = c(confusion_table[1,1], 0, confusion_table[1,2],0)
Label = c('TN','FP','FN','TP')
Prediction = c(0,1,0,1)
Actual = c(0,0,1,1)
confusion_frame = data.frame(Prediction, Actual, confusion_values, Label)
colnames(confusion_frame)<- c("Prediction", "Actual","Frequency", "Label")
confusion_frame
}

colnames(Model_Performance)<-c("Name","Scaling", "Sampling", "Accuracy","Precision","Recall","Specificity","F1_Score","FPR", "TP", "TN", "FP", "FN", "Prediction_Threshold", "AUC")

#True Positives
TP = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="TP"])
Model_Performance$TP[i] = TP

#True Negatives
TN = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="TN"])
Model_Performance$TN[i] = TN 

#False Positive
FP = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="FP"])
Model_Performance$FP[i] = FP

#False Negatives
FN = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="FN"])
Model_Performance$FN[i] = FN

#Model Name
Model_Performance$Name[i] = paste("Model ",k ,": ",description[k], sep = "")

#Accuracy
Model_Performance$Accuracy[i] = round((TP+TN)/(TP+TN+FP+FN),3)

#Precision
Model_Performance$Precision[i] = round(TP / (TP + FP),3)

#Recall, Sensitivity, True Positive Rate, We likely want to optimize for recall 
Model_Performance$Recall[i] = round(TP / (TP + FN),3)

#Specificity 
Model_Performance$Specificity[i] = round(TN / (TN + FP),3)

# F1 Score
Model_Performance$F1_Score[i] = round(2*TP / (2*TP + FP + FN),3)

#False Positive Rate
Model_Performance$FPR[i] = round(FP / (FP+TN),3)

#Model Prediction Threshold
Model_Performance$Prediction_Threshold[i] = threshold

#Area Under the Curve
Model_Performance$AUC[i] = auc(ccd.test$DEFAULT, ccd.test$DEFAULT_PREDICTION)[1]

#Record the scaling method
Model_Performance$Scaling[i] = s

#Record the sampling method for balancing an imbalanced dataset
Model_Performance$Sampling[i] = c

i=i+1
}
  
}

}

}

Model_Performance %>% arrange(-Accuracy)
write.csv(Model_Performance, "model_performance.csv", row.names = FALSE)
Model_Performance <- read.csv("~/Documents/Academic/UVA_MSDS/STAT6021/Project_2/model_performance.csv")

first = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 1: Everything"& Scaling == "No Scaling" & Sampling == "No Adjustment")

second = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 1: Everything"& Scaling == "RobustScaler" & Sampling == "Over-Sample")

third = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 2: Interactions"& Scaling == "MinMaxScaler" & Sampling == "Over-Sample")

fourth = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 2: Interactions"& Scaling == "No Scaling" & Sampling == "No Adjustment")

fifth = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 3: Two-Stage"& Scaling == "MinMaxScaler" & Sampling == "Over-Sample")

sixth = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 3: Two-Stage"& Scaling == "No Scaling" & Sampling == "Over-Sample")

seventh = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 4: Log Transformation"& Scaling == "No Scaling" & Sampling == "No Adjustment")

eighth = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 4: Log Transformation"& Scaling == "RobustScaler" & Sampling == "No Adjustment")

modper = rbind(first, second, third, fourth, fifth, sixth, seventh, eighth)

modper %>% group_by(combined) %>% summarise(max_AUC = max(AUC), Average_Accuracy = mean(Accuracy)) %>% arrange(-max_AUC)
seventh %>% arrange(-AUC) %>% filter(Prediction_Threshold == .275)