ccd <- CreditCardDefault <- read.csv("~/Downloads/CreditCardDefault.csv")
head(ccd, 3)
ccd <- CreditCardDefault <- read.csv("~/Downloads/CreditCardDefault.csv")
#subset data to only the well defined dataset
#The original dataset contained information outside of the defined descriptions provided by the source. To handle these, we've narrowed those variables to content and observations that we understand.
ccd<- subset(ccd, (ccd$EDUCATION %in% c(1, 2, 3, 4)))
ccd<- subset(ccd, (ccd$MARRIAGE %in% c(1, 2)))
#Setting categorical variables as factors
#The original dataset was nearly entirely integers. Some categorical factors were converted to factor data type to enable use with R data visualizations and built in linear modeling tools.
ccd$DEFAULT <- as.factor(ccd$default.payment.next.month)
ccd$SEX <- as.factor(ccd$SEX)
ccd$EDUCATION <- as.factor(ccd$EDUCATION)
ccd$MARRIAGE <- as.factor(ccd$MARRIAGE)
#handle level naming conventions
#To reduce confusion in visualization or reporting and interpretation the levels for the converted factors were renamed
levels(ccd$DEFAULT) <- c("No Default Next Month", "Defaults Next Month")
levels(ccd$SEX) <- c("Male", "Female")
levels(ccd$MARRIAGE) <- c('Married','Single')
levels(ccd$EDUCATION) <- c('Graduate School', 'University','High School', 'Other')
#Feature Generation
#These three features were created after some initial modeling with the belief that inclusion of these three variables may improve the model in some capacity. We speculate that including information about credit user average long run behavior over the five months preceding may reveal useful information in understanding they future bill amounts and accordingly their predicted probability of defaulting in the next month.
#credit utilization, numerical variable with the average bill amount divided by the limit balance per user
ccd$AVG_UTILIZATION_RATE =(ccd$BILL_AMT2+ccd$BILL_AMT3+ccd$BILL_AMT4+ccd$BILL_AMT5+ccd$BILL_AMT6)/5/ccd$LIMIT_BAL
#Create a logical binary variable for if the average monthly excess balance over the past five months was greater than the limit balance.
ccd$OVERSPENDER<-as.factor((ccd$BILL_AMT2+ccd$BILL_AMT3+ccd$BILL_AMT4+ccd$BILL_AMT5+ccd$BILL_AMT6)/5 > ccd$LIMIT_BAL)
#Create a numerical variable which captures how much excess balance over the past three months is being carried. The idea here is that users who make payments greater than their statement balance will have a negative excess balance average over three months. Those who are having trouble paying or are carrying revolving credit will
ccd$TOTAL_EXCESS_BALANCE<-(((ccd$BILL_AMT2-ccd$PAY_AMT2+ccd$BILL_AMT3-ccd$PAY_AMT3+ccd$BILL_AMT4-ccd$PAY_AMT4+ccd$BILL_AMT5-ccd$PAY_AMT5+ccd$BILL_AMT6-ccd$PAY_AMT6)/5)-ccd$LIMIT_BAL)
#For whatever the reason the dataset is unevenly split with different shares of males and females. We do not attempt to speculate why, but simply split the population by gender and resample to head off potential unobserved built in bias.
#handle uneven Male and female data set, split sample by Male and Female and sample from each
ccd_male<- subset(ccd, (ccd$SEX == 'Male'))
ccd_female<- subset(ccd, (ccd$SEX == 'Female'))
#this way everyone gets the same output
set.seed((1234))
#sample for even 50/50 split between males and females, size is determined by the smallest available subset, in this case males with 11616 observations
ccd_male_sample <- sample_n(ccd_male, min(nrow(ccd_male), nrow(ccd_female)))
ccd_female_sample <- sample_n(ccd_female, min(nrow(ccd_male), nrow(ccd_female)))
ccd <- data.frame(rbind(ccd_male_sample,ccd_female_sample))
#handle NANs by only selecting complete observations
ccd<-ccd[complete.cases(ccd),]
#Workspace cleanup
rm(CreditCardDefault)
rm(ccd_female)
rm(ccd_male)
rm(ccd_female_sample)
rm(ccd_male_sample)
#narrow our working dataset to features we will specifically use here
#drop PAY_0, we would not have this information yet
#drop BILL_AMT1, we are modeling this,...but retain for model accuracy checks
#drop PAY_AMT1, we would not have this information yet...but retain for inclusion into logistic Regression.
ccd <- ccd %>% dplyr::select(c(-PAY_0))
#Because we are working with a relatively large dataset, we choose to split the training and test data 50/50.
set.seed((1234))
#Split Data into train and test, ccd will remain train, ccd.test will be test data
Split_Data = sample(c(rep(0, 0.5 * nrow(ccd)), rep(1, 0.5 * nrow(ccd))))
ccd.train<- ccd[Split_Data==0,]
ccd.test<- ccd[Split_Data==1,]
write.csv(ccd, "ccd.csv", row.names = FALSE)
write.csv(ccd.train,"ccd.train.csv", row.names = FALSE)
write.csv(ccd.test,"ccd.test.csv", row.names = FALSE)
Data Visualization: Using proportion bar charts, we explore the
categorical features within our data with intent to identify impactful
predictors. Splitting by individuals that default, our response variable
and what we are attempting to model predictions for with the logistic
regression, we note possible differences based on Sex, Education,
Marital Status, and the Overspender Flag.
The Overspender Flag and Other Education immediately stick out as features with a relative large difference in proportion of defaults in comparison the population average default proportion. Males appear to have a larger proportion of defaults within the data, but the magnitude of the effect is difficult to discern from visualization alone. Marriage also appears to have a slightly larger share of the population than which results in default.
Data Visualization: We explore some of our numeric predictor variables
using boxplots with outliers depicted in red. Limit Balance on average
appears to be larger for those who do not default versus those that
default. As we would expect with Total_Excess_Balance, the greater the
balance over the limit balance, the more defaults on average. Average
Credit Card Utilization Rates appear to have an a postive impact of the
share of the population subset which defaults within the next month.
Based on density plot visualization and comparison, there appears to be
a difference in the distributions of Average Utilization Rate between
those who default and those who do not default. Increases the average
utilization rate appears to be associated with increased defaults.
linear<-lm(BILL_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+AGE+SEX+EDUCATION+MARRIAGE, data = ccd)
summary(linear)
##
## Call:
## lm(formula = BILL_AMT1 ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 +
## PAY_AMT2 + PAY_AMT3 + AGE + SEX + EDUCATION + MARRIAGE, data = ccd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -489733 -4391 -2143 705 414124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.499e+02 8.647e+02 -0.405 0.68572
## LIMIT_BAL 1.442e-02 1.286e-03 11.213 < 2e-16 ***
## BILL_AMT2 8.842e-01 6.878e-03 128.545 < 2e-16 ***
## BILL_AMT3 1.059e-01 7.361e-03 14.386 < 2e-16 ***
## PAY_AMT2 -8.000e-02 8.673e-03 -9.224 < 2e-16 ***
## PAY_AMT3 6.934e-02 9.511e-03 7.290 3.19e-13 ***
## AGE 5.287e+00 1.907e+01 0.277 0.78162
## SEXFemale -6.792e+02 2.997e+02 -2.266 0.02346 *
## EDUCATIONUniversity 1.673e+03 3.434e+02 4.871 1.12e-06 ***
## EDUCATIONHigh School 1.485e+03 4.715e+02 3.151 0.00163 **
## EDUCATIONOther 6.791e+03 2.308e+03 2.942 0.00326 **
## MARRIAGESingle -1.845e+00 3.439e+02 -0.005 0.99572
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22590 on 23220 degrees of freedom
## Multiple R-squared: 0.9081, Adjusted R-squared: 0.9081
## F-statistic: 2.086e+04 on 11 and 23220 DF, p-value: < 2.2e-16
T-tests seem to indicate that AGE and MARRIAGESSingle are not significant, given the other variables in the model. Let’s see if we can remove them.
\(H_{0} : B_{1} = B_{2} = B_{3} = B_{4} = ... B_{p} = 0\)
\(H_{a}\) : at least one of the coefficients in \(H_{0}\) is not zero.
reduced<-lm(BILL_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION, data = ccd)
anova(reduced,linear)
Our F statistic is 0.0494 with a p-value of 0.9518. We do not reject the null hypothesis. Our data suggests we can drop the predictors AGE and MARRIAGE and go with the reduced model.
summary(reduced)
##
## Call:
## lm(formula = BILL_AMT1 ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 +
## PAY_AMT2 + PAY_AMT3 + SEX + EDUCATION, data = ccd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -489738 -4386 -2141 710 414107
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.806e+02 3.794e+02 -0.476 0.634085
## LIMIT_BAL 1.449e-02 1.258e-03 11.526 < 2e-16 ***
## BILL_AMT2 8.842e-01 6.876e-03 128.583 < 2e-16 ***
## BILL_AMT3 1.059e-01 7.360e-03 14.390 < 2e-16 ***
## PAY_AMT2 -8.003e-02 8.672e-03 -9.228 < 2e-16 ***
## PAY_AMT3 6.933e-02 9.509e-03 7.291 3.19e-13 ***
## SEXFemale -6.890e+02 2.971e+02 -2.319 0.020395 *
## EDUCATIONUniversity 1.680e+03 3.393e+02 4.952 7.39e-07 ***
## EDUCATIONHigh School 1.523e+03 4.547e+02 3.349 0.000812 ***
## EDUCATIONOther 6.787e+03 2.308e+03 2.941 0.003279 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22590 on 23222 degrees of freedom
## Multiple R-squared: 0.9081, Adjusted R-squared: 0.9081
## F-statistic: 2.55e+04 on 9 and 23222 DF, p-value: < 2.2e-16
Let’s double check our reduced model by doing the regsubsets() frunction to run all possible regressions on our original (full) predictor set.
library(leaps)
allreg<-regsubsets(BILL_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+AGE+SEX+EDUCATION+MARRIAGE, data = ccd, nbest = 2)
summary(allreg)
## Subset selection object
## Call: regsubsets.formula(BILL_AMT1 ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 +
## PAY_AMT2 + PAY_AMT3 + AGE + SEX + EDUCATION + MARRIAGE, data = ccd,
## nbest = 2)
## 11 Variables (and intercept)
## Forced in Forced out
## LIMIT_BAL FALSE FALSE
## BILL_AMT2 FALSE FALSE
## BILL_AMT3 FALSE FALSE
## PAY_AMT2 FALSE FALSE
## PAY_AMT3 FALSE FALSE
## AGE FALSE FALSE
## SEXFemale FALSE FALSE
## EDUCATIONUniversity FALSE FALSE
## EDUCATIONHigh School FALSE FALSE
## EDUCATIONOther FALSE FALSE
## MARRIAGESingle FALSE FALSE
## 2 subsets of each size up to 8
## Selection Algorithm: exhaustive
## LIMIT_BAL BILL_AMT2 BILL_AMT3 PAY_AMT2 PAY_AMT3 AGE SEXFemale
## 1 ( 1 ) " " "*" " " " " " " " " " "
## 1 ( 2 ) " " " " "*" " " " " " " " "
## 2 ( 1 ) " " "*" "*" " " " " " " " "
## 2 ( 2 ) "*" "*" " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " "
## 3 ( 2 ) " " "*" "*" " " "*" " " " "
## 4 ( 1 ) "*" "*" "*" "*" " " " " " "
## 4 ( 2 ) "*" "*" "*" " " "*" " " " "
## 5 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 5 ( 2 ) "*" "*" "*" "*" " " " " " "
## 6 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 6 ( 2 ) "*" "*" "*" "*" "*" " " " "
## 7 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 7 ( 2 ) "*" "*" "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 8 ( 2 ) "*" "*" "*" "*" "*" " " "*"
## EDUCATIONUniversity EDUCATIONHigh School EDUCATIONOther MARRIAGESingle
## 1 ( 1 ) " " " " " " " "
## 1 ( 2 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 2 ( 2 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 3 ( 2 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 4 ( 2 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 5 ( 2 ) "*" " " " " " "
## 6 ( 1 ) "*" " " " " " "
## 6 ( 2 ) " " " " "*" " "
## 7 ( 1 ) "*" "*" " " " "
## 7 ( 2 ) "*" " " "*" " "
## 8 ( 1 ) "*" "*" "*" " "
## 8 ( 2 ) "*" "*" " " " "
Let’s see which model has the best (highest) adjusted \(R^{2}\).
coef(allreg, which.max(summary(allreg)$adjr2))
## (Intercept) LIMIT_BAL BILL_AMT2
## -499.38363102 0.01436234 0.88453542
## BILL_AMT3 PAY_AMT2 PAY_AMT3
## 0.10572357 -0.07988742 0.06967691
## EDUCATIONUniversity EDUCATIONHigh School EDUCATIONOther
## 1651.13782180 1510.39547423 6721.23012387
This model dropped AGE and MARRIAGE like our reduced model did. It also dropped SEX.
Let’s see which model is has the best (lowest) Mallow’s \(C_p\).
coef(allreg, which.min(summary(allreg)$cp))
## (Intercept) LIMIT_BAL BILL_AMT2
## -499.38363102 0.01436234 0.88453542
## BILL_AMT3 PAY_AMT2 PAY_AMT3
## 0.10572357 -0.07988742 0.06967691
## EDUCATIONUniversity EDUCATIONHigh School EDUCATIONOther
## 1651.13782180 1510.39547423 6721.23012387
This has the same predictors as the model which maximizes adjusted \(R^2\). It drops AGE, MARRIAGE, and SEX.
Let’s see which model has the best (lowest) BIC.
coef(allreg, which.min(summary(allreg)$bic))
## (Intercept) LIMIT_BAL BILL_AMT2 BILL_AMT3
## 171.58094718 0.01344336 0.88485507 0.10581910
## PAY_AMT2 PAY_AMT3 EDUCATIONUniversity
## -0.07990731 0.06988625 1092.26986352
This model removes some of the education factors. It’s only a predictor for clients who attended university.
Lastly, let’s try a stepwise regression to see which model will be the best. We will start with an intercept-only model.
regnull<- lm(BILL_AMT1 ~ 1, data = ccd)
step(regnull, scope = list(lower=regnull, upper=linear), direction = "both")
## Start: AIC=521270.2
## BILL_AMT1 ~ 1
##
## Df Sum of Sq RSS AIC
## + BILL_AMT2 1 1.1693e+14 1.2063e+13 466222
## + BILL_AMT3 1 1.0194e+14 2.7054e+13 484985
## + LIMIT_BAL 1 1.0629e+13 1.1837e+14 519274
## + PAY_AMT3 1 2.8306e+12 1.2616e+14 520757
## + PAY_AMT2 1 1.3590e+12 1.2764e+14 521026
## + AGE 1 4.5774e+11 1.2854e+14 521190
## + SEX 1 1.4178e+11 1.2885e+14 521247
## + EDUCATION 3 1.2562e+11 1.2887e+14 521254
## + MARRIAGE 1 6.5190e+10 1.2893e+14 521260
## <none> 1.2899e+14 521270
##
## Step: AIC=466221.5
## BILL_AMT1 ~ BILL_AMT2
##
## Df Sum of Sq RSS AIC
## + BILL_AMT3 1 7.1095e+10 1.1992e+13 466086
## + LIMIT_BAL 1 6.8674e+10 1.1995e+13 466091
## + PAY_AMT3 1 2.9881e+10 1.2033e+13 466166
## + PAY_AMT2 1 4.4609e+09 1.2059e+13 466215
## + EDUCATION 3 6.0505e+09 1.2057e+13 466216
## + AGE 1 3.8989e+09 1.2059e+13 466216
## + MARRIAGE 1 2.8508e+09 1.2060e+13 466218
## + SEX 1 1.3719e+09 1.2062e+13 466221
## <none> 1.2063e+13 466222
## - BILL_AMT2 1 1.1693e+14 1.2899e+14 521270
##
## Step: AIC=466086.2
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3
##
## Df Sum of Sq RSS AIC
## + LIMIT_BAL 1 5.9404e+10 1.1933e+13 465973
## + PAY_AMT3 1 2.9098e+10 1.1963e+13 466032
## + PAY_AMT2 1 2.0111e+10 1.1972e+13 466049
## + EDUCATION 3 6.4398e+09 1.1986e+13 466080
## + AGE 1 3.5813e+09 1.1989e+13 466081
## + MARRIAGE 1 2.3350e+09 1.1990e+13 466084
## + SEX 1 1.5964e+09 1.1991e+13 466085
## <none> 1.1992e+13 466086
## - BILL_AMT3 1 7.1095e+10 1.2063e+13 466222
## - BILL_AMT2 1 1.5062e+13 2.7054e+13 484985
##
## Step: AIC=465972.8
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL
##
## Df Sum of Sq RSS AIC
## + PAY_AMT2 1 3.2457e+10 1.1900e+13 465912
## + PAY_AMT3 1 1.6290e+10 1.1917e+13 465943
## + EDUCATION 3 1.6371e+10 1.1916e+13 465947
## + SEX 1 2.3673e+09 1.1930e+13 465970
## <none> 1.1933e+13 465973
## + AGE 1 6.6674e+08 1.1932e+13 465974
## + MARRIAGE 1 4.8953e+08 1.1932e+13 465974
## - LIMIT_BAL 1 5.9404e+10 1.1992e+13 466086
## - BILL_AMT3 1 6.1825e+10 1.1995e+13 466091
## - BILL_AMT2 1 1.4947e+13 2.6879e+13 484837
##
## Step: AIC=465911.5
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL + PAY_AMT2
##
## Df Sum of Sq RSS AIC
## + PAY_AMT3 1 2.7423e+10 1.1873e+13 465860
## + EDUCATION 3 1.6135e+10 1.1884e+13 465886
## + SEX 1 2.5566e+09 1.1898e+13 465909
## <none> 1.1900e+13 465912
## + AGE 1 5.8576e+08 1.1900e+13 465912
## + MARRIAGE 1 3.8415e+08 1.1900e+13 465913
## - PAY_AMT2 1 3.2457e+10 1.1933e+13 465973
## - LIMIT_BAL 1 7.1750e+10 1.1972e+13 466049
## - BILL_AMT3 1 9.3635e+10 1.1994e+13 466092
## - BILL_AMT2 1 8.8123e+12 2.0713e+13 478784
##
## Step: AIC=465859.9
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL + PAY_AMT2 + PAY_AMT3
##
## Df Sum of Sq RSS AIC
## + EDUCATION 3 1.6126e+10 1.1857e+13 465834
## + SEX 1 2.2972e+09 1.1871e+13 465857
## <none> 1.1873e+13 465860
## + AGE 1 6.1621e+08 1.1872e+13 465861
## + MARRIAGE 1 5.0231e+08 1.1872e+13 465861
## - PAY_AMT3 1 2.7423e+10 1.1900e+13 465912
## - PAY_AMT2 1 4.3590e+10 1.1917e+13 465943
## - LIMIT_BAL 1 5.6741e+10 1.1930e+13 465969
## - BILL_AMT3 1 1.0566e+11 1.1979e+13 466064
## - BILL_AMT2 1 8.4736e+12 2.0347e+13 478372
##
## Step: AIC=465834.4
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL + PAY_AMT2 + PAY_AMT3 +
## EDUCATION
##
## Df Sum of Sq RSS AIC
## + SEX 1 2.7455e+09 1.1854e+13 465831
## <none> 1.1857e+13 465834
## + AGE 1 1.5878e+08 1.1857e+13 465836
## + MARRIAGE 1 4.4906e+06 1.1857e+13 465836
## - EDUCATION 3 1.6126e+10 1.1873e+13 465860
## - PAY_AMT3 1 2.7413e+10 1.1884e+13 465886
## - PAY_AMT2 1 4.3318e+10 1.1900e+13 465917
## - LIMIT_BAL 1 6.6721e+10 1.1924e+13 465963
## - BILL_AMT3 1 1.0534e+11 1.1962e+13 466038
## - BILL_AMT2 1 8.4517e+12 2.0309e+13 478334
##
## Step: AIC=465831
## BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL + PAY_AMT2 + PAY_AMT3 +
## EDUCATION + SEX
##
## Df Sum of Sq RSS AIC
## <none> 1.1854e+13 465831
## + AGE 1 5.0380e+07 1.1854e+13 465833
## + MARRIAGE 1 1.1165e+07 1.1854e+13 465833
## - SEX 1 2.7455e+09 1.1857e+13 465834
## - EDUCATION 3 1.6574e+10 1.1871e+13 465857
## - PAY_AMT3 1 2.7132e+10 1.1881e+13 465882
## - PAY_AMT2 1 4.3469e+10 1.1898e+13 465914
## - LIMIT_BAL 1 6.7817e+10 1.1922e+13 465962
## - BILL_AMT3 1 1.0570e+11 1.1960e+13 466035
## - BILL_AMT2 1 8.4399e+12 2.0294e+13 478320
##
## Call:
## lm(formula = BILL_AMT1 ~ BILL_AMT2 + BILL_AMT3 + LIMIT_BAL +
## PAY_AMT2 + PAY_AMT3 + EDUCATION + SEX, data = ccd)
##
## Coefficients:
## (Intercept) BILL_AMT2 BILL_AMT3
## -180.60583 0.88416 0.10591
## LIMIT_BAL PAY_AMT2 PAY_AMT3
## 0.01449 -0.08003 0.06933
## EDUCATIONUniversity EDUCATIONHigh School EDUCATIONOther
## 1680.03227 1522.86782 6786.68206
## SEXFemale
## -688.99111
The stepwise regression ends with the same predictors as our reduced model above. We remove only AGE and MARRIAGE.
#Chose to standardize because of application scenario to Logistic Regression. If there are outliers in the data, it is a reasonable conclusion that they would contain important information. Standardization retains the impact of outliers.
#endstate is numeric features only
noscale <- function(x){x}
standardscale <- function(x){(x-mean(x))/(sd(x))}
minmaxscale <- function(x){(x-min(x))/(max(x)-min(x))}
robustscale <- function(x){
Q <- quantile(x, probs=c(.25, .75), na.rm = FALSE)
return (x-median(x))/(Q[2]-Q[1])}
standardscale<-noscale
ccd.scaled <- data.frame(ccd$LIMIT_BAL)
ccd.scaled$LIMIT_BAL<- standardscale(ccd$LIMIT_BAL)
ccd.scaled <- ccd.scaled %>% dplyr::select(-ccd.LIMIT_BAL)
ccd.scaled$AGE<- standardscale(ccd$AGE)
ccd.scaled$PAY_2<- standardscale(ccd$PAY_2)
ccd.scaled$PAY_3<- standardscale(ccd$PAY_3)
ccd.scaled$PAY_4<- standardscale(ccd$PAY_4)
ccd.scaled$PAY_5<- standardscale(ccd$PAY_5)
ccd.scaled$PAY_6<- standardscale(ccd$PAY_6)
ccd.scaled$BILL_AMT1<- standardscale(ccd$BILL_AMT1)
ccd.scaled$BILL_AMT2<- standardscale(ccd$BILL_AMT2)
ccd.scaled$BILL_AMT3<- standardscale(ccd$BILL_AMT3)
ccd.scaled$BILL_AMT4<- standardscale(ccd$BILL_AMT4)
ccd.scaled$BILL_AMT5<- standardscale(ccd$BILL_AMT5)
ccd.scaled$BILL_AMT6<- standardscale(ccd$BILL_AMT6)
ccd.scaled$PAY_AMT1<- standardscale(ccd$PAY_AMT1)
ccd.scaled$PAY_AMT2<- standardscale(ccd$PAY_AMT2)
ccd.scaled$PAY_AMT3<- standardscale(ccd$PAY_AMT3)
ccd.scaled$PAY_AMT4<- standardscale(ccd$PAY_AMT4)
ccd.scaled$PAY_AMT5<- standardscale(ccd$PAY_AMT5)
ccd.scaled$PAY_AMT6<- standardscale(ccd$PAY_AMT6)
ccd.scaled$DEFAULT <- as.numeric(ifelse(ccd$default.payment.next.month == 1,1,0))
ccd.scaled$AVG_UTILIZATION_RATE <- standardscale(ccd$AVG_UTILIZATION_RATE)
ccd.scaled$OVERSPENDER <- ifelse(ccd$OVERSPENDER == 1, 1,0)
ccd.scaled$TOTAL_EXCESS_BALANCE <- standardscale(ccd$TOTAL_EXCESS_BALANCE)
ccd.scaled$MARRIAGE <- ccd$MARRIAGE
ccd.scaled$EDUCATION <- ccd$EDUCATION
ccd.scaled$SEX <- ccd$SEX
#May not be necessary...yet
#ccd.scaled$SEX_MALE <- ifelse(ccd$SEX == 'Male', 1, 0)
#ccd.scaled$SEX_FEMALE <- ifelse(ccd$SEX == 'Female', 1, 0)
#ccd.scaled$EDU_GRAD <- ifelse(ccd$EDUCATION == 'Graduate School', 1, 0)
#ccd.scaled$EDU_UNIVERSITY <- ifelse(ccd$EDUCATION == 'University', 1, 0)
#ccd.scaled$EDU_HIGH_SCHOOL <- ifelse(ccd$EDUCATION == 'High School', 1, 0)
#ccd.scaled$EDU_OTHER <- ifelse(ccd$EDUCATION == 'Other', 1, 0)
#ccd.scaled$MARRIAGE_M <- ifelse(ccd$MARRIAGE == 'Married', 1, 0)
#ccd.scaled$MARRIAGE_S <- ifelse(ccd$MARRIAGE == 'Single', 1, 0)
#Feature Scaling References
#REF: https://www.oreilly.com/library/view/hands-on-machine-learning/9781788393485/fd5b8a44-e9d3-4c19-bebb-c2fa5a5ebfee.xhtml
#REF: https://vitalflux.com/minmaxscaler-standardscaler-python-examples/
#REF: https://becominghuman.ai/what-does-feature-scaling-mean-when-to-normalize-data-and-when-to-standardize-data-c3de654405ed
#REF: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
#REF: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
#Handling an imbalanced dataset, may need to resample the unbalanced dataset to account for fewer defaults
#keep all of the defaults, randomly sample an equal number of non defaults
ccd_defaults<- subset(ccd, (ccd$default.payment.next.month == 1))
ccd_no_defaults <- subset(ccd, (ccd$default.payment.next.month == 0))
#this way everyone gets the same output
set.seed((1234))
#unbalanced dataset of defaults and no defaults, resample for maximum number of defaults and equivalent no defaults.
#down sampling:
ccd_no_defaults_downsample <- sample_n(ccd_no_defaults, nrow(ccd_defaults), replace = F)
ccd_down_balanced <- data.frame(rbind(ccd_defaults,ccd_no_defaults_downsample))
#up sampling:
ccd_defaults_upsample <- sample_n(ccd_defaults, nrow(ccd_no_defaults), replace = T)
ccd_up_balanced <- data.frame(rbind(ccd_no_defaults,ccd_defaults_upsample))
set.seed((1234))
#Split Data into train and test, ccd will remain train, ccd.test will be test data
Split_Data = sample(c(rep(0, 0.5 * nrow(ccd_down_balanced)), rep(1, 0.5 *nrow(ccd_down_balanced))))
ccd.down.train<- ccd_down_balanced[Split_Data==0,]
ccd.down.test<- ccd_down_balanced[Split_Data==1,]
set.seed((1234))
#Split Data into train and test, ccd will remain train, ccd.test will be test data
Split_Data = sample(c(rep(0, 0.5 * nrow(ccd_up_balanced)), rep(1, 0.5 *nrow(ccd_up_balanced))))
ccd.up.train<- ccd_up_balanced[Split_Data==0,]
ccd.up.test<- ccd_up_balanced[Split_Data==1,]
#Handling an imbalanced dataset, may need to resample the unbalanced dataset to account for fewer defaults
#keep all of the defaults, randomly sample an equal number of non defaults
ccd_defaults<- subset(ccd.scaled, (ccd.scaled$DEFAULT == 1))
ccd_no_defaults <- subset(ccd.scaled, (ccd.scaled$DEFAULT == 0))
#this way everyone gets the same output
set.seed((1234))
#unbalanced dataset of defaults and no defaults, resample for maximum number of defaults and equivalent no defaults.
#down sampling:
ccd_no_defaults_downsample <- sample_n(ccd_no_defaults, nrow(ccd_defaults), replace = F)
ccd_down_balanced <- data.frame(rbind(ccd_defaults,ccd_no_defaults_downsample))
#up sampling:
ccd_defaults_upsample <- sample_n(ccd_defaults, nrow(ccd_no_defaults), replace = T)
ccd_up_balanced <- data.frame(rbind(ccd_no_defaults,ccd_defaults_upsample))
set.seed((1234))
#Split Data into train and test, ccd will remain train, ccd.test will be test data
Split_Data = sample(c(rep(0, 0.5 * nrow(ccd_up_balanced)), rep(1, 0.5 *nrow(ccd_up_balanced))))
ccd.scaled.up.train<- ccd_up_balanced[Split_Data==0,]
ccd.scaled.up.test<- ccd_up_balanced[Split_Data==1,]
#clean workspace
rm(ccd_defaults)
rm(ccd_no_defaults)
rm(ccd_no_defaults_downsample)
rm(ccd_down_balanced)
rm(ccd_defaults_upsample)
rm(ccd_up_balanced)
#scaled up
ccd.test = ccd.scaled.up.test
ccd.train = ccd.scaled.up.train
As a starting point, our first logistic regression model simply includes every feature available in the dataset.
mod1 = glm(formula = DEFAULT~LIMIT_BAL+BILL_AMT1+BILL_AMT2+BILL_AMT3+BILL_AMT4+BILL_AMT5+BILL_AMT6+PAY_AMT1+PAY_AMT2+PAY_AMT3+PAY_AMT4+PAY_AMT5+PAY_AMT6+AVG_UTILIZATION_RATE+PAY_2+PAY_3+PAY_4+PAY_5+PAY_6+SEX+MARRIAGE+EDUCATION+AGE, family = binomial(link = "logit"), data=ccd.train)
summary(mod1)
##
## Call:
## glm(formula = DEFAULT ~ LIMIT_BAL + BILL_AMT1 + BILL_AMT2 + BILL_AMT3 +
## BILL_AMT4 + BILL_AMT5 + BILL_AMT6 + PAY_AMT1 + PAY_AMT2 +
## PAY_AMT3 + PAY_AMT4 + PAY_AMT5 + PAY_AMT6 + AVG_UTILIZATION_RATE +
## PAY_2 + PAY_3 + PAY_4 + PAY_5 + PAY_6 + SEX + MARRIAGE +
## EDUCATION + AGE, family = binomial(link = "logit"), data = ccd.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5278 -1.1127 0.2719 1.0918 3.7838
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.740e-01 9.859e-02 4.807 1.53e-06 ***
## LIMIT_BAL -1.411e-06 1.949e-07 -7.238 4.55e-13 ***
## BILL_AMT1 -6.595e-06 1.142e-06 -5.777 7.62e-09 ***
## BILL_AMT2 6.355e-06 1.481e-06 4.291 1.78e-05 ***
## BILL_AMT3 1.135e-06 1.274e-06 0.891 0.372863
## BILL_AMT4 -2.011e-06 1.392e-06 -1.445 0.148524
## BILL_AMT5 -1.677e-06 1.596e-06 -1.051 0.293159
## BILL_AMT6 4.482e-06 1.256e-06 3.568 0.000360 ***
## PAY_AMT1 -1.536e-05 2.208e-06 -6.957 3.48e-12 ***
## PAY_AMT2 -9.856e-06 2.003e-06 -4.919 8.69e-07 ***
## PAY_AMT3 -2.565e-06 1.548e-06 -1.657 0.097424 .
## PAY_AMT4 -5.754e-06 1.736e-06 -3.314 0.000919 ***
## PAY_AMT5 -8.599e-06 1.868e-06 -4.603 4.17e-06 ***
## PAY_AMT6 -3.001e-06 1.381e-06 -2.173 0.029788 *
## AVG_UTILIZATION_RATE -2.490e-01 7.590e-02 -3.280 0.001036 **
## PAY_2 3.099e-01 2.000e-02 15.495 < 2e-16 ***
## PAY_3 6.277e-02 2.379e-02 2.639 0.008317 **
## PAY_4 3.118e-02 2.608e-02 1.195 0.231946
## PAY_5 5.516e-02 2.806e-02 1.966 0.049348 *
## PAY_6 2.102e-02 2.308e-02 0.911 0.362388
## SEXFemale -9.410e-02 3.217e-02 -2.925 0.003441 **
## MARRIAGESingle -1.634e-01 3.687e-02 -4.430 9.43e-06 ***
## EDUCATIONUniversity -1.945e-02 3.721e-02 -0.523 0.601195
## EDUCATIONHigh School -3.927e-02 5.071e-02 -0.774 0.438728
## EDUCATIONOther -9.365e-01 3.457e-01 -2.709 0.006752 **
## AGE 3.566e-03 2.029e-03 1.757 0.078877 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24878 on 17945 degrees of freedom
## Residual deviance: 22749 on 17920 degrees of freedom
## AIC: 22801
##
## Number of Fisher Scoring iterations: 5
anova(mod1, test="Chisq")
equatiomatic::extract_eq(mod1)
\[ \log\left[ \frac { P( \operatorname{DEFAULT} = \operatorname{1} ) }{ 1 - P( \operatorname{DEFAULT} = \operatorname{1} ) } \right] = \alpha + \beta_{1}(\operatorname{LIMIT\_BAL}) + \beta_{2}(\operatorname{BILL\_AMT1}) + \beta_{3}(\operatorname{BILL\_AMT2}) + \beta_{4}(\operatorname{BILL\_AMT3}) + \beta_{5}(\operatorname{BILL\_AMT4}) + \beta_{6}(\operatorname{BILL\_AMT5}) + \beta_{7}(\operatorname{BILL\_AMT6}) + \beta_{8}(\operatorname{PAY\_AMT1}) + \beta_{9}(\operatorname{PAY\_AMT2}) + \beta_{10}(\operatorname{PAY\_AMT3}) + \beta_{11}(\operatorname{PAY\_AMT4}) + \beta_{12}(\operatorname{PAY\_AMT5}) + \beta_{13}(\operatorname{PAY\_AMT6}) + \beta_{14}(\operatorname{AVG\_UTILIZATION\_RATE}) + \beta_{15}(\operatorname{PAY\_2}) + \beta_{16}(\operatorname{PAY\_3}) + \beta_{17}(\operatorname{PAY\_4}) + \beta_{18}(\operatorname{PAY\_5}) + \beta_{19}(\operatorname{PAY\_6}) + \beta_{20}(\operatorname{SEX}_{\operatorname{Female}}) + \beta_{21}(\operatorname{MARRIAGE}_{\operatorname{Single}}) + \beta_{22}(\operatorname{EDUCATION}_{\operatorname{University}}) + \beta_{23}(\operatorname{EDUCATION}_{\operatorname{High\ School}}) + \beta_{24}(\operatorname{EDUCATION}_{\operatorname{Other}}) + \beta_{25}(\operatorname{AGE}) \]
mod2 = glm(formula = DEFAULT~
BILL_AMT1+BILL_AMT2+BILL_AMT3+BILL_AMT4+BILL_AMT5+BILL_AMT6+
PAY_AMT1+PAY_AMT2+PAY_AMT3+PAY_AMT4+PAY_AMT5+PAY_AMT6+
PAY_2+PAY_3+PAY_4+PAY_5+PAY_6+
SEX*MARRIAGE+EDUCATION*AGE+LIMIT_BAL*AGE+AVG_UTILIZATION_RATE*AGE, family = binomial(link = "logit"), data=ccd.train)
summary(mod2)
##
## Call:
## glm(formula = DEFAULT ~ BILL_AMT1 + BILL_AMT2 + BILL_AMT3 + BILL_AMT4 +
## BILL_AMT5 + BILL_AMT6 + PAY_AMT1 + PAY_AMT2 + PAY_AMT3 +
## PAY_AMT4 + PAY_AMT5 + PAY_AMT6 + PAY_2 + PAY_3 + PAY_4 +
## PAY_5 + PAY_6 + SEX * MARRIAGE + EDUCATION * AGE + LIMIT_BAL *
## AGE + AVG_UTILIZATION_RATE * AGE, family = binomial(link = "logit"),
## data = ccd.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5517 -1.1129 0.2706 1.0918 3.8062
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.253e-01 1.933e-01 3.235 0.001218 **
## BILL_AMT1 -6.484e-06 1.139e-06 -5.690 1.27e-08 ***
## BILL_AMT2 6.479e-06 1.480e-06 4.379 1.19e-05 ***
## BILL_AMT3 1.013e-06 1.267e-06 0.799 0.424122
## BILL_AMT4 -1.843e-06 1.383e-06 -1.333 0.182591
## BILL_AMT5 -1.682e-06 1.596e-06 -1.054 0.292027
## BILL_AMT6 4.422e-06 1.258e-06 3.517 0.000437 ***
## PAY_AMT1 -1.547e-05 2.207e-06 -7.010 2.38e-12 ***
## PAY_AMT2 -9.607e-06 2.001e-06 -4.801 1.58e-06 ***
## PAY_AMT3 -2.688e-06 1.526e-06 -1.762 0.078057 .
## PAY_AMT4 -5.579e-06 1.731e-06 -3.223 0.001267 **
## PAY_AMT5 -8.512e-06 1.865e-06 -4.563 5.04e-06 ***
## PAY_AMT6 -3.033e-06 1.380e-06 -2.197 0.027990 *
## PAY_2 3.125e-01 2.005e-02 15.591 < 2e-16 ***
## PAY_3 6.009e-02 2.383e-02 2.521 0.011690 *
## PAY_4 3.501e-02 2.612e-02 1.341 0.180012
## PAY_5 5.464e-02 2.808e-02 1.946 0.051687 .
## PAY_6 2.139e-02 2.310e-02 0.926 0.354548
## SEXFemale -1.255e-01 4.739e-02 -2.648 0.008085 **
## MARRIAGESingle -1.877e-01 4.945e-02 -3.796 0.000147 ***
## EDUCATIONUniversity 1.521e-01 1.512e-01 1.006 0.314388
## EDUCATIONHigh School 3.042e-01 1.983e-01 1.534 0.125041
## EDUCATIONOther -2.008e+00 1.477e+00 -1.359 0.173998
## AGE 3.373e-04 5.182e-03 0.065 0.948105
## LIMIT_BAL -4.474e-06 6.130e-07 -7.298 2.92e-13 ***
## AVG_UTILIZATION_RATE -8.223e-02 1.924e-01 -0.427 0.669058
## SEXFemale:MARRIAGESingle 8.801e-02 6.412e-02 1.373 0.169875
## EDUCATIONUniversity:AGE -4.897e-03 4.269e-03 -1.147 0.251383
## EDUCATIONHigh School:AGE -8.808e-03 5.137e-03 -1.715 0.086435 .
## EDUCATIONOther:AGE 2.990e-02 3.960e-02 0.755 0.450344
## AGE:LIMIT_BAL 7.888e-08 1.569e-08 5.027 4.98e-07 ***
## AGE:AVG_UTILIZATION_RATE -5.961e-03 5.230e-03 -1.140 0.254323
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24878 on 17945 degrees of freedom
## Residual deviance: 22700 on 17914 degrees of freedom
## AIC: 22764
##
## Number of Fisher Scoring iterations: 5
anova(mod2, test="Chisq")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
equatiomatic::extract_eq(mod2)
\[ \log\left[ \frac { P( \operatorname{DEFAULT} = \operatorname{1} ) }{ 1 - P( \operatorname{DEFAULT} = \operatorname{1} ) } \right] = \alpha + \beta_{1}(\operatorname{BILL\_AMT1}) + \beta_{2}(\operatorname{BILL\_AMT2}) + \beta_{3}(\operatorname{BILL\_AMT3}) + \beta_{4}(\operatorname{BILL\_AMT4}) + \beta_{5}(\operatorname{BILL\_AMT5}) + \beta_{6}(\operatorname{BILL\_AMT6}) + \beta_{7}(\operatorname{PAY\_AMT1}) + \beta_{8}(\operatorname{PAY\_AMT2}) + \beta_{9}(\operatorname{PAY\_AMT3}) + \beta_{10}(\operatorname{PAY\_AMT4}) + \beta_{11}(\operatorname{PAY\_AMT5}) + \beta_{12}(\operatorname{PAY\_AMT6}) + \beta_{13}(\operatorname{PAY\_2}) + \beta_{14}(\operatorname{PAY\_3}) + \beta_{15}(\operatorname{PAY\_4}) + \beta_{16}(\operatorname{PAY\_5}) + \beta_{17}(\operatorname{PAY\_6}) + \beta_{18}(\operatorname{SEX}_{\operatorname{Female}}) + \beta_{19}(\operatorname{MARRIAGE}_{\operatorname{Single}}) + \beta_{20}(\operatorname{EDUCATION}_{\operatorname{University}}) + \beta_{21}(\operatorname{EDUCATION}_{\operatorname{High\ School}}) + \beta_{22}(\operatorname{EDUCATION}_{\operatorname{Other}}) + \beta_{23}(\operatorname{AGE}) + \beta_{24}(\operatorname{LIMIT\_BAL}) + \beta_{25}(\operatorname{AVG\_UTILIZATION\_RATE}) + \beta_{26}(\operatorname{SEX}_{\operatorname{Female}} \times \operatorname{MARRIAGE}_{\operatorname{Single}}) + \beta_{27}(\operatorname{EDUCATION}_{\operatorname{University}} \times \operatorname{AGE}) + \beta_{28}(\operatorname{EDUCATION}_{\operatorname{High\ School}} \times \operatorname{AGE}) + \beta_{29}(\operatorname{EDUCATION}_{\operatorname{Other}} \times \operatorname{AGE}) + \beta_{30}(\operatorname{AGE} \times \operatorname{LIMIT\_BAL}) + \beta_{31}(\operatorname{AGE} \times \operatorname{AVG\_UTILIZATION\_RATE}) \]
Predict Limit Balance, then incorporate those predictions into the logistic regression. Possible opportunity to demonstrate causality via two stage regression techniques that BILL_AMT1 and PAY_AMT1 directly impact the next month’s default.
Bill_Amt_Lin_Reg = lm(formula = BILL_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE, data=ccd.train)
summary(Bill_Amt_Lin_Reg)
##
## Call:
## lm(formula = BILL_AMT1 ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 +
## PAY_AMT2 + PAY_AMT3 + SEX + EDUCATION + AVG_UTILIZATION_RATE,
## data = ccd.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -303245 -4161 -2004 1140 416262
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.007e+03 4.734e+02 4.240 2.24e-05 ***
## LIMIT_BAL 4.172e-03 1.632e-03 2.557 0.0106 *
## BILL_AMT2 9.035e-01 6.971e-03 129.596 < 2e-16 ***
## BILL_AMT3 1.142e-01 7.737e-03 14.754 < 2e-16 ***
## PAY_AMT2 -1.095e-01 8.755e-03 -12.503 < 2e-16 ***
## PAY_AMT3 5.679e-02 9.847e-03 5.767 8.18e-09 ***
## SEXFemale -3.534e+02 2.933e+02 -1.205 0.2283
## EDUCATIONUniversity 1.732e+03 3.388e+02 5.112 3.22e-07 ***
## EDUCATIONHigh School 1.991e+03 4.515e+02 4.411 1.04e-05 ***
## EDUCATIONOther 2.070e+03 2.677e+03 0.773 0.4394
## AVG_UTILIZATION_RATE -5.753e+03 6.370e+02 -9.031 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19580 on 17935 degrees of freedom
## Multiple R-squared: 0.9292, Adjusted R-squared: 0.9292
## F-statistic: 2.356e+04 on 10 and 17935 DF, p-value: < 2.2e-16
anova(Bill_Amt_Lin_Reg)
Pay_Amt_Lin_Reg = lm( formula = PAY_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE, data=ccd.train)
summary(Pay_Amt_Lin_Reg)
##
## Call:
## lm(formula = PAY_AMT1 ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 + PAY_AMT2 +
## PAY_AMT3 + SEX + EDUCATION + AVG_UTILIZATION_RATE, data = ccd.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -138373 -2543 -967 276 296088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.787e+01 3.049e+02 0.255 0.798
## LIMIT_BAL 1.046e-02 1.051e-03 9.951 <2e-16 ***
## BILL_AMT2 1.572e-01 4.490e-03 35.020 <2e-16 ***
## BILL_AMT3 -1.281e-01 4.983e-03 -25.707 <2e-16 ***
## PAY_AMT2 2.049e-01 5.639e-03 36.344 <2e-16 ***
## PAY_AMT3 6.336e-02 6.342e-03 9.990 <2e-16 ***
## SEXFemale 2.211e+02 1.889e+02 1.171 0.242
## EDUCATIONUniversity -3.577e+02 2.182e+02 -1.639 0.101
## EDUCATIONHigh School -3.237e+02 2.908e+02 -1.113 0.266
## EDUCATIONOther 1.042e+03 1.724e+03 0.605 0.545
## AVG_UTILIZATION_RATE 3.939e+02 4.103e+02 0.960 0.337
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12610 on 17935 degrees of freedom
## Multiple R-squared: 0.1829, Adjusted R-squared: 0.1824
## F-statistic: 401.4 on 10 and 17935 DF, p-value: < 2.2e-16
anova(Pay_Amt_Lin_Reg)
ccd.train$BILL_AMT1_PRED<- predict(Bill_Amt_Lin_Reg, ccd.test)
ccd.test$BILL_AMT1_PRED<- predict(Bill_Amt_Lin_Reg, ccd.test)
ccd.train$PAY_AMT1_PRED<- predict(Pay_Amt_Lin_Reg, ccd.test)
ccd.test$PAY_AMT1_PRED<- predict(Pay_Amt_Lin_Reg, ccd.test)
mod3 = glm(formula = DEFAULT~LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE+BILL_AMT1_PRED+PAY_AMT1_PRED, family = binomial(link = "logit"), data=ccd.train)
summary(mod3)
##
## Call:
## glm(formula = DEFAULT ~ LIMIT_BAL + BILL_AMT2 + BILL_AMT3 + PAY_AMT2 +
## PAY_AMT3 + SEX + EDUCATION + AVG_UTILIZATION_RATE + BILL_AMT1_PRED +
## PAY_AMT1_PRED, family = binomial(link = "logit"), data = ccd.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4479 -1.1364 0.6582 1.0616 4.0625
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.516e-01 5.307e-02 10.394 < 2e-16 ***
## LIMIT_BAL -2.220e-06 1.814e-07 -12.241 < 2e-16 ***
## BILL_AMT2 -2.292e-06 8.817e-07 -2.599 0.009336 **
## BILL_AMT3 3.403e-06 9.739e-07 3.494 0.000476 ***
## PAY_AMT2 -1.883e-05 2.165e-06 -8.701 < 2e-16 ***
## PAY_AMT3 -6.547e-06 1.451e-06 -4.512 6.43e-06 ***
## SEXFemale -1.300e-01 3.116e-02 -4.173 3.01e-05 ***
## EDUCATIONUniversity 4.005e-02 3.594e-02 1.114 0.265098
## EDUCATIONHigh School 4.175e-02 4.790e-02 0.872 0.383389
## EDUCATIONOther -1.173e+00 3.481e-01 -3.369 0.000756 ***
## AVG_UTILIZATION_RATE 4.255e-01 6.886e-02 6.180 6.42e-10 ***
## BILL_AMT1_PRED 5.046e-06 3.706e-07 13.614 < 2e-16 ***
## PAY_AMT1_PRED -1.134e-04 5.981e-06 -18.962 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24878 on 17945 degrees of freedom
## Residual deviance: 23408 on 17933 degrees of freedom
## AIC: 23434
##
## Number of Fisher Scoring iterations: 5
anova(mod3, test="Chisq")
equatiomatic::extract_eq(mod3)
\[ \log\left[ \frac { P( \operatorname{DEFAULT} = \operatorname{1} ) }{ 1 - P( \operatorname{DEFAULT} = \operatorname{1} ) } \right] = \alpha + \beta_{1}(\operatorname{LIMIT\_BAL}) + \beta_{2}(\operatorname{BILL\_AMT2}) + \beta_{3}(\operatorname{BILL\_AMT3}) + \beta_{4}(\operatorname{PAY\_AMT2}) + \beta_{5}(\operatorname{PAY\_AMT3}) + \beta_{6}(\operatorname{SEX}_{\operatorname{Female}}) + \beta_{7}(\operatorname{EDUCATION}_{\operatorname{University}}) + \beta_{8}(\operatorname{EDUCATION}_{\operatorname{High\ School}}) + \beta_{9}(\operatorname{EDUCATION}_{\operatorname{Other}}) + \beta_{10}(\operatorname{AVG\_UTILIZATION\_RATE}) + \beta_{11}(\operatorname{BILL\_AMT1\_PRED}) + \beta_{12}(\operatorname{PAY\_AMT1\_PRED}) \]
mod4 = glm(formula = DEFAULT~log(LIMIT_BAL+.01)+SEX+AGE+log(BILL_AMT1_PRED+.01)+log(BILL_AMT2+.01)+log(BILL_AMT3+.01)+log(PAY_AMT1_PRED+.01)+log(PAY_AMT2+.01)+log(PAY_AMT3+.01)+log(AVG_UTILIZATION_RATE+10)+PAY_2+PAY_3, family = binomial(link = "logit"), data=ccd.train)
summary(mod4)
##
## Call:
## glm(formula = DEFAULT ~ log(LIMIT_BAL + 0.01) + SEX + AGE + log(BILL_AMT1_PRED +
## 0.01) + log(BILL_AMT2 + 0.01) + log(BILL_AMT3 + 0.01) + log(PAY_AMT1_PRED +
## 0.01) + log(PAY_AMT2 + 0.01) + log(PAY_AMT3 + 0.01) + log(AVG_UTILIZATION_RATE +
## 10) + PAY_2 + PAY_3, family = binomial(link = "logit"), data = ccd.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0864 -0.9870 0.1906 1.0204 2.4550
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.784575 1.711273 -5.133 2.85e-07 ***
## log(LIMIT_BAL + 0.01) -0.149109 0.020383 -7.315 2.57e-13 ***
## SEXFemale -0.039514 0.034037 -1.161 0.245678
## AGE 0.005638 0.001856 3.038 0.002381 **
## log(BILL_AMT1_PRED + 0.01) 0.222171 0.016368 13.573 < 2e-16 ***
## log(BILL_AMT2 + 0.01) -0.074134 0.006665 -11.122 < 2e-16 ***
## log(BILL_AMT3 + 0.01) -0.002808 0.009235 -0.304 0.761105
## log(PAY_AMT1_PRED + 0.01) -0.610441 0.024556 -24.859 < 2e-16 ***
## log(PAY_AMT2 + 0.01) -0.025402 0.006731 -3.774 0.000161 ***
## log(PAY_AMT3 + 0.01) -0.045630 0.004185 -10.904 < 2e-16 ***
## log(AVG_UTILIZATION_RATE + 10) 5.990430 0.694560 8.625 < 2e-16 ***
## PAY_2 0.395521 0.021204 18.654 < 2e-16 ***
## PAY_3 0.185561 0.028576 6.494 8.38e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23904 on 17242 degrees of freedom
## Residual deviance: 20519 on 17230 degrees of freedom
## (703 observations deleted due to missingness)
## AIC: 20545
##
## Number of Fisher Scoring iterations: 4
anova(mod4, test="Chisq")
#Make the Predictions
ccd.test$DEFAULT_PROB_PREDICT = predict(mod4, ccd.test, type="response")
## Warning in log(BILL_AMT1_PRED + 0.01): NaNs produced
## Warning in log(BILL_AMT2 + 0.01): NaNs produced
## Warning in log(BILL_AMT3 + 0.01): NaNs produced
## Warning in log(PAY_AMT1_PRED + 0.01): NaNs produced
#Define the prediction threshold
threshold = .275
#Generate Prediction
ccd.test$DEFAULT_PREDICTION <- ifelse(ccd.test$DEFAULT_PROB_PREDICT >= threshold, 1, 0 )
table(factor(ccd.test$DEFAULT_PREDICTION),factor(ccd.test$DEFAULT))
##
## 0 1
## 0 2437 919
## 1 6198 7727
#This was for the single use case.
model_names = c("mod1", "mod2", "mod3", "mod4")
description = c("Everything", "Interactions", "Two-Stage", "Log Transformation")
Model_Performance<-as.data.frame(matrix(nrow=length(model_names),ncol=12))
for(i in 1:length(model_names)){
#Make the Predictions
ccd.test$DEFAULT_PROB_PREDICT = predict(eval(parse(text = paste('mod',i,sep=""))), ccd.test, type="response")
#Define the prediction threshold
threshold = .25
#Generate Prediction
ccd.test$DEFAULT_PREDICTION <- ifelse(ccd.test$DEFAULT_PROB_PREDICT >= threshold, 1, 0 )
#Confusion Matrix Build
table(factor(ccd.test$DEFAULT_PREDICTION),factor(ccd.test$DEFAULT))
confusion_table = table(factor(ccd.test$DEFAULT_PREDICTION),factor(ccd.test$DEFAULT))
confusion_values = c(confusion_table[1,1], confusion_table[2,1], confusion_table[1,2], confusion_table[2,2])
Label = c('TN','FP','FN','TP')
Prediction = c(0,1,0,1)
Actual = c(0,0,1,1)
confusion_frame = data.frame(Prediction, Actual, confusion_values, Label)
colnames(confusion_frame)<- c("Prediction", "Actual","Frequency", "Label")
confusion_frame
colnames(Model_Performance)<-c("Name","Accuracy","Precision","Recall","Specificity","F1_Score","FPR", "TP", "TN", "FP", "FN", "Prediction_Threshold")
#True Positives
TP = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="TP"])
Model_Performance$TP[i] = TP
#True Negatives
TN = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="TN"])
Model_Performance$TN[i] = TN
#False Positive
FP = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="FP"])
Model_Performance$FP[i] = FP
#False Negatives
FN = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="FN"])
Model_Performance$FN[i] = FN
#Model Name
Model_Performance$Name[i] = paste("Model ",i ,": ",description[i], sep = "")
#Accuracy
Model_Performance$Accuracy[i] = round((TP+TN)/(TP+TN+FP+FN),3)
#Precision
Model_Performance$Precision[i] = round(TP / (TP + FP),3)
#Recall, Sensitivity, True Positive Rate, We likely want to optimize for recall
Model_Performance$Recall[i] = round(TP / (TP + FN),3)
#Specificity
Model_Performance$Specificity[i] = round(TN / (TN + FP),3)
# F1 Score
Model_Performance$F1_Score[i] = round(2*TP / (2*TP + FP + FN),3)
#False Positive Rate
Model_Performance$FPR[i] = round(FP / (FP+TN),3)
#Model Prediction Threshold
Model_Performance$Prediction_Threshold[i] = threshold
}
Model_Performance
ccd <- read.csv("~/Documents/Academic/UVA_MSDS/STAT6021/Project_2/ccd.csv")
model_names = c("mod1", "mod2", "mod3", "mod4")
description = c("Everything", "Interactions", "Two-Stage", "Log Transformation")
scaling_type = c('No Scaling', 'MinMaxScaler', 'StandardScaler', 'RobustScaler')
balancing_type = c('No Adjustment', 'Over-Sample', 'Under-Sample')
Model_Performance<-as.data.frame(matrix(nrow=length(model_names)*length(seq(0,1,.025))*length(scaling_type)*length(balancing_type),ncol=15))
#i represent a row entry
i=1
s=1
b=1
#loop through scaling, 'No Scaling', 'MinMaxScaler', 'StandardScaler', 'RobustScaler'
for(s in scaling_type){
if(s == 'No Scaling'){
noscale <- function(x){x}
scale <- noscale
}else if(s == 'MinMaxScaler'){
minmaxscale<-function(x){(x-min(x))/(max(x)-min(x))}
scale <- minmaxscale
}else if(s == 'StandardScaler'){
standardscale<-function(x){(x-mean(x))/(sd(x))}
scale <- standardscale
}else if(s == 'RobustScaler'){
robustscale<-function(x){
Q <- quantile(x, probs=c(.25, .75), na.rm = FALSE)
return (x-median(x))/(Q[2]-Q[1])}
scale <- robustscale
}
ccd$DEFAULT <- as.numeric(ccd$default.payment.next.month)
ccd.scaled <- data.frame(ccd$LIMIT_BAL)
ccd.scaled$LIMIT_BAL<- scale(ccd$LIMIT_BAL)
ccd.scaled <- ccd.scaled %>% dplyr::select(-ccd.LIMIT_BAL)
ccd.scaled$AGE<- scale(ccd$AGE)
ccd.scaled$PAY_2<- scale(ccd$PAY_2)
ccd.scaled$PAY_3<- scale(ccd$PAY_3)
ccd.scaled$PAY_4<- scale(ccd$PAY_4)
ccd.scaled$PAY_5<- scale(ccd$PAY_5)
ccd.scaled$PAY_6<- scale(ccd$PAY_6)
ccd.scaled$BILL_AMT1<- scale(ccd$BILL_AMT1)
ccd.scaled$BILL_AMT2<- scale(ccd$BILL_AMT2)
ccd.scaled$BILL_AMT3<- scale(ccd$BILL_AMT3)
ccd.scaled$BILL_AMT4<- scale(ccd$BILL_AMT4)
ccd.scaled$BILL_AMT5<- scale(ccd$BILL_AMT5)
ccd.scaled$BILL_AMT6<- scale(ccd$BILL_AMT6)
ccd.scaled$PAY_AMT1<- scale(ccd$PAY_AMT1)
ccd.scaled$PAY_AMT2<- scale(ccd$PAY_AMT2)
ccd.scaled$PAY_AMT3<- scale(ccd$PAY_AMT3)
ccd.scaled$PAY_AMT4<- scale(ccd$PAY_AMT4)
ccd.scaled$PAY_AMT5<- scale(ccd$PAY_AMT5)
ccd.scaled$PAY_AMT6<- scale(ccd$PAY_AMT6)
ccd.scaled$DEFAULT <- as.numeric(ifelse(ccd$default.payment.next.month == 1,1,0))
ccd.scaled$AVG_UTILIZATION_RATE <- scale(ccd$AVG_UTILIZATION_RATE)
ccd.scaled$OVERSPENDER <- ifelse(ccd$OVERSPENDER == 1, 1,0)
ccd.scaled$TOTAL_EXCESS_BALANCE <- scale(ccd$TOTAL_EXCESS_BALANCE)
ccd.scaled$MARRIAGE <- ccd$MARRIAGE
ccd.scaled$EDUCATION <- ccd$EDUCATION
ccd.scaled$SEX <- ccd$SEX
#Handling an imbalanced dataset, may need to resample the unbalanced dataset to account for fewer defaults
#keep all of the defaults, randomly sample an equal number of non defaults
ccd_defaults<- subset(ccd.scaled, (ccd.scaled$DEFAULT == 1))
ccd_no_defaults <- subset(ccd.scaled, (ccd.scaled$DEFAULT == 0))
#loop through sampling, 'No Adjustment', 'Upsample', 'Downsample'
for(c in balancing_type){
if(c == 'No Adjustment'){
#Split Data into train and test, ccd will remain train, ccd.test will be test data
Split_Data = sample(c(rep(0, 0.5 * nrow(ccd.scaled)), rep(1, 0.5 *nrow(ccd.scaled))))
ccd.train<- ccd.scaled[Split_Data==0,]
ccd.test<- ccd.scaled[Split_Data==1,]
}else if(c == 'Over-Sample'){
#up sampling:
ccd_defaults_upsample <- sample_n(ccd_defaults, nrow(ccd_no_defaults), replace = T)
ccd_up_balanced <- data.frame(rbind(ccd_no_defaults,ccd_defaults_upsample))
#Split Data into train and test, ccd will remain train, ccd.test will be test data
Split_Data = sample(c(rep(0, 0.5 * nrow(ccd_up_balanced)), rep(1, 0.5 *nrow(ccd_up_balanced))))
ccd.up.train<- ccd_up_balanced[Split_Data==0,]
ccd.up.test<- ccd_up_balanced[Split_Data==1,]
#synchronize
ccd.train <- ccd.up.train
ccd.test <- ccd.up.test
}else if(c == 'Under-Sample'){
#unbalanced dataset of defaults and no defaults, resample for maximum number of defaults and equivalent no defaults.
#down sampling:
ccd_no_defaults_downsample <- sample_n(ccd_no_defaults, nrow(ccd_defaults), replace = F)
ccd_down_balanced <- data.frame(rbind(ccd_defaults,ccd_no_defaults_downsample))
#Split Data into train and test, ccd will remain train, ccd.test will be test data
Split_Data = sample(c(rep(0, 0.5 * nrow(ccd_down_balanced)), rep(1, 0.5 *nrow(ccd_down_balanced))))
ccd.down.train<- ccd_down_balanced[Split_Data==0,]
ccd.down.test<- ccd_down_balanced[Split_Data==1,]
#synchronize
ccd.train <- ccd.down.train
ccd.test <- ccd.down.test
}
#Mod 1
mod1 = glm(formula = DEFAULT~LIMIT_BAL+BILL_AMT1+BILL_AMT2+BILL_AMT3+BILL_AMT4+BILL_AMT5+BILL_AMT6+PAY_AMT1+PAY_AMT2+PAY_AMT3+PAY_AMT4+PAY_AMT5+PAY_AMT6+AVG_UTILIZATION_RATE+PAY_2+PAY_3+PAY_4+PAY_5+PAY_6+SEX+MARRIAGE+EDUCATION+AGE, family = binomial(link = "logit"), data=ccd.train)
#Mod2
mod2 = glm(formula = DEFAULT~
BILL_AMT1+BILL_AMT2+BILL_AMT3+BILL_AMT4+BILL_AMT5+BILL_AMT6+
PAY_AMT1+PAY_AMT2+PAY_AMT3+PAY_AMT4+PAY_AMT5+PAY_AMT6+
PAY_2+PAY_3+PAY_4+PAY_5+PAY_6+
MARRIAGE+LIMIT_BAL*AGE+AVG_UTILIZATION_RATE, family = binomial(link = "logit"), data=ccd.train)
#Mod 3
Bill_Amt_Lin_Reg = lm(formula = BILL_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE, data=ccd.train)
summary(Bill_Amt_Lin_Reg)
anova(Bill_Amt_Lin_Reg)
Pay_Amt_Lin_Reg = lm( formula = PAY_AMT1 ~ LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE, data=ccd.train)
summary(Pay_Amt_Lin_Reg)
anova(Pay_Amt_Lin_Reg)
ccd.train$BILL_AMT1_PRED<- predict(Bill_Amt_Lin_Reg, ccd.test)
ccd.test$BILL_AMT1_PRED<- predict(Bill_Amt_Lin_Reg, ccd.test)
ccd.train$PAY_AMT1_PRED<- predict(Pay_Amt_Lin_Reg, ccd.test)
ccd.test$PAY_AMT1_PRED<- predict(Pay_Amt_Lin_Reg, ccd.test)
mod3 = glm(formula = DEFAULT~LIMIT_BAL+BILL_AMT2+BILL_AMT3+PAY_AMT2+PAY_AMT3+SEX+EDUCATION+AVG_UTILIZATION_RATE+BILL_AMT1_PRED+PAY_AMT1_PRED, family = binomial(link = "logit"), data=ccd.train)
#Mod 4
mod4 = glm(formula = DEFAULT~log(LIMIT_BAL+.01)+SEX+AGE+log(BILL_AMT1_PRED+.01)+log(BILL_AMT2+.01)+log(BILL_AMT3+.01)+log(PAY_AMT1_PRED+.01)+log(PAY_AMT2+.01)+log(PAY_AMT3+.01)+log(AVG_UTILIZATION_RATE+.01)+PAY_2+PAY_3, family = binomial(link = "logit"), data=ccd.train)
#loop through prediction interval
for(j in seq(0,1,.025)){
#loop through models
for(k in 1:length(model_names)){
#Make the Predictions
ccd.test$DEFAULT_PROB_PREDICT = predict(eval(parse(text = paste('mod',k,sep=""))), ccd.test, type="response")
#Define the prediction threshold
threshold = j
#Generate Prediction
ccd.test$DEFAULT_PREDICTION <- ifelse(ccd.test$DEFAULT_PROB_PREDICT >= threshold, 1, 0 )
#Confusion Matrix Build
confusion_table = table(factor(ccd.test$DEFAULT_PREDICTION),factor(ccd.test$DEFAULT))
if(length(confusion_table) == 4){
confusion_values = c(confusion_table[1,1], confusion_table[2,1], confusion_table[1,2], confusion_table[2,2])
Label = c('TN','FP','FN','TP')
Prediction = c(0,1,0,1)
Actual = c(0,0,1,1)
confusion_frame = data.frame(Prediction, Actual, confusion_values, Label)
colnames(confusion_frame)<- c("Prediction", "Actual","Frequency", "Label")
confusion_frame
}else if(length(confusion_table) == 2){
confusion_values = c(confusion_table[1,1], 0, confusion_table[1,2],0)
Label = c('TN','FP','FN','TP')
Prediction = c(0,1,0,1)
Actual = c(0,0,1,1)
confusion_frame = data.frame(Prediction, Actual, confusion_values, Label)
colnames(confusion_frame)<- c("Prediction", "Actual","Frequency", "Label")
confusion_frame
}
colnames(Model_Performance)<-c("Name","Scaling", "Sampling", "Accuracy","Precision","Recall","Specificity","F1_Score","FPR", "TP", "TN", "FP", "FN", "Prediction_Threshold", "AUC")
#True Positives
TP = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="TP"])
Model_Performance$TP[i] = TP
#True Negatives
TN = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="TN"])
Model_Performance$TN[i] = TN
#False Positive
FP = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="FP"])
Model_Performance$FP[i] = FP
#False Negatives
FN = as.numeric(confusion_frame$Frequency[confusion_frame$Label=="FN"])
Model_Performance$FN[i] = FN
#Model Name
Model_Performance$Name[i] = paste("Model ",k ,": ",description[k], sep = "")
#Accuracy
Model_Performance$Accuracy[i] = round((TP+TN)/(TP+TN+FP+FN),3)
#Precision
Model_Performance$Precision[i] = round(TP / (TP + FP),3)
#Recall, Sensitivity, True Positive Rate, We likely want to optimize for recall
Model_Performance$Recall[i] = round(TP / (TP + FN),3)
#Specificity
Model_Performance$Specificity[i] = round(TN / (TN + FP),3)
# F1 Score
Model_Performance$F1_Score[i] = round(2*TP / (2*TP + FP + FN),3)
#False Positive Rate
Model_Performance$FPR[i] = round(FP / (FP+TN),3)
#Model Prediction Threshold
Model_Performance$Prediction_Threshold[i] = threshold
#Area Under the Curve
Model_Performance$AUC[i] = auc(ccd.test$DEFAULT, ccd.test$DEFAULT_PREDICTION)[1]
#Record the scaling method
Model_Performance$Scaling[i] = s
#Record the sampling method for balancing an imbalanced dataset
Model_Performance$Sampling[i] = c
i=i+1
}
}
}
}
Model_Performance %>% arrange(-Accuracy)
write.csv(Model_Performance, "model_performance.csv", row.names = FALSE)
Model_Performance <- read.csv("~/Documents/Academic/UVA_MSDS/STAT6021/Project_2/model_performance.csv")
first = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 1: Everything"& Scaling == "No Scaling" & Sampling == "No Adjustment")
second = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 1: Everything"& Scaling == "RobustScaler" & Sampling == "Over-Sample")
third = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 2: Interactions"& Scaling == "MinMaxScaler" & Sampling == "Over-Sample")
fourth = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 2: Interactions"& Scaling == "No Scaling" & Sampling == "No Adjustment")
fifth = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 3: Two-Stage"& Scaling == "MinMaxScaler" & Sampling == "Over-Sample")
sixth = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 3: Two-Stage"& Scaling == "No Scaling" & Sampling == "Over-Sample")
seventh = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 4: Log Transformation"& Scaling == "No Scaling" & Sampling == "No Adjustment")
eighth = Model_Performance %>% mutate(combined = paste(Name, Scaling, Sampling)) %>% filter(Name == "Model 4: Log Transformation"& Scaling == "RobustScaler" & Sampling == "No Adjustment")
modper = rbind(first, second, third, fourth, fifth, sixth, seventh, eighth)
modper %>% group_by(combined) %>% summarise(max_AUC = max(AUC), Average_Accuracy = mean(Accuracy)) %>% arrange(-max_AUC)
seventh %>% arrange(-AUC) %>% filter(Prediction_Threshold == .275)