# Clear environment of variables and functions
rm(list = ls(all = TRUE)) 

# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)

1 Load Package and Data

1.1 Load Package

#load package 
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:dplyr':
## 
##     nasa

library(janitor) # for tyble

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(lmPerm)  # for ANOVA
library(formattable)# For table formatting and table formatting functions
library(htmltools)
library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(MultinomialCI) # To calculate multinomial confidence intervals for factor variables 

library(flexdashboard)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:Hmisc':
## 
##     subplot

## The following object is masked from 'package:formattable':
## 
##     style

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(dygraphs)
library(xts)  # to convert date data to xts data, xts is time series class

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

library(gganimate)
library(inspectdf)# Load auto EDA packages

1.2 Data Loading and Cleaning

#Import Data
df <- read.csv("MLD_Data_File.csv", header=TRUE)

#Look at the first 10 rows of data 
df$LOANPRC = df$LOANPRC*100
head(df,10)

##    MARRIED GDLIN OBRAT BLACK HISPAN MALE APPROVE   LOANPRC
## 1        0     1  34.5     0      0    .       1  75.42373
## 2        1     1  34.1     0      0    1       0  80.00000
## 3        0     1  26.0     0      0    1       1  89.51049
## 4        1     1  37.0     0      0    1       1  60.00000
## 5        1     1  32.1     0      0    1       1  89.55224
## 6        0     1  33.0     0      0    1       1  80.43478
## 7        0     1  36.0     0      0    1       1  89.80892
## 8        0     1  37.0     0      0    1       1  79.76879
## 9        1     1  30.7     0      0    1       1 100.30090
## 10       1     1  49.0     0      0    1       1  63.27014

df <- subset(df,MARRIED != "." & LOANPRC <= 100 & (GDLIN == 1 | GDLIN == 0))
head(df,10)

##    MARRIED GDLIN OBRAT BLACK HISPAN MALE APPROVE  LOANPRC
## 1        0     1  34.5     0      0    .       1 75.42373
## 2        1     1  34.1     0      0    1       0 80.00000
## 3        0     1  26.0     0      0    1       1 89.51049
## 4        1     1  37.0     0      0    1       1 60.00000
## 5        1     1  32.1     0      0    1       1 89.55224
## 6        0     1  33.0     0      0    1       1 80.43478
## 7        0     1  36.0     0      0    1       1 89.80892
## 8        0     1  37.0     0      0    1       1 79.76879
## 10       1     1  49.0     0      0    1       1 63.27014
## 11       1     1  37.0     0      0    1       1 50.47591

summary(df)

##  MARRIED      GDLIN            OBRAT           BLACK        
##  .:   0   Min.   :0.0000   Min.   : 0.00   Min.   :0.00000  
##  0: 669   1st Qu.:1.0000   1st Qu.:28.00   1st Qu.:0.00000  
##  1:1283   Median :1.0000   Median :33.00   Median :0.00000  
##           Mean   :0.9144   Mean   :32.37   Mean   :0.09939  
##           3rd Qu.:1.0000   3rd Qu.:37.00   3rd Qu.:0.00000  
##           Max.   :1.0000   Max.   :95.00   Max.   :1.00000  
##      HISPAN       MALE        APPROVE          LOANPRC       
##  Min.   :0.0000   .:  15   Min.   :0.0000   Min.   :  2.105  
##  1st Qu.:0.0000   0: 361   1st Qu.:1.0000   1st Qu.: 69.750  
##  Median :0.0000   1:1576   Median :1.0000   Median : 80.000  
##  Mean   :0.0543            Mean   :0.8796   Mean   : 76.072  
##  3rd Qu.:0.0000            3rd Qu.:1.0000   3rd Qu.: 89.820  
##  Max.   :1.0000            Max.   :1.0000   Max.   :100.000

#df$MARRIED <- as.double(df$MARRIED)
#df$MALE <- as.double(df$MALE)
#head(df,10)

#change column names to the easy understood ones
colnames(df) = c('MARRIED','Meet_Guidelines','Obligation','BLACK','HISPAN','MALE','APPROVE','Loan_Purchase')
head(df,10)

##    MARRIED Meet_Guidelines Obligation BLACK HISPAN MALE APPROVE
## 1        0               1       34.5     0      0    .       1
## 2        1               1       34.1     0      0    1       0
## 3        0               1       26.0     0      0    1       1
## 4        1               1       37.0     0      0    1       1
## 5        1               1       32.1     0      0    1       1
## 6        0               1       33.0     0      0    1       1
## 7        0               1       36.0     0      0    1       1
## 8        0               1       37.0     0      0    1       1
## 10       1               1       49.0     0      0    1       1
## 11       1               1       37.0     0      0    1       1
##    Loan_Purchase
## 1       75.42373
## 2       80.00000
## 3       89.51049
## 4       60.00000
## 5       89.55224
## 6       80.43478
## 7       89.80892
## 8       79.76879
## 10      63.27014
## 11      50.47591

2 Basic EDA

summary(df)

##  MARRIED  Meet_Guidelines    Obligation        BLACK        
##  .:   0   Min.   :0.0000   Min.   : 0.00   Min.   :0.00000  
##  0: 669   1st Qu.:1.0000   1st Qu.:28.00   1st Qu.:0.00000  
##  1:1283   Median :1.0000   Median :33.00   Median :0.00000  
##           Mean   :0.9144   Mean   :32.37   Mean   :0.09939  
##           3rd Qu.:1.0000   3rd Qu.:37.00   3rd Qu.:0.00000  
##           Max.   :1.0000   Max.   :95.00   Max.   :1.00000  
##      HISPAN       MALE        APPROVE       Loan_Purchase    
##  Min.   :0.0000   .:  15   Min.   :0.0000   Min.   :  2.105  
##  1st Qu.:0.0000   0: 361   1st Qu.:1.0000   1st Qu.: 69.750  
##  Median :0.0000   1:1576   Median :1.0000   Median : 80.000  
##  Mean   :0.0543            Mean   :0.8796   Mean   : 76.072  
##  3rd Qu.:0.0000            3rd Qu.:1.0000   3rd Qu.: 89.820  
##  Max.   :1.0000            Max.   :1.0000   Max.   :100.000

#correlation table
 df %>% 
  select_if(is.numeric) %>% 
  as.matrix() %>% 
  rcorr()

##                 Meet_Guidelines Obligation BLACK HISPAN APPROVE
## Meet_Guidelines            1.00      -0.15 -0.22  -0.04    0.62
## Obligation                -0.15       1.00  0.11   0.03   -0.17
## BLACK                     -0.22       0.11  1.00  -0.08   -0.21
## HISPAN                    -0.04       0.03 -0.08   1.00   -0.07
## APPROVE                    0.62      -0.17 -0.21  -0.07    1.00
## Loan_Purchase             -0.13       0.21  0.13   0.11   -0.14
##                 Loan_Purchase
## Meet_Guidelines         -0.13
## Obligation               0.21
## BLACK                    0.13
## HISPAN                   0.11
## APPROVE                 -0.14
## Loan_Purchase            1.00
## 
## n= 1952 
## 
## 
## P
##                 Meet_Guidelines Obligation BLACK  HISPAN APPROVE
## Meet_Guidelines                 0.0000     0.0000 0.0783 0.0000 
## Obligation      0.0000                     0.0000 0.2364 0.0000 
## BLACK           0.0000          0.0000            0.0004 0.0000 
## HISPAN          0.0783          0.2364     0.0004        0.0017 
## APPROVE         0.0000          0.0000     0.0000 0.0017        
## Loan_Purchase   0.0000          0.0000     0.0000 0.0000 0.0000 
##                 Loan_Purchase
## Meet_Guidelines 0.0000       
## Obligation      0.0000       
## BLACK           0.0000       
## HISPAN          0.0000       
## APPROVE         0.0000       
## Loan_Purchase

## correlation graph
df %>% 
  select(MARRIED,Obligation,MALE,APPROVE,Loan_Purchase, HISPAN, BLACK) %>% 
  inspect_cor() %>% 
  show_plot()

library(DataExplorer)

df %>% 
  plot_bar()

Comments:

“.” appears in Married and Male

df %>% 
  plot_histogram()

comments: 1. Loan purchase value shouldn’t over 1

Meet_Guidelines: outlier 666

3 Sub Sample for different Races

df_white <- subset(df, MALE != "." & HISPAN == 0 & BLACK == 0)
head(df_white,10)

##    MARRIED Meet_Guidelines Obligation BLACK HISPAN MALE APPROVE
## 2        1               1       34.1     0      0    1       0
## 3        0               1       26.0     0      0    1       1
## 4        1               1       37.0     0      0    1       1
## 5        1               1       32.1     0      0    1       1
## 6        0               1       33.0     0      0    1       1
## 7        0               1       36.0     0      0    1       1
## 8        0               1       37.0     0      0    1       1
## 10       1               1       49.0     0      0    1       1
## 11       1               1       37.0     0      0    1       1
## 12       1               1       37.1     0      0    1       1
##    Loan_Purchase
## 2       80.00000
## 3       89.51049
## 4       60.00000
## 5       89.55224
## 6       80.43478
## 7       89.80892
## 8       79.76879
## 10      63.27014
## 11      50.47591
## 12      70.37037

df_hisp <- subset(df, MALE != "." & HISPAN == 1)
head(df_hisp,10)

##     MARRIED Meet_Guidelines Obligation BLACK HISPAN MALE APPROVE
## 30        1               1       34.8     0      1    1       1
## 49        1               0       30.0     0      1    1       0
## 54        0               1       38.0     0      1    0       1
## 56        0               1       25.0     0      1    1       1
## 69        1               0       22.0     0      1    1       1
## 74        1               1       39.7     0      1    1       1
## 100       1               1       22.0     0      1    0       1
## 111       1               1       36.0     0      1    0       1
## 116       0               1       32.2     0      1    0       1
## 120       1               1       19.0     0      1    1       0
##     Loan_Purchase
## 30       80.00000
## 49       94.91525
## 54       89.39394
## 56       79.20792
## 69       80.00000
## 74       84.06780
## 100      85.33334
## 111      90.00000
## 116      90.30303
## 120      76.31161

df_black <- subset(df,MALE != "." & BLACK == 1)
head(df_black,10)

##    MARRIED Meet_Guidelines Obligation BLACK HISPAN MALE APPROVE
## 16       1               1       37.1     1      0    1       1
## 27       1               1       34.0     1      0    1       1
## 35       0               0       38.2     1      0    0       0
## 36       0               1       27.0     1      0    0       1
## 46       1               0       34.0     1      0    1       0
## 50       0               1       38.2     1      0    1       1
## 51       1               1       35.0     1      0    1       1
## 80       0               1       34.0     1      0    0       1
## 81       1               1       38.9     1      0    1       1
## 86       0               1       25.0     1      0    0       1
##    Loan_Purchase
## 16      89.74359
## 27      80.00000
## 35      95.00000
## 36      75.15528
## 46      80.00000
## 50      67.82609
## 51      95.56962
## 80      88.59060
## 81      95.29412
## 86      80.00000

3.1 White

summary (df_white)

##  MARRIED  Meet_Guidelines    Obligation        BLACK       HISPAN 
##  .:   0   Min.   :0.0000   Min.   : 0.00   Min.   :0   Min.   :0  
##  0: 557   1st Qu.:1.0000   1st Qu.:27.60   1st Qu.:0   1st Qu.:0  
##  1:1084   Median :1.0000   Median :32.50   Median :0   Median :0  
##           Mean   :0.9391   Mean   :31.99   Mean   :0   Mean   :0  
##           3rd Qu.:1.0000   3rd Qu.:36.50   3rd Qu.:0   3rd Qu.:0  
##           Max.   :1.0000   Max.   :95.00   Max.   :0   Max.   :0  
##  MALE        APPROVE       Loan_Purchase    
##  .:   0   Min.   :0.0000   Min.   :  2.105  
##  0: 291   1st Qu.:1.0000   1st Qu.: 67.708  
##  1:1350   Median :1.0000   Median : 79.861  
##           Mean   :0.9098   Mean   : 74.782  
##           3rd Qu.:1.0000   3rd Qu.: 89.147  
##           Max.   :1.0000   Max.   :100.000

grid.arrange(
df_white %>%
ggplot(df_white, mapping = aes(APPROVE))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 1.5, size = 3.0, color = "white") +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " "),

df_white %>%
  filter(df_white$Meet_Guidelines != 666)%>%
  ggplot(df_white, mapping = aes(Meet_Guidelines, fill=Meet_Guidelines))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 1, size = 2.9,color = "white" ) +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " "),

df_white %>%
  ggplot(df_white, mapping = aes(MALE))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 3, size = 3.0) +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " "),

df_white %>%
  ggplot(df_white, mapping = aes(MARRIED))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 5, size = 3.0) +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " ")
  

, ncol = 2)

3.2 Hispanic

summary (df_hisp)

##  MARRIED Meet_Guidelines    Obligation        BLACK       HISPAN  MALE  
##  .: 0    Min.   :0.0000   Min.   :14.60   Min.   :0   Min.   :1   .: 0  
##  0:30    1st Qu.:1.0000   1st Qu.:29.00   1st Qu.:0   1st Qu.:1   0:20  
##  1:74    Median :1.0000   Median :33.00   Median :0   Median :1   1:84  
##          Mean   :0.8654   Mean   :33.32   Mean   :0   Mean   :1         
##          3rd Qu.:1.0000   3rd Qu.:38.05   3rd Qu.:0   3rd Qu.:1         
##          Max.   :1.0000   Max.   :62.00   Max.   :0   Max.   :1         
##     APPROVE       Loan_Purchase   
##  Min.   :0.0000   Min.   : 40.09  
##  1st Qu.:1.0000   1st Qu.: 80.00  
##  Median :1.0000   Median : 89.31  
##  Mean   :0.7788   Mean   : 83.91  
##  3rd Qu.:1.0000   3rd Qu.: 90.27  
##  Max.   :1.0000   Max.   :100.00

grid.arrange(
  
df_hisp %>%
ggplot(df_hisp, mapping = aes(APPROVE))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 1.5, size = 3.0, color = "white") +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " "),

df_hisp %>%
ggplot(df_hisp, mapping = aes(Meet_Guidelines, fill=Meet_Guidelines))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 1.5, size = 3.0,color = "white" ) +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " "),

df_hisp %>%
  ggplot(df_hisp, mapping = aes(MALE))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 3, size = 3.0) +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " "),

df_hisp %>%
  ggplot(df_hisp, mapping = aes(MARRIED))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 5, size = 3.0) +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " ")
  

, ncol = 2)

3.3 Black

summary (df_black)

##  MARRIED Meet_Guidelines   Obligation        BLACK       HISPAN  MALE   
##  .:  0   Min.   :0.000   Min.   : 5.60   Min.   :1   Min.   :0   .:  0  
##  0: 75   1st Qu.:0.000   1st Qu.:31.00   1st Qu.:1   1st Qu.:0   0: 50  
##  1:117   Median :1.000   Median :35.00   Median :1   Median :0   1:142  
##          Mean   :0.724   Mean   :35.03   Mean   :1   Mean   :0          
##          3rd Qu.:1.000   3rd Qu.:38.90   3rd Qu.:1   3rd Qu.:0          
##          Max.   :1.000   Max.   :63.00   Max.   :1   Max.   :0          
##     APPROVE       Loan_Purchase   
##  Min.   :0.0000   Min.   : 28.99  
##  1st Qu.:0.0000   1st Qu.: 80.00  
##  Median :1.0000   Median : 86.06  
##  Mean   :0.6667   Mean   : 82.89  
##  3rd Qu.:1.0000   3rd Qu.: 90.23  
##  Max.   :1.0000   Max.   :100.00

grid.arrange(
  
df_black %>%
ggplot(df_black, mapping = aes(APPROVE))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 1.5, size = 3.0, color = "white") +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " "),

df_black %>%
ggplot(df_black, mapping = aes(Meet_Guidelines, fill=Meet_Guidelines))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 1.5, size = 3.0,color = "white" ) +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " "),

df_black %>%
  ggplot(df_black, mapping = aes(MALE))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 3, size = 3.0) +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " "),

df_black %>%
  ggplot(df_black, mapping = aes(MARRIED))+
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  geom_text(aes(label = ((..count..)/sum(..count..))*100,
                   y= ..prop.. ), stat='count',vjust = 5, size = 3.0) +
  scale_y_continuous(labels = scales::percent) +
  labs(y= " ")
  

, ncol = 2)

4 Logit Model

#Estimate Logit Model
LogitModel = glm(APPROVE ~ MARRIED + Meet_Guidelines + Obligation + BLACK + HISPAN + Loan_Purchase, data = df, 
                 family = "binomial")
summary(LogitModel)

## 
## Call:
## glm(formula = APPROVE ~ MARRIED + Meet_Guidelines + Obligation + 
##     BLACK + HISPAN + Loan_Purchase, family = "binomial", data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8975   0.2423   0.3071   0.3683   2.3487  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.253682   0.686366   1.827 0.067768 .  
## MARRIED1         0.477739   0.184576   2.588 0.009645 ** 
## Meet_Guidelines  3.776275   0.220505  17.126  < 2e-16 ***
## Obligation      -0.033849   0.010561  -3.205 0.001351 ** 
## BLACK           -0.861500   0.242228  -3.557 0.000376 ***
## HISPAN          -0.853023   0.322550  -2.645 0.008178 ** 
## Loan_Purchase   -0.016167   0.007016  -2.304 0.021198 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1435.50  on 1951  degrees of freedom
## Residual deviance:  927.23  on 1945  degrees of freedom
## AIC: 941.23
## 
## Number of Fisher Scoring iterations: 6

exp(coef(LogitModel))

##     (Intercept)        MARRIED1 Meet_Guidelines      Obligation 
##       3.5032185       1.6124238      43.6531243       0.9667177 
##           BLACK          HISPAN   Loan_Purchase 
##       0.4225276       0.4261247       0.9839626

#Generate Log-Likelihood
logLik(LogitModel)

## 'log Lik.' -463.6162 (df=7)

#Define prototypical loan applicants (you will need more than 3)
prototype1 <- data.frame(Obligation=median(df$Obligation), MARRIED = "1", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 1, BLACK = 1, HISPAN = 0)
prototype2 <- data.frame(Obligation=median(df$Obligation), MARRIED = "1", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 1, BLACK = 0, HISPAN = 1)
prototype3 <- data.frame(Obligation=median(df$Obligation), MARRIED = "1", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 1, BLACK = 0, HISPAN = 0)

prototype4 <- data.frame(Obligation=median(df$Obligation), MARRIED = "1", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 0, BLACK = 1, HISPAN = 0)
prototype5 <- data.frame(Obligation=median(df$Obligation), MARRIED = "1", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 0, BLACK = 0, HISPAN = 1)
prototype6 <- data.frame(Obligation=median(df$Obligation), MARRIED = "1", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 0, BLACK = 0, HISPAN = 0)

prototype7 <- data.frame(Obligation=median(df$Obligation), MARRIED = "0", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 1, BLACK = 1, HISPAN = 0)
prototype8 <- data.frame(Obligation=median(df$Obligation), MARRIED = "0", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 1, BLACK = 0, HISPAN = 1)
prototype9 <- data.frame(Obligation=median(df$Obligation), MARRIED = "0", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 1, BLACK = 0, HISPAN = 0)

prototype10 <- data.frame(Obligation=median(df$Obligation), MARRIED = "0", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 0, BLACK = 1, HISPAN = 0)
prototype11 <- data.frame(Obligation=median(df$Obligation), MARRIED = "0", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 0, BLACK = 0, HISPAN = 1)
prototype12 <- data.frame(Obligation=median(df$Obligation), MARRIED = "0", Loan_Purchase = median(df$Loan_Purchase), 
    Meet_Guidelines = 0, BLACK = 0, HISPAN = 0)

#Predict probabilities for prototypical individuals
prototype1$predictedprob <- predict (LogitModel, newdata = prototype1, type ="response")
prototype2$predictedprob <- predict (LogitModel, newdata = prototype2, type ="response")
prototype3$predictedprob <- predict (LogitModel, newdata = prototype3, type ="response")


prototype4$predictedprob <- predict (LogitModel, newdata = prototype4, type ="response")
prototype5$predictedprob <- predict (LogitModel, newdata = prototype5, type ="response")
prototype6$predictedprob <- predict (LogitModel, newdata = prototype6, type ="response")

prototype7$predictedprob <- predict (LogitModel, newdata = prototype7, type ="response")
prototype8$predictedprob <- predict (LogitModel, newdata = prototype8, type ="response")
prototype9$predictedprob <- predict (LogitModel, newdata = prototype9, type ="response")

prototype10$predictedprob <- predict (LogitModel, newdata = prototype10, type ="response")
prototype11$predictedprob <- predict (LogitModel, newdata = prototype11, type ="response")
prototype12$predictedprob <- predict (LogitModel, newdata = prototype12, type ="response")


rbind.data.frame(prototype1,prototype2,prototype3,prototype4,prototype5,prototype6,prototype7,prototype8,prototype9,prototype10,prototype11,prototype12)

##    Obligation MARRIED Loan_Purchase Meet_Guidelines BLACK HISPAN
## 1          33       1            80               1     1      0
## 2          33       1            80               1     0      1
## 3          33       1            80               1     0      0
## 4          33       1            80               0     1      0
## 5          33       1            80               0     0      1
## 6          33       1            80               0     0      0
## 7          33       0            80               1     1      0
## 8          33       0            80               1     0      1
## 9          33       0            80               1     0      0
## 10         33       0            80               0     1      0
## 11         33       0            80               0     0      1
## 12         33       0            80               0     0      0
##    predictedprob
## 1      0.9034182
## 2      0.9041553
## 3      0.9567810
## 4      0.1764655
## 5      0.1777009
## 6      0.3364891
## 7      0.8529661
## 8      0.8540261
## 9      0.9321097
## 10     0.1173033
## 11     0.1181839
## 12     0.2392642

5 Probit Model

#Estimate Probit Model
ProbitModel = glm(APPROVE ~ MARRIED + Meet_Guidelines + Obligation + BLACK + HISPAN + Loan_Purchase, data = df, 
                  family = "binomial" (link = "probit"))
summary(ProbitModel)

## 
## Call:
## glm(formula = APPROVE ~ MARRIED + Meet_Guidelines + Obligation + 
##     BLACK + HISPAN + Loan_Purchase, family = binomial(link = "probit"), 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9489   0.2372   0.3058   0.3710   2.2906  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      0.442016   0.337821   1.308 0.190727    
## MARRIED1         0.236340   0.091676   2.578 0.009937 ** 
## Meet_Guidelines  2.174365   0.122768  17.711  < 2e-16 ***
## Obligation      -0.016137   0.005449  -2.962 0.003060 ** 
## BLACK           -0.445506   0.127706  -3.489 0.000486 ***
## HISPAN          -0.439135   0.167857  -2.616 0.008893 ** 
## Loan_Purchase   -0.007493   0.003283  -2.282 0.022471 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1435.50  on 1951  degrees of freedom
## Residual deviance:  927.16  on 1945  degrees of freedom
## AIC: 941.16
## 
## Number of Fisher Scoring iterations: 6

exp(coef(ProbitModel))

##     (Intercept)        MARRIED1 Meet_Guidelines      Obligation 
##       1.5558400       1.2666049       8.7965935       0.9839925 
##           BLACK          HISPAN   Loan_Purchase 
##       0.6405003       0.6445936       0.9925346

#Generate Log-Likelihood
logLik(ProbitModel)

## 'log Lik.' -463.58 (df=7)

#Predict probabilities for prototypical individuals
prototype1$predictedprob <- predict (ProbitModel, newdata = prototype1, type ="response")
prototype2$predictedprob <- predict (ProbitModel, newdata = prototype2, type ="response")
prototype3$predictedprob <- predict (ProbitModel, newdata = prototype3, type ="response")


prototype4$predictedprob <- predict (ProbitModel, newdata = prototype4, type ="response")
prototype5$predictedprob <- predict (ProbitModel, newdata = prototype5, type ="response")
prototype6$predictedprob <- predict (ProbitModel, newdata = prototype6, type ="response")

prototype7$predictedprob <- predict (ProbitModel, newdata = prototype7, type ="response")
prototype8$predictedprob <- predict (ProbitModel, newdata = prototype8, type ="response")
prototype9$predictedprob <- predict (ProbitModel, newdata = prototype9, type ="response")

prototype10$predictedprob <- predict (ProbitModel, newdata = prototype10, type ="response")
prototype11$predictedprob <- predict (ProbitModel, newdata = prototype11, type ="response")
prototype12$predictedprob <- predict (ProbitModel, newdata = prototype12, type ="response")


rbind.data.frame(prototype1,prototype2,prototype3,prototype4,prototype5,prototype6,prototype7,prototype8,prototype9,prototype10,prototype11,prototype12)

##    Obligation MARRIED Loan_Purchase Meet_Guidelines BLACK HISPAN
## 1          33       1            80               1     1      0
## 2          33       1            80               1     0      1
## 3          33       1            80               1     0      0
## 4          33       1            80               0     1      0
## 5          33       1            80               0     0      1
## 6          33       1            80               0     0      0
## 7          33       0            80               1     1      0
## 8          33       0            80               1     0      1
## 9          33       0            80               1     0      0
## 10         33       0            80               0     1      0
## 11         33       0            80               0     0      1
## 12         33       0            80               0     0      0
##    predictedprob
## 1      0.8988841
## 2      0.9000065
## 3      0.9573496
## 4      0.1842875
## 5      0.1859887
## 6      0.3250439
## 7      0.8505694
## 8      0.8520460
## 9      0.9311465
## 10     0.1280859
## 11     0.1294245
## 12     0.2451033

MLD Assignment

02/23/2020

1 Load Package and Data

1.1 Load Package

1.2 Data Loading and Cleaning

2 Basic EDA

3 Sub Sample for different Races

3.1 White

3.2 Hispanic

3.3 Black

4 Logit Model

5 Probit Model