If any issues, questions or suggestions feel free to reach me out via e-mail or Linkedin. You can also visit my Github.

This is R replication of the code and exercises from the Udemy course “Credit Risk Modeling in Python 2022”.

if(!require('pacman')) install.packages('pacman')
pacman::p_load(dplyr, tidyr, ggplot2)
options(scipen = 20)

We load data which was prepared in that blog post. Our dataset contains 43236 rows with defaulted loans and 45 attributes:
- credit conversion factor (CCF) as target variable. CCF is the proportion from the original amount of the loan that is still outstanding when the borrower defaulted.
- 29 dummy variables derived 6 categorial variables in that blog post
- 15 continuous variables.

data <- read.csv('ead_lgd_preprocessed_data.csv') %>%
  select(-good_bad, -recovery_rate)
data %>% dim()
[1] 43236    45
data %>% colnames()
 [1] "grade_A"                                             "grade_B"                                            
 [3] "grade_C"                                             "grade_D"                                            
 [5] "grade_E"                                             "grade_F"                                            
 [7] "home_ownership_OWN"                                  "home_ownership_MORTGAGE"                            
 [9] "addr_state_NM_VA"                                    "addr_state_NY"                                      
[11] "addr_state_OK_TN_MO_LA_MD_NC"                        "addr_state_CA"                                      
[13] "addr_state_UT_KY_AZ_NJ"                              "addr_state_AR_MI_PA_OH_MN"                          
[15] "addr_state_RI_MA_DE_SD_IN"                           "addr_state_GA_WA_OR"                                
[17] "addr_state_WI_MT"                                    "addr_state_IL_CT"                                   
[19] "addr_state_KS_SC_CO_VT_AK_MS"                        "addr_state_TX"                                      
[21] "addr_state_WV_NH_WY_DC_ME_ID"                        "verification_status_Source.Verified"                
[23] "verification_status_Not.Verified"                    "purpose_renewable_energy_moving_other_house_medical"
[25] "purpose_wedding_vacation_debt_consolidation"         "purpose_major_purchase_home_improvement"            
[27] "purpose_car_credit_card"                             "initial_list_status_w"                              
[29] "term_36"                                             "CCF"                                                
[31] "emp_length"                                          "months_since_issue_d"                               
[33] "int_rate"                                            "months_since_earliest_cr_line"                      
[35] "delinq_2yrs"                                         "inq_last_6mths"                                     
[37] "open_acc"                                            "pub_rec"                                            
[39] "total_acc"                                           "acc_now_delinq"                                     
[41] "total_rev_hi_lim"                                    "annual_inc"                                         
[43] "mths_since_last_delinq"                              "dti"                                                
[45] "mths_since_last_record"                             

We split our data into training set and test set in proportions 80:20.

set.seed(2137)
n_obs <- nrow(data)
train_index <- sample(1:n_obs, 0.8 * n_obs)

We fit linear regression model.

lin_model_1 <- lm(
  CCF ~.
  ,data = data
  ,subset = train_index
)

Calculate adjusted multivariate p-values using p.adjust() function. Based on these we exclude from the linear regression model the following variables:
- categorial: home_onwership, addr_state
- continuous: months_since_earliest_cr_line, delinq_2yrs, pub_rec, acc_now_delinq, total_rev_hi_lim, annual_inc, dti, mths_since_last_record.

lin_p_values_multi <- lin_model_1 %>%
  summary() %>%
  coef() %>%
  .[,4] %>% 
  p.adjust()

lin_p_values_multi %>% round(5) %>% as.data.frame()

For those we fit linear regression model. We obtain pretty simple model, with \(R^2=0.27\).

lin_model_2 <- lm(
  CCF ~.
  ,data = data %>% select(
    -starts_with('home_ownership'), -starts_with('addr_state')
    ,-months_since_earliest_cr_line, -delinq_2yrs, -pub_rec, -acc_now_delinq
    ,-total_rev_hi_lim, -annual_inc, -dti, -mths_since_last_record
  )
  ,subset = train_index
)

lin_model_2 %>% summary()

Call:
lm(formula = CCF ~ ., data = data %>% select(-starts_with("home_ownership"), 
    -starts_with("addr_state"), -months_since_earliest_cr_line, 
    -delinq_2yrs, -pub_rec, -acc_now_delinq, -total_rev_hi_lim, 
    -annual_inc, -dti, -mths_since_last_record), subset = train_index)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.91278 -0.08263  0.01563  0.10818  0.53265 

Coefficients:
                                                       Estimate  Std. Error t value             Pr(>|t|)    
(Intercept)                                          1.44297826  0.02130515  67.729 < 0.0000000000000002 ***
grade_A                                             -0.29603396  0.01391984 -21.267 < 0.0000000000000002 ***
grade_B                                             -0.22541762  0.01131738 -19.918 < 0.0000000000000002 ***
grade_C                                             -0.16243104  0.00972739 -16.698 < 0.0000000000000002 ***
grade_D                                             -0.11297473  0.00852547 -13.251 < 0.0000000000000002 ***
grade_E                                             -0.06225462  0.00776177  -8.021  0.00000000000000108 ***
grade_F                                             -0.01870368  0.00774027  -2.416             0.015679 *  
verification_status_Source.Verified                  0.00928306  0.00225456   4.117  0.00003839751701151 ***
verification_status_Not.Verified                     0.00257510  0.00241944   1.064             0.287185    
purpose_renewable_energy_moving_other_house_medical -0.02099876  0.00583318  -3.600             0.000319 ***
purpose_wedding_vacation_debt_consolidation         -0.04709230  0.00521639  -9.028 < 0.0000000000000002 ***
purpose_major_purchase_home_improvement             -0.03680992  0.00615485  -5.981  0.00000000224443542 ***
purpose_car_credit_card                             -0.06127883  0.00555439 -11.033 < 0.0000000000000002 ***
initial_list_status_w                                0.01338708  0.00229566   5.831  0.00000000554321420 ***
term_36                                             -0.11946865  0.00232570 -51.369 < 0.0000000000000002 ***
emp_length                                          -0.00170901  0.00024970  -6.844  0.00000000000781982 ***
months_since_issue_d                                -0.00442044  0.00007222 -61.207 < 0.0000000000000002 ***
int_rate                                            -0.01124318  0.00070344 -15.983 < 0.0000000000000002 ***
inq_last_6mths                                       0.00992159  0.00074944  13.239 < 0.0000000000000002 ***
open_acc                                            -0.00197173  0.00026046  -7.570  0.00000000000003823 ***
total_acc                                            0.00041327  0.00011343   3.643             0.000270 ***
mths_since_last_delinq                              -0.00024421  0.00004165  -5.863  0.00000000458036046 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.1714 on 34566 degrees of freedom
Multiple R-squared:  0.2704,    Adjusted R-squared:   0.27 
F-statistic:   610 on 21 and 34566 DF,  p-value: < 0.00000000000000022

As we see below residuals of the linear regression model ain’t behave well. These have heavy left tail and are heteroscedastic. In another blog post we will estimate beta regression instead of lienar regression.

par(mfrow = c(2,2))
lin_model_2 %>% plot(which = 1:3)
acf(lin_model_2$residuals, main = '')

lin_predict <- predict(lin_model_2
                       ,data[-train_index,])

CCF_summary = data.frame(
  Actual = data$CCF[-train_index]
  ,Predicted = lin_predict
) %>%
  pivot_longer(cols = 1:2, names_to = 'CCF', values_to = 'Value')

ggplot(CCF_summary, aes(Value, fill = CCF)) +
  geom_density(alpha= 0.5) +
  theme_bw() +
  labs(x = '', y ='', title = 'Densities of Actual and Predicted CCF')

LS0tDQp0aXRsZTogIkNyZWRpdCBSaXNrIE1vZGVsaW5nIC0gRUFEIE1vZGVsIEVzdGltYXRpb24gd2l0aCBMaW5lYXIgUmVncmVzc2lvbiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCklmIGFueSBpc3N1ZXMsIHF1ZXN0aW9ucyBvciBzdWdnZXN0aW9ucyBmZWVsIGZyZWUgdG8gcmVhY2ggbWUgb3V0IHZpYSBlLW1haWwgPHdpZWN6eW5za2lwYXdlbEBnbWFpbC5jb20+IG9yIFtMaW5rZWRpbl0oaHR0cHM6Ly93d3cubGlua2VkaW4uY29tL2luL3Bhd2VsLXdpZWN6eW5za2kvKS4gWW91IGNhbiBhbHNvIHZpc2l0IG15IFtHaXRodWJdKGh0dHBzOi8vZ2l0aHViLmNvbS9wYXdlbC13aWVjenluc2tpKS4NCg0KVGhpcyBpcyBSIHJlcGxpY2F0aW9uIG9mIHRoZSBjb2RlIGFuZCBleGVyY2lzZXMgZnJvbSB0aGUgVWRlbXkgY291cnNlIFsiQ3JlZGl0IFJpc2sgTW9kZWxpbmcgaW4gUHl0aG9uIDIwMjIiXShodHRwczovL3d3dy51ZGVteS5jb20vY291cnNlL2NyZWRpdC1yaXNrLW1vZGVsaW5nLWluLXB5dGhvbi8pLg0KDQpgYGB7ciBsb2FkX2xpYnJhcmllc30NCmlmKCFyZXF1aXJlKCdwYWNtYW4nKSkgaW5zdGFsbC5wYWNrYWdlcygncGFjbWFuJykNCnBhY21hbjo6cF9sb2FkKGRwbHlyLCB0aWR5ciwgZ2dwbG90MikNCm9wdGlvbnMoc2NpcGVuID0gMjApDQpgYGANCg0KV2UgbG9hZCBkYXRhIHdoaWNoIHdhcyBwcmVwYXJlZCBpbiBbdGhhdCBibG9nIHBvc3RdKGh0dHBzOi8vcnB1YnMuY29tL3Bhd2VsLXdpZWN6eW5za2kvODk4MDA2KS4gT3VyIGRhdGFzZXQgY29udGFpbnMgNDMyMzYgcm93cyB3aXRoIGRlZmF1bHRlZCBsb2FucyBhbmQgNDUgYXR0cmlidXRlczogXA0KIC0gY3JlZGl0IGNvbnZlcnNpb24gZmFjdG9yIChDQ0YpIGFzIHRhcmdldCB2YXJpYWJsZS4gKipDQ0YgaXMgdGhlIHByb3BvcnRpb24gZnJvbSB0aGUgb3JpZ2luYWwgYW1vdW50IG9mIHRoZSBsb2FuIHRoYXQgaXMgc3RpbGwgb3V0c3RhbmRpbmcgd2hlbiB0aGUgYm9ycm93ZXIgZGVmYXVsdGVkKiouIFwNCiAtIDI5IGR1bW15IHZhcmlhYmxlcyBkZXJpdmVkIDYgY2F0ZWdvcmlhbCB2YXJpYWJsZXMgaW4gW3RoYXQgYmxvZyBwb3N0XShodHRwczovL3JwdWJzLmNvbS9wYXdlbC13aWVjenluc2tpLzg5MTgxNykgXA0KIC0gMTUgY29udGludW91cyB2YXJpYWJsZXMuDQogDQpgYGB7ciBsb2FkX2RhdGF9DQpkYXRhIDwtIHJlYWQuY3N2KCdlYWRfbGdkX3ByZXByb2Nlc3NlZF9kYXRhLmNzdicpICU+JQ0KICBzZWxlY3QoLWdvb2RfYmFkLCAtcmVjb3ZlcnlfcmF0ZSkNCmRhdGEgJT4lIGRpbSgpDQpkYXRhICU+JSBjb2xuYW1lcygpDQpgYGANCg0KV2Ugc3BsaXQgb3VyIGRhdGEgaW50byB0cmFpbmluZyBzZXQgYW5kIHRlc3Qgc2V0IGluIHByb3BvcnRpb25zIDgwOjIwLg0KYGBge3IgZGF0YV9zcGxpdH0NCnNldC5zZWVkKDIxMzcpDQpuX29icyA8LSBucm93KGRhdGEpDQp0cmFpbl9pbmRleCA8LSBzYW1wbGUoMTpuX29icywgMC44ICogbl9vYnMpDQpgYGANCg0KV2UgZml0IGxpbmVhciByZWdyZXNzaW9uIG1vZGVsLg0KYGBge3IgbGluX21vZGVsXzF9DQpsaW5fbW9kZWxfMSA8LSBsbSgNCiAgQ0NGIH4uDQogICxkYXRhID0gZGF0YQ0KICAsc3Vic2V0ID0gdHJhaW5faW5kZXgNCikNCmBgYA0KDQpDYWxjdWxhdGUgKiphZGp1c3RlZCBtdWx0aXZhcmlhdGUgcC12YWx1ZXMqKiB1c2luZyAqcC5hZGp1c3QoKSogZnVuY3Rpb24uIEJhc2VkIG9uIHRoZXNlIHdlIGV4Y2x1ZGUgZnJvbSB0aGUgbGluZWFyIHJlZ3Jlc3Npb24gbW9kZWwgdGhlIGZvbGxvd2luZyB2YXJpYWJsZXM6IFwNCiAtIGNhdGVnb3JpYWw6IGhvbWVfb253ZXJzaGlwLCBhZGRyX3N0YXRlIFwNCiAtIGNvbnRpbnVvdXM6IG1vbnRoc19zaW5jZV9lYXJsaWVzdF9jcl9saW5lLCBkZWxpbnFfMnlycywgcHViX3JlYywgYWNjX25vd19kZWxpbnEsIHRvdGFsX3Jldl9oaV9saW0sIGFubnVhbF9pbmMsIGR0aSwgbXRoc19zaW5jZV9sYXN0X3JlY29yZC4NCmBgYHtyIGxpbl9tb2RlbF9wX2FkanVzdH0NCmxpbl9wX3ZhbHVlc19tdWx0aSA8LSBsaW5fbW9kZWxfMSAlPiUNCiAgc3VtbWFyeSgpICU+JQ0KICBjb2VmKCkgJT4lDQogIC5bLDRdICU+JSANCiAgcC5hZGp1c3QoKQ0KDQpsaW5fcF92YWx1ZXNfbXVsdGkgJT4lIHJvdW5kKDUpICU+JSBhcy5kYXRhLmZyYW1lKCkNCmBgYA0KDQpGb3IgdGhvc2Ugd2UgZml0IGxpbmVhciByZWdyZXNzaW9uIG1vZGVsLiBXZSBvYnRhaW4gcHJldHR5IHNpbXBsZSBtb2RlbCwgd2l0aCAkUl4yPTAuMjckLg0KYGBge3IgbGluX21vZGVsXzJ9DQpsaW5fbW9kZWxfMiA8LSBsbSgNCiAgQ0NGIH4uDQogICxkYXRhID0gZGF0YSAlPiUgc2VsZWN0KA0KICAgIC1zdGFydHNfd2l0aCgnaG9tZV9vd25lcnNoaXAnKSwgLXN0YXJ0c193aXRoKCdhZGRyX3N0YXRlJykNCiAgICAsLW1vbnRoc19zaW5jZV9lYXJsaWVzdF9jcl9saW5lLCAtZGVsaW5xXzJ5cnMsIC1wdWJfcmVjLCAtYWNjX25vd19kZWxpbnENCiAgICAsLXRvdGFsX3Jldl9oaV9saW0sIC1hbm51YWxfaW5jLCAtZHRpLCAtbXRoc19zaW5jZV9sYXN0X3JlY29yZA0KICApDQogICxzdWJzZXQgPSB0cmFpbl9pbmRleA0KKQ0KDQpsaW5fbW9kZWxfMiAlPiUgc3VtbWFyeSgpDQpgYGANCg0KQXMgd2Ugc2VlIGJlbG93IHJlc2lkdWFscyBvZiB0aGUgbGluZWFyIHJlZ3Jlc3Npb24gbW9kZWwgYWluJ3QgYmVoYXZlIHdlbGwuIFRoZXNlIGhhdmUgaGVhdnkgbGVmdCB0YWlsIGFuZCBhcmUgaGV0ZXJvc2NlZGFzdGljLiBJbiBhbm90aGVyIGJsb2cgcG9zdCB3ZSB3aWxsIGVzdGltYXRlIGJldGEgcmVncmVzc2lvbiBpbnN0ZWFkIG9mIGxpZW5hciByZWdyZXNzaW9uLg0KYGBge3IgbGluX3Jlc2lkdWFsc30NCnBhcihtZnJvdyA9IGMoMiwyKSkNCmxpbl9tb2RlbF8yICU+JSBwbG90KHdoaWNoID0gMTozKQ0KYWNmKGxpbl9tb2RlbF8yJHJlc2lkdWFscywgbWFpbiA9ICcnKQ0KYGBgDQoNCmBgYHtyIGNvbWJpbmV9DQpsaW5fcHJlZGljdCA8LSBwcmVkaWN0KGxpbl9tb2RlbF8yDQogICAgICAgICAgICAgICAgICAgICAgICxkYXRhWy10cmFpbl9pbmRleCxdKQ0KDQpDQ0Zfc3VtbWFyeSA9IGRhdGEuZnJhbWUoDQogIEFjdHVhbCA9IGRhdGEkQ0NGWy10cmFpbl9pbmRleF0NCiAgLFByZWRpY3RlZCA9IGxpbl9wcmVkaWN0DQopICU+JQ0KICBwaXZvdF9sb25nZXIoY29scyA9IDE6MiwgbmFtZXNfdG8gPSAnQ0NGJywgdmFsdWVzX3RvID0gJ1ZhbHVlJykNCg0KZ2dwbG90KENDRl9zdW1tYXJ5LCBhZXMoVmFsdWUsIGZpbGwgPSBDQ0YpKSArDQogIGdlb21fZGVuc2l0eShhbHBoYT0gMC41KSArDQogIHRoZW1lX2J3KCkgKw0KICBsYWJzKHggPSAnJywgeSA9JycsIHRpdGxlID0gJ0RlbnNpdGllcyBvZiBBY3R1YWwgYW5kIFByZWRpY3RlZCBDQ0YnKQ0KYGBg