If any issues, questions or suggestions feel free to reach me out via e-mail or Linkedin. You can also visit my Github.

This is R replication of the code and exercises from the Udemy course “Credit Risk Modeling in Python 2022”.

if(!require('pacman')) install.packages('pacman')
pacman::p_load(dplyr, tidyr, ggplot2)
options(scipen = 20)

We load data which was preprocessed in that blog post and we filter only loans that was charged off. Our dataset containt 43236 rows.

data <- read.csv('pd_preprocessed_loan_data_2007_2014.csv') %>%
  filter(loan_status == 'Charged Off' | loan_status == 'Does not meet the credit policy. Status:Charged Off')
data %>% dim()
[1] 43236   323

For EAD and LGD model we will use the same independent variables as we did use in PD model estimation. However, for PD model we transformed continuous variables into dummy variables thru fine classign and coarse classing. For some attributes there were dummy variables variable_Missing. But for EAD and LGD we will use continuous variables as they are in the original data. So here we have to fill NA’s with some numeric values. Let’s replace them with 0 using replace_na function from tidyr package.

data <- data %>%
  replace_na(list(
    mths_since_last_delinq = 0
    ,mths_since_last_record = 0
    ,total_rev_hi_lim = 0
  ))

Calculate recovery rate and credit conversion factor.

data <- data %>%
  mutate(
    recovery_rate = recoveries / funded_amnt
    ,CCF = (funded_amnt - total_rec_prncp) / funded_amnt
  )

data %>%
  select(recovery_rate, CCF) %>%
  summary()
 recovery_rate          CCF          
 Min.   :0.00000   Min.   :0.000438  
 1st Qu.:0.00000   1st Qu.:0.632088  
 Median :0.02947   Median :0.789908  
 Mean   :0.06082   Mean   :0.735952  
 3rd Qu.:0.11404   3rd Qu.:0.888543  
 Max.   :1.22077   Max.   :1.000000  

As we see above, some recovery rates are outside the range [0,1]. Let’s trunctate them into that interval.

data <- data %>%
  mutate(recovery_rate = if_else(recovery_rate > 1, 1, recovery_rate))

Below we can see distributions of recovery rate and CCF.

ggplot(data, aes(recovery_rate)) +
  geom_histogram(aes(y = ..density..), bins = 50, color = 'black', fill = 'grey') +
  geom_density(size = 1, col = 'black') +
  theme_bw() +
  labs(x = '', y = '', title = 'Distribution of Recovery Rate')


ggplot(data, aes(CCF)) +
  geom_histogram(aes(y = ..density..), bins = 50, color = 'black', fill = 'grey') +
  geom_density(size = 1, col = 'black') +
  theme_bw() +
  labs(x = '', y = '', title = 'Distribution of Credit Conversion Factor')

Our final dataset for EAD na LGD models has 47 attributes:
- dummy variable good_bad indicating whether loan was defaulted or not
- 29 dummy variables derived from 7 categorial attributes
- 15 continuous attributes
- CCF and recovery rate

list_of_dummy_variables <- read.csv('pd_dummies.csv') %>% unlist() %>% .[1:36]
list_of_reference_categories <- read.csv('pd_dummies_reference.csv') %>% unlist() %>% .[1:7]
dummies <- setdiff(list_of_dummy_variables
                   ,list_of_reference_categories)

data_ead_lgd <- data %>%
  select(
    all_of(dummies)
    ,good_bad, CCF, recovery_rate
    ,emp_length, months_since_issue_d, int_rate, months_since_earliest_cr_line
    ,delinq_2yrs, inq_last_6mths, open_acc, pub_rec, total_acc, acc_now_delinq
    ,total_rev_hi_lim, annual_inc, mths_since_last_delinq, dti, mths_since_last_record
  )

data_ead_lgd %>% dim()
[1] 43236    47
write.csv(data_ead_lgd, 'ead_lgd_preprocessed_data.csv', row.names = FALSE)
LS0tDQp0aXRsZTogIkNyZWRpdCBSaXNrIE1vZGVsaW5nIC0gRUFEIGFuZCBMR0QgTW9kZWxzIERhdGEgUHJlcGFyYXRpb24iDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpJZiBhbnkgaXNzdWVzLCBxdWVzdGlvbnMgb3Igc3VnZ2VzdGlvbnMgZmVlbCBmcmVlIHRvIHJlYWNoIG1lIG91dCB2aWEgZS1tYWlsIDx3aWVjenluc2tpcGF3ZWxAZ21haWwuY29tPiBvciBbTGlua2VkaW5dKGh0dHBzOi8vd3d3LmxpbmtlZGluLmNvbS9pbi9wYXdlbC13aWVjenluc2tpLykuIFlvdSBjYW4gYWxzbyB2aXNpdCBteSBbR2l0aHViXShodHRwczovL2dpdGh1Yi5jb20vcGF3ZWwtd2llY3p5bnNraSkuDQoNClRoaXMgaXMgUiByZXBsaWNhdGlvbiBvZiB0aGUgY29kZSBhbmQgZXhlcmNpc2VzIGZyb20gdGhlIFVkZW15IGNvdXJzZSBbIkNyZWRpdCBSaXNrIE1vZGVsaW5nIGluIFB5dGhvbiAyMDIyIl0oaHR0cHM6Ly93d3cudWRlbXkuY29tL2NvdXJzZS9jcmVkaXQtcmlzay1tb2RlbGluZy1pbi1weXRob24vKS4NCg0KYGBge3IgbG9hZF9saWJyYXJpZXN9DQppZighcmVxdWlyZSgncGFjbWFuJykpIGluc3RhbGwucGFja2FnZXMoJ3BhY21hbicpDQpwYWNtYW46OnBfbG9hZChkcGx5ciwgdGlkeXIsIGdncGxvdDIpDQpvcHRpb25zKHNjaXBlbiA9IDIwKQ0KYGBgDQoNCldlIGxvYWQgZGF0YSB3aGljaCB3YXMgcHJlcHJvY2Vzc2VkIGluIFt0aGF0IGJsb2cgcG9zdF0oaHR0cHM6Ly9ycHVicy5jb20vcGF3ZWwtd2llY3p5bnNraS84OTE4MTcpIGFuZCB3ZSBmaWx0ZXIgb25seSBsb2FucyB0aGF0IHdhcyBjaGFyZ2VkIG9mZi4gT3VyIGRhdGFzZXQgY29udGFpbnQgNDMyMzYgcm93cy4NCmBgYHtyIGxvYWRfZGF0YX0NCmRhdGEgPC0gcmVhZC5jc3YoJ3BkX3ByZXByb2Nlc3NlZF9sb2FuX2RhdGFfMjAwN18yMDE0LmNzdicpICU+JQ0KICBmaWx0ZXIobG9hbl9zdGF0dXMgPT0gJ0NoYXJnZWQgT2ZmJyB8IGxvYW5fc3RhdHVzID09ICdEb2VzIG5vdCBtZWV0IHRoZSBjcmVkaXQgcG9saWN5LiBTdGF0dXM6Q2hhcmdlZCBPZmYnKQ0KZGF0YSAlPiUgZGltKCkNCmBgYA0KDQpGb3IgRUFEIGFuZCBMR0QgbW9kZWwgd2Ugd2lsbCB1c2UgdGhlIHNhbWUgaW5kZXBlbmRlbnQgdmFyaWFibGVzIGFzIHdlIGRpZCB1c2UgaW4gW1BEIG1vZGVsIGVzdGltYXRpb25dKGh0dHBzOi8vcnB1YnMuY29tL3Bhd2VsLXdpZWN6eW5za2kvODk1MzA3KS4gSG93ZXZlciwgZm9yIFBEIG1vZGVsIHdlIHRyYW5zZm9ybWVkIGNvbnRpbnVvdXMgdmFyaWFibGVzIGludG8gZHVtbXkgdmFyaWFibGVzIHRocnUgZmluZSBjbGFzc2lnbiBhbmQgY29hcnNlIGNsYXNzaW5nLiBGb3Igc29tZSBhdHRyaWJ1dGVzIHRoZXJlIHdlcmUgZHVtbXkgdmFyaWFibGVzICp2YXJpYWJsZV9NaXNzaW5nKi4gQnV0IGZvciBFQUQgYW5kIExHRCB3ZSB3aWxsIHVzZSBjb250aW51b3VzIHZhcmlhYmxlcyBhcyB0aGV5IGFyZSBpbiB0aGUgb3JpZ2luYWwgZGF0YS4gU28gaGVyZSB3ZSBoYXZlIHRvIGZpbGwgTkEncyB3aXRoIHNvbWUgbnVtZXJpYyB2YWx1ZXMuIExldCdzIHJlcGxhY2UgdGhlbSB3aXRoIDAgdXNpbmcgKnJlcGxhY2VfbmEqIGZ1bmN0aW9uIGZyb20gKnRpZHlyKiBwYWNrYWdlLg0KYGBge3IgZmlsbF9uYXN9DQpkYXRhIDwtIGRhdGEgJT4lDQogIHJlcGxhY2VfbmEobGlzdCgNCiAgICBtdGhzX3NpbmNlX2xhc3RfZGVsaW5xID0gMA0KICAgICxtdGhzX3NpbmNlX2xhc3RfcmVjb3JkID0gMA0KICAgICx0b3RhbF9yZXZfaGlfbGltID0gMA0KICApKQ0KYGBgDQoNCkNhbGN1bGF0ZSByZWNvdmVyeSByYXRlIGFuZCBjcmVkaXQgY29udmVyc2lvbiBmYWN0b3IuDQpgYGB7ciBjYWxjdWxhdGV9DQpkYXRhIDwtIGRhdGEgJT4lDQogIG11dGF0ZSgNCiAgICByZWNvdmVyeV9yYXRlID0gcmVjb3ZlcmllcyAvIGZ1bmRlZF9hbW50DQogICAgLENDRiA9IChmdW5kZWRfYW1udCAtIHRvdGFsX3JlY19wcm5jcCkgLyBmdW5kZWRfYW1udA0KICApDQoNCmRhdGEgJT4lDQogIHNlbGVjdChyZWNvdmVyeV9yYXRlLCBDQ0YpICU+JQ0KICBzdW1tYXJ5KCkNCmBgYA0KDQpBcyB3ZSBzZWUgYWJvdmUsIHNvbWUgcmVjb3ZlcnkgcmF0ZXMgYXJlIG91dHNpZGUgdGhlIHJhbmdlIFswLDFdLiBMZXQncyB0cnVuY3RhdGUgdGhlbSBpbnRvIHRoYXQgaW50ZXJ2YWwuDQpgYGB7ciB0cnVuY2F0ZX0NCmRhdGEgPC0gZGF0YSAlPiUNCiAgbXV0YXRlKHJlY292ZXJ5X3JhdGUgPSBpZl9lbHNlKHJlY292ZXJ5X3JhdGUgPiAxLCAxLCByZWNvdmVyeV9yYXRlKSkNCmBgYA0KDQpCZWxvdyB3ZSBjYW4gc2VlIGRpc3RyaWJ1dGlvbnMgb2YgcmVjb3ZlcnkgcmF0ZSBhbmQgQ0NGLg0KYGBge3Igdml6dWFsaXplfQ0KZ2dwbG90KGRhdGEsIGFlcyhyZWNvdmVyeV9yYXRlKSkgKw0KICBnZW9tX2hpc3RvZ3JhbShhZXMoeSA9IC4uZGVuc2l0eS4uKSwgYmlucyA9IDUwLCBjb2xvciA9ICdibGFjaycsIGZpbGwgPSAnZ3JleScpICsNCiAgZ2VvbV9kZW5zaXR5KHNpemUgPSAxLCBjb2wgPSAnYmxhY2snKSArDQogIHRoZW1lX2J3KCkgKw0KICBsYWJzKHggPSAnJywgeSA9ICcnLCB0aXRsZSA9ICdEaXN0cmlidXRpb24gb2YgUmVjb3ZlcnkgUmF0ZScpDQoNCmdncGxvdChkYXRhLCBhZXMoQ0NGKSkgKw0KICBnZW9tX2hpc3RvZ3JhbShhZXMoeSA9IC4uZGVuc2l0eS4uKSwgYmlucyA9IDUwLCBjb2xvciA9ICdibGFjaycsIGZpbGwgPSAnZ3JleScpICsNCiAgZ2VvbV9kZW5zaXR5KHNpemUgPSAxLCBjb2wgPSAnYmxhY2snKSArDQogIHRoZW1lX2J3KCkgKw0KICBsYWJzKHggPSAnJywgeSA9ICcnLCB0aXRsZSA9ICdEaXN0cmlidXRpb24gb2YgQ3JlZGl0IENvbnZlcnNpb24gRmFjdG9yJykNCmBgYA0KDQpPdXIgZmluYWwgZGF0YXNldCBmb3IgRUFEIG5hIExHRCBtb2RlbHMgaGFzIDQ3IGF0dHJpYnV0ZXM6IFwNCiAtIGR1bW15IHZhcmlhYmxlICpnb29kX2JhZCogaW5kaWNhdGluZyB3aGV0aGVyIGxvYW4gd2FzIGRlZmF1bHRlZCBvciBub3QgXA0KIC0gMjkgZHVtbXkgdmFyaWFibGVzIGRlcml2ZWQgZnJvbSA3IGNhdGVnb3JpYWwgYXR0cmlidXRlcyBcDQogLSAxNSBjb250aW51b3VzIGF0dHJpYnV0ZXMgXA0KIC0gKkNDRiogYW5kICpyZWNvdmVyeSByYXRlKg0KYGBge3IgZHVtbWllc30NCmxpc3Rfb2ZfZHVtbXlfdmFyaWFibGVzIDwtIHJlYWQuY3N2KCdwZF9kdW1taWVzLmNzdicpICU+JSB1bmxpc3QoKSAlPiUgLlsxOjM2XQ0KbGlzdF9vZl9yZWZlcmVuY2VfY2F0ZWdvcmllcyA8LSByZWFkLmNzdigncGRfZHVtbWllc19yZWZlcmVuY2UuY3N2JykgJT4lIHVubGlzdCgpICU+JSAuWzE6N10NCmR1bW1pZXMgPC0gc2V0ZGlmZihsaXN0X29mX2R1bW15X3ZhcmlhYmxlcw0KICAgICAgICAgICAgICAgICAgICxsaXN0X29mX3JlZmVyZW5jZV9jYXRlZ29yaWVzKQ0KDQpkYXRhX2VhZF9sZ2QgPC0gZGF0YSAlPiUNCiAgc2VsZWN0KA0KICAgIGFsbF9vZihkdW1taWVzKQ0KICAgICxnb29kX2JhZCwgQ0NGLCByZWNvdmVyeV9yYXRlDQogICAgLGVtcF9sZW5ndGgsIG1vbnRoc19zaW5jZV9pc3N1ZV9kLCBpbnRfcmF0ZSwgbW9udGhzX3NpbmNlX2VhcmxpZXN0X2NyX2xpbmUNCiAgICAsZGVsaW5xXzJ5cnMsIGlucV9sYXN0XzZtdGhzLCBvcGVuX2FjYywgcHViX3JlYywgdG90YWxfYWNjLCBhY2Nfbm93X2RlbGlucQ0KICAgICx0b3RhbF9yZXZfaGlfbGltLCBhbm51YWxfaW5jLCBtdGhzX3NpbmNlX2xhc3RfZGVsaW5xLCBkdGksIG10aHNfc2luY2VfbGFzdF9yZWNvcmQNCiAgKQ0KDQpkYXRhX2VhZF9sZ2QgJT4lIGRpbSgpDQpgYGANCg0KYGBge3Igd3JpdGV9DQp3cml0ZS5jc3YoZGF0YV9lYWRfbGdkLCAnZWFkX2xnZF9wcmVwcm9jZXNzZWRfZGF0YS5jc3YnLCByb3cubmFtZXMgPSBGQUxTRSkNCmBgYA0KDQo=