There are several loan_statutes from which we derive binary column
indicating where given credit was good or bad. Categories
Current, Fully Paid and Charged Off consists
of \(96.8\%\) of all the credits.
Here we create good_bad column. Out data set is imbalanced: there are
\(89\%\) of good credits and \(11\%\) of bad ones.
Split data into training set and test set in proportions 80:20.
Fine classing and coarse classing
Function woe() groups data by a choosen attribute and
calculates Weight of Evidence (and bunch of another proportions and
ratios) and Information Value for that attribute. If
discrete=TRUE data is arranged by WoE (suitable for
quantitative attributes). If discrete=FALSE data is arranged by
the attribute itself in ascending order (suitable for qualitative
attributes). Function plot_by_woe() plots attribute values
againts its WoE. In this function we’ve added additional feature which
is bubble size depending on the n_obs. We use
tidyverse environment extensively to define below
functions.
woe <- function(df, train, column, discrete = TRUE){
if (!('good_bad' %in% colnames(df))) {
stop('Lack of column good_bad in the data frame.')
}
temp <- df[train,] %>%
group_by(across(all_of(column)), good_bad) %>%
summarise(n_obs = n(), .groups = 'keep') %>%
ungroup() %>%
mutate(good_bad = ifelse(good_bad == 0, 'bad', 'good')) %>%
pivot_wider(names_from = good_bad, names_prefix = 'n_', values_from = n_obs, values_fill = 0) %>%
mutate(n_obs = n_bad + n_good) %>%
mutate(prop_n_obs = n_obs / sum(n_obs)
,prop_bad = n_bad / n_obs
,prop_good = n_good / n_obs
,prop_n_bad = n_bad / sum(n_bad)
,prop_n_good = n_good / sum(n_good)) %>%
mutate(WoE = log(prop_n_good / prop_n_bad)) %>%
arrange(ifelse(discrete, WoE, eval(column))) %>%
mutate(diff_prop_good = c(NaN, prop_good %>% diff() %>% abs())
,diff_WoE = c(NaN, WoE %>% diff() %>% abs())
,IV = (prop_n_good - prop_n_bad) * WoE) %>%
mutate(IV = sum(IV))
return(temp)
}
plot_by_woe <- function(df_woe, discrete = TRUE, rotation = 0){
col_name <- colnames(df_woe)[1]
if (discrete) sort_col = 'WoE' else sort_col = colnames(df_woe)[1]
ggplot(df_woe, aes(reorder(.data[[col_name]], .data[[sort_col]]), WoE)) +
geom_point(aes(size = n_obs)) +
theme_bw() +
theme(axis.text.x = element_text(angle = rotation, vjust = 1, hjust = 1)) +
labs(x = '', y = '', title =paste0('Weight of evidence of the attribute ', col_name), size = 'Size')
}
Preprocessing attribute grade was explained in chapter 29 of
the Udemy course. This attribute has medium Information Value
(0.29).
woe_grade <- woe(data, train_index, 'grade')
woe_grade %>% select(grade, n_obs, WoE, IV)
plot_by_woe(woe_grade)

list_of_dummy_variables <- c(
'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G'
)
list_of_reference_categories <- c('grade_G')
Preprocessing attribute home_ownership was explained in
chapter 29 of the Udemy course. This attribute has weak Information
Value (0.02).
woe_home_ownership <- woe(data, train_index, 'home_ownership')
woe_home_ownership %>% select(home_ownership, n_obs, WoE, IV)
plot_by_woe(woe_home_ownership)

data <- data %>%
mutate(home_ownership_RENT_OTHER_NONE_ANY = home_ownership_RENT + home_ownership_OTHER + home_ownership_NONE + home_ownership_ANY)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'home_ownership_RENT_OTHER_NONE_ANY', 'home_ownership_OWN', 'home_ownership_MORTGAGE'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'home_ownership_RENT_OTHER_NONE_ANY'
)
Preprocessing attribute addr_state was explained in chapter
30 of the Udemy course.
woe_addr_state <- woe(data, train_index, 'addr_state')
woe_addr_state %>% select(addr_state, n_obs, WoE, IV)
plot_by_woe(woe_addr_state, rotation = 90)

data <- data %>%
mutate(addr_state_NE_IA_NV_FL_HI_AL = addr_state_NE + addr_state_IA + addr_state_NV + addr_state_FL + addr_state_HI + addr_state_AL
,addr_state_NM_VA = addr_state_NM + addr_state_VA
,addr_state_OK_TN_MO_LA_MD_NC = addr_state_OK + addr_state_TN + addr_state_MO + addr_state_LA + addr_state_MD + addr_state_NC
,addr_state_UT_KY_AZ_NJ = addr_state_UT + addr_state_KY + addr_state_AZ + addr_state_NJ
,addr_state_AR_MI_PA_OH_MN = addr_state_AR + addr_state_MI + addr_state_PA + addr_state_OH + addr_state_MN
,addr_state_RI_MA_DE_SD_IN = addr_state_RI + addr_state_MA + addr_state_DE + addr_state_SD + addr_state_IN
,addr_state_GA_WA_OR = addr_state_GA + addr_state_WA + addr_state_OR
,addr_state_WI_MT = addr_state_WI + addr_state_MT
,addr_state_IL_CT = addr_state_IL + addr_state_CT
,addr_state_KS_SC_CO_VT_AK_MS = addr_state_KS + addr_state_SC + addr_state_CO + addr_state_VT + addr_state_AK + addr_state_MS
,addr_state_WV_NH_WY_DC_ME_ID = addr_state_WV + addr_state_NH + addr_state_WY + addr_state_DC + addr_state_ME + addr_state_ID
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'addr_state_NE_IA_NV_FL_HI_AL', 'addr_state_NM_VA', 'addr_state_NY'
,'addr_state_OK_TN_MO_LA_MD_NC', 'addr_state_CA', 'addr_state_UT_KY_AZ_NJ'
,'addr_state_AR_MI_PA_OH_MN', 'addr_state_RI_MA_DE_SD_IN', 'addr_state_GA_WA_OR'
,'addr_state_WI_MT', 'addr_state_IL_CT', 'addr_state_KS_SC_CO_VT_AK_MS'
,'addr_state_TX', 'addr_state_WV_NH_WY_DC_ME_ID'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'addr_state_NE_IA_NV_FL_HI_AL'
)
Preprocessing attribute verification_status left as homework
of the Udemy course. This attribute has weak Information Value
(0.02).
woe_verification_status <- woe(data, train_index, 'verification_status')
woe_verification_status %>% select(verification_status, n_obs, WoE, IV)
plot_by_woe(woe_verification_status)

list_of_dummy_variables <- c(
list_of_dummy_variables
,'verification_status_Verified'
,'verification_status_Source.Verified'
,'verification_status_Not.Verified'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'verification_status_Verified'
)
Preprocessing attribute purpose left as homework of the
Udemy course. This attribute has weak Information Value (0.04).
woe_purpose <- woe(data, train_index, 'purpose')
woe_purpose %>% select(purpose, n_obs, WoE, IV)
plot_by_woe(woe_purpose, rotation = 45)

data <- data %>%
mutate(purpose_small_business_educational = purpose_small_business + purpose_educational
,purpose_renewable_energy_moving_other_house_medical = purpose_renewable_energy + purpose_moving + purpose_other + purpose_house + purpose_medical
,purpose_wedding_vacation_debt_consolidation = purpose_wedding + purpose_vacation + purpose_debt_consolidation
,purpose_major_purchase_home_improvement = purpose_major_purchase + purpose_home_improvement
,purpose_car_credit_card = purpose_car + purpose_credit_card
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'purpose_small_business_educational'
,'purpose_renewable_energy_moving_other_house_medical'
,'purpose_wedding_vacation_debt_consolidation'
,'purpose_major_purchase_home_improvement'
,'purpose_car_credit_card'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'purpose_small_business_educational'
)
Preprocessing attribute initial_list_status left as homework
of the Udemy course. This attribute has weak Information Value
(0.03).
woe_initial_list_status <- woe(data, train_index, 'initial_list_status')
woe_initial_list_status %>% select(initial_list_status, n_obs, WoE, IV)
plot_by_woe(woe_initial_list_status)

list_of_dummy_variables <- c(
list_of_dummy_variables
,'initial_list_status_f', 'initial_list_status_w'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'initial_list_status_f'
)
Preprocessing attribute term was explained in chapter 33 of
the Udemy course. This attribute has weak Information Value (0.04).
woe_term <- woe(data, train_index, 'term', discrete = FALSE)
woe_term %>% select(term, n_obs, WoE, IV)
plot_by_woe(woe_term)

data <- data %>%
mutate(term_36 = if_else(term == 36, 1, 0)
,term_60 = if_else(term == 60, 1, 0))
list_of_dummy_variables <- c(
list_of_dummy_variables
,'term_36', 'term_60'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'term_60'
)
Preprocessing attribute emp_length was explained in chapter
33 of the Udemy course. This attribute has no predictive power according
to the Information Value (<0.02).
woe_emp_length <- woe(data, train_index, 'emp_length', discrete = FALSE)
woe_emp_length %>% select(emp_length, n_obs, WoE, IV)
plot_by_woe(woe_emp_length, discrete = FALSE)

data <- data %>%
mutate(emp_length_0 = if_else(emp_length == 0, 1, 0)
,emp_length_1_4 = if_else(emp_length >= 1 & emp_length <= 4, 1, 0)
,emp_length_5_6 = if_else(emp_length >= 5 & emp_length <= 6, 1, 0)
,emp_length_7_9 = if_else(emp_length >= 7 & emp_length <= 9, 1, 0)
,emp_length_10 = if_else(emp_length == 10, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'emp_length_0', 'emp_length_1_4', 'emp_length_5_6', 'emp_length_7_9', 'emp_length_10'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'emp_length_0'
)
Preprocessing attribute months_since_issue_d was explained
in chapter 34 of the Udemy course. This attribute has medium Information
Value (0.11).
data <- data %>%
mutate(months_since_issue_d_cut = cut(months_since_issue_d, 50))
woe_months_since_issue_d_cut <- woe(data, train_index, 'months_since_issue_d_cut', discrete = FALSE)
woe_months_since_issue_d_cut %>% select(months_since_issue_d_cut, n_obs, WoE, IV)
plot_by_woe(woe_months_since_issue_d_cut, discrete = FALSE, rotation = 90)

data <- data %>%
mutate(months_since_issue_d_less_38 = if_else(months_since_issue_d < 38, 1, 0)
,months_since_issue_d_38_39 = if_else(months_since_issue_d >= 38 & months_since_issue_d <= 39, 1, 0)
,months_since_issue_d_40_41 = if_else(months_since_issue_d >= 40 & months_since_issue_d <= 41, 1, 0)
,months_since_issue_d_42_48 = if_else(months_since_issue_d >= 42 & months_since_issue_d <= 48, 1, 0)
,months_since_issue_d_49_52 = if_else(months_since_issue_d >= 49 & months_since_issue_d <= 52, 1, 0)
,months_since_issue_d_53_64 = if_else(months_since_issue_d >= 53 & months_since_issue_d <= 64, 1, 0)
,months_since_issue_d_65_84 = if_else(months_since_issue_d >= 65 & months_since_issue_d <= 84, 1, 0)
,months_since_issue_d_84_more = if_else(months_since_issue_d > 84, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'months_since_issue_d_less_38'
,'months_since_issue_d_38_39'
,'months_since_issue_d_40_41'
,'months_since_issue_d_42_48'
,'months_since_issue_d_49_52'
,'months_since_issue_d_53_64'
,'months_since_issue_d_65_84'
,'months_since_issue_d_84_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'months_since_issue_d_less_38'
)
Preprocessing attribute int_rate was explained in chapter 34
of the Udemy course. This attribute has strong Information Value
(0.35).
data <- data %>%
mutate(int_rate_cut = cut(int_rate, 50))
woe_int_rate_cut <- woe(data, train_index, 'int_rate_cut', discrete = FALSE)
woe_int_rate_cut %>% select(int_rate_cut, n_obs, WoE, IV)
plot_by_woe(woe_int_rate_cut, discrete = FALSE, rotation = 90)

data <- data %>%
mutate(int_rate_less_9.548 = if_else(int_rate < 9.548, 1, 0)
,int_rate_9.548_12.025 = if_else(int_rate >= 9.548 & int_rate <= 12.025, 1, 0)
,int_rate_12.025_15.74 = if_else(int_rate >= 12.025 & int_rate <= 15.74, 1, 0)
,int_rate_15.74_20.281 = if_else(int_rate >= 15.74 & int_rate <= 20.281, 1, 0)
,int_rate_20.281_more = if_else(int_rate > 20.281, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'int_rate_less_9.548'
,'int_rate_9.548_12.025'
,'int_rate_12.025_15.74'
,'int_rate_15.74_20.281'
,'int_rate_20.281_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'int_rate_less_9.548'
)
Preprocessing attribute months_since_earliest_cr_line left
as homework of the Udemy course. This attribute has has no predictive
power according to the Information Value (<0.02).
data <- data %>%
mutate(months_since_earliest_cr_line_cut = cut(months_since_earliest_cr_line, 50))
woe_months_since_earliest_cr_line_cut <- woe(data, train_index, 'months_since_earliest_cr_line_cut', discrete = FALSE)
woe_months_since_earliest_cr_line_cut %>% select(months_since_earliest_cr_line_cut, n_obs, WoE, IV)
plot_by_woe(woe_months_since_earliest_cr_line_cut, discrete = FALSE, rotation = 90)

data <- data %>%
mutate(months_since_earliest_cr_line_less_140 = if_else(months_since_earliest_cr_line < 140, 1, 0)
,months_since_earliest_cr_line_141_164 = if_else(months_since_earliest_cr_line >= 141
& months_since_earliest_cr_line <= 164, 1, 0)
,months_since_earliest_cr_line_165_247 = if_else(months_since_earliest_cr_line >= 165
& months_since_earliest_cr_line <= 247, 1, 0)
,months_since_earliest_cr_line_248_270 = if_else(months_since_earliest_cr_line >= 248
& months_since_earliest_cr_line <= 270, 1, 0)
,months_since_earliest_cr_line_271_352 = if_else(months_since_earliest_cr_line >= 271
& months_since_earliest_cr_line <= 352, 1, 0)
,months_since_earliest_cr_line_352_more = if_else(months_since_earliest_cr_line > 352, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'months_since_earliest_cr_line_less_140'
,'months_since_earliest_cr_line_141_164'
,'months_since_earliest_cr_line_165_247'
,'months_since_earliest_cr_line_248_270'
,'months_since_earliest_cr_line_271_352'
,'months_since_earliest_cr_line_352_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'months_since_earliest_cr_line_less_140'
)
Preprocessing attribute installment left as homework of the
Udemy course. This attribute has has no predictive power according to
the Information Value (<0.02).
data <- data %>%
mutate(installment_cut = cut(installment, 50))
woe_installment_cut <- woe(data, train_index, 'installment_cut', discrete = FALSE)
woe_installment_cut %>% select(installment_cut, n_obs, WoE, IV)
plot_by_woe(woe_installment_cut, discrete = FALSE, rotation = 90)

Preprocessing attribute delinq_2yrs left as homework of the
Udemy course.
woe_delinq_2yrs <- woe(data, train_index, 'delinq_2yrs', discrete = FALSE)
woe_delinq_2yrs %>% select(delinq_2yrs, n_obs, WoE, IV)
plot_by_woe(woe_delinq_2yrs, discrete = FALSE)

data <- data %>%
mutate(delinq_2yrs_0 = if_else(delinq_2yrs == 0, 1, 0)
,delinq_2yrs_1_3 = if_else(delinq_2yrs >= 1 & delinq_2yrs <= 3, 1, 0)
,delinq_2yrs_3_more = if_else(delinq_2yrs > 3, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'delinq_2yrs_0 '
,'delinq_2yrs_1_3'
,'delinq_2yrs_3_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'delinq_2yrs_0 '
)
Preprocessing attribute inq_last_6mths left as homework of
the Udemy course.
woe_inq_last_6mths <- woe(data, train_index, 'inq_last_6mths', discrete = FALSE)
woe_inq_last_6mths %>% select(inq_last_6mths, n_obs, WoE, IV)
plot_by_woe(woe_inq_last_6mths, discrete = FALSE)

data <- data %>%
mutate(inq_last_6mths_0 = if_else(inq_last_6mths == 0, 1, 0)
,inq_last_6mths_1_2 = if_else(inq_last_6mths >= 1 & inq_last_6mths <= 2, 1, 0)
,inq_last_6mths_3_6 = if_else(inq_last_6mths >= 3 & inq_last_6mths <= 6, 1, 0)
,inq_last_6mths_6_more = if_else(inq_last_6mths > 6, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'inq_last_6mths_0'
,'inq_last_6mths_1_2'
,'inq_last_6mths_3_6'
,'inq_last_6mths_6_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'inq_last_6mths_0'
)
Preprocessing attribute open_acc left as homework of the
Udemy course.
woe_open_acc <- woe(data, train_index, 'open_acc', discrete = FALSE)
woe_open_acc %>% select(open_acc, n_obs, WoE, IV)
plot_by_woe(woe_open_acc, discrete = FALSE, rotation = 90)

data <- data %>%
mutate(open_acc_0 = if_else(open_acc == 0, 1, 0)
,open_acc_1_3 = if_else(open_acc >= 1 & open_acc <= 3, 1, 0)
,open_acc_4_12 = if_else(open_acc >= 4 & open_acc <= 12, 1, 0)
,open_acc_13_17 = if_else(open_acc >= 13 & open_acc <= 17, 1, 0)
,open_acc_18_22 = if_else(open_acc >= 18 & open_acc <= 22, 1, 0)
,open_acc_23_25 = if_else(open_acc >= 23 & open_acc <= 25, 1, 0)
,open_acc_26_30 = if_else(open_acc >= 26 & open_acc <= 30, 1, 0)
,open_acc_30_more = if_else(open_acc > 30, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'open_acc_0'
,'open_acc_1_3'
,'open_acc_4_12'
,'open_acc_13_17'
,'open_acc_18_22'
,'open_acc_23_25'
,'open_acc_26_30'
,'open_acc_30_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'open_acc_0'
)
Preprocessing attribute pub_rec left as homework of the
Udemy course.
woe_pub_rec <- woe(data, train_index, 'pub_rec', discrete = FALSE)
woe_pub_rec %>% select(pub_rec, n_obs, WoE, IV)
plot_by_woe(woe_pub_rec, discrete = FALSE)

data <- data %>%
mutate(pub_rec_0_2 = if_else(pub_rec >= 0 & pub_rec <= 2, 1, 0)
,pub_rec_3_4 = if_else(pub_rec >= 3 & pub_rec <= 4, 1, 0)
,pub_rec_4_more = if_else(pub_rec > 4, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'pub_rec_0_2'
,'pub_rec_3_4'
,'pub_rec_4_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'pub_rec_0_2'
)
Preprocessing attribute total_acc left as homework of the
Udemy course.
data <- data %>%
mutate(total_acc_cut = cut(total_acc, 50))
woe_total_acc_cut <- woe(data, train_index, 'total_acc_cut', discrete = FALSE)
woe_total_acc_cut %>% select(total_acc_cut, n_obs, WoE, IV)
plot_by_woe(woe_total_acc_cut, discrete = FALSE, rotation = 90)

data <- data %>%
mutate(total_acc_less_28 = if_else(total_acc < 28, 1, 0)
,total_acc_28_51 = if_else(total_acc >= 28 & total_acc <= 51, 1, 0)
,total_acc_51_more = if_else(total_acc > 51, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'total_acc_less_28'
,'total_acc_28_51'
,'total_acc_51_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'total_acc_less_28'
)
Preprocessing attribute acc_now_delinq left as homework of
the Udemy course.
woe_acc_now_delinq <- woe(data, train_index, 'acc_now_delinq', discrete = FALSE)
woe_acc_now_delinq %>% select(acc_now_delinq, n_obs, WoE, IV)
plot_by_woe(woe_acc_now_delinq, discrete = FALSE)

data <- data %>%
mutate(acc_now_delinq_0 = if_else(acc_now_delinq == 0, 1, 0)
,acc_now_delinq_0_more = if_else(acc_now_delinq > 0, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'acc_now_delinq_0'
,'acc_now_delinq_0_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'acc_now_delinq_0'
)
Preprocessing attribute total_rev_hi_lim left as homework of
the Udemy course.
data <- data %>%
mutate(total_rev_hi_lim_cut = cut(total_rev_hi_lim, 50))
woe_total_rev_hi_lim_cut <- woe(data, train_index, 'total_rev_hi_lim_cut', discrete = FALSE)
woe_total_rev_hi_lim_cut %>% select(total_rev_hi_lim_cut, n_obs, WoE, IV)
plot_by_woe(woe_total_rev_hi_lim_cut, discrete = FALSE, rotation = 90)

data <- data %>%
mutate(total_rev_hi_lim_Missing = if_else(is.na(total_rev_hi_lim), 1, 0)
,total_rev_hi_lim_less_5 = if_else(total_rev_hi_lim < 5000, 1, 0, missing = 0)
,total_rev_hi_lim_5_10 = if_else(total_rev_hi_lim >= 5000 & total_rev_hi_lim <= 10000, 1, 0, missing = 0)
,total_rev_hi_lim_10_20 = if_else(total_rev_hi_lim >= 10000 & total_rev_hi_lim <= 20000, 1, 0, missing = 0)
,total_rev_hi_lim_20_30 = if_else(total_rev_hi_lim >= 20000 & total_rev_hi_lim <= 30000, 1, 0, missing = 0)
,total_rev_hi_lim_30_40 = if_else(total_rev_hi_lim >= 30000 & total_rev_hi_lim <= 40000, 1, 0, missing = 0)
,total_rev_hi_lim_40_55 = if_else(total_rev_hi_lim >= 40000 & total_rev_hi_lim <= 55000, 1, 0, missing = 0)
,total_rev_hi_lim_55_95 = if_else(total_rev_hi_lim >= 55000 & total_rev_hi_lim <= 95000, 1, 0, missing = 0)
,total_rev_hi_lim_95_more = if_else(total_rev_hi_lim > 95000, 1, 0, missing = 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'total_rev_hi_lim_Missing'
,'total_rev_hi_lim_less_5'
,'total_rev_hi_lim_5_10'
,'total_rev_hi_lim_10_20'
,'total_rev_hi_lim_20_30'
,'total_rev_hi_lim_30_40'
,'total_rev_hi_lim_40_55'
,'total_rev_hi_lim_55_95'
,'total_rev_hi_lim_95_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'total_rev_hi_lim_Missing'
)
Preprocessing attribute annual_inc was explained in chapter
36 of the Udemy course.
data <- data %>%
mutate(annual_inc_cut = cut(annual_inc, 50))
woe_annual_inc_cut <- woe(data, train_index, 'annual_inc_cut', discrete = FALSE)
woe_annual_inc_cut %>% select(annual_inc_cut, n_obs, WoE, IV)
plot_by_woe(woe_annual_inc_cut, discrete = FALSE, rotation = 90)

data <- data %>%
mutate(annual_inc_less_20 = if_else(annual_inc < 20000, 1, 0)
,annual_inc_20_30 = if_else(annual_inc >= 20000 & annual_inc <= 30000, 1, 0)
,annual_inc_30_40 = if_else(annual_inc >= 30000 & annual_inc <= 40000, 1, 0)
,annual_inc_40_50 = if_else(annual_inc >= 40000 & annual_inc <= 50000, 1, 0)
,annual_inc_50_60 = if_else(annual_inc >= 50000 & annual_inc <= 60000, 1, 0)
,annual_inc_60_70 = if_else(annual_inc >= 60000 & annual_inc <= 70000, 1, 0)
,annual_inc_70_80 = if_else(annual_inc >= 70000 & annual_inc <= 80000, 1, 0)
,annual_inc_80_90 = if_else(annual_inc >= 80000 & annual_inc <= 90000, 1, 0)
,annual_inc_90_100 = if_else(annual_inc >= 90000 & annual_inc <= 100000, 1, 0)
,annual_inc_100_120 = if_else(annual_inc >= 100000 & annual_inc <= 120000, 1, 0)
,annual_inc_120_140 = if_else(annual_inc >= 120000 & annual_inc <= 140000, 1, 0)
,annual_inc_140_more = if_else(annual_inc > 140000, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'annual_inc_less_20'
,'annual_inc_20_30'
,'annual_inc_30_40'
,'annual_inc_40_50'
,'annual_inc_50_60'
,'annual_inc_60_70'
,'annual_inc_70_80'
,'annual_inc_80_90'
,'annual_inc_90_100'
,'annual_inc_100_120'
,'annual_inc_120_140'
,'annual_inc_140_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'annual_inc_less_20'
)
Preprocessing attribute mths_since_last_delinq was explained
in chapter 36 of the Udemy course.
data <- data %>%
mutate(mths_since_last_delinq_cut = cut(mths_since_last_delinq, 50))
woe_mths_since_last_delinq_cut <- woe(data, train_index, 'mths_since_last_delinq_cut', discrete = FALSE)
woe_mths_since_last_delinq_cut %>% select(mths_since_last_delinq_cut, n_obs, WoE, IV)
plot_by_woe(woe_mths_since_last_delinq_cut, discrete = FALSE, rotation = 90)

data <- data %>%
mutate(mths_since_last_delinq_Missing = if_else(is.na(mths_since_last_delinq), 1, 0)
,mths_since_last_delinq_0_3 = if_else(mths_since_last_delinq >= 0 & mths_since_last_delinq <= 3, 1, 0, missing = 0)
,mths_since_last_delinq_4_30 = if_else(mths_since_last_delinq >= 4 & mths_since_last_delinq <= 30, 1, 0, missing = 0)
,mths_since_last_delinq_31_56 = if_else(mths_since_last_delinq >= 31 & mths_since_last_delinq <= 56, 1, 0, missing = 0)
,mths_since_last_delinq_56_more = if_else(mths_since_last_delinq > 56, 1, 0, missing = 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'mths_since_last_delinq_Missing'
,'mths_since_last_delinq_0_3'
,'mths_since_last_delinq_4_30'
,'mths_since_last_delinq_31_56'
,'mths_since_last_delinq_56_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'mths_since_last_delinq_Missing'
)
Preprocessing attribute dti left as homework of the Udemy
course.
data <- data %>%
mutate(dti_cut = cut(dti, 50))
woe_dti_cut <- woe(data, train_index, 'dti_cut', discrete = FALSE)
woe_dti_cut %>% select(dti_cut, n_obs, WoE, IV)
plot_by_woe(woe_dti_cut, discrete = FALSE, rotation = 90)

data <- data %>%
mutate(dti_less_1.4 = if_else(dti < 1.4, 1, 0)
,dti_1.4_3.5 = if_else(dti >= 1.4 & dti <= 3.5, 1, 0)
,dti_3.5_7.7 = if_else(dti >= 3.5 & dti <= 7.7, 1, 0)
,dti_7.7_10.5 = if_else(dti >= 7.7 & dti <= 10.5, 1, 0)
,dti_10.5_16.1 = if_else(dti >= 10.5 & dti <= 16.1, 1, 0)
,dti_16.1_20.3 = if_else(dti >= 16.1 & dti <= 20.3, 1, 0)
,dti_20.3_21.7 = if_else(dti >= 20.3 & dti <= 21.7, 1, 0)
,dti_21.7_22.4 = if_else(dti >= 21.7 & dti <= 22.4, 1, 0)
,dti_22.4_35 = if_else(dti >= 22.4 & dti <= 35, 1, 0)
,dti_35_more = if_else(dti > 35, 1, 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'dti_less_1.4'
,'dti_1.4_3.5'
,'dti_3.5_7.7'
,'dti_7.7_10.5'
,'dti_10.5_16.1'
,'dti_16.1_20.3'
,'dti_20.3_21.7'
,'dti_21.7_22.4'
,'dti_22.4_35'
,'dti_35_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'dti_less_1.4'
)
Preprocessing attribute mths_since_last_record was left as
homework of the Udemy course.
data <- data %>%
mutate(mths_since_last_record_cut = cut(mths_since_last_record, 50))
woe_mths_since_last_record_cut <- woe(data, train_index, 'mths_since_last_record_cut', discrete = FALSE)
woe_mths_since_last_record_cut %>% select(mths_since_last_record_cut, n_obs, WoE, IV)
plot_by_woe(woe_mths_since_last_record_cut, discrete = FALSE, rotation = 90)

data <- data %>%
mutate(mths_since_last_record_Missing = if_else(is.na(mths_since_last_record), 1, 0)
,mths_since_last_record_0_2 = if_else(mths_since_last_record >= 0 & mths_since_last_record <= 2, 1, 0, missing = 0)
,mths_since_last_record_3_20 = if_else(mths_since_last_record >= 3 & mths_since_last_record <= 20, 1, 0, missing = 0)
,mths_since_last_record_21_31 = if_else(mths_since_last_record >= 21 & mths_since_last_record <= 31, 1, 0, missing = 0)
,mths_since_last_record_32_80 = if_else(mths_since_last_record >= 32 & mths_since_last_record <= 80, 1, 0, missing = 0)
,mths_since_last_record_81_86 = if_else(mths_since_last_record >= 81 & mths_since_last_record <= 86, 1, 0, missing = 0)
,mths_since_last_record_86_more = if_else(mths_since_last_record > 86, 1, 0, missing = 0)
)
list_of_dummy_variables <- c(
list_of_dummy_variables
,'mths_since_last_record_Missing'
,'mths_since_last_record_0_2'
,'mths_since_last_record_3_20'
,'mths_since_last_record_21_31'
,'mths_since_last_record_32_80'
,'mths_since_last_record_81_86'
,'mths_since_last_record_86_more'
)
list_of_reference_categories <- c(
list_of_reference_categories
,'mths_since_last_record_Missing'
)
write.csv(list_of_dummy_variables, 'pd_dummies.csv', row.names = FALSE)
write.csv(list_of_reference_categories, 'pd_dummies_reference.csv', row.names = FALSE)
write.csv(data, 'pd_preprocessed_loan_data_2007_2014.csv', row.names = FALSE)
---
title: "Credit Risk Modeling - PD Model Data Preparation"
output: html_notebook
---

If any issues, questions or suggestions feel free to reach me out via e-mail <wieczynskipawel@gmail.com> or [Linkedin](https://www.linkedin.com/in/pawel-wieczynski/). You can also visit my [Github](https://github.com/pawel-wieczynski).

This is R replication of the code and exercises from the Udemy course ["Credit Risk Modeling in Python 2022"](https://www.udemy.com/course/credit-risk-modeling-in-python/).

```{r load_libraries}
if(!require('pacman')) install.packages('pacman')
pacman::p_load(dplyr, tidyr, ggplot2)
options(scipen = 20)
```

We load data which was preprocessed in the previous script which you can found [here](https://rpubs.com/pawel-wieczynski/891815).
```{r load_data}
data <- read.csv('preprocessed_loan_data_2007_2014.csv')
```

There are several loan_statutes from which we derive binary column indicating where given credit was good or bad. Categories *Current*, *Fully Paid* and *Charged Off* consists of $96.8\%$ of all the credits.
```{r loan_status}
loan_status_summary <- data %>%
  group_by(loan_status) %>%
  summarise(count = n()) %>%
  mutate(proportion = count / sum(count)) %>%
  arrange(-count)

loan_status_summary

loan_status_summary %>%
  top_n(5, count) %>%
  ggplot(aes(reorder(loan_status, -count), count)) +
  geom_col(aes(fill = loan_status)) +
  theme_bw() +
  theme(legend.position = 'none') +
  labs(x = '', y = '')
```

Here we create good_bad column. Out data set is imbalanced: there are $89\%$ of good credits and $11\%$ of bad ones.
```{r good_bad}
data <- data %>%
  mutate(good_bad = if_else(
    loan_status %in% c('Fully Paid', 'Current', 'In Grace Period', 'Late (16-30 days)'
                       ,'Does not meet the credit policy. Status:Fully Paid')
    ,1, 0
  ))

data %>%
  group_by(good_bad) %>%
  summarise(count = n()) %>%
  mutate(proportion = count / sum(count)) %>%
  arrange(-count)
```

Split data into training set and test set in proportions 80:20.
```{r data_split}
set.seed(2137)
n_obs <- nrow(data)
train_index <- sample(1:n_obs, 0.8 * n_obs)
```

### Fine classing and coarse classing

Function *woe()* groups data by a choosen attribute and calculates Weight of Evidence (and bunch of another proportions and ratios) and Information Value for that attribute. If *discrete=TRUE* data is arranged by WoE (suitable for quantitative attributes). If *discrete=FALSE* data is arranged by the attribute itself in ascending order (suitable for qualitative attributes). Function *plot_by_woe()* plots attribute values againts its WoE. In this function we've added additional feature which is bubble size depending on the *n_obs*. We use *tidyverse* environment extensively to define below functions. 
```{r woe}
woe <- function(df, train, column, discrete = TRUE){
  
  if (!('good_bad' %in% colnames(df))) {
    stop('Lack of column good_bad in the data frame.')
  }
  
  temp <- df[train,] %>%
    group_by(across(all_of(column)), good_bad) %>%
    summarise(n_obs = n(), .groups = 'keep') %>%
    ungroup() %>%
    mutate(good_bad = ifelse(good_bad == 0, 'bad', 'good')) %>%
    pivot_wider(names_from = good_bad, names_prefix = 'n_', values_from = n_obs, values_fill = 0) %>%
    mutate(n_obs = n_bad + n_good) %>%
    mutate(prop_n_obs = n_obs / sum(n_obs)
           ,prop_bad = n_bad / n_obs
           ,prop_good = n_good / n_obs
           ,prop_n_bad = n_bad / sum(n_bad)
           ,prop_n_good = n_good / sum(n_good)) %>%
    mutate(WoE = log(prop_n_good / prop_n_bad)) %>%
    arrange(ifelse(discrete, WoE, eval(column))) %>%
    mutate(diff_prop_good = c(NaN, prop_good %>% diff() %>% abs())
           ,diff_WoE = c(NaN, WoE %>% diff() %>% abs())
           ,IV = (prop_n_good - prop_n_bad) * WoE) %>%
    mutate(IV = sum(IV))
  
  return(temp)
  
}

plot_by_woe <- function(df_woe, discrete = TRUE, rotation = 0){
  
  col_name <- colnames(df_woe)[1]
  
  if (discrete) sort_col = 'WoE' else sort_col = colnames(df_woe)[1]
  
  ggplot(df_woe, aes(reorder(.data[[col_name]], .data[[sort_col]]), WoE)) +
    geom_point(aes(size = n_obs)) + 
    theme_bw() +
    theme(axis.text.x = element_text(angle = rotation, vjust = 1, hjust = 1)) +
    labs(x = '', y = '', title =paste0('Weight of evidence of the attribute ', col_name), size = 'Size')
  
}
```

Preprocessing attribute *grade* was explained in chapter 29 of the Udemy course. This attribute has medium Information Value (0.29).
```{r woe_grade}
woe_grade <- woe(data, train_index, 'grade')
woe_grade %>% select(grade, n_obs, WoE, IV)
plot_by_woe(woe_grade)

list_of_dummy_variables <- c(
  'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G'
)

list_of_reference_categories <- c('grade_G')
```

Preprocessing attribute *home_ownership* was explained in chapter 29 of the Udemy course. This attribute has weak Information Value (0.02).
```{r home_ownership}
woe_home_ownership <- woe(data, train_index, 'home_ownership')
woe_home_ownership %>% select(home_ownership, n_obs, WoE, IV)
plot_by_woe(woe_home_ownership)

data <- data %>% 
  mutate(home_ownership_RENT_OTHER_NONE_ANY = home_ownership_RENT + home_ownership_OTHER + home_ownership_NONE + home_ownership_ANY)

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'home_ownership_RENT_OTHER_NONE_ANY', 'home_ownership_OWN', 'home_ownership_MORTGAGE'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'home_ownership_RENT_OTHER_NONE_ANY'
)
```

Preprocessing attribute *addr_state* was explained in chapter 30 of the Udemy course.
```{r addr_state}
woe_addr_state <- woe(data, train_index, 'addr_state')
woe_addr_state %>% select(addr_state, n_obs, WoE, IV)
plot_by_woe(woe_addr_state, rotation = 90)

data <- data %>%
  mutate(addr_state_NE_IA_NV_FL_HI_AL = addr_state_NE + addr_state_IA + addr_state_NV + addr_state_FL + addr_state_HI + addr_state_AL
          ,addr_state_NM_VA = addr_state_NM + addr_state_VA
          ,addr_state_OK_TN_MO_LA_MD_NC = addr_state_OK + addr_state_TN + addr_state_MO + addr_state_LA + addr_state_MD + addr_state_NC
          ,addr_state_UT_KY_AZ_NJ = addr_state_UT + addr_state_KY + addr_state_AZ + addr_state_NJ
          ,addr_state_AR_MI_PA_OH_MN = addr_state_AR + addr_state_MI + addr_state_PA + addr_state_OH + addr_state_MN
          ,addr_state_RI_MA_DE_SD_IN = addr_state_RI + addr_state_MA + addr_state_DE + addr_state_SD + addr_state_IN
          ,addr_state_GA_WA_OR = addr_state_GA + addr_state_WA + addr_state_OR
          ,addr_state_WI_MT = addr_state_WI + addr_state_MT
          ,addr_state_IL_CT = addr_state_IL + addr_state_CT
          ,addr_state_KS_SC_CO_VT_AK_MS = addr_state_KS + addr_state_SC + addr_state_CO + addr_state_VT + addr_state_AK + addr_state_MS
          ,addr_state_WV_NH_WY_DC_ME_ID = addr_state_WV + addr_state_NH + addr_state_WY + addr_state_DC + addr_state_ME + addr_state_ID
  )
    
list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'addr_state_NE_IA_NV_FL_HI_AL', 'addr_state_NM_VA', 'addr_state_NY'
  ,'addr_state_OK_TN_MO_LA_MD_NC', 'addr_state_CA', 'addr_state_UT_KY_AZ_NJ'
  ,'addr_state_AR_MI_PA_OH_MN', 'addr_state_RI_MA_DE_SD_IN', 'addr_state_GA_WA_OR'
  ,'addr_state_WI_MT', 'addr_state_IL_CT', 'addr_state_KS_SC_CO_VT_AK_MS'
  ,'addr_state_TX', 'addr_state_WV_NH_WY_DC_ME_ID'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'addr_state_NE_IA_NV_FL_HI_AL'
)

```

Preprocessing attribute *verification_status* left as homework of the Udemy course. This attribute has weak Information Value (0.02).
```{r verification_status}
woe_verification_status <- woe(data, train_index, 'verification_status')
woe_verification_status %>% select(verification_status, n_obs, WoE, IV)
plot_by_woe(woe_verification_status)

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'verification_status_Verified'
  ,'verification_status_Source.Verified'
  ,'verification_status_Not.Verified'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'verification_status_Verified'
)
```

Preprocessing attribute *purpose* left as homework of the Udemy course. This attribute has weak Information Value (0.04).
```{r purpose}
woe_purpose <- woe(data, train_index, 'purpose')
woe_purpose %>% select(purpose, n_obs, WoE, IV)
plot_by_woe(woe_purpose, rotation = 45)
           
data <- data %>%
  mutate(purpose_small_business_educational = purpose_small_business + purpose_educational
         ,purpose_renewable_energy_moving_other_house_medical = purpose_renewable_energy + purpose_moving + purpose_other + purpose_house +  purpose_medical
         ,purpose_wedding_vacation_debt_consolidation = purpose_wedding + purpose_vacation + purpose_debt_consolidation
         ,purpose_major_purchase_home_improvement = purpose_major_purchase  + purpose_home_improvement
         ,purpose_car_credit_card = purpose_car + purpose_credit_card
  )


list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'purpose_small_business_educational'
  ,'purpose_renewable_energy_moving_other_house_medical'
  ,'purpose_wedding_vacation_debt_consolidation'
  ,'purpose_major_purchase_home_improvement'
  ,'purpose_car_credit_card'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'purpose_small_business_educational'
)
```

Preprocessing attribute *initial_list_status* left as homework of the Udemy course. This attribute has weak Information Value (0.03).
```{r initial_list_status}
woe_initial_list_status <- woe(data, train_index, 'initial_list_status')
woe_initial_list_status %>% select(initial_list_status, n_obs, WoE, IV)
plot_by_woe(woe_initial_list_status)

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'initial_list_status_f', 'initial_list_status_w'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'initial_list_status_f'
)
```

Preprocessing attribute *term* was explained in chapter 33 of the Udemy course. This attribute has weak Information Value (0.04).
```{r term}
woe_term <- woe(data, train_index, 'term', discrete = FALSE)
woe_term %>% select(term, n_obs, WoE, IV)
plot_by_woe(woe_term)

data <- data %>%
  mutate(term_36 = if_else(term == 36, 1, 0)
         ,term_60 = if_else(term == 60, 1, 0))

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'term_36', 'term_60'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'term_60'
)
```

Preprocessing attribute *emp_length* was explained in chapter 33 of the Udemy course. This attribute has no predictive power according to the Information Value (<0.02).
```{r emp_length}
woe_emp_length <- woe(data, train_index, 'emp_length', discrete = FALSE)
woe_emp_length %>% select(emp_length, n_obs, WoE, IV)
plot_by_woe(woe_emp_length, discrete = FALSE)

data <- data %>%
  mutate(emp_length_0 = if_else(emp_length == 0, 1, 0)
         ,emp_length_1_4 = if_else(emp_length >= 1 & emp_length <= 4, 1, 0)
         ,emp_length_5_6 = if_else(emp_length >= 5 & emp_length <= 6, 1, 0)
         ,emp_length_7_9 = if_else(emp_length >= 7 & emp_length <= 9, 1, 0)
         ,emp_length_10 = if_else(emp_length == 10, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'emp_length_0', 'emp_length_1_4', 'emp_length_5_6', 'emp_length_7_9', 'emp_length_10'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'emp_length_0'
)
```

Preprocessing attribute *months_since_issue_d* was explained in chapter 34 of the Udemy course. This attribute has medium Information Value (0.11).
```{r months_since_issue_d, warning=FALSE}
data <- data %>% 
  mutate(months_since_issue_d_cut = cut(months_since_issue_d, 50))

woe_months_since_issue_d_cut <- woe(data, train_index, 'months_since_issue_d_cut', discrete = FALSE)
woe_months_since_issue_d_cut %>% select(months_since_issue_d_cut, n_obs, WoE, IV)
plot_by_woe(woe_months_since_issue_d_cut, discrete = FALSE, rotation = 90)

data <- data %>%
  mutate(months_since_issue_d_less_38 = if_else(months_since_issue_d < 38, 1, 0)
         ,months_since_issue_d_38_39 = if_else(months_since_issue_d >= 38 & months_since_issue_d <= 39, 1, 0)
         ,months_since_issue_d_40_41 = if_else(months_since_issue_d >= 40 & months_since_issue_d <= 41, 1, 0)
         ,months_since_issue_d_42_48 = if_else(months_since_issue_d >= 42 & months_since_issue_d <= 48, 1, 0)
         ,months_since_issue_d_49_52 = if_else(months_since_issue_d >= 49 & months_since_issue_d <= 52, 1, 0)
         ,months_since_issue_d_53_64 = if_else(months_since_issue_d >= 53 & months_since_issue_d <= 64, 1, 0)
         ,months_since_issue_d_65_84 = if_else(months_since_issue_d >= 65 & months_since_issue_d <= 84, 1, 0)
         ,months_since_issue_d_84_more = if_else(months_since_issue_d > 84, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'months_since_issue_d_less_38'
  ,'months_since_issue_d_38_39'
  ,'months_since_issue_d_40_41'
  ,'months_since_issue_d_42_48'
  ,'months_since_issue_d_49_52'
  ,'months_since_issue_d_53_64'
  ,'months_since_issue_d_65_84'
  ,'months_since_issue_d_84_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'months_since_issue_d_less_38'
)
```

Preprocessing attribute *int_rate* was explained in chapter 34 of the Udemy course. This attribute has strong Information Value (0.35).
```{r int_rate, warning=FALSE}
data <- data %>% 
  mutate(int_rate_cut = cut(int_rate, 50))

woe_int_rate_cut <- woe(data, train_index, 'int_rate_cut', discrete = FALSE)
woe_int_rate_cut %>% select(int_rate_cut, n_obs, WoE, IV)
plot_by_woe(woe_int_rate_cut, discrete = FALSE, rotation = 90)

data <- data %>%
  mutate(int_rate_less_9.548 = if_else(int_rate < 9.548, 1, 0)
         ,int_rate_9.548_12.025 = if_else(int_rate >= 9.548 & int_rate <= 12.025, 1, 0)
         ,int_rate_12.025_15.74 = if_else(int_rate >= 12.025 & int_rate <= 15.74, 1, 0)
         ,int_rate_15.74_20.281 = if_else(int_rate >= 15.74 & int_rate <= 20.281, 1, 0)
         ,int_rate_20.281_more = if_else(int_rate > 20.281, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'int_rate_less_9.548'
  ,'int_rate_9.548_12.025'
  ,'int_rate_12.025_15.74'
  ,'int_rate_15.74_20.281'
  ,'int_rate_20.281_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'int_rate_less_9.548'
)
```

Preprocessing attribute *months_since_earliest_cr_line* left as homework of the Udemy course. This attribute has has no predictive power according to the Information Value (<0.02).
```{r months_since_earliest_cr_line, warning=FALSE}
data <- data %>% 
  mutate(months_since_earliest_cr_line_cut = cut(months_since_earliest_cr_line, 50))

woe_months_since_earliest_cr_line_cut <- woe(data, train_index, 'months_since_earliest_cr_line_cut', discrete = FALSE)
woe_months_since_earliest_cr_line_cut %>% select(months_since_earliest_cr_line_cut, n_obs, WoE, IV)
plot_by_woe(woe_months_since_earliest_cr_line_cut, discrete = FALSE, rotation = 90)

data <- data %>%
  mutate(months_since_earliest_cr_line_less_140 = if_else(months_since_earliest_cr_line < 140, 1, 0)
         ,months_since_earliest_cr_line_141_164 = if_else(months_since_earliest_cr_line >= 141 
                                                         & months_since_earliest_cr_line <= 164, 1, 0)
         ,months_since_earliest_cr_line_165_247 = if_else(months_since_earliest_cr_line >= 165
                                                         & months_since_earliest_cr_line <= 247, 1, 0)
         ,months_since_earliest_cr_line_248_270 = if_else(months_since_earliest_cr_line >= 248 
                                                         & months_since_earliest_cr_line <= 270, 1, 0)
         ,months_since_earliest_cr_line_271_352 = if_else(months_since_earliest_cr_line >= 271 
                                                         & months_since_earliest_cr_line <= 352, 1, 0)
         ,months_since_earliest_cr_line_352_more = if_else(months_since_earliest_cr_line > 352, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'months_since_earliest_cr_line_less_140'
  ,'months_since_earliest_cr_line_141_164'
  ,'months_since_earliest_cr_line_165_247'
  ,'months_since_earliest_cr_line_248_270'
  ,'months_since_earliest_cr_line_271_352'
  ,'months_since_earliest_cr_line_352_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'months_since_earliest_cr_line_less_140'
)
```

Preprocessing attribute *installment* left as homework of the Udemy course. This attribute has has no predictive power according to the Information Value (<0.02).
```{r installment, warning=FALSE}
data <- data %>% 
  mutate(installment_cut = cut(installment, 50))

woe_installment_cut <- woe(data, train_index, 'installment_cut', discrete = FALSE)
woe_installment_cut %>% select(installment_cut, n_obs, WoE, IV)
plot_by_woe(woe_installment_cut, discrete = FALSE, rotation = 90)
```

Preprocessing attribute *delinq_2yrs* left as homework of the Udemy course. 
```{r delinq_2yrs}
woe_delinq_2yrs <- woe(data, train_index, 'delinq_2yrs', discrete = FALSE)
woe_delinq_2yrs %>% select(delinq_2yrs, n_obs, WoE, IV)
plot_by_woe(woe_delinq_2yrs, discrete = FALSE)

data <- data %>%
  mutate(delinq_2yrs_0 = if_else(delinq_2yrs == 0, 1, 0)
         ,delinq_2yrs_1_3 = if_else(delinq_2yrs >= 1 & delinq_2yrs <= 3, 1, 0)
         ,delinq_2yrs_3_more = if_else(delinq_2yrs > 3, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'delinq_2yrs_0 '
  ,'delinq_2yrs_1_3'
  ,'delinq_2yrs_3_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'delinq_2yrs_0 '
)
```

Preprocessing attribute *inq_last_6mths* left as homework of the Udemy course. 
```{r inq_last_6mths}
woe_inq_last_6mths <- woe(data, train_index, 'inq_last_6mths', discrete = FALSE)
woe_inq_last_6mths %>% select(inq_last_6mths, n_obs, WoE, IV)
plot_by_woe(woe_inq_last_6mths, discrete = FALSE)

data <- data %>%
  mutate(inq_last_6mths_0 = if_else(inq_last_6mths == 0, 1, 0)
         ,inq_last_6mths_1_2 = if_else(inq_last_6mths >= 1 & inq_last_6mths <= 2, 1, 0)
         ,inq_last_6mths_3_6 = if_else(inq_last_6mths >= 3 & inq_last_6mths <= 6, 1, 0)
         ,inq_last_6mths_6_more = if_else(inq_last_6mths > 6, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'inq_last_6mths_0'
  ,'inq_last_6mths_1_2'
  ,'inq_last_6mths_3_6'
  ,'inq_last_6mths_6_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'inq_last_6mths_0'
)
```

Preprocessing attribute *open_acc* left as homework of the Udemy course. 
```{r open_acc}
woe_open_acc <- woe(data, train_index, 'open_acc', discrete = FALSE)
woe_open_acc %>% select(open_acc, n_obs, WoE, IV)
plot_by_woe(woe_open_acc, discrete = FALSE, rotation = 90)

data <- data %>%
  mutate(open_acc_0 = if_else(open_acc == 0, 1, 0)
         ,open_acc_1_3 = if_else(open_acc >= 1 & open_acc <= 3, 1, 0)
         ,open_acc_4_12 = if_else(open_acc >= 4 & open_acc <= 12, 1, 0)
         ,open_acc_13_17 = if_else(open_acc >= 13 & open_acc <= 17, 1, 0)
         ,open_acc_18_22 = if_else(open_acc >= 18 & open_acc <= 22, 1, 0)
         ,open_acc_23_25 = if_else(open_acc >= 23 & open_acc <= 25, 1, 0)
         ,open_acc_26_30 = if_else(open_acc >= 26 & open_acc <= 30, 1, 0)
         ,open_acc_30_more = if_else(open_acc > 30, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'open_acc_0'
  ,'open_acc_1_3'
  ,'open_acc_4_12'
  ,'open_acc_13_17'
  ,'open_acc_18_22'
  ,'open_acc_23_25'
  ,'open_acc_26_30'
  ,'open_acc_30_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'open_acc_0'
)
```

Preprocessing attribute *pub_rec* left as homework of the Udemy course. 
```{r pub_rec}
woe_pub_rec <- woe(data, train_index, 'pub_rec', discrete = FALSE)
woe_pub_rec %>% select(pub_rec, n_obs, WoE, IV)
plot_by_woe(woe_pub_rec, discrete = FALSE)

data <- data %>%
  mutate(pub_rec_0_2 = if_else(pub_rec >= 0 & pub_rec <= 2, 1, 0)
         ,pub_rec_3_4 = if_else(pub_rec >= 3 & pub_rec <= 4, 1, 0)
         ,pub_rec_4_more = if_else(pub_rec > 4, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'pub_rec_0_2'
  ,'pub_rec_3_4'
  ,'pub_rec_4_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'pub_rec_0_2'
)
```

Preprocessing attribute *total_acc* left as homework of the Udemy course. 
```{r total_acc, warning=FALSE}
data <- data %>% 
  mutate(total_acc_cut = cut(total_acc, 50))

woe_total_acc_cut <- woe(data, train_index, 'total_acc_cut', discrete = FALSE)
woe_total_acc_cut %>% select(total_acc_cut, n_obs, WoE, IV)
plot_by_woe(woe_total_acc_cut, discrete = FALSE, rotation = 90)

data <- data %>%
  mutate(total_acc_less_28 = if_else(total_acc < 28, 1, 0)
         ,total_acc_28_51 = if_else(total_acc >= 28 & total_acc <= 51, 1, 0)
         ,total_acc_51_more = if_else(total_acc > 51, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'total_acc_less_28'
  ,'total_acc_28_51'
  ,'total_acc_51_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'total_acc_less_28'
)
```

Preprocessing attribute *acc_now_delinq* left as homework of the Udemy course. 
```{r acc_now_delinq}
woe_acc_now_delinq <- woe(data, train_index, 'acc_now_delinq', discrete = FALSE)
woe_acc_now_delinq %>% select(acc_now_delinq, n_obs, WoE, IV)
plot_by_woe(woe_acc_now_delinq, discrete = FALSE)

data <- data %>%
  mutate(acc_now_delinq_0 = if_else(acc_now_delinq == 0, 1, 0)
         ,acc_now_delinq_0_more = if_else(acc_now_delinq > 0, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'acc_now_delinq_0'
  ,'acc_now_delinq_0_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'acc_now_delinq_0'
)
```

Preprocessing attribute *total_rev_hi_lim* left as homework of the Udemy course. 
```{r total_rev_hi_lim, warning=FALSE}
data <- data %>% 
  mutate(total_rev_hi_lim_cut = cut(total_rev_hi_lim, 50))

woe_total_rev_hi_lim_cut <- woe(data, train_index, 'total_rev_hi_lim_cut', discrete = FALSE)
woe_total_rev_hi_lim_cut %>% select(total_rev_hi_lim_cut, n_obs, WoE, IV)
plot_by_woe(woe_total_rev_hi_lim_cut, discrete = FALSE, rotation = 90)

data <- data %>%
  mutate(total_rev_hi_lim_Missing = if_else(is.na(total_rev_hi_lim), 1, 0)
         ,total_rev_hi_lim_less_5 = if_else(total_rev_hi_lim < 5000, 1, 0, missing = 0)
         ,total_rev_hi_lim_5_10 = if_else(total_rev_hi_lim >= 5000 & total_rev_hi_lim <= 10000, 1, 0, missing = 0)
         ,total_rev_hi_lim_10_20 = if_else(total_rev_hi_lim >= 10000 & total_rev_hi_lim <= 20000, 1, 0, missing = 0)
         ,total_rev_hi_lim_20_30 = if_else(total_rev_hi_lim >= 20000 & total_rev_hi_lim <= 30000, 1, 0, missing = 0)
         ,total_rev_hi_lim_30_40 = if_else(total_rev_hi_lim >= 30000 & total_rev_hi_lim <= 40000, 1, 0, missing = 0)
         ,total_rev_hi_lim_40_55 = if_else(total_rev_hi_lim >= 40000 & total_rev_hi_lim <= 55000, 1, 0, missing = 0)
         ,total_rev_hi_lim_55_95 = if_else(total_rev_hi_lim >= 55000 & total_rev_hi_lim <= 95000, 1, 0, missing = 0)
         ,total_rev_hi_lim_95_more = if_else(total_rev_hi_lim > 95000, 1, 0, missing = 0)
  )



list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'total_rev_hi_lim_Missing'
  ,'total_rev_hi_lim_less_5'
  ,'total_rev_hi_lim_5_10'
  ,'total_rev_hi_lim_10_20'
  ,'total_rev_hi_lim_20_30'
  ,'total_rev_hi_lim_30_40'
  ,'total_rev_hi_lim_40_55'
  ,'total_rev_hi_lim_55_95'
  ,'total_rev_hi_lim_95_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'total_rev_hi_lim_Missing'
)
```

Preprocessing attribute *annual_inc* was explained in chapter 36 of the Udemy course.
```{r annual_inc, warning=FALSE}
data <- data %>% 
  mutate(annual_inc_cut = cut(annual_inc, 50))

woe_annual_inc_cut <- woe(data, train_index, 'annual_inc_cut', discrete = FALSE)
woe_annual_inc_cut %>% select(annual_inc_cut, n_obs, WoE, IV)
plot_by_woe(woe_annual_inc_cut, discrete = FALSE, rotation = 90)

data <- data %>%
  mutate(annual_inc_less_20 = if_else(annual_inc < 20000, 1, 0)
         ,annual_inc_20_30 = if_else(annual_inc >= 20000 & annual_inc <= 30000, 1, 0)
         ,annual_inc_30_40 = if_else(annual_inc >= 30000 & annual_inc <= 40000, 1, 0)
         ,annual_inc_40_50 = if_else(annual_inc >= 40000 & annual_inc <= 50000, 1, 0)
         ,annual_inc_50_60 = if_else(annual_inc >= 50000 & annual_inc <= 60000, 1, 0)
         ,annual_inc_60_70 = if_else(annual_inc >= 60000 & annual_inc <= 70000, 1, 0)
         ,annual_inc_70_80 = if_else(annual_inc >= 70000 & annual_inc <= 80000, 1, 0)
         ,annual_inc_80_90 = if_else(annual_inc >= 80000 & annual_inc <= 90000, 1, 0)
         ,annual_inc_90_100 = if_else(annual_inc >= 90000 & annual_inc <= 100000, 1, 0)
         ,annual_inc_100_120 = if_else(annual_inc >= 100000 & annual_inc <= 120000, 1, 0)
         ,annual_inc_120_140 = if_else(annual_inc >= 120000 & annual_inc <= 140000, 1, 0)
         ,annual_inc_140_more = if_else(annual_inc > 140000, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'annual_inc_less_20'
  ,'annual_inc_20_30'
  ,'annual_inc_30_40'
  ,'annual_inc_40_50'
  ,'annual_inc_50_60'
  ,'annual_inc_60_70'
  ,'annual_inc_70_80'
  ,'annual_inc_80_90'
  ,'annual_inc_90_100'
  ,'annual_inc_100_120'
  ,'annual_inc_120_140'
  ,'annual_inc_140_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'annual_inc_less_20'
)
```

Preprocessing attribute *mths_since_last_delinq* was explained in chapter 36 of the Udemy course.
```{r mths_since_last_delinq, warning=FALSE}
data <- data %>% 
  mutate(mths_since_last_delinq_cut = cut(mths_since_last_delinq, 50))

woe_mths_since_last_delinq_cut <- woe(data, train_index, 'mths_since_last_delinq_cut', discrete = FALSE)
woe_mths_since_last_delinq_cut %>% select(mths_since_last_delinq_cut, n_obs, WoE, IV)
plot_by_woe(woe_mths_since_last_delinq_cut, discrete = FALSE, rotation = 90)

data <- data %>%
  mutate(mths_since_last_delinq_Missing = if_else(is.na(mths_since_last_delinq), 1, 0)
         ,mths_since_last_delinq_0_3 = if_else(mths_since_last_delinq >= 0 & mths_since_last_delinq <= 3, 1, 0, missing = 0)
         ,mths_since_last_delinq_4_30 = if_else(mths_since_last_delinq >= 4 & mths_since_last_delinq <= 30, 1, 0, missing = 0)
         ,mths_since_last_delinq_31_56 = if_else(mths_since_last_delinq >= 31 & mths_since_last_delinq <= 56, 1, 0, missing = 0)
         ,mths_since_last_delinq_56_more = if_else(mths_since_last_delinq > 56, 1, 0, missing = 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'mths_since_last_delinq_Missing'
  ,'mths_since_last_delinq_0_3'
  ,'mths_since_last_delinq_4_30'
  ,'mths_since_last_delinq_31_56'
  ,'mths_since_last_delinq_56_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'mths_since_last_delinq_Missing'
)
```

Preprocessing attribute *dti* left as homework of the Udemy course. 
```{r dti, warning=FALSE}
data <- data %>% 
  mutate(dti_cut = cut(dti, 50))

woe_dti_cut <- woe(data, train_index, 'dti_cut', discrete = FALSE)
woe_dti_cut %>% select(dti_cut, n_obs, WoE, IV)
plot_by_woe(woe_dti_cut, discrete = FALSE, rotation = 90)

data <- data %>%
  mutate(dti_less_1.4 = if_else(dti < 1.4, 1, 0)
         ,dti_1.4_3.5 = if_else(dti >= 1.4 & dti <= 3.5, 1, 0)
         ,dti_3.5_7.7 = if_else(dti >= 3.5 & dti <= 7.7, 1, 0)
         ,dti_7.7_10.5 = if_else(dti >= 7.7 & dti <= 10.5, 1, 0)
         ,dti_10.5_16.1 = if_else(dti >= 10.5 & dti <= 16.1, 1, 0)
         ,dti_16.1_20.3 = if_else(dti >= 16.1 & dti <= 20.3, 1, 0)
         ,dti_20.3_21.7 = if_else(dti >= 20.3 & dti <= 21.7, 1, 0)
         ,dti_21.7_22.4 = if_else(dti >= 21.7 & dti <= 22.4, 1, 0)
         ,dti_22.4_35 = if_else(dti >= 22.4 & dti <= 35, 1, 0)
         ,dti_35_more = if_else(dti > 35, 1, 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'dti_less_1.4'
  ,'dti_1.4_3.5'
  ,'dti_3.5_7.7'
  ,'dti_7.7_10.5'
  ,'dti_10.5_16.1'
  ,'dti_16.1_20.3'
  ,'dti_20.3_21.7'
  ,'dti_21.7_22.4'
  ,'dti_22.4_35'
  ,'dti_35_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'dti_less_1.4'
)
```

Preprocessing attribute *mths_since_last_record* was left as homework of the Udemy course. 
```{r mths_since_last_record, warning=FALSE}
data <- data %>% 
  mutate(mths_since_last_record_cut = cut(mths_since_last_record, 50))

woe_mths_since_last_record_cut <- woe(data, train_index, 'mths_since_last_record_cut', discrete = FALSE)
woe_mths_since_last_record_cut %>% select(mths_since_last_record_cut, n_obs, WoE, IV)
plot_by_woe(woe_mths_since_last_record_cut, discrete = FALSE, rotation = 90)

data <- data %>%
  mutate(mths_since_last_record_Missing = if_else(is.na(mths_since_last_record), 1, 0)
         ,mths_since_last_record_0_2 = if_else(mths_since_last_record >= 0 & mths_since_last_record <= 2, 1, 0, missing = 0)
         ,mths_since_last_record_3_20 = if_else(mths_since_last_record >= 3 & mths_since_last_record <= 20, 1, 0, missing = 0)
         ,mths_since_last_record_21_31 = if_else(mths_since_last_record >= 21 & mths_since_last_record <= 31, 1, 0, missing = 0)
         ,mths_since_last_record_32_80 = if_else(mths_since_last_record >= 32 & mths_since_last_record <= 80, 1, 0, missing = 0)
         ,mths_since_last_record_81_86 = if_else(mths_since_last_record >= 81 & mths_since_last_record <= 86, 1, 0, missing = 0)
         ,mths_since_last_record_86_more = if_else(mths_since_last_record > 86, 1, 0, missing = 0)
  )

list_of_dummy_variables <- c(
  list_of_dummy_variables
  ,'mths_since_last_record_Missing'
  ,'mths_since_last_record_0_2'
  ,'mths_since_last_record_3_20'
  ,'mths_since_last_record_21_31'
  ,'mths_since_last_record_32_80'
  ,'mths_since_last_record_81_86'
  ,'mths_since_last_record_86_more'
)

list_of_reference_categories <- c(
  list_of_reference_categories
  ,'mths_since_last_record_Missing'
)
```

```{r write}
write.csv(list_of_dummy_variables, 'pd_dummies.csv', row.names = FALSE)
write.csv(list_of_reference_categories, 'pd_dummies_reference.csv', row.names = FALSE)

write.csv(data, 'pd_preprocessed_loan_data_2007_2014.csv', row.names = FALSE)
```

