# Clear Working Space
rm(list = ls())
# Import Dataset (You can also import the data manually using the "Import Dataset" button on the upper right environment)
data <- read.table("~/Downloads/german.data")
# If you want to manually import the german.csv:
# write.csv(data, file = "german.csv")
# data <- german[,2:22]
# Data Description
# See: https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)
# Define column names
colnames(data) <- c("CHK", "DUR", "CRH", "PUR", "CRA",
"SAB", "EMY", "INT", "STA", "GUA",
"RES", "PRP", "AGE", "OIP", "HOU",
"CRD", "JOB", "PPL", "TEL", "FOR", "TYP")
# Data Structure
str(data)
# Attribute Information:
#
# Attribute 1: (qualitative)
# Status of existing checking account - CHK
# A11 : ... < 0 DM
# A12 : 0 <= ... < 200 DM
# A13 : ... >= 200 DM / salary assignments for at least 1 year
# A14 : no checking account
#
# Attribute 2: (numerical)
# Duration in month - DUR
#
# Attribute 3: (qualitative)
# Credit history - CRH
# A30 : no credits taken/ all credits paid back duly
# A31 : all credits at this bank paid back duly
# A32 : existing credits paid back duly till now
# A33 : delay in paying off in the past
# A34 : critical account/ other credits existing (not at this bank)
#
# Attribute 4: (qualitative)
# Purpose - PUR
# A40 : car (new)
# A41 : car (used)
# A42 : furniture/equipment
# A43 : radio/television
# A44 : domestic appliances
# A45 : repairs
# A46 : education
# A47 : (vacation - does not exist?)
# A48 : retraining
# A49 : business
# A410 : others
#
# Attribute 5: (numerical)
# Credit amount - CRA
#
# Attibute 6: (qualitative)
# Savings account/bonds - SAB
# A61 : ... < 100 DM
# A62 : 100 <= ... < 500 DM
# A63 : 500 <= ... < 1000 DM
# A64 : .. >= 1000 DM
# A65 : unknown/ no savings account
#
# Attribute 7: (qualitative)
# Present employment since - EMY
# A71 : unemployed
# A72 : ... < 1 year
# A73 : 1 <= ... < 4 years
# A74 : 4 <= ... < 7 years
# A75 : .. >= 7 years
#
# Attribute 8: (numerical)
# Installment rate in percentage of disposable income - INT
#
# Attribute 9: (qualitative)
# Personal status and sex - STA
# A91 : male : divorced/separated
# A92 : female : divorced/separated/married
# A93 : male : single
# A94 : male : married/widowed
# A95 : female : single
#
# Attribute 10: (qualitative)
# Other debtors / guarantors - GUA
# A101 : none
# A102 : co-applicant
# A103 : guarantor
#
# Attribute 11: (numerical)
# Present residence since - RES
#
# Attribute 12: (qualitative)
# Property - PRP
# A121 : real estate
# A122 : if not A121 : building society savings agreement/ life insurance
# A123 : if not A121/A122 : car or other, not in attribute 6
# A124 : unknown / no property
#
# Attribute 13: (numerical)
# Age in years - AGE
#
# Attribute 14: (qualitative)
# Other installment plans - OIP
# A141 : bank
# A142 : stores
# A143 : none
#
# Attribute 15: (qualitative)
# Housing - HOU
# A151 : rent
# A152 : own
# A153 : for free
#
# Attribute 16: (numerical)
# Number of existing credits at this bank - CRD
#
# Attribute 17: (qualitative)
# Job - JOB
# A171 : unemployed/ unskilled - non-resident
# A172 : unskilled - resident
# A173 : skilled employee / official
# A174 : management/ self-employed/
# highly qualified employee/ officer
#
# Attribute 18: (numerical)
# Number of people being liable to provide maintenance for - PPL
#
# Attribute 19: (qualitative)
# Telephone - TEL
# A191 : none
# A192 : yes, registered under the customers name
#
# Attribute 20: (qualitative)
# foreign worker - FOR
# A201 : yes
# A202 : no
# Data Cleaning
# Define: Good Type = 0 and Bad Type = 1
data$TYP <- data$TYP -1
summary(data$TYP)
# Expand the dataset (include dummy variables )
x.con <- data[,c(2,5,8,11,13,16,18)]
x.dum.1 <- model.matrix(~ CHK - 1, data)
x.dum.2 <- model.matrix(~ CRH - 1, data)
x.dum.3 <- model.matrix(~ PUR - 1, data)
x.dum.4 <- model.matrix(~ SAB - 1, data)
x.dum.5 <- model.matrix(~ EMY - 1, data)
x.dum.6 <- model.matrix(~ STA - 1, data)
x.dum.7 <- model.matrix(~ GUA - 1, data)
x.dum.8 <- model.matrix(~ PRP - 1, data)
x.dum.9 <- model.matrix(~ OIP - 1, data)
x.dum.10 <- model.matrix(~ HOU - 1, data)
x.dum.11 <- model.matrix(~ JOB - 1, data)
x.dum.12 <- model.matrix(~ TEL - 1, data)
x.dum.13 <- model.matrix(~ FOR - 1, data)
x.dum <- cbind(x.dum.1, x.dum.2, x.dum.3, x.dum.4, x.dum.5,
x.dum.6, x.dum.7, x.dum.8, x.dum.9, x.dum.10,
x.dum.11, x.dum.12, x.dum.13)
x <- cbind(x.con, x.dum)
rm(x.con, x.dum, x.dum.1, x.dum.2, x.dum.3, x.dum.4, x.dum.5,
x.dum.6, x.dum.7, x.dum.8, x.dum.9, x.dum.10, x.dum.11,
x.dum.12, x.dum.13)
y <- data[,21]
# Define a function: Factor normalization by column
factor_normalize <- function(x) {
normalized <- t(apply(x, 2, function(col) (col - min(col)) / (max(col) - min(col))))
return(normalized)
}
# Apply factor normalization to the data
x <- factor_normalize(x)
x <- t(x)
data2 <- cbind(x,y)
### Now we have three datasets:
### "data" is the original data: 20 factors and 1 label ("TYP")
### "x" is expanded from the original 20-factor dataset to 61-factor dataset including dummy variables; and "y" is the label
### "data2" is the combination of x and y in one table
# Compare the Prediction with the Actual Data
# We will compute the following metrics to evaluate the model:
# 1. Confusion matrix
# 2. Accuracy
# 3. F1 Score
# 4. Receiver Operating Curve (ROC)
# 5. Area Under ROC (AUROC or AUC)
# I will create a function called "accuracyFun" - that takes pred.pre, type.pre and type.act as inputs - to compute and return all of the above metrics as outputs
# I also want the accuracy evaluation function:
accuracyFun <- function(prob.pre = NULL, type.pre, type.act) {
mod.tab <- table(type.pre, type.act)
tp <- mod.tab[2, 2]
tn <- mod.tab[1, 1]
fp <- mod.tab[2, 1]
fn <- mod.tab[1, 2]
tot <- tp + tn + fp + fn
mod.acc <- (tp + tn) / tot
mod.f1 <- (2 * tp) / (2 * tp + fp + fn)
# Draw the ROC Curve and calculate Area Under ROC
# install.packages("pROC")
library(pROC)
if (!is.null(prob.pre)) {
mod.roc <- roc(response = type.act, predictor = prob.pre)
plot(mod.roc)
mod.auc <- auc(mod.roc)
ans <- list(mod.tab, mod.acc, mod.f1, mod.roc, mod.auc)
return(ans)
} else {
ans <- list(mod.tab, mod.acc, mod.f1)
return(ans)
}
}
The caret package in R is a powerful tool for training and evaluating machine learning models.
It provides a unified interface for various modeling techniques, making it easy to switch between algorithms.
Caret handles pre-processing tasks like data scaling and feature selection, simplifying the model-building process.
It offers robust resampling methods, such as cross-validation and bootstrapping, for model evaluation and tuning.
# Install and load required packages
# For more information about caret package, visit:
# Max Kuhn's caret webinar: https://www.youtube.com/watch?v=7Jbb2ItbTC4
# Max Kuhn's book: https://topepo.github.io/caret/index.html
# Check this out!: https://topepo.github.io/caret/models-clustered-by-tag-similarity.html
install.packages("caret")
library(caret)
# Step 1: Data import
data <- read.csv("your_data_file.csv") # Replace "your_data_file.csv" with the actual file name
# Step 2: Data pre-processing
preprocessed_data <- preProcess(data, method = c("center", "scale")) # Example of scaling
# Step 3: Data partitioning
set.seed(123) # Set a seed for reproducibility
partition <- createDataPartition(preprocessed_data$Class, p = 0.8, list = FALSE)
train_data <- preprocessed_data[partition, ]
test_data <- preprocessed_data[-partition, ]
# Step 4: Model estimation and evaluation with 5-fold cross-validation
# Logistic regression
model_lr <- train(Class ~ ., data = train_data, method = "glm", family = "binomial", trControl = trainControl(method = "cv", number = 5))
predictions_lr <- predict(model_lr, newdata = test_data, type = "response")
predictions_lr <- ifelse(predictions_lr >= 0.5, "1", "0")
accuracy_lr <- confusionMatrix(predictions_lr, test_data$Class)$overall["Accuracy"]
# Elastic Net
model_en <- train(Class ~ ., data = train_data, method = "glmnet", trControl = trainControl(method = "cv", number = 5))
predictions_en <- predict(model_en, newdata = test_data, type = "response")
predictions_en <- ifelse(predictions_en >= 0.5, "1", "0")
accuracy_en <- confusionMatrix(predictions_en, test_data$Class)$overall["Accuracy"]
# Neural network
model_nn <- train(Class ~ ., data = train_data, method = "nnet", trControl = trainControl(method = "cv", number = 5))
predictions_nn <- predict(model_nn, newdata = test_data)
accuracy_nn <- confusionMatrix(predictions_nn, test_data$Class)$overall["Accuracy"]
# Decision tree
model_dt <- train(Class ~ ., data = train_data, method = "rpart", trControl = trainControl(method = "cv", number = 5))
predictions_dt <- predict(model_dt, newdata = test_data)
accuracy_dt <- confusionMatrix(predictions_dt, test_data$Class)$overall["Accuracy"]
# Random forest
model_rf <- train(Class ~ ., data = train_data, method = "rf", trControl = trainControl(method = "cv", number = 5))
predictions_rf <- predict(model_rf, newdata = test_data)
accuracy_rf <- confusionMatrix(predictions_rf, test_data$Class)$overall["Accuracy"]
# Naive Bayes
model_nb <- train(Class ~ ., data = train_data, method = "naive_bayes", trControl = trainControl(method = "cv", number = 5))
predictions_nb <- predict(model_nb, newdata = test_data)
accuracy_nb <- confusionMatrix(predictions_nb, test_data$Class)$overall["Accuracy"]
# SVM
model_svm <- train(Class ~ ., data = train_data, method = "svmRadial", trControl = trainControl(method = "cv", number = 5))
predictions_svm <- predict(model_svm, newdata = test_data)
accuracy_svm <- confusionMatrix(predictions_svm, test_data$Class)$overall["Accuracy"]
One way to model LGD is to assume that LGD follows a Beta Distribution with parameters α and β.
The beta distribution is a continuous probability distribution defined on the interval [0, 1].
It is characterized by two shape parameters, α and β, that control the distribution’s shape and skewness.
The beta distribution is commonly used to model proportions, rates, and probabilities in statistical analysis.
PDF: The PDF of the beta distribution with shape parameters α and β is given by:
f(x) = (1/B(α, β)) * x^(α-1) * (1-x)^(β-1)
where B(α, β) is the beta function.
E[X] = α / (α + β)
V[X] = (α * β) / [(α + β)^2 * (α + β + 1)]
# Set the seed for reproducibility
set.seed(123)
# Generate random samples from a beta distribution
n <- 100000 # Number of samples
alpha <- 0.1 # Shape parameter
beta <- 10 # Shape parameter
samples <- rbeta(n, alpha, beta) # Simulate Beta(alpha,beta) n times
summary(samples)
hist(samples, breaks = 50)
# Estimating alpha and beta parameters in Beta Distribution
# Step 1: Data import or generate data
lgd <- train_v2$loss
hist(lgd, breaks = 50)
# Step 2: Compute sample moments
mean_data <- mean(lgd)
var_data <- var(lgd)
# Step 3: Solve equations for alpha and beta using method of moments
# alpha_hat <- ((1 - mean_data) / var_data - 1 / mean_data) * mean_data^2
# beta_hat <- alpha_hat * (1 / mean_data - 1)
# Print the estimated alpha and beta
cat("Estimated Alpha:", alpha_hat, "\n")
cat("Estimated Beta:", beta_hat, "\n")
# Generate random samples from a beta distribution
samples <- rbeta(n, alpha_hat, beta_hat) # Simulate Beta(alpha,beta) n times
summary(samples)
hist(samples, breaks = 50)
Since we assume that LGD follows Beta distribution, in order to predict individual’s LGD, we can use GLM (generalized linear regression) to make the prediction.
Ferrari, S. L. P., & Cribari-Neto, F. (2004). Beta regression for modelling rates and proportions. Journal of Applied Statistics, 31(7), 799-815.
https://www.tandfonline.com/doi/abs/10.1080/0266476042000214501
Smithson, M., & Verkuilen, J. (2006). A better lemon squeezer? Maximum-likelihood regression with beta-distributed dependent variables. Psychological Methods, 11(1), 54-71.
# Install required packages
install.packages("betareg")
install.packages("caret")
# Load required libraries
library(betareg)
library(caret)
# Split the data into training and testing sets (80% training, 20% testing)
set.seed(123)
trainIndex <- createDataPartition(data$LGD, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]
# Fit the beta regression model on the training data
model <- betareg(LGD ~ ., data = trainData)
# Make predictions on the testing data
predictions <- predict(model, newdata = testData, type = "response")
# Evaluate model accuracy
accuracy <- mean((predictions - testData$LGD)^2)
So far we have estimated individual’s credit risk
We may use similar technology to estimate corporate’s credit risk as well
Expected Loss = EAD x PD x LGD
EAD: Outstanding lending amount
PD: Use Logistic Regression / Machine Learning …
LGD: Use Beta Regression / Machine Learning …
However, market prices might indicate a more real-time and forward-looking information about the credit risk.
… and we need a risk neutral probability for credit risk pricing and evaluation.
Risk-Neutral Probability:
Implied by market prices, assuming investors are risk-neutral.
Used in financial models like Black-Scholes for option pricing.
Represents the probability that makes expected value of a risky investment equal to its risk-free value.
Actual/Physical Probability:
The true likelihood of an event based on underlying factors and characteristics.
Considers creditworthiness, financial health, industry conditions, and other variables.
Takes into account the real-world risks and uncertainties.
Influenced by investor risk aversion and expectations of potential losses.
Typically higher for risk-averse investors, reflecting their demand for compensation for bearing risk.
Similarities between Risk-Neutral Probability and Actual/Physical Probability:
Both probabilities are measures of the likelihood of an event occurring, specifically default in the context of financial instruments.
Both probabilities are used in the field of finance and play a role in assessing and managing risk.
They both consider the underlying factors and characteristics related to the event in question.
Differences between Risk-Neutral Probability and Actual/Physical Probability:
Risk-Neutral Probability is derived from market prices and assumes investors are risk-neutral, while Actual/Physical Probability is based on real-world factors and considers the true likelihood of the event.
Risk-Neutral Probability is used in financial models for pricing and valuation purposes, whereas Actual/Physical Probability is used to assess real-world risks and make informed decisions.
Risk-Neutral Probability can be seen as an equilibrium concept that reflects market expectations, while Actual/Physical Probability considers individual risk preferences, creditworthiness, and other real-world factors.
Risk-Neutral Probability may be lower than Actual/Physical Probability for risk-averse investors due to their higher expectations of potential losses and demand for compensation.
Risk-Neutral Probability is used to calculate option prices and hedge strategies, while Actual/Physical Probability is employed in credit risk analysis and risk management.
\[ B = \frac{(1-p*)c + p*c(1-R)}{1+r_f} \]
\[ p* = \frac{1-(1+r_f)B}{1-R} \]
You can see that everything is observable at time 0
This set up can be easily extended to a coupon bond. You can assume that p* is constant over time.
Pros and Cons? This is a risk neutral probability and does not equate the actual probability of default. However, it might give a signal about the future prospects of the bond issuer from the investor’s point of view.
Bond price itself may not reflect only the default risk i.e. liquidity risk, counterparty risk etc.
Probability of default can be estimated from a stock price as well, with certain assumptions and structures. This is called a structural model.
Set up:
Total Asset (A) = Total Liability (B) + Total Shareholder’s Equity (E)
Assume that the firm issued one unit of equity and one unit of a zero coupon bond with face value D and maturity T, at expiration, the value of debt B and equity E are given by:
\[ B_T=min(A_T, D) = D - max(D-A_T,0) \]
\[ E_T = max(A_T-D,0) \]
In the case of default i.e. A < B, bondholders only get paid fully if the firm’s asset exceed the face value of debt. Equity holders are residual claimants in the firm.
Assume that A follows a Geometric Brownian Motion:
\[ dA_t =\mu A_t dt +\sigma A_t dZ_t \]
We can apply Black-Scholes technique to evaluate the value of debt and equity since debt is just the PV of D minus the value of the European put option at strike D. The equity value is just a European call option at strike D. The underlying asset is the value of the asset.
Probability of Default (PD) is defined as P(A<D), assume that Zt follows a standard normal distribution, thus At follows a lognormal distribution.
The usual Black-Scholes technology suggest that the probability of default is:
\[ P(A<D)=\Phi(-DD) \]
\[ DD = \frac{log(\frac{A_t}{D}) + (r_f-\frac{1}{2}\sigma^2)(T-t)}{\sigma\sqrt(T-t)} \]
DD is called Distance to Default
This approach assumed away a lot of stuff and too simplistic. But it’s a good start and work quite well according to research.
Bharath, S. T., & Shumway, T. (2008). Forecasting default with the Merton distance to default model. The Review of Financial Studies, 21(3), 1339-1369.
Avellaneda, M., & Zhu, J. (2001). Distance to default. Risk, 14(12), 125-129.
Jessen, C., & Lando, D. (2015). Robustness of distance-to-default. Journal of Banking & Finance, 50, 493-505.
Credit Default Swap (CDS):
Financial derivative contract between two parties where one party makes periodic payments in exchange for protection against a credit event.
Provides insurance-like coverage against the default or credit deterioration of a specific borrower or debt instrument.
Commonly used by investors and institutions to hedge credit risk or speculate on the creditworthiness of a particular entity.
CDS, if liquid, would be the cleanest measure of default risk since its value and the way cash flow is structured only depend on the event of default.
For a single period 1-year CDS, the CDS (default) premium S for a unit payout is:
\[ S=\frac{p*(1-R)}{1+r_f} \]
\[ p*=\frac{S(1+r_f)}{1-R} \]
Where p* is the risk neutral probability of the default premium.
Avesani, R., & Garcia Pascual, A. I. (2006). A new risk indicator and stress testing tool: A multifactor Nth-to-default CDS basket. Available at SSRN 910670.
Galil, K., Shapir, O. M., Amiram, D., & Ben-Zion, U. (2014). The determinants of CDS spreads. Journal of Banking & Finance, 41, 271-282.
Apergis, N., Danuletiu, D., & Xu, B. (2022). CDS spreads and COVID-19 pandemic. Journal of International Financial Markets, Institutions and Money, 76, 101433.