The panel data-set contains commercial customers’ financial information and days past due indicator from 2000 to 2020. The goal is to build a binary classifier to predict customers 90+ days past due (90+DPD) probability.
train <- read.csv(file="FITB_train.csv",header=TRUE)
test <- read.csv(file="FITB_test.csv",header=TRUE)
Checking the distribution of the data. If you look carefully you can see that the distribution of feature 3 has a lot of values in the extreme right tail. Red does not as evident by it’s flat distribution. You can’t even see green (feature 1) in the upper tail which means it’s under the red curve so it’s not problematic.
library(ggplot2)
ggplot() + geom_density(data=train, aes(x=feature_3), color="blue") +
geom_density(data=train, aes(x=feature_2), color="red") +
geom_density(data=train, aes(x=feature_1), color="green") +
geom_density(data=train, aes(x=feature_4), color="purple") +
theme_minimal()
Removing the top and bottom 1% from the tails of feature 3. “Winsorize“ feature 3.
library(dplyr)
train$key <- row.names(train)
feature_3_winsor <- data.frame(feature_3 = train$feature_3, key = row.names(train))
feature_3_winsor_clean <- na.omit(feature_3_winsor)
feature_3_winsor_clean <- feature_3_winsor_clean %>%
mutate(z_score = (feature_3 - mean(feature_3)) / sd(feature_3),percentile = ecdf(feature_3)(feature_3) * 100)
feature_3_winsor_df <- feature_3_winsor_clean[!(feature_3_winsor_clean[, 4] < 1 | feature_3_winsor_clean[, 4] > 99), ]
non_matching_keys <- anti_join(train, feature_3_winsor_df, by = "key")$key
train <- train %>% mutate(feature_3 = ifelse(key %in% non_matching_keys, NA, feature_3))
colnames(train)[3] <- "feature_3_winsor"
ggplot(data=train,aes(feature_3_winsor)) + geom_density() + theme_minimal()
Replace missing values from Winsorization with median of feature 3.
train[is.na(train[,3]),3] <- median(feature_3_winsor_clean$feature_3)
colnames(train)[3] <- "feature_3_impute"
test[is.na(test[,3]),3] <- median(feature_3_winsor_clean$feature_3)
colnames(test)[3] <- "feature_3_impute"
Impute missing values of feature 2 with value from next or previous year of that same ID.
train$date <- format(as.Date(train$date, format = "%Y-%m-%d"), "%Y")
train <- train %>%
arrange(id, date) %>% # Sort by id and date
group_by(id) %>%
mutate(feature_2 = ifelse(is.na(feature_2),
lead(feature_2, order_by = date), # Try next year
feature_2)) %>%
mutate(feature_2 = ifelse(is.na(feature_2),
lag(feature_2, order_by = date), # Try previous year
feature_2))
colnames(train)[2] <- "feature_2_impute"
test <- test %>%
arrange(id, date) %>%
group_by(id) %>%
mutate(feature_2 = ifelse(is.na(feature_2),
lead(feature_2, order_by = date), # Try next year
feature_2)) %>%
mutate(feature_2 = ifelse(is.na(feature_2),
lag(feature_2, order_by = date), # Try previous year
feature_2))
colnames(test)[2] <- "feature_2_impute"
train <- na.omit(train)
test <- na.omit(test)
Normalize the variables.
library(dplyr)
train <- train %>%
mutate(across(c(feature_1, feature_2_impute, feature_3_impute, feature_4),
~ (.x - mean(.x, na.rm = TRUE)) / sd(.x, na.rm = TRUE)))
colnames(train) <- c("feature_1_standard","feature_2_standard","feature_3_standard","feature_4_standard","id","date","y","key")
test <- test %>%
mutate(across(c(feature_1, feature_2_impute, feature_3_impute, feature_4),
~ (.x - mean(.x, na.rm = TRUE)) / sd(.x, na.rm = TRUE)))
colnames(test) <- c("feature_1_standard","feature_2_standard","feature_3_standard","feature_4_standard","id","date","y")
ggplot() + geom_density(data=train, aes(x=feature_3_standard), color="blue") +
geom_density(data=train, aes(x=feature_2_standard), color="red") +
geom_density(data=train, aes(x=feature_1_standard), color="green") +
geom_density(data=train, aes(x=feature_4_standard), color="purple") +
theme_minimal()
Building a logistic regression model where features 1 to 4 are independent variables and column y of the training data set is our categorical dependent variable. Converting y value “90+ DPD” to 1 and “active” to 0, as in, 1 for delinquent and 0 for non-delinquent. The model will be producing probabilities for value 1 ( “90+ DPD”: delinquency).
library(nnet)
train$y <- as.numeric(as.character(factor(train$y, levels = c("90+DPD", "active"), labels = c(1, 0))))
#This is necessary so that the delinquent value is recognized as the positive outcome.
delinquency_model <- multinom(y ~ feature_1_standard + feature_2_standard + feature_3_standard + feature_4_standard,
data=train,family=binomial())
## # weights: 6 (5 variable)
## initial value 2730.999891
## iter 10 value 1604.602929
## final value 1604.602903
## converged
summary(delinquency_model)
## Call:
## multinom(formula = y ~ feature_1_standard + feature_2_standard +
## feature_3_standard + feature_4_standard, data = train, family = binomial())
##
## Coefficients:
## Values Std. Err.
## (Intercept) -1.7978070 0.05452505
## feature_1_standard -0.5889688 0.07635082
## feature_2_standard -0.1989696 0.05108342
## feature_3_standard -0.8885600 0.06997357
## feature_4_standard 0.1973470 0.05150389
##
## Residual Deviance: 3209.206
## AIC: 3219.206
Evaluating the accuracy of the model by the AUC and ROC curve resulting from the model being evaluated on the testing data.
library(pROC)
test$predicted_y <- predict(delinquency_model, newdata = test, type = "class")
test$y_numeric <- as.numeric(as.character(factor(test$y, levels = c("90+DPD", "active"), labels = c(1, 0))))
test$Probability <- predict(delinquency_model, newdata = test, type = "probs")
options(digits = 4)
roc_curve <- roc(response = test$y_numeric, predictor = test$Probability)
roc_metrics <- coords(roc_curve, x = "all", ret = c("threshold", "sensitivity", "specificity"))
(head(roc_metrics,5))
## threshold sensitivity specificity
## 1 -Inf 1 0.000000
## 2 0.0003493 1 0.001183
## 3 0.0003954 1 0.002367
## 4 0.0004220 1 0.003550
## 5 0.0004640 1 0.004734
auc_value <- auc(roc_curve)
optimal_threshold <- roc_metrics$threshold[which.min(abs(roc_metrics$sensitivity - roc_metrics$specificity))]
roc_metrics$threshold <- as.numeric(roc_metrics$threshold)
roc_data <- data.frame(
TPR = rev(roc_curve$sensitivities),
FPR = rev(1 - roc_curve$specificities)
)
auc_value <- auc(roc_curve)
ggplot(roc_data, aes(x = FPR, y = TPR)) +
geom_smooth(color = "blue", size = 1) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "ROC Curve for Multinomial Logistic Regression",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)",
caption = paste("AUC:", round(auc_value, 4))
) +
coord_fixed() + # Maintain proportional scales
xlim(-0.5, 1.5) + # Expand x-axis
theme_minimal() +
theme(plot.caption = element_text(hjust = 0.5, size = 12))
The AUC of the model on the testing data is 82% (50% would be random guess).
As well, the optimal threshold found was 0.2243997. The model produces a probability of delinquency for each observation, a probability value must be selected where observations that have a probability equal or greater to this value are categorized as delinquent. The effectiveness of this probability threshold can be evaluated by comparing the resulting predicted outcomes of delinquency against the actual delinquency outcomes.
| threshold | sensitivity | specificity |
|---|---|---|
| 0.2241302 | 0.7710 | 0.770414 |
Sensitivity: is the proportion of positive outcomes (delinquent) correctly identified by the model, as in, what proportion of delinquencies were caught by the model.
Specificity: is the proportion of negative outcomes (not delinquent) correctly identified by the model, as in, what proportion of people who did not go delinquent on payments were not mis-categorized as delinquent.
Thus the compliment of Specificity is the false positive rate.
The optimal threshold is that which maximizes the delinquencies successfully predicted and minimizes the number of delinquencies incorrectly predicted. This value is visually apparent by this plot.
ggplot(roc_metrics, aes(x = threshold)) +
geom_smooth(aes(y = sensitivity, color = "Sensitivity")) +
geom_smooth(aes(y = specificity, color = "Specificity")) +
labs(title = "Sensitivity and Specificity vs. Threshold",
x = "Threshold", y = "Metric Value") +
scale_color_manual(name = "Metrics", values = c("Sensitivity" = "red", "Specificity" = "blue")) +
theme_minimal()
Confusion matrix displaying the accuracy of the found optimal decision threshold.
test$predicted_class <- ifelse(test$Probability >= roc_metrics$threshold[which.min(abs(roc_metrics$sensitivity - roc_metrics$specificity))], 1, 0)
library(caret)
conf_matrix <- confusionMatrix(
factor(test$predicted_class, levels = c(0, 1)),
factor(test$y_numeric, levels = c(0, 1)))
confusion_table <- as.data.frame.matrix(conf_matrix$table)
rownames(confusion_table) <- c("Actual: Non-delinquent", "Actual: Delinquent")
colnames(confusion_table) <- c("Predicted: Non-delinquent", "Predicted: Delinquent")
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_table)
## Predicted: Non-delinquent Predicted: Delinquent
## Actual: Non-delinquent 652 49
## Actual: Delinquent 193 165
true_positives <- confusion_table[2, 2]
false_positives <- confusion_table[1, 2]
true_negatives <- confusion_table[1, 1]
false_negatives <- confusion_table[2, 1]
Checking for Multicollinearity
library(car)
X <- model.matrix(~ feature_1_standard + feature_2_standard + feature_3_standard + feature_4_standard, data=train)
vif_values <- diag(solve(cor(X[, -1])))
names(vif_values) <- colnames(X)[-1]
print(vif_values)
## feature_1_standard feature_2_standard feature_3_standard feature_4_standard
## 2.109 1.343 1.901 1.215
library(corrplot)
cor_matrix <- cor(train[, c("feature_1_standard", "feature_2_standard", "feature_3_standard", "feature_4_standard")])
corrplot(cor_matrix,
method = "color",
col = colorRampPalette(c("white", "red"))(200),
type = "upper",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.8)
There is Multicollinearity between feature 1 and feature 3
Analysis of deviance test, for difference of goodness of fit between full model and model without feature 1 or feature 2.
full_model <- glm(y ~ feature_1_standard + feature_2_standard + feature_3_standard + feature_4_standard, data = train, family = binomial())
model_without_feature_1 <- glm(y ~ feature_2_standard + feature_3_standard + feature_4_standard, data = train, family = binomial())
model_without_feature_3 <- glm(y ~ feature_1_standard + feature_2_standard + feature_4_standard, data = train, family = binomial())
anova(model_without_feature_1, full_model, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: y ~ feature_2_standard + feature_3_standard + feature_4_standard
## Model 2: y ~ feature_1_standard + feature_2_standard + feature_3_standard +
## feature_4_standard
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 3936 3264
## 2 3935 3209 1 54.6 1.5e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model_without_feature_3, full_model, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: y ~ feature_1_standard + feature_2_standard + feature_4_standard
## Model 2: y ~ feature_1_standard + feature_2_standard + feature_3_standard +
## feature_4_standard
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 3936 3416
## 2 3935 3209 1 207 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Reject the null hypothesis that the reduced model does not have significantly different goodness of fit to the original. Feature 1 is necessary.
library(pROC)
library(ggplot2)
model_without_feature_1 <- glm(y ~ feature_2_standard + feature_3_standard + feature_4_standard,
data = train, family = binomial())
test_no_feature_1 <- test %>%
select(-feature_1_standard)
test_no_feature_1$Probability <- predict(model_without_feature_1, newdata = test_no_feature_1, type = "response")
test_no_feature_1$y_numeric <- as.numeric(as.character(factor(test_no_feature_1$y,
levels = c("90+DPD", "active"),
labels = c(1, 0))))
roc_curve <- roc(response = test_no_feature_1$y_numeric,
predictor = test_no_feature_1$Probability)
roc_metrics <- coords(roc_curve, x = "all", ret = c("threshold", "sensitivity", "specificity"))
# Extract data for ROC curve
roc_data <- data.frame(
TPR = rev(roc_curve$sensitivities), # True Positive Rate (Sensitivity)
FPR = rev(1 - roc_curve$specificities) # False Positive Rate (1 - Specificity)
)
# Calculate AUC value
auc_value <- auc(roc_curve)
# Create the ROC curve plot using ggplot2
ggplot(roc_data, aes(x = FPR, y = TPR)) +
geom_smooth(color = "blue", size = 1) +
geom_abline(linetype = "dashed", color = "gray") + # Diagonal line for random chance
labs(
title = "ROC Curve Without Feature 3",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)",
caption = paste("AUC:", round(auc_value, 4))
) +
coord_fixed() + # Maintain proportional scales
xlim(-0.5, 1.5) + # Expand x-axis for better visualization
theme_minimal() +
theme(plot.caption = element_text(hjust = 0.5, size = 12))
roc_metrics_df <- as.data.frame(roc_metrics)
ggplot(roc_metrics_df, aes(x = threshold)) +
geom_smooth(aes(y = sensitivity, color = "Sensitivity")) +
geom_smooth(aes(y = specificity, color = "Specificity")) +
labs(title = "Sensitivity and Specificity vs. Threshold Without Feature 1",
x = "Threshold", y = "Metric Value") +
scale_color_manual(name = "Metrics", values = c("Sensitivity" = "red", "Specificity" = "blue")) +
theme_minimal()
The AUC is slightly inferior.
library(pROC)
library(ggplot2)
model_without_feature_3 <- glm(y ~ feature_1_standard + feature_2_standard + feature_4_standard,
data = train, family = binomial())
test_no_feature_3 <- test %>%
select(-feature_3_standard)
test_no_feature_3$Probability <- predict(model_without_feature_3, newdata = test_no_feature_3, type = "response")
test_no_feature_3$y_numeric <- as.numeric(as.character(factor(test_no_feature_3$y,
levels = c("90+DPD", "active"),
labels = c(1, 0))))
roc_curve <- roc(response = test_no_feature_3$y_numeric,
predictor = test_no_feature_3$Probability)
roc_metrics <- coords(roc_curve, x = "all", ret = c("threshold", "sensitivity", "specificity"))
# Extract data for ROC curve
roc_data <- data.frame(
TPR = rev(roc_curve$sensitivities), # True Positive Rate (Sensitivity)
FPR = rev(1 - roc_curve$specificities) # False Positive Rate (1 - Specificity)
)
# Calculate AUC value
auc_value <- auc(roc_curve)
# Create the ROC curve plot using ggplot2
ggplot(roc_data, aes(x = FPR, y = TPR)) +
geom_smooth(color = "blue", size = 1) +
geom_abline(linetype = "dashed", color = "gray") + # Diagonal line for random chance
labs(
title = "ROC Curve Without Feature 3",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)",
caption = paste("AUC:", round(auc_value, 4))
) +
coord_fixed() + # Maintain proportional scales
xlim(-0.5, 1.5) + # Expand x-axis for better visualization
theme_minimal() +
theme(plot.caption = element_text(hjust = 0.5, size = 12))
roc_metrics_df <- as.data.frame(roc_metrics)
ggplot(roc_metrics_df, aes(x = threshold)) +
geom_smooth(aes(y = sensitivity, color = "Sensitivity")) +
geom_smooth(aes(y = specificity, color = "Specificity")) +
labs(title = "Sensitivity and Specificity vs. Threshold Without Feature 3",
x = "Threshold", y = "Metric Value") +
scale_color_manual(name = "Metrics", values = c("Sensitivity" = "red", "Specificity" = "blue")) +
theme_minimal()
The result is verified, the reduced models are inferior. The original model should be retained with the awareness of possible problems with multicollinearity.