# Diagnostics / transforms — load BEFORE tidyverse to avoid masking conflicts
library(car) # vif(), Anova()
library(lmtest) # bptest() — Breusch-Pagan heteroscedasticity test
library(MASS) # boxcox(), stepAIC()
library(nortest) # ad.test() — Anderson-Darling normality test
# Model training & evaluation
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)
library(e1071) # SVM / naiveBayes
library(pROC) # roc(), auc()
library(scales)
# Visualisation helpers
library(ggcorrplot)
library(knitr)
library(kableExtra)
# Core — load AFTER MASS so dplyr::select() wins over MASS::select()
library(tidyverse)
library(patchwork)# ── Custom theme & palette ────────────────────────────────────────────────────
theme_loan <- function() {
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 13, colour = "#2c3e50"),
plot.subtitle = element_text(colour = "#7f8c8d", size = 10),
axis.title = element_text(colour = "#555555"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "#eaecef"),
legend.position = "bottom"
)
}
palette_main <- c("#2980b9", "#e74c3c", "#27ae60", "#f39c12", "#8e44ad", "#16a085")df_raw <- read.csv("loan_data.csv", stringsAsFactors = FALSE)
cat("Raw dimensions:", nrow(df_raw), "rows ×", ncol(df_raw), "columns\n\n")## Raw dimensions: 45000 rows × 14 columns
## 'data.frame': 45000 obs. of 14 variables:
## $ person_age : num 22 21 25 23 24 21 26 24 24 21 ...
## $ person_gender : chr "female" "female" "female" "female" ...
## $ person_education : chr "Master" "High School" "High School" "Bachelor" ...
## $ person_income : num 71948 12282 12438 79753 66135 ...
## $ person_emp_exp : int 0 0 3 0 1 0 1 5 3 0 ...
## $ person_home_ownership : chr "RENT" "OWN" "MORTGAGE" "RENT" ...
## $ loan_amnt : num 35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
## $ loan_intent : chr "PERSONAL" "EDUCATION" "MEDICAL" "MEDICAL" ...
## $ loan_int_rate : num 16 11.1 12.9 15.2 14.3 ...
## $ loan_percent_income : num 0.49 0.08 0.44 0.44 0.53 0.19 0.37 0.37 0.35 0.13 ...
## $ cb_person_cred_hist_length : num 3 2 3 2 4 2 3 4 2 3 ...
## $ credit_score : int 561 504 635 675 586 532 701 585 544 640 ...
## $ previous_loan_defaults_on_file: chr "No" "Yes" "No" "No" ...
## $ loan_status : int 1 0 1 1 1 1 1 1 1 1 ...
## person_age person_gender person_education person_income person_emp_exp
## 1 22 female Master 71948 0
## 2 21 female High School 12282 0
## 3 25 female High School 12438 3
## 4 23 female Bachelor 79753 0
## 5 24 male Master 66135 1
## 6 21 female High School 12951 0
## person_home_ownership loan_amnt loan_intent loan_int_rate loan_percent_income
## 1 RENT 35000 PERSONAL 16.02 0.49
## 2 OWN 1000 EDUCATION 11.14 0.08
## 3 MORTGAGE 5500 MEDICAL 12.87 0.44
## 4 RENT 35000 MEDICAL 15.23 0.44
## 5 RENT 35000 MEDICAL 14.27 0.53
## 6 OWN 2500 VENTURE 7.14 0.19
## cb_person_cred_hist_length credit_score previous_loan_defaults_on_file
## 1 3 561 No
## 2 2 504 Yes
## 3 3 635 No
## 4 2 675 No
## 5 4 586 No
## 6 2 532 No
## loan_status
## 1 1
## 2 0
## 3 1
## 4 1
## 5 1
## 6 1
na_counts <- colSums(is.na(df_raw))
blank_counts <- sapply(df_raw, function(x) sum(x == "" | x == " ", na.rm = TRUE))
miss_df <- data.frame(
Column = names(na_counts),
NA_Count = na_counts,
Blank_Count = blank_counts,
NA_Pct = round(100 * na_counts / nrow(df_raw), 2)
)
miss_df <- miss_df[miss_df$NA_Count > 0 | miss_df$Blank_Count > 0, ]
if (nrow(miss_df) == 0) {
cat("✔ No missing values detected.\n")
} else {
print(miss_df)
}## ✔ No missing values detected.
df_clean <- df_raw %>%
filter(person_age <= 100) %>%
drop_na() %>%
mutate(
person_gender = factor(person_gender),
person_education = factor(
person_education,
levels = c("High School", "Bachelor", "Master", "PhD")
),
person_home_ownership = factor(person_home_ownership),
loan_intent = factor(loan_intent),
previous_loan_defaults_on_file = factor(previous_loan_defaults_on_file),
loan_status_f = factor(
loan_status,
levels = c(0, 1),
labels = c("Repaid", "Defaulted")
),
# ── Derived / engineered features ──────────────────────────────────────
log_income = log(person_income),
loan_to_income = loan_amnt / person_income,
age_group = cut(
person_age,
breaks = c(17, 25, 35, 50, Inf),
labels = c("18-25", "26-35", "36-50", "50+")
),
# High-risk flag: high loan burden AND poor credit score
high_risk = factor(
ifelse(loan_percent_income > 0.4 & credit_score < 600, "Yes", "No")
)
)
cat("Cleaned dataset:", nrow(df_clean), "rows ×", ncol(df_clean), "columns\n")## Cleaned dataset: 44993 rows × 19 columns
## Checking for NAs in key columns after cleaning:
na_check_after <- df_clean %>%
select(person_income, log_income, person_age, person_emp_exp,
loan_amnt, loan_int_rate, credit_score, loan_percent_income) %>%
summarise(across(everything(), ~sum(is.na(.))))
print(na_check_after)## person_income log_income person_age person_emp_exp loan_amnt loan_int_rate
## 1 0 0 0 0 0 0
## credit_score loan_percent_income
## 1 0 0
## person_age person_income person_emp_exp loan_amnt
## Min. :20.00 Min. : 8000 Min. : 0.000 Min. : 500
## 1st Qu.:24.00 1st Qu.: 47195 1st Qu.: 1.000 1st Qu.: 5000
## Median :26.00 Median : 67046 Median : 4.000 Median : 8000
## Mean :27.75 Mean : 79908 Mean : 5.395 Mean : 9583
## 3rd Qu.:30.00 3rd Qu.: 95778 3rd Qu.: 8.000 3rd Qu.:12237
## Max. :94.00 Max. :2448661 Max. :76.000 Max. :35000
## loan_int_rate loan_percent_income cb_person_cred_hist_length credit_score
## Min. : 5.42 Min. :0.0000 Min. : 2.000 Min. :390.0
## 1st Qu.: 8.59 1st Qu.:0.0700 1st Qu.: 3.000 1st Qu.:601.0
## Median :11.01 Median :0.1200 Median : 4.000 Median :640.0
## Mean :11.01 Mean :0.1397 Mean : 5.867 Mean :632.6
## 3rd Qu.:12.99 3rd Qu.:0.1900 3rd Qu.: 8.000 3rd Qu.:670.0
## Max. :20.00 Max. :0.6600 Max. :30.000 Max. :784.0
## loan_status log_income loan_to_income
## Min. :0.0000 Min. : 8.987 Min. :0.0006576
## 1st Qu.:0.0000 1st Qu.:10.762 1st Qu.:0.0735662
## Median :0.0000 Median :11.113 Median :0.1216656
## Mean :0.2223 Mean :11.122 Mean :0.1397593
## 3rd Qu.:0.0000 3rd Qu.:11.470 3rd Qu.:0.1881413
## Max. :1.0000 Max. :14.711 Max. :0.6641860
status_tbl <- df_clean %>%
count(loan_status_f) %>%
mutate(pct = round(100 * n / sum(n), 1))
print(status_tbl)## loan_status_f n pct
## 1 Repaid 34993 77.8
## 2 Defaulted 10000 22.2
ggplot(status_tbl, aes(x = loan_status_f, y = n, fill = loan_status_f)) +
geom_col(width = 0.5, alpha = 0.85) +
geom_text(aes(label = paste0(pct, "%")), vjust = -0.4, fontface = "bold") +
scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
labs(
title = "Loan Status Distribution",
subtitle = "Target variable class balance",
x = NULL, y = "Count", fill = NULL
) +
theme_loan()p1 <- ggplot(df_clean, aes(x = person_income)) +
geom_histogram(bins = 60, fill = "#2980b9", colour = "white", alpha = 0.85) +
scale_x_continuous(labels = comma) +
labs(title = "Raw Income — Right-Skewed", x = "Annual Income (USD)", y = "Count") +
theme_loan()
p2 <- ggplot(df_clean, aes(x = log_income)) +
geom_histogram(bins = 60, fill = "#27ae60", colour = "white", alpha = 0.85) +
labs(title = "Log(Income) — More Symmetric", x = "ln(Income)", y = "Count") +
theme_loan()
p1 + p2num_vars <- c(
"person_income", "person_age", "loan_amnt",
"loan_int_rate", "credit_score", "loan_percent_income"
)
df_long <- df_clean %>%
select(all_of(num_vars), loan_status_f) %>%
pivot_longer(-loan_status_f, names_to = "variable", values_to = "value")
ggplot(df_long, aes(x = loan_status_f, y = value, fill = loan_status_f)) +
geom_boxplot(outlier.alpha = 0.2, alpha = 0.75) +
scale_fill_manual(values = c("#2980b9", "#e74c3c")) +
facet_wrap(~variable, scales = "free_y", ncol = 3) +
labs(
title = "Numeric Variables by Loan Status",
subtitle = "Detecting separation between Repaid vs Defaulted groups",
x = NULL, y = NULL, fill = NULL
) +
theme_loan() +
theme(axis.text.x = element_text(angle = 20, hjust = 1))cat_vars <- c(
"person_education", "person_home_ownership",
"loan_intent", "previous_loan_defaults_on_file"
)
plots <- lapply(cat_vars, function(v) {
ggplot(df_clean, aes(x = .data[[v]], fill = loan_status_f)) +
geom_bar(position = "fill", alpha = 0.85) +
scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
scale_y_continuous(labels = percent_format()) +
labs(title = v, x = NULL, y = "Proportion", fill = NULL) +
theme_loan() +
theme(
axis.text.x = element_text(angle = 25, hjust = 1),
legend.position = "none"
)
})
wrap_plots(plots, ncol = 2)cor_mat <- df_clean %>%
select(
person_age, person_income, person_emp_exp,
loan_amnt, loan_int_rate, loan_percent_income,
cb_person_cred_hist_length, credit_score, loan_status
) %>%
cor(use = "complete.obs")
ggcorrplot(
cor_mat,
method = "circle",
type = "lower",
lab = TRUE,
lab_size = 3,
colors = c("#e74c3c", "white", "#2980b9"),
title = "Pearson Correlation Matrix — Numeric Predictors",
ggtheme = theme_loan()
)fe_summary <- df_clean %>%
group_by(age_group, high_risk) %>%
summarise(
n = n(),
default_pct = round(100 * mean(loan_status), 1),
avg_income = round(mean(person_income)),
avg_credit = round(mean(credit_score)),
.groups = "drop"
)
print(fe_summary)## # A tibble: 7 × 6
## age_group high_risk n default_pct avg_income avg_credit
## <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 18-25 No 20393 23.2 71250 626
## 2 18-25 Yes 48 85.4 40612 551
## 3 26-35 No 20029 21.2 85104 636
## 4 26-35 Yes 44 68.2 43052 557
## 5 36-50 No 4151 20.5 95655 649
## 6 36-50 Yes 7 85.7 41245 554
## 7 50+ No 321 25.9 113933 678
df_clean %>%
count(age_group, loan_status_f) %>%
ggplot(aes(x = age_group, y = n, fill = loan_status_f)) +
geom_col(position = "fill", alpha = 0.85) +
scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
scale_y_continuous(labels = percent_format()) +
labs(
title = "Default Rate by Age Group (Engineered Feature)",
subtitle = "Younger applicants show higher default proportions",
x = "Age Group", y = "Proportion", fill = "Loan Status"
) +
theme_loan()set.seed(123)
train_idx <- createDataPartition(df_clean$loan_status_f, p = 0.80, list = FALSE)
train_df <- df_clean[ train_idx, ]
test_df <- df_clean[-train_idx, ]
cat("Training rows:", nrow(train_df), "\n")## Training rows: 35995
## Test rows : 8998
##
## Final NA check on training data:
na_final <- train_df %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "column", values_to = "na_count") %>%
filter(na_count > 0)
if(nrow(na_final) > 0) {
print(na_final)
cat("\nRemoving rows with NAs from train and test...\n")
train_df <- na.omit(train_df)
test_df <- test_df %>% filter(complete.cases(select(., all_of(names(train_df)))))
cat("New training rows:", nrow(train_df), "\n")
cat("New test rows:", nrow(test_df), "\n")
} else {
cat("✓ No NAs found in training data\n")
}## # A tibble: 1 × 2
## column na_count
## <chr> <int>
## 1 person_education 10110
##
## Removing rows with NAs from train and test...
## New training rows: 25885
## New test rows: 6461
lm_raw <- lm(
person_income ~ person_age + person_education + person_emp_exp +
person_home_ownership + loan_amnt + loan_int_rate +
credit_score + loan_percent_income,
data = train_df
)
summary(lm_raw)##
## Call:
## lm(formula = person_income ~ person_age + person_education +
## person_emp_exp + person_home_ownership + loan_amnt + loan_int_rate +
## credit_score + loan_percent_income, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89722 -15830 -7071 5419 2295870
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.053e+04 5.549e+03 12.709 < 2e-16 ***
## person_age 7.978e+02 1.680e+02 4.750 2.04e-06 ***
## person_educationBachelor 1.653e+01 6.930e+02 0.024 0.98097
## person_educationMaster -6.995e+02 8.338e+02 -0.839 0.40151
## person_emp_exp 1.696e+02 1.674e+02 1.013 0.31100
## person_home_ownershipOTHER 3.702e+03 5.976e+03 0.619 0.53560
## person_home_ownershipOWN -3.356e+03 1.295e+03 -2.592 0.00955 **
## person_home_ownershipRENT -7.364e+03 6.699e+02 -10.993 < 2e-16 ***
## loan_amnt 7.199e+00 6.325e-02 113.816 < 2e-16 ***
## loan_int_rate -2.097e+02 1.030e+02 -2.037 0.04167 *
## credit_score -7.644e+00 6.226e+00 -1.228 0.21956
## loan_percent_income -5.094e+05 4.537e+03 -112.266 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48350 on 25873 degrees of freedom
## Multiple R-squared: 0.4348, Adjusted R-squared: 0.4346
## F-statistic: 1810 on 11 and 25873 DF, p-value: < 2.2e-16
par(mfrow = c(1, 2))
plot(lm_raw, which = 1, main = "Residuals vs Fitted (Raw LM)")
plot(lm_raw, which = 3, main = "Scale-Location (Raw LM)")par(mfrow = c(1, 1))
par(mfrow = c(1, 2))
hist(
residuals(lm_raw), breaks = 50, col = "#3498db",
main = "Residual Histogram (Raw LM)", xlab = "Residuals"
)
plot(lm_raw, which = 2, main = "Q-Q Plot (Raw LM)")par(mfrow = c(1, 1))
ad_raw <- ad.test(residuals(lm_raw))
cat("\nAnderson-Darling Test (Raw LM):\n")##
## Anderson-Darling Test (Raw LM):
cat(" A =", round(ad_raw$statistic, 4),
" p-value =", formatC(ad_raw$p.value, format = "e", digits = 3), "\n")## A = 3085.106 p-value = 3.700e-24
vif_vals <- vif(lm_raw)
vif_df <- if (is.matrix(vif_vals)) {
data.frame(
Predictor = rownames(vif_vals),
GVIF = round(vif_vals[, "GVIF"], 3),
Df = vif_vals[, "Df"],
GVIF_adj = round(vif_vals[, "GVIF^(1/(2*Df))"], 3)
)
} else {
data.frame(Predictor = names(vif_vals), VIF = round(vif_vals, 3))
}
cat("\nVIF — Baseline Model:\n")##
## VIF — Baseline Model:
## Predictor GVIF Df GVIF_adj
## person_age person_age 10.269 1 3.205
## person_education person_education 1.064 2 1.016
## person_emp_exp person_emp_exp 10.291 1 3.208
## person_home_ownership person_home_ownership 1.151 3 1.024
## loan_amnt loan_amnt 1.759 1 1.326
## loan_int_rate loan_int_rate 1.050 1 1.025
## credit_score credit_score 1.092 1 1.045
## loan_percent_income loan_percent_income 1.732 1 1.316
##
## Breusch-Pagan Test (Raw LM):
cat(" BP =", round(bp_raw$statistic, 3),
" df =", bp_raw$parameter,
" p-value =", formatC(bp_raw$p.value, format = "e", digits = 3), "\n")## BP = 232.479 df = 11 p-value = 1.289e-43
lm_log <- lm(
log_income ~ person_age + person_education + person_emp_exp +
person_home_ownership + loan_amnt + loan_int_rate +
credit_score + loan_percent_income,
data = train_df
)
summary(lm_log)##
## Call:
## lm(formula = log_income ~ person_age + person_education + person_emp_exp +
## person_home_ownership + loan_amnt + loan_int_rate + credit_score +
## loan_percent_income, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7147 -0.1192 0.0056 0.1186 3.2153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.112e+01 3.069e-02 362.232 < 2e-16 ***
## person_age 4.935e-03 9.288e-04 5.313 1.09e-07 ***
## person_educationBachelor 8.349e-03 3.832e-03 2.179 0.0294 *
## person_educationMaster 5.127e-03 4.611e-03 1.112 0.2662
## person_emp_exp -1.346e-03 9.255e-04 -1.454 0.1459
## person_home_ownershipOTHER -9.952e-03 3.305e-02 -0.301 0.7633
## person_home_ownershipOWN -1.051e-01 7.160e-03 -14.676 < 2e-16 ***
## person_home_ownershipRENT -1.024e-01 3.705e-03 -27.654 < 2e-16 ***
## loan_amnt 8.389e-05 3.498e-07 239.813 < 2e-16 ***
## loan_int_rate -6.118e-03 5.694e-04 -10.744 < 2e-16 ***
## credit_score 4.727e-06 3.443e-05 0.137 0.8908
## loan_percent_income -5.775e+00 2.509e-02 -230.157 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2674 on 25873 degrees of freedom
## Multiple R-squared: 0.7667, Adjusted R-squared: 0.7666
## F-statistic: 7732 on 11 and 25873 DF, p-value: < 2.2e-16
par(mfrow = c(1, 1))
bp_log <- bptest(lm_log)
ad_log <- ad.test(residuals(lm_log))
cat("\nPost-transformation diagnostics:\n")##
## Post-transformation diagnostics:
## Breusch-Pagan p = 8.173e-86
## Anderson-Darling p = 3.700e-24
## Mean of residuals = 0
vif_log <- vif(lm_log)
vif_log_df <- if (is.matrix(vif_log)) {
data.frame(
Predictor = rownames(vif_log),
GVIF = round(vif_log[, "GVIF"], 3),
Df = vif_log[, "Df"],
GVIF_adj = round(vif_log[, "GVIF^(1/(2*Df))"], 3)
)
} else {
data.frame(Predictor = names(vif_log), VIF = round(vif_log, 3))
}
cat("\nVIF — Log-Linear Model:\n")##
## VIF — Log-Linear Model:
## Predictor GVIF Df GVIF_adj
## person_age person_age 10.269 1 3.205
## person_education person_education 1.064 2 1.016
## person_emp_exp person_emp_exp 10.291 1 3.208
## person_home_ownership person_home_ownership 1.151 3 1.024
## loan_amnt loan_amnt 1.759 1 1.326
## loan_int_rate loan_int_rate 1.050 1 1.025
## credit_score credit_score 1.092 1 1.045
## loan_percent_income loan_percent_income 1.732 1 1.316
##
## Call:
## lm(formula = log_income ~ person_age + person_education + person_emp_exp +
## person_home_ownership + loan_amnt + loan_int_rate + loan_percent_income,
## data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7151 -0.1193 0.0056 0.1186 3.2154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.112e+01 2.230e-02 498.734 < 2e-16 ***
## person_age 4.934e-03 9.288e-04 5.312 1.09e-07 ***
## person_educationBachelor 8.446e-03 3.767e-03 2.242 0.025 *
## person_educationMaster 5.266e-03 4.499e-03 1.170 0.242
## person_emp_exp -1.338e-03 9.236e-04 -1.448 0.148
## person_home_ownershipOTHER -9.974e-03 3.304e-02 -0.302 0.763
## person_home_ownershipOWN -1.051e-01 7.159e-03 -14.676 < 2e-16 ***
## person_home_ownershipRENT -1.025e-01 3.705e-03 -27.655 < 2e-16 ***
## loan_amnt 8.389e-05 3.498e-07 239.820 < 2e-16 ***
## loan_int_rate -6.117e-03 5.694e-04 -10.744 < 2e-16 ***
## loan_percent_income -5.775e+00 2.509e-02 -230.167 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2674 on 25874 degrees of freedom
## Multiple R-squared: 0.7667, Adjusted R-squared: 0.7667
## F-statistic: 8505 on 10 and 25874 DF, p-value: < 2.2e-16
set.seed(42)
rf_income <- randomForest(
log_income ~ person_age + person_education + person_emp_exp +
person_home_ownership + loan_amnt + loan_int_rate +
credit_score + loan_percent_income,
data = train_df,
ntree = 300,
importance = TRUE,
na.action = na.omit
)
cat("\nOOB R² (Random Forest on log-income):",
round(1 - rf_income$mse[300] / var(train_df$log_income, na.rm = TRUE), 4), "\n")##
## OOB R² (Random Forest on log-income): 0.9435
eval_reg <- function(model, test, log_response = TRUE) {
preds_log <- predict(model, test)
preds <- if (log_response) exp(preds_log) else preds_log
actual <- test$person_income
rmse <- sqrt(mean((actual - preds)^2, na.rm = TRUE))
r2 <- cor(actual, preds, use = "complete.obs")^2
c(RMSE = rmse, R2 = r2)
}
raw_metrics <- eval_reg(lm_raw, test_df, log_response = FALSE)
log_metrics <- eval_reg(lm_log, test_df)
step_metrics <- eval_reg(lm_step, test_df)
rf_metrics <- eval_reg(rf_income, test_df)
results_reg <- data.frame(
Model = c(
"Linear Reg (Raw Income)",
"Log-Linear Reg (Required)",
"Log-Linear Stepwise AIC",
"Random Forest (Benchmark)"
),
RMSE = round(c(
raw_metrics["RMSE"], log_metrics["RMSE"],
step_metrics["RMSE"], rf_metrics["RMSE"]
)),
R_Squared = round(c(
raw_metrics["R2"], log_metrics["R2"],
step_metrics["R2"], rf_metrics["R2"]
), 4)
)
cat("\n========== Regression Model Comparison — Test Set ==========\n")##
## ========== Regression Model Comparison — Test Set ==========
## Model RMSE R_Squared
## 1 Linear Reg (Raw Income) 40408 0.5147
## 2 Log-Linear Reg (Required) 40321 0.5290
## 3 Log-Linear Stepwise AIC 40322 0.5290
## 4 Random Forest (Benchmark) 27216 0.8427
##
## FINAL MODEL: lowest RMSE / highest R² selected.
# Actual vs Predicted plot
best_preds <- exp(predict(lm_log, test_df))
ggplot(
data.frame(Actual = test_df$person_income, Predicted = best_preds),
aes(x = Predicted, y = Actual)
) +
geom_point(alpha = 0.25, colour = "#2980b9", size = 1.5) +
geom_abline(intercept = 0, slope = 1,
colour = "#e74c3c", linewidth = 1, linetype = "dashed") +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
labs(
title = "Actual vs Predicted Income — Log-Linear Model",
subtitle = "Dashed line = perfect prediction",
x = "Predicted Income (USD)", y = "Actual Income (USD)"
) +
theme_loan()cls_formula <- loan_status_f ~
person_income + person_age + person_education +
person_home_ownership + loan_amnt + loan_int_rate +
loan_percent_income + credit_score +
previous_loan_defaults_on_file + loan_intent +
age_group + high_risk + loan_to_incomelogit_model <- glm(
loan_status ~
log_income + person_age + person_education +
person_home_ownership + loan_amnt + loan_int_rate +
loan_percent_income + credit_score +
previous_loan_defaults_on_file + loan_intent,
data = train_df,
family = binomial(link = "logit")
)
summary(logit_model)##
## Call:
## glm(formula = loan_status ~ log_income + person_age + person_education +
## person_home_ownership + loan_amnt + loan_int_rate + loan_percent_income +
## credit_score + previous_loan_defaults_on_file + loan_intent,
## family = binomial(link = "logit"), data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.205e+01 1.115e+00 10.806 < 2e-16 ***
## log_income -1.035e+00 9.529e-02 -10.857 < 2e-16 ***
## person_age 2.800e-03 4.081e-03 0.686 0.492625
## person_educationBachelor 2.165e-02 5.441e-02 0.398 0.690746
## person_educationMaster 5.731e-02 6.505e-02 0.881 0.378292
## person_home_ownershipOTHER 1.429e-01 4.324e-01 0.330 0.741060
## person_home_ownershipOWN -1.579e+00 1.370e-01 -11.528 < 2e-16 ***
## person_home_ownershipRENT 6.644e-01 5.360e-02 12.395 < 2e-16 ***
## loan_amnt -1.053e-05 9.365e-06 -1.125 0.260602
## loan_int_rate 3.294e-01 8.708e-03 37.829 < 2e-16 ***
## loan_percent_income 9.909e+00 6.296e-01 15.739 < 2e-16 ***
## credit_score -9.510e-03 5.469e-04 -17.389 < 2e-16 ***
## previous_loan_defaults_on_fileYes -2.033e+01 1.348e+02 -0.151 0.880105
## loan_intentEDUCATION -9.196e-01 7.752e-02 -11.864 < 2e-16 ***
## loan_intentHOMEIMPROVEMENT 9.452e-04 8.904e-02 0.011 0.991531
## loan_intentMEDICAL -2.607e-01 7.497e-02 -3.477 0.000506 ***
## loan_intentPERSONAL -7.472e-01 7.995e-02 -9.346 < 2e-16 ***
## loan_intentVENTURE -1.196e+00 8.369e-02 -14.288 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27452 on 25884 degrees of freedom
## Residual deviance: 11297 on 25867 degrees of freedom
## AIC: 11333
##
## Number of Fisher Scoring iterations: 19
logit_probs <- predict(logit_model, test_df, type = "response")
logit_preds <- factor(
ifelse(logit_probs > 0.5, "Defaulted", "Repaid"),
levels = c("Repaid", "Defaulted")
)
cm_logit <- confusionMatrix(logit_preds, test_df$loan_status_f,
positive = "Defaulted")
cat("\n--- Logistic Regression Confusion Matrix ---\n")##
## --- Logistic Regression Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction Repaid Defaulted
## Repaid 4705 354
## Defaulted 312 1090
##
## Accuracy : 0.8969
## 95% CI : (0.8892, 0.9042)
## No Information Rate : 0.7765
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6999
##
## Mcnemar's Test P-Value : 0.1121
##
## Sensitivity : 0.7548
## Specificity : 0.9378
## Pos Pred Value : 0.7775
## Neg Pred Value : 0.9300
## Prevalence : 0.2235
## Detection Rate : 0.1687
## Detection Prevalence : 0.2170
## Balanced Accuracy : 0.8463
##
## 'Positive' Class : Defaulted
##
set.seed(123)
tree_model <- rpart(
cls_formula,
data = train_df,
method = "class",
control = rpart.control(cp = 0.001, minsplit = 30)
)
best_cp <- tree_model$cptable[which.min(tree_model$cptable[, "xerror"]), "CP"]
tree_pruned <- prune(tree_model, cp = best_cp)
rpart.plot(
tree_pruned, type = 4, extra = 104,
main = "Pruned Decision Tree — Loan Default", cex = 0.7
)tree_preds <- predict(tree_pruned, test_df, type = "class")
cm_tree <- confusionMatrix(tree_preds, test_df$loan_status_f,
positive = "Defaulted")
cat("\n--- Decision Tree Confusion Matrix ---\n")##
## --- Decision Tree Confusion Matrix ---
set.seed(42)
rf_class <- randomForest(
cls_formula,
data = train_df,
ntree = 300,
mtry = 4,
importance = TRUE,
na.action = na.omit
)
rf_preds <- predict(rf_class, test_df)
cm_rf <- confusionMatrix(rf_preds, test_df$loan_status_f,
positive = "Defaulted")
cat("\n--- Random Forest Confusion Matrix ---\n")##
## --- Random Forest Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction Repaid Defaulted
## Repaid 4890 332
## Defaulted 127 1112
##
## Accuracy : 0.929
## 95% CI : (0.9224, 0.9351)
## No Information Rate : 0.7765
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7844
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7701
## Specificity : 0.9747
## Pos Pred Value : 0.8975
## Neg Pred Value : 0.9364
## Prevalence : 0.2235
## Detection Rate : 0.1721
## Detection Prevalence : 0.1918
## Balanced Accuracy : 0.8724
##
## 'Positive' Class : Defaulted
##
imp_df <- as.data.frame(importance(rf_class)) %>%
rownames_to_column("Feature") %>%
arrange(desc(MeanDecreaseGini))
ggplot(imp_df,
aes(x = reorder(Feature, MeanDecreaseGini),
y = MeanDecreaseGini, fill = MeanDecreaseGini)) +
geom_col(alpha = 0.88) +
coord_flip() +
scale_fill_viridis_c(option = "C", guide = "none") +
labs(
title = "Random Forest — Variable Importance",
subtitle = "Mean Decrease Gini: higher = more predictive power",
x = NULL, y = "Mean Decrease Gini"
) +
theme_loan()nb_model <- naiveBayes(
loan_status_f ~ person_education + person_home_ownership +
loan_intent + previous_loan_defaults_on_file +
age_group + high_risk,
data = train_df
)
nb_preds <- predict(nb_model, test_df)
cm_nb <- confusionMatrix(nb_preds, test_df$loan_status_f,
positive = "Defaulted")
cat("\n--- Naïve Bayes Confusion Matrix ---\n")##
## --- Naïve Bayes Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction Repaid Defaulted
## Repaid 4394 562
## Defaulted 623 882
##
## Accuracy : 0.8166
## 95% CI : (0.8069, 0.826)
## No Information Rate : 0.7765
## P-Value [Acc > NIR] : 1.427e-15
##
## Kappa : 0.4794
##
## Mcnemar's Test P-Value : 0.08134
##
## Sensitivity : 0.6108
## Specificity : 0.8758
## Pos Pred Value : 0.5860
## Neg Pred Value : 0.8866
## Prevalence : 0.2235
## Detection Rate : 0.1365
## Detection Prevalence : 0.2329
## Balanced Accuracy : 0.7433
##
## 'Positive' Class : Defaulted
##
roc_logit <- roc(test_df$loan_status, logit_probs, quiet = TRUE)
roc_tree <- roc(
test_df$loan_status,
predict(tree_pruned, test_df, type = "prob")[, "Defaulted"],
quiet = TRUE
)
roc_rf <- roc(
test_df$loan_status,
predict(rf_class, test_df, type = "prob")[, "Defaulted"],
quiet = TRUE
)
plot(roc_logit, col = palette_main[1], lwd = 2,
main = "ROC Curves — Loan Default Classifiers")
lines(roc_tree, col = palette_main[2], lwd = 2)
lines(roc_rf, col = palette_main[3], lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "grey60")
legend(
"bottomright",
legend = c(
paste0("Logistic Reg AUC = ", round(auc(roc_logit), 3)),
paste0("Decision Tree AUC = ", round(auc(roc_tree), 3)),
paste0("Random Forest AUC = ", round(auc(roc_rf), 3))
),
col = palette_main[1:3], lwd = 2
)extract_metrics <- function(cm, model_name, auc_val = NA) {
data.frame(
Model = model_name,
Accuracy = round(cm$overall["Accuracy"] * 100, 2),
Sensitivity = round(cm$byClass["Sensitivity"] * 100, 2),
Specificity = round(cm$byClass["Specificity"] * 100, 2),
AUC = round(auc_val, 4)
)
}
class_results <- rbind(
extract_metrics(cm_logit, "Logistic Regression", auc(roc_logit)),
extract_metrics(cm_tree, "Decision Tree (Pruned)", auc(roc_tree)),
extract_metrics(cm_rf, "Random Forest", auc(roc_rf)),
extract_metrics(cm_nb, "Naïve Bayes", NA)
)
rownames(class_results) <- NULL
cat("\n========== Classification Model Comparison — Test Set ==========\n")##
## ========== Classification Model Comparison — Test Set ==========
## Model Accuracy Sensitivity Specificity AUC
## 1 Logistic Regression 89.69 75.48 93.78 0.9530
## 2 Decision Tree (Pruned) 92.09 74.52 97.15 0.9612
## 3 Random Forest 92.90 77.01 97.47 0.9762
## 4 Naïve Bayes 81.66 61.08 87.58 NA
best_cls <- class_results$Model[which.max(class_results$AUC)]
cat("\nFINAL CLASSIFIER:", best_cls, "(highest AUC)\n")##
## FINAL CLASSIFIER: Random Forest (highest AUC)
credit_risk_tbl <- df_clean %>%
mutate(
credit_band = cut(
credit_score,
breaks = c(300, 500, 600, 700, 800, 850),
labels = c("300-500", "501-600", "601-700", "701-800", "801-850")
)
) %>%
group_by(credit_band) %>%
summarise(
n = n(),
default_pct = round(100 * mean(loan_status), 1),
avg_income = round(mean(person_income)),
.groups = "drop"
)
cat("\n========== Default Rate & Income by Credit Score Band ==========\n")##
## ========== Default Rate & Income by Credit Score Band ==========
## # A tibble: 4 × 4
## credit_band n default_pct avg_income
## <fct> <int> <dbl> <dbl>
## 1 300-500 529 21.2 74458
## 2 501-600 10520 22.6 78787
## 3 601-700 31252 22.2 79867
## 4 701-800 2692 21.2 85838
df_clean %>%
mutate(
credit_band = cut(
credit_score,
breaks = c(300, 500, 600, 700, 800, 850),
labels = c("300-500", "501-600", "601-700", "701-800", "801-850")
)
) %>%
ggplot(aes(x = credit_score, y = log_income, colour = loan_status_f)) +
geom_point(alpha = 0.15, size = 1.2) +
geom_smooth(method = "lm", se = TRUE) +
scale_colour_manual(values = c("#27ae60", "#e74c3c")) +
labs(
title = "Credit Score vs Log(Income) by Loan Outcome",
subtitle = "Higher credit score + higher income → lower default risk",
x = "Credit Score", y = "Log(Income)", colour = "Status"
) +
theme_loan()# Additional packages needed for Part 2 not already loaded in Part 1
library(ggridges) # geom_density_ridges()
library(viridis) # scale_fill_viridis_c()
library(RColorBrewer) # brewer palettes# ── Custom palette & theme ────────────────────────────────────────────────────
palette_sal <- c(
"#2980b9", "#27ae60", "#e74c3c", "#f39c12",
"#8e44ad", "#16a085", "#2c3e50"
)
theme_report <- function() {
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 13, colour = "#2c3e50",
margin = ggplot2::margin(b = 6)),
plot.subtitle = element_text(colour = "#7f8c8d", size = 10,
margin = ggplot2::margin(b = 10)),
axis.title = element_text(colour = "#555555", size = 10),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "#eaecef"),
legend.position = "bottom",
legend.title = element_text(face = "bold")
)
}df_raw_sal <- read.csv("salaries.csv", stringsAsFactors = FALSE)
cat("Raw dimensions:", nrow(df_raw_sal), "rows ×", ncol(df_raw_sal), "columns\n\n")## Raw dimensions: 88584 rows × 11 columns
## 'data.frame': 88584 obs. of 11 variables:
## $ work_year : int 2025 2025 2025 2025 2025 2025 2025 2025 2025 2025 ...
## $ experience_level : chr "MI" "SE" "SE" "SE" ...
## $ employment_type : chr "FT" "FT" "FT" "FT" ...
## $ job_title : chr "Customer Success Manager" "Engineer" "Engineer" "Applied Scientist" ...
## $ salary : int 57000 165000 109000 294000 137600 82000 44000 149800 89700 200000 ...
## $ salary_currency : chr "EUR" "USD" "USD" "USD" ...
## $ salary_in_usd : int 60000 165000 109000 294000 137600 82000 44000 149800 89700 200000 ...
## $ employee_residence: chr "NL" "US" "US" "US" ...
## $ remote_ratio : int 50 0 0 0 0 0 0 0 0 0 ...
## $ company_location : chr "NL" "US" "US" "US" ...
## $ company_size : chr "L" "M" "M" "M" ...
## work_year experience_level employment_type job_title salary
## 1 2025 MI FT Customer Success Manager 57000
## 2 2025 SE FT Engineer 165000
## 3 2025 SE FT Engineer 109000
## 4 2025 SE FT Applied Scientist 294000
## 5 2025 SE FT Applied Scientist 137600
## 6 2025 EN FT Data Analyst 82000
## salary_currency salary_in_usd employee_residence remote_ratio
## 1 EUR 60000 NL 50
## 2 USD 165000 US 0
## 3 USD 109000 US 0
## 4 USD 294000 US 0
## 5 USD 137600 US 0
## 6 USD 82000 US 0
## company_location company_size
## 1 NL L
## 2 US M
## 3 US M
## 4 US M
## 5 US M
## 6 US M
na_report <- colSums(is.na(df_raw_sal))
if (any(na_report > 0)) {
cat("Columns with NAs:\n")
print(na_report[na_report > 0])
} else {
cat("✔ No missing values detected.\n")
}## ✔ No missing values detected.
df <- df_raw_sal %>%
drop_na(salary_in_usd) %>%
mutate(
# ── Ordered factors ──────────────────────────────────────────────────────
experience_level = factor(
experience_level,
levels = c("EN", "MI", "SE", "EX"),
labels = c("Entry", "Mid", "Senior", "Executive"),
ordered = TRUE
),
company_size = factor(
company_size,
levels = c("S", "M", "L"),
labels = c("Small", "Medium", "Large"),
ordered = TRUE
),
# ── Unordered factors ────────────────────────────────────────────────────
employment_type = factor(
employment_type,
levels = c("FT", "PT", "CT", "FL"),
labels = c("Full-Time", "Part-Time", "Contract", "Freelance")
),
remote_label = factor(
remote_ratio,
levels = c(0, 50, 100),
labels = c("On-Site", "Hybrid", "Fully Remote")
),
work_year = as.integer(work_year),
# ── Derived / engineered features ────────────────────────────────────────
log_salary = log(salary_in_usd),
# Era grouping: pandemic (2020-21), recovery (2022-23), AI boom (2024-25)
year_group = cut(
work_year,
breaks = c(2019, 2021, 2023, 2025),
labels = c("2020-21", "2022-23", "2024-25")
),
# Consolidate 80+ job titles into 4 meaningful buckets
job_category = case_when(
grepl("Analyst", job_title, ignore.case = TRUE) ~ "Data Analyst",
grepl("Scientist", job_title, ignore.case = TRUE) ~ "Data Scientist",
grepl("Engineer", job_title, ignore.case = TRUE) ~ "Data / ML Engineer",
grepl("Manager|Director|Lead", job_title, ignore.case = TRUE) ~ "Management",
TRUE ~ "Other"
),
job_category = factor(job_category)
)
# Salary quartile tiers — classification target
sal_q <- quantile(df$salary_in_usd, probs = c(0, .25, .50, .75, 1), na.rm = TRUE)
df$salary_category <- cut(
df$salary_in_usd,
breaks = sal_q,
labels = c("Low", "Medium", "High", "Very High"),
include.lowest = TRUE
)
cat("\nCleaned dataset:", nrow(df), "rows ×", ncol(df), "columns\n")##
## Cleaned dataset: 88584 rows × 16 columns
cat("Salary range: $", format(min(df$salary_in_usd), big.mark = ","),
"→ $", format(max(df$salary_in_usd), big.mark = ","), "\n")## Salary range: $ 15,000 → $ 800,000
## work_year salary_in_usd log_salary remote_ratio
## Min. :2020 Min. : 15000 Min. : 9.616 Min. : 0.00
## 1st Qu.:2024 1st Qu.:106097 1st Qu.:11.572 1st Qu.: 0.00
## Median :2024 Median :146307 Median :11.893 Median : 0.00
## Mean :2024 Mean :157568 Mean :11.858 Mean : 21.29
## 3rd Qu.:2024 3rd Qu.:198600 3rd Qu.:12.199 3rd Qu.: 0.00
## Max. :2025 Max. :800000 Max. :13.592 Max. :100.00
kpi <- data.frame(
Metric = c("Records", "Median Salary", "Mean Salary",
"Max Salary", "Unique Job Titles", "Years Covered"),
Value = c(
nrow(df),
dollar(median(df$salary_in_usd)),
dollar(round(mean(df$salary_in_usd))),
dollar(max(df$salary_in_usd)),
n_distinct(df$job_title),
paste(min(df$work_year), "–", max(df$work_year))
)
)
print(kpi)## Metric Value
## 1 Records 88584
## 2 Median Salary $146,307
## 3 Mean Salary $157,568
## 4 Max Salary $800,000
## 5 Unique Job Titles 312
## 6 Years Covered 2020 – 2025
p1 <- ggplot(df, aes(x = salary_in_usd)) +
geom_histogram(bins = 60, fill = "#2980b9", colour = "white", alpha = 0.85) +
scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
labs(title = "Raw Salary — Right-Skewed", x = "Salary (USD)", y = "Count") +
theme_report()
p2 <- ggplot(df, aes(x = log_salary)) +
geom_histogram(bins = 60, fill = "#27ae60", colour = "white", alpha = 0.85) +
labs(title = "Log(Salary) — Approx. Normal", x = "ln(Salary)", y = "Count") +
theme_report()
p1 + p2ggplot(df, aes(x = experience_level, y = salary_in_usd, fill = experience_level)) +
geom_violin(alpha = 0.6, trim = FALSE) +
geom_boxplot(width = 0.18, fill = "white", outlier.alpha = 0.25) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_fill_manual(values = palette_sal[1:4], guide = "none") +
labs(
title = "Salary Distribution by Experience Level",
subtitle = "Violin plot shows full density; boxplot shows quartiles",
x = "Experience Level", y = "Salary (USD)"
) +
theme_report()df %>%
group_by(work_year, experience_level) %>%
summarise(median_salary = median(salary_in_usd), .groups = "drop") %>%
ggplot(aes(x = work_year, y = median_salary,
colour = experience_level, group = experience_level)) +
geom_line(linewidth = 1.2) +
geom_point(size = 2.5) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_colour_manual(values = palette_sal[1:4]) +
labs(
title = "Median Salary Trends 2020-2025 by Experience Level",
subtitle = "Tracking compensation growth across seniority tiers",
x = "Year", y = "Median Salary (USD)", colour = "Experience"
) +
theme_report()df %>%
group_by(remote_label) %>%
summarise(
median = median(salary_in_usd),
q25 = quantile(salary_in_usd, 0.25),
q75 = quantile(salary_in_usd, 0.75),
n = n(),
.groups = "drop"
) %>%
ggplot(aes(x = remote_label, y = median, fill = remote_label)) +
geom_col(alpha = 0.85) +
geom_errorbar(aes(ymin = q25, ymax = q75), width = 0.25, colour = "#2c3e50") +
geom_text(aes(label = dollar(round(median))), vjust = -0.5, fontface = "bold") +
scale_fill_manual(values = palette_sal[1:3], guide = "none") +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
labs(
title = "Median Salary by Remote Work Arrangement",
subtitle = "Error bars = IQR (25th–75th percentile)",
x = NULL, y = "Salary (USD)"
) +
theme_report()ggplot(df, aes(x = company_size, y = salary_in_usd, fill = company_size)) +
geom_boxplot(outlier.alpha = 0.3, alpha = 0.75) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_fill_manual(values = palette_sal[1:3], guide = "none") +
facet_wrap(~employment_type) +
labs(
title = "Salary by Company Size & Employment Type",
subtitle = "Large firms pay premiums, especially for full-time roles",
x = "Company Size", y = "Salary (USD)"
) +
theme_report()ggplot(df, aes(x = log_salary, y = job_category, fill = job_category)) +
geom_density_ridges(alpha = 0.7, scale = 1.5) +
scale_fill_manual(values = palette_sal, guide = "none") +
labs(
title = "Log-Salary Density by Job Category",
subtitle = "Ridge plot reveals distributional shape across roles",
x = "ln(Salary)", y = NULL
) +
theme_report()cor_mat_sal <- df %>%
select(work_year, salary_in_usd, log_salary, remote_ratio) %>%
cor(use = "complete.obs")
ggcorrplot(
cor_mat_sal,
method = "circle",
type = "lower",
lab = TRUE,
lab_size = 4,
colors = c("#e74c3c", "white", "#2980b9"),
title = "Pearson Correlation Matrix — Numeric Variables",
ggtheme = theme_report()
)df <- df %>%
mutate(
exp_size_combo = paste(
as.character(experience_level),
as.character(company_size), sep = " × "
)
)
fe_tbl <- df %>%
group_by(job_category, year_group) %>%
summarise(
n = n(),
median_salary = dollar(round(median(salary_in_usd))),
pct_remote = round(100 * mean(remote_ratio == 100), 1),
.groups = "drop"
) %>%
arrange(job_category, year_group)
cat("\n========== Feature-Engineered View: Job Category × Year Group ==========\n")##
## ========== Feature-Engineered View: Job Category × Year Group ==========
## # A tibble: 15 × 5
## job_category year_group n median_salary pct_remote
## <fct> <fct> <int> <chr> <dbl>
## 1 Data / ML Engineer 2020-21 109 $79,833 53.2
## 2 Data / ML Engineer 2022-23 4398 $153,000 32.7
## 3 Data / ML Engineer 2024-25 35990 $165,600 18.2
## 4 Data Analyst 2020-21 46 $71,893 60.9
## 5 Data Analyst 2022-23 1802 $104,000 41
## 6 Data Analyst 2024-25 11198 $99,000 21
## 7 Data Scientist 2020-21 121 $90,000 46.3
## 8 Data Scientist 2022-23 3144 $160,000 34.6
## 9 Data Scientist 2024-25 14930 $159,404 19
## 10 Management 2020-21 4 $120,000 50
## 11 Management 2022-23 237 $120,000 16.9
## 12 Management 2024-25 6561 $150,029 17.5
## 13 Other 2020-21 13 $150,000 61.5
## 14 Other 2022-23 601 $130,000 42.8
## 15 Other 2024-25 9430 $126,600 22.3
df %>%
count(job_category, salary_category) %>%
ggplot(aes(x = job_category, y = n, fill = salary_category)) +
geom_col(position = "fill", alpha = 0.87) +
scale_fill_manual(
values = c("#3498db", "#2ecc71", "#f39c12", "#e74c3c"),
name = "Salary Tier"
) +
scale_y_continuous(labels = percent_format()) +
coord_flip() +
labs(
title = "Salary Tier Composition by Job Category",
subtitle = "100% stacked — proportion in each pay bracket",
x = NULL, y = "Proportion"
) +
theme_report()set.seed(123)
# Regression split (continuous log-salary target)
df_reg <- df %>%
select(
salary_in_usd, log_salary, work_year, experience_level,
employment_type, company_size, remote_label, job_category
) %>%
na.omit()
idx_r <- createDataPartition(df_reg$log_salary, p = 0.80, list = FALSE)
train_reg <- df_reg[ idx_r, ]
test_reg <- df_reg[-idx_r, ]
# Classification split (salary tier target)
df_cls <- df %>%
select(
salary_category, work_year, experience_level,
employment_type, company_size, remote_label, job_category
) %>%
na.omit()
idx_c <- createDataPartition(df_cls$salary_category, p = 0.80, list = FALSE)
train_cls <- df_cls[ idx_c, ]
test_cls <- df_cls[-idx_c, ]
cat("Regression — Train:", nrow(train_reg), "| Test:", nrow(test_reg), "\n")## Regression — Train: 70869 | Test: 17715
## Classification — Train: 70869 | Test: 17715
lm_raw_sal <- lm(
salary_in_usd ~ work_year + experience_level +
employment_type + company_size + remote_label + job_category,
data = train_reg
)
summary(lm_raw_sal)##
## Call:
## lm(formula = salary_in_usd ~ work_year + experience_level + employment_type +
## company_size + remote_label + job_category, data = train_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -182789 -44633 -9633 33594 688249
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8551620.8 853101.9 -10.024 < 2e-16 ***
## work_year 4305.9 421.6 10.213 < 2e-16 ***
## experience_level.L 59222.5 1318.1 44.930 < 2e-16 ***
## experience_level.Q 4570.9 1015.8 4.500 6.82e-06 ***
## experience_level.C -3534.1 580.6 -6.086 1.16e-09 ***
## employment_typePart-Time -43483.7 5030.2 -8.645 < 2e-16 ***
## employment_typeContract -32419.6 4909.6 -6.603 4.05e-11 ***
## employment_typeFreelance -50590.3 18074.5 -2.799 0.00513 **
## company_size.L 23265.8 3898.0 5.969 2.40e-09 ***
## company_size.Q -12526.3 2314.4 -5.412 6.24e-08 ***
## remote_labelHybrid -52401.9 4494.3 -11.660 < 2e-16 ***
## remote_labelFully Remote -11850.9 621.1 -19.080 < 2e-16 ***
## job_categoryData Analyst -52230.8 812.9 -64.251 < 2e-16 ***
## job_categoryData Scientist -5599.5 670.0 -8.357 < 2e-16 ***
## job_categoryManagement -8738.5 982.3 -8.896 < 2e-16 ***
## job_categoryOther -32255.5 836.8 -38.544 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66660 on 70853 degrees of freedom
## Multiple R-squared: 0.1743, Adjusted R-squared: 0.1741
## F-statistic: 996.8 on 15 and 70853 DF, p-value: < 2.2e-16
par(mfrow = c(1, 2))
plot(lm_raw_sal, which = 1, main = "Residuals vs Fitted (Raw Salary)")
plot(lm_raw_sal, which = 3, main = "Scale-Location (Raw Salary)")par(mfrow = c(1, 1))
par(mfrow = c(1, 2))
hist(
residuals(lm_raw_sal), breaks = 50, col = "#3498db", border = "white",
main = "Residuals Histogram (Raw)", xlab = "Residuals"
)
plot(lm_raw_sal, which = 2, main = "Q-Q Plot (Raw Salary)")par(mfrow = c(1, 1))
ad_raw_sal <- ad.test(residuals(lm_raw_sal))
cat("\nAnderson-Darling Test (Raw Salary Model):\n")##
## Anderson-Darling Test (Raw Salary Model):
cat(" A =", round(ad_raw_sal$statistic, 4),
" p-value =", formatC(ad_raw_sal$p.value, format = "e", digits = 3), "\n")## A = 774.6828 p-value = 3.700e-24
##
## Breusch-Pagan Test (Raw Salary Model):
cat(" BP =", round(bp_raw_sal$statistic, 3),
" p-value =", formatC(bp_raw_sal$p.value, format = "e", digits = 3), "\n")## BP = 647.331 p-value = 3.063e-128
vif_raw_sal <- vif(lm_raw_sal)
vif_raw_sal_df <- if (is.matrix(vif_raw_sal)) {
data.frame(
Predictor = rownames(vif_raw_sal),
GVIF = round(vif_raw_sal[, "GVIF"], 3),
Df = vif_raw_sal[, "Df"],
GVIF_adj = round(vif_raw_sal[, "GVIF^(1/(2*Df))"], 3)
)
} else {
data.frame(Predictor = names(vif_raw_sal), VIF = round(vif_raw_sal, 3))
}
cat("\nVIF — Baseline Model:\n")##
## VIF — Baseline Model:
## Predictor GVIF Df GVIF_adj
## work_year work_year 1.088 1 1.043
## experience_level experience_level 1.250 3 1.038
## employment_type employment_type 1.047 3 1.008
## company_size company_size 1.147 2 1.035
## remote_label remote_label 1.121 2 1.029
## job_category job_category 1.261 4 1.029
lm_log_sal <- lm(
log_salary ~ work_year + experience_level +
employment_type + company_size + remote_label + job_category,
data = train_reg
)
summary(lm_log_sal)##
## Call:
## lm(formula = log_salary ~ work_year + experience_level + employment_type +
## company_size + remote_label + job_category, data = train_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.20916 -0.25629 0.01486 0.27929 2.13325
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -40.887361 5.415632 -7.550 4.41e-14 ***
## work_year 0.026050 0.002676 9.733 < 2e-16 ***
## experience_level.L 0.453070 0.008368 54.146 < 2e-16 ***
## experience_level.Q -0.024044 0.006449 -3.729 0.000193 ***
## experience_level.C -0.019814 0.003686 -5.375 7.67e-08 ***
## employment_typePart-Time -0.498455 0.031933 -15.610 < 2e-16 ***
## employment_typeContract -0.430857 0.031167 -13.824 < 2e-16 ***
## employment_typeFreelance -0.662959 0.114740 -5.778 7.59e-09 ***
## company_size.L 0.246188 0.024745 9.949 < 2e-16 ***
## company_size.Q -0.132263 0.014692 -9.002 < 2e-16 ***
## remote_labelHybrid -0.547849 0.028530 -19.202 < 2e-16 ***
## remote_labelFully Remote -0.066311 0.003943 -16.818 < 2e-16 ***
## job_categoryData Analyst -0.356829 0.005161 -69.146 < 2e-16 ***
## job_categoryData Scientist -0.024616 0.004253 -5.787 7.18e-09 ***
## job_categoryManagement -0.055554 0.006236 -8.909 < 2e-16 ***
## job_categoryOther -0.230543 0.005312 -43.397 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4231 on 70853 degrees of freedom
## Multiple R-squared: 0.2358, Adjusted R-squared: 0.2356
## F-statistic: 1457 on 15 and 70853 DF, p-value: < 2.2e-16
par(mfrow = c(1, 1))
bp_log_sal <- bptest(lm_log_sal)
ad_log_sal <- ad.test(residuals(lm_log_sal))
cat("\nPost-transformation diagnostics:\n")##
## Post-transformation diagnostics:
## Breusch-Pagan p = 0.000e+00
## Anderson-Darling p = 3.700e-24
## Mean of residuals = 0
vif_log_sal <- vif(lm_log_sal)
vif_log_sal_df <- if (is.matrix(vif_log_sal)) {
data.frame(
Predictor = rownames(vif_log_sal),
GVIF = round(vif_log_sal[, "GVIF"], 3),
Df = vif_log_sal[, "Df"],
GVIF_adj = round(vif_log_sal[, "GVIF^(1/(2*Df))"], 3)
)
} else {
data.frame(Predictor = names(vif_log_sal), VIF = round(vif_log_sal, 3))
}
cat("\nVIF — Log-Linear Model:\n")##
## VIF — Log-Linear Model:
## Predictor GVIF Df GVIF_adj
## work_year work_year 1.088 1 1.043
## experience_level experience_level 1.250 3 1.038
## employment_type employment_type 1.047 3 1.008
## company_size company_size 1.147 2 1.035
## remote_label remote_label 1.121 2 1.029
## job_category job_category 1.261 4 1.029
# Coefficient interpretation (% salary effect)
coef_df <- as.data.frame(summary(lm_log_sal)$coefficients) %>%
rownames_to_column("Term") %>%
mutate(
`Salary % effect` = round((exp(Estimate) - 1) * 100, 2),
Significant = ifelse(`Pr(>|t|)` < 0.05, "Yes", "No")
) %>%
select(Term, Estimate, `Salary % effect`, `Pr(>|t|)`, Significant)
cat("\n========== Log-Linear Model — Coefficient Salary % Effects ==========\n")##
## ========== Log-Linear Model — Coefficient Salary % Effects ==========
## Term Estimate Salary % effect Pr(>|t|)
## 1 (Intercept) -40.88736106 -100.00 4.408608e-14
## 2 work_year 0.02604969 2.64 2.246734e-22
## 3 experience_level.L 0.45306954 57.31 0.000000e+00
## 4 experience_level.Q -0.02404406 -2.38 1.927001e-04
## 5 experience_level.C -0.01981397 -1.96 7.665968e-08
## 6 employment_typePart-Time -0.49845483 -39.25 7.731562e-55
## 7 employment_typeContract -0.43085693 -35.00 2.075993e-43
## 8 employment_typeFreelance -0.66295869 -48.47 7.594185e-09
## 9 company_size.L 0.24618830 27.91 2.637457e-23
## 10 company_size.Q -0.13226257 -12.39 2.262214e-19
## 11 remote_labelHybrid -0.54784861 -42.18 5.727239e-82
## 12 remote_labelFully Remote -0.06631114 -6.42 2.395357e-63
## 13 job_categoryData Analyst -0.35682851 -30.01 0.000000e+00
## 14 job_categoryData Scientist -0.02461600 -2.43 7.177895e-09
## 15 job_categoryManagement -0.05555433 -5.40 5.269606e-19
## 16 job_categoryOther -0.23054302 -20.59 0.000000e+00
## Significant
## 1 Yes
## 2 Yes
## 3 Yes
## 4 Yes
## 5 Yes
## 6 Yes
## 7 Yes
## 8 Yes
## 9 Yes
## 10 Yes
## 11 Yes
## 12 Yes
## 13 Yes
## 14 Yes
## 15 Yes
## 16 Yes
##
## Call:
## lm(formula = log_salary ~ work_year + experience_level + employment_type +
## company_size + remote_label + job_category, data = train_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.20916 -0.25629 0.01486 0.27929 2.13325
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -40.887361 5.415632 -7.550 4.41e-14 ***
## work_year 0.026050 0.002676 9.733 < 2e-16 ***
## experience_level.L 0.453070 0.008368 54.146 < 2e-16 ***
## experience_level.Q -0.024044 0.006449 -3.729 0.000193 ***
## experience_level.C -0.019814 0.003686 -5.375 7.67e-08 ***
## employment_typePart-Time -0.498455 0.031933 -15.610 < 2e-16 ***
## employment_typeContract -0.430857 0.031167 -13.824 < 2e-16 ***
## employment_typeFreelance -0.662959 0.114740 -5.778 7.59e-09 ***
## company_size.L 0.246188 0.024745 9.949 < 2e-16 ***
## company_size.Q -0.132263 0.014692 -9.002 < 2e-16 ***
## remote_labelHybrid -0.547849 0.028530 -19.202 < 2e-16 ***
## remote_labelFully Remote -0.066311 0.003943 -16.818 < 2e-16 ***
## job_categoryData Analyst -0.356829 0.005161 -69.146 < 2e-16 ***
## job_categoryData Scientist -0.024616 0.004253 -5.787 7.18e-09 ***
## job_categoryManagement -0.055554 0.006236 -8.909 < 2e-16 ***
## job_categoryOther -0.230543 0.005312 -43.397 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4231 on 70853 degrees of freedom
## Multiple R-squared: 0.2358, Adjusted R-squared: 0.2356
## F-statistic: 1457 on 15 and 70853 DF, p-value: < 2.2e-16
set.seed(42)
rf_reg_sal <- randomForest(
log_salary ~ work_year + experience_level + employment_type +
company_size + remote_label + job_category,
data = train_reg,
ntree = 300,
importance = TRUE,
na.action = na.omit
)
cat("\nOOB R² (Random Forest on log-salary):",
round(1 - rf_reg_sal$mse[300] / var(train_reg$log_salary), 4), "\n")##
## OOB R² (Random Forest on log-salary): 0.2518
eval_reg_sal <- function(model, test, log_resp = TRUE) {
preds_log <- predict(model, test)
preds <- if (log_resp) exp(preds_log) else preds_log
actual <- test$salary_in_usd
rmse <- sqrt(mean((actual - preds)^2, na.rm = TRUE))
r2 <- 1 - sum((actual - preds)^2, na.rm = TRUE) / sum((actual - mean(actual, na.rm = TRUE))^2, na.rm = TRUE)
c(RMSE = rmse, R2 = r2)
}
m_raw <- eval_reg_sal(lm_raw_sal, test_reg, log_resp = FALSE)
m_log <- eval_reg_sal(lm_log_sal, test_reg)
m_step <- eval_reg_sal(lm_step_sal, test_reg)
m_rf <- eval_reg_sal(rf_reg_sal, test_reg)
reg_tbl <- data.frame(
Model = c(
"Baseline LM (Raw Salary)",
"Log-Linear LM (Required)",
"Log-Linear Stepwise AIC",
"Random Forest (Benchmark)"
),
RMSE = round(c(m_raw["RMSE"], m_log["RMSE"], m_step["RMSE"], m_rf["RMSE"])),
R_Squared = round(c(m_raw["R2"], m_log["R2"], m_step["R2"], m_rf["R2"]), 4)
)
cat("\n========== Regression Model Comparison — Test Set ==========\n")##
## ========== Regression Model Comparison — Test Set ==========
## Model RMSE R_Squared
## 1 Baseline LM (Raw Salary) 67791 0.1668
## 2 Log-Linear LM (Required) 69021 0.1363
## 3 Log-Linear Stepwise AIC 69021 0.1363
## 4 Random Forest (Benchmark) 68735 0.1435
# Actual vs Predicted
best_preds_sal <- exp(predict(lm_step_sal, test_reg))
pred_df <- data.frame(
Actual = test_reg$salary_in_usd,
Predicted = best_preds_sal,
Residual = test_reg$salary_in_usd - best_preds_sal
)
ggplot(pred_df, aes(x = Predicted, y = Actual)) +
geom_point(aes(colour = abs(Residual)), alpha = 0.6, size = 2) +
geom_abline(intercept = 0, slope = 1,
colour = "#e74c3c", linewidth = 1.1, linetype = "dashed") +
scale_colour_viridis_c(name = "Abs. Residual", labels = dollar_format()) +
scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
labs(
title = "Actual vs Predicted Salary — Best Linear Model",
subtitle = "Dashed diagonal = perfect prediction; colour = residual size",
x = "Predicted Salary", y = "Actual Salary"
) +
theme_report()# Residual plot
ggplot(pred_df, aes(x = Predicted, y = Residual)) +
geom_hline(yintercept = 0, colour = "#e74c3c", linewidth = 1, linetype = "dashed") +
geom_point(colour = "#2980b9", alpha = 0.55, size = 1.8) +
geom_smooth(method = "loess", se = FALSE, colour = "#f39c12", linewidth = 1.1) +
scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
labs(
title = "Residual Plot — Best Linear Model",
subtitle = "Flat LOESS band and random scatter indicate assumption is met",
x = "Predicted Salary", y = "Residual"
) +
theme_report()set.seed(123)
dt_cls <- rpart(
salary_category ~ .,
data = train_cls,
method = "class",
control = rpart.control(cp = 0.001, minsplit = 20)
)
best_cp <- dt_cls$cptable[which.min(dt_cls$cptable[, "xerror"]), "CP"]
dt_pruned <- prune(dt_cls, cp = best_cp)
rpart.plot(
dt_pruned, type = 4, extra = 104,
main = "Pruned Decision Tree — Salary Tier", cex = 0.7
)dt_preds <- predict(dt_pruned, test_cls, type = "class")
cm_dt <- confusionMatrix(dt_preds, test_cls$salary_category)
cat("\n--- Decision Tree Accuracy:",
round(cm_dt$overall["Accuracy"] * 100, 2), "% ---\n")##
## --- Decision Tree Accuracy: 38.75 % ---
## Reference
## Prediction Low Medium High Very High
## Low 2450 1191 722 317
## Medium 851 918 816 828
## High 245 573 780 564
## Very High 883 1747 2113 2717
set.seed(42)
rf_cls_sal <- randomForest(
salary_category ~ .,
data = train_cls,
ntree = 300,
importance = TRUE,
na.action = na.omit
)
rf_cls_preds <- predict(rf_cls_sal, test_cls)
cm_rf_cls <- confusionMatrix(rf_cls_preds, test_cls$salary_category)
cat("\n--- Random Forest Accuracy:",
round(cm_rf_cls$overall["Accuracy"] * 100, 2), "% ---\n")##
## --- Random Forest Accuracy: 39.37 % ---
# Confusion matrix heatmap
cm_df <- as.data.frame(cm_rf_cls$table) %>%
rename(Predicted = Prediction, Actual = Reference)
ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(colour = "white", linewidth = 1.2) +
geom_text(aes(label = Freq), size = 5, fontface = "bold") +
scale_fill_gradient(low = "#dce8f5", high = "#1a6fa8", name = "Count") +
labs(
title = "Confusion Matrix — Random Forest Salary Tier Classifier",
subtitle = paste0("Overall accuracy: ",
round(cm_rf_cls$overall["Accuracy"] * 100, 1), "%"),
x = "Actual Tier", y = "Predicted Tier"
) +
theme_report() +
theme(panel.grid = element_blank(), legend.position = "right")# Variable importance
imp_df_sal <- as.data.frame(importance(rf_cls_sal)) %>%
rownames_to_column("Feature") %>%
arrange(desc(MeanDecreaseGini))
ggplot(imp_df_sal,
aes(x = reorder(Feature, MeanDecreaseGini),
y = MeanDecreaseGini, fill = MeanDecreaseGini)) +
geom_col(alpha = 0.88) +
geom_text(aes(label = round(MeanDecreaseGini, 1)),
hjust = -0.15, size = 3.5, fontface = "bold") +
coord_flip() +
scale_fill_viridis_c(option = "C", guide = "none") +
scale_y_continuous(expand = expansion(mult = c(0, .15))) +
labs(
title = "Random Forest — Variable Importance (Salary Tier)",
subtitle = "Mean Decrease Gini: higher = more predictive",
x = NULL, y = "Mean Decrease Gini"
) +
theme_report()cls_tbl <- data.frame(
Model = c("Decision Tree (Pruned)", "Random Forest"),
Accuracy = c(
round(cm_dt$overall["Accuracy"] * 100, 2),
round(cm_rf_cls$overall["Accuracy"] * 100, 2)
),
Kappa = c(
round(cm_dt$overall["Kappa"], 3),
round(cm_rf_cls$overall["Kappa"], 3)
)
)
rownames(cls_tbl) <- NULL
cat("\n========== Classification Model Comparison ==========\n")##
## ========== Classification Model Comparison ==========
## Model Accuracy Kappa
## 1 Decision Tree (Pruned) 38.75 0.183
## 2 Random Forest 39.37 0.192
##
## FINAL CLASSIFIER: highest Accuracy + Kappa selected.
## Cohen's Kappa interpretation: >0.60 substantial; >0.80 almost-perfect.
ggplot(df,
aes(x = experience_level, y = salary_in_usd,
colour = company_size, group = company_size)) +
stat_summary(fun = median, geom = "line", linewidth = 1.2) +
stat_summary(fun = median, geom = "point", size = 3) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_colour_manual(values = palette_sal[1:3]) +
labs(
title = "Median Salary by Experience × Company Size",
subtitle = "Large companies show the steepest experience premium",
x = "Experience Level", y = "Median Salary (USD)",
colour = "Company Size"
) +
theme_report()df %>%
group_by(work_year, remote_label) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(work_year) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(x = work_year, y = pct, fill = remote_label)) +
geom_area(alpha = 0.8) +
scale_fill_manual(values = palette_sal[1:3]) +
scale_y_continuous(labels = percent_format()) +
labs(
title = "Remote Work Mix Over Time",
subtitle = "Fully remote share grew sharply post-2020",
x = "Year", y = "Proportion", fill = "Work Arrangement"
) +
theme_report()df %>%
group_by(work_year, employment_type) %>%
summarise(median_salary = median(salary_in_usd), .groups = "drop") %>%
ggplot(aes(x = work_year, y = median_salary,
colour = employment_type, group = employment_type)) +
geom_line(linewidth = 1.1) +
geom_point(size = 2.5) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_colour_manual(values = palette_sal) +
labs(
title = "Median Salary by Employment Type (2020–2025)",
subtitle = "Full-time contracts dominate; freelance variance is wide",
x = "Year", y = "Median Salary (USD)", colour = "Employment Type"
) +
theme_report()df %>%
group_by(year_group, job_category) %>%
summarise(median_salary = median(salary_in_usd), .groups = "drop") %>%
ggplot(aes(x = year_group, y = median_salary, fill = job_category)) +
geom_col(position = "dodge", alpha = 0.85) +
scale_fill_manual(values = palette_sal[1:5]) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
labs(
title = "Median Salary by Job Category and Era",
subtitle = "Management and ML Engineering command premiums across all periods",
x = "Era", y = "Median Salary (USD)", fill = "Job Category"
) +
theme_report()## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] RColorBrewer_1.1-3 viridis_0.6.5 viridisLite_0.4.3
## [4] ggridges_0.5.7 patchwork_1.3.2 lubridate_1.9.5
## [7] forcats_1.0.1 stringr_1.6.0 dplyr_1.2.1
## [10] purrr_1.2.2 readr_2.2.0 tidyr_1.3.2
## [13] tibble_3.3.1 tidyverse_2.0.0 kableExtra_1.4.0
## [16] knitr_1.51 ggcorrplot_0.1.4.1 scales_1.4.0
## [19] pROC_1.19.0.1 e1071_1.7-17 randomForest_4.7-1.2
## [22] rpart.plot_3.1.4 rpart_4.1.24 caret_7.0-1
## [25] lattice_0.22-9 ggplot2_4.0.2 nortest_1.0-4
## [28] MASS_7.3-65 lmtest_0.9-40 zoo_1.8-15
## [31] car_3.1-5 carData_3.0-6
##
## loaded via a namespace (and not attached):
## [1] gridExtra_2.3 rlang_1.2.0 magrittr_2.0.5
## [4] otel_0.2.0 compiler_4.5.3 mgcv_1.9-4
## [7] systemfonts_1.3.2 vctrs_0.7.3 reshape2_1.4.5
## [10] pkgconfig_2.0.3 fastmap_1.2.0 labeling_0.4.3
## [13] utf8_1.2.6 rmarkdown_2.31 prodlim_2026.03.11
## [16] tzdb_0.5.0 xfun_0.57 cachem_1.1.0
## [19] jsonlite_2.0.0 recipes_1.3.2 parallel_4.5.3
## [22] R6_2.6.1 bslib_0.10.0 stringi_1.8.7
## [25] parallelly_1.46.1 jquerylib_0.1.4 Rcpp_1.1.1
## [28] iterators_1.0.14 future.apply_1.20.2 Matrix_1.7-4
## [31] splines_4.5.3 nnet_7.3-20 timechange_0.4.0
## [34] tidyselect_1.2.1 rstudioapi_0.18.0 abind_1.4-8
## [37] yaml_2.3.12 timeDate_4052.112 codetools_0.2-20
## [40] listenv_0.10.1 plyr_1.8.9 withr_3.0.2
## [43] S7_0.2.1 evaluate_1.0.5 future_1.70.0
## [46] survival_3.8-6 proxy_0.4-29 xml2_1.5.2
## [49] pillar_1.11.1 foreach_1.5.2 stats4_4.5.3
## [52] generics_0.1.4 hms_1.1.4 globals_0.19.1
## [55] class_7.3-23 glue_1.8.0 tools_4.5.3
## [58] data.table_1.18.2.1 ModelMetrics_1.2.2.2 gower_1.0.2
## [61] grid_4.5.3 ipred_0.9-15 nlme_3.1-168
## [64] Formula_1.2-5 cli_3.6.6 textshaping_1.0.5
## [67] svglite_2.2.2 lava_1.9.0 gtable_0.3.6
## [70] sass_0.4.10 digest_0.6.39 farver_2.1.2
## [73] htmltools_0.5.9 lifecycle_1.0.5 hardhat_1.4.3
```