PART 1
Loan Data Analysis
📊 Dataset: 45,000 loan applicants  |  🎯 Target: loan_status (default prediction)

Libraries

# Diagnostics / transforms — load BEFORE tidyverse to avoid masking conflicts
library(car)      # vif(), Anova()
library(lmtest)   # bptest() — Breusch-Pagan heteroscedasticity test
library(MASS)     # boxcox(), stepAIC()
library(nortest)  # ad.test() — Anderson-Darling normality test

# Model training & evaluation
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)
library(e1071)    # SVM / naiveBayes
library(pROC)     # roc(), auc()
library(scales)

# Visualisation helpers
library(ggcorrplot)
library(knitr)
library(kableExtra)

# Core — load AFTER MASS so dplyr::select() wins over MASS::select()
library(tidyverse)
library(patchwork)
# ── Custom theme & palette ────────────────────────────────────────────────────

theme_loan <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title       = element_text(face = "bold", size = 13, colour = "#2c3e50"),
      plot.subtitle    = element_text(colour = "#7f8c8d", size = 10),
      axis.title       = element_text(colour = "#555555"),
      panel.grid.minor = element_blank(),
      panel.grid.major = element_line(colour = "#eaecef"),
      legend.position  = "bottom"
    )
}

palette_main <- c("#2980b9", "#e74c3c", "#27ae60", "#f39c12", "#8e44ad", "#16a085")

1 Data Loading & Initial Inspection

df_raw <- read.csv("loan_data.csv", stringsAsFactors = FALSE)

cat("Raw dimensions:", nrow(df_raw), "rows ×", ncol(df_raw), "columns\n\n")
## Raw dimensions: 45000 rows × 14 columns
str(df_raw)
## 'data.frame':    45000 obs. of  14 variables:
##  $ person_age                    : num  22 21 25 23 24 21 26 24 24 21 ...
##  $ person_gender                 : chr  "female" "female" "female" "female" ...
##  $ person_education              : chr  "Master" "High School" "High School" "Bachelor" ...
##  $ person_income                 : num  71948 12282 12438 79753 66135 ...
##  $ person_emp_exp                : int  0 0 3 0 1 0 1 5 3 0 ...
##  $ person_home_ownership         : chr  "RENT" "OWN" "MORTGAGE" "RENT" ...
##  $ loan_amnt                     : num  35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
##  $ loan_intent                   : chr  "PERSONAL" "EDUCATION" "MEDICAL" "MEDICAL" ...
##  $ loan_int_rate                 : num  16 11.1 12.9 15.2 14.3 ...
##  $ loan_percent_income           : num  0.49 0.08 0.44 0.44 0.53 0.19 0.37 0.37 0.35 0.13 ...
##  $ cb_person_cred_hist_length    : num  3 2 3 2 4 2 3 4 2 3 ...
##  $ credit_score                  : int  561 504 635 675 586 532 701 585 544 640 ...
##  $ previous_loan_defaults_on_file: chr  "No" "Yes" "No" "No" ...
##  $ loan_status                   : int  1 0 1 1 1 1 1 1 1 1 ...
head(df_raw, 6)
##   person_age person_gender person_education person_income person_emp_exp
## 1         22        female           Master         71948              0
## 2         21        female      High School         12282              0
## 3         25        female      High School         12438              3
## 4         23        female         Bachelor         79753              0
## 5         24          male           Master         66135              1
## 6         21        female      High School         12951              0
##   person_home_ownership loan_amnt loan_intent loan_int_rate loan_percent_income
## 1                  RENT     35000    PERSONAL         16.02                0.49
## 2                   OWN      1000   EDUCATION         11.14                0.08
## 3              MORTGAGE      5500     MEDICAL         12.87                0.44
## 4                  RENT     35000     MEDICAL         15.23                0.44
## 5                  RENT     35000     MEDICAL         14.27                0.53
## 6                   OWN      2500     VENTURE          7.14                0.19
##   cb_person_cred_hist_length credit_score previous_loan_defaults_on_file
## 1                          3          561                             No
## 2                          2          504                            Yes
## 3                          3          635                             No
## 4                          2          675                             No
## 5                          4          586                             No
## 6                          2          532                             No
##   loan_status
## 1           1
## 2           0
## 3           1
## 4           1
## 5           1
## 6           1

2 Data Cleaning

2.1 Missing-Value Audit

na_counts    <- colSums(is.na(df_raw))
blank_counts <- sapply(df_raw, function(x) sum(x == "" | x == " ", na.rm = TRUE))

miss_df <- data.frame(
  Column      = names(na_counts),
  NA_Count    = na_counts,
  Blank_Count = blank_counts,
  NA_Pct      = round(100 * na_counts / nrow(df_raw), 2)
)
miss_df <- miss_df[miss_df$NA_Count > 0 | miss_df$Blank_Count > 0, ]

if (nrow(miss_df) == 0) {
  cat("✔  No missing values detected.\n")
} else {
  print(miss_df)
}
## ✔  No missing values detected.

2.2 Outlier Removal, Type Conversion & Feature Engineering

df_clean <- df_raw %>%
  filter(person_age <= 100) %>%
  drop_na() %>%
  mutate(
    person_gender = factor(person_gender),

    person_education = factor(
      person_education,
      levels = c("High School", "Bachelor", "Master", "PhD")
    ),

    person_home_ownership          = factor(person_home_ownership),
    loan_intent                    = factor(loan_intent),
    previous_loan_defaults_on_file = factor(previous_loan_defaults_on_file),

    loan_status_f = factor(
      loan_status,
      levels = c(0, 1),
      labels = c("Repaid", "Defaulted")
    ),

    # ── Derived / engineered features ──────────────────────────────────────
    log_income    = log(person_income),
    loan_to_income = loan_amnt / person_income,

    age_group = cut(
      person_age,
      breaks = c(17, 25, 35, 50, Inf),
      labels = c("18-25", "26-35", "36-50", "50+")
    ),

    # High-risk flag: high loan burden AND poor credit score
    high_risk = factor(
      ifelse(loan_percent_income > 0.4 & credit_score < 600, "Yes", "No")
    )
  )

cat("Cleaned dataset:", nrow(df_clean), "rows ×", ncol(df_clean), "columns\n")
## Cleaned dataset: 44993 rows × 19 columns

2.3 Check for NAs After Cleaning

cat("Checking for NAs in key columns after cleaning:\n")
## Checking for NAs in key columns after cleaning:
na_check_after <- df_clean %>%
  select(person_income, log_income, person_age, person_emp_exp, 
         loan_amnt, loan_int_rate, credit_score, loan_percent_income) %>%
  summarise(across(everything(), ~sum(is.na(.))))

print(na_check_after)
##   person_income log_income person_age person_emp_exp loan_amnt loan_int_rate
## 1             0          0          0              0         0             0
##   credit_score loan_percent_income
## 1            0                   0
if(any(is.na(df_clean$log_income))) {
  cat("\nWarning: NAs found in log_income. Removing rows with log_income NA.\n")
  df_clean <- df_clean %>% filter(!is.na(log_income))
}

3 Exploratory Data Analysis (EDA)

3.1 Summary Statistics (Numeric Columns)

df_clean %>%
  select(where(is.numeric)) %>%
  summary()
##    person_age    person_income     person_emp_exp     loan_amnt    
##  Min.   :20.00   Min.   :   8000   Min.   : 0.000   Min.   :  500  
##  1st Qu.:24.00   1st Qu.:  47195   1st Qu.: 1.000   1st Qu.: 5000  
##  Median :26.00   Median :  67046   Median : 4.000   Median : 8000  
##  Mean   :27.75   Mean   :  79908   Mean   : 5.395   Mean   : 9583  
##  3rd Qu.:30.00   3rd Qu.:  95778   3rd Qu.: 8.000   3rd Qu.:12237  
##  Max.   :94.00   Max.   :2448661   Max.   :76.000   Max.   :35000  
##  loan_int_rate   loan_percent_income cb_person_cred_hist_length  credit_score  
##  Min.   : 5.42   Min.   :0.0000      Min.   : 2.000             Min.   :390.0  
##  1st Qu.: 8.59   1st Qu.:0.0700      1st Qu.: 3.000             1st Qu.:601.0  
##  Median :11.01   Median :0.1200      Median : 4.000             Median :640.0  
##  Mean   :11.01   Mean   :0.1397      Mean   : 5.867             Mean   :632.6  
##  3rd Qu.:12.99   3rd Qu.:0.1900      3rd Qu.: 8.000             3rd Qu.:670.0  
##  Max.   :20.00   Max.   :0.6600      Max.   :30.000             Max.   :784.0  
##   loan_status       log_income     loan_to_income     
##  Min.   :0.0000   Min.   : 8.987   Min.   :0.0006576  
##  1st Qu.:0.0000   1st Qu.:10.762   1st Qu.:0.0735662  
##  Median :0.0000   Median :11.113   Median :0.1216656  
##  Mean   :0.2223   Mean   :11.122   Mean   :0.1397593  
##  3rd Qu.:0.0000   3rd Qu.:11.470   3rd Qu.:0.1881413  
##  Max.   :1.0000   Max.   :14.711   Max.   :0.6641860

3.2 Distribution of Loan Status

status_tbl <- df_clean %>%
  count(loan_status_f) %>%
  mutate(pct = round(100 * n / sum(n), 1))

print(status_tbl)
##   loan_status_f     n  pct
## 1        Repaid 34993 77.8
## 2     Defaulted 10000 22.2
ggplot(status_tbl, aes(x = loan_status_f, y = n, fill = loan_status_f)) +
  geom_col(width = 0.5, alpha = 0.85) +
  geom_text(aes(label = paste0(pct, "%")), vjust = -0.4, fontface = "bold") +
  scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
  labs(
    title    = "Loan Status Distribution",
    subtitle = "Target variable class balance",
    x = NULL, y = "Count", fill = NULL
  ) +
  theme_loan()

3.3 Income Distribution: Raw vs Log-Transformed

p1 <- ggplot(df_clean, aes(x = person_income)) +
  geom_histogram(bins = 60, fill = "#2980b9", colour = "white", alpha = 0.85) +
  scale_x_continuous(labels = comma) +
  labs(title = "Raw Income — Right-Skewed", x = "Annual Income (USD)", y = "Count") +
  theme_loan()

p2 <- ggplot(df_clean, aes(x = log_income)) +
  geom_histogram(bins = 60, fill = "#27ae60", colour = "white", alpha = 0.85) +
  labs(title = "Log(Income) — More Symmetric", x = "ln(Income)", y = "Count") +
  theme_loan()

p1 + p2

3.4 Continuous Variables — Boxplots by Loan Status

num_vars <- c(
  "person_income", "person_age", "loan_amnt",
  "loan_int_rate", "credit_score", "loan_percent_income"
)

df_long <- df_clean %>%
  select(all_of(num_vars), loan_status_f) %>%
  pivot_longer(-loan_status_f, names_to = "variable", values_to = "value")

ggplot(df_long, aes(x = loan_status_f, y = value, fill = loan_status_f)) +
  geom_boxplot(outlier.alpha = 0.2, alpha = 0.75) +
  scale_fill_manual(values = c("#2980b9", "#e74c3c")) +
  facet_wrap(~variable, scales = "free_y", ncol = 3) +
  labs(
    title    = "Numeric Variables by Loan Status",
    subtitle = "Detecting separation between Repaid vs Defaulted groups",
    x = NULL, y = NULL, fill = NULL
  ) +
  theme_loan() +
  theme(axis.text.x = element_text(angle = 20, hjust = 1))

3.5 Categorical Predictors vs Loan Default

cat_vars <- c(
  "person_education", "person_home_ownership",
  "loan_intent", "previous_loan_defaults_on_file"
)

plots <- lapply(cat_vars, function(v) {
  ggplot(df_clean, aes(x = .data[[v]], fill = loan_status_f)) +
    geom_bar(position = "fill", alpha = 0.85) +
    scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
    scale_y_continuous(labels = percent_format()) +
    labs(title = v, x = NULL, y = "Proportion", fill = NULL) +
    theme_loan() +
    theme(
      axis.text.x    = element_text(angle = 25, hjust = 1),
      legend.position = "none"
    )
})

wrap_plots(plots, ncol = 2)

3.6 Correlation Matrix

cor_mat <- df_clean %>%
  select(
    person_age, person_income, person_emp_exp,
    loan_amnt, loan_int_rate, loan_percent_income,
    cb_person_cred_hist_length, credit_score, loan_status
  ) %>%
  cor(use = "complete.obs")

ggcorrplot(
  cor_mat,
  method   = "circle",
  type     = "lower",
  lab      = TRUE,
  lab_size = 3,
  colors   = c("#e74c3c", "white", "#2980b9"),
  title    = "Pearson Correlation Matrix — Numeric Predictors",
  ggtheme  = theme_loan()
)

4 Feature Engineering

fe_summary <- df_clean %>%
  group_by(age_group, high_risk) %>%
  summarise(
    n           = n(),
    default_pct = round(100 * mean(loan_status), 1),
    avg_income  = round(mean(person_income)),
    avg_credit  = round(mean(credit_score)),
    .groups     = "drop"
  )

print(fe_summary)
## # A tibble: 7 × 6
##   age_group high_risk     n default_pct avg_income avg_credit
##   <fct>     <fct>     <int>       <dbl>      <dbl>      <dbl>
## 1 18-25     No        20393        23.2      71250        626
## 2 18-25     Yes          48        85.4      40612        551
## 3 26-35     No        20029        21.2      85104        636
## 4 26-35     Yes          44        68.2      43052        557
## 5 36-50     No         4151        20.5      95655        649
## 6 36-50     Yes           7        85.7      41245        554
## 7 50+       No          321        25.9     113933        678
df_clean %>%
  count(age_group, loan_status_f) %>%
  ggplot(aes(x = age_group, y = n, fill = loan_status_f)) +
  geom_col(position = "fill", alpha = 0.85) +
  scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title    = "Default Rate by Age Group (Engineered Feature)",
    subtitle = "Younger applicants show higher default proportions",
    x = "Age Group", y = "Proportion", fill = "Loan Status"
  ) +
  theme_loan()

5 Train / Test Split (80/20)

set.seed(123)
train_idx <- createDataPartition(df_clean$loan_status_f, p = 0.80, list = FALSE)
train_df  <- df_clean[ train_idx, ]
test_df   <- df_clean[-train_idx, ]

cat("Training rows:", nrow(train_df), "\n")
## Training rows: 35995
cat("Test rows    :", nrow(test_df),  "\n")
## Test rows    : 8998
# Final NA check before modeling
cat("\nFinal NA check on training data:\n")
## 
## Final NA check on training data:
na_final <- train_df %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "column", values_to = "na_count") %>%
  filter(na_count > 0)

if(nrow(na_final) > 0) {
  print(na_final)
  cat("\nRemoving rows with NAs from train and test...\n")
  train_df <- na.omit(train_df)
  test_df <- test_df %>% filter(complete.cases(select(., all_of(names(train_df)))))
  cat("New training rows:", nrow(train_df), "\n")
  cat("New test rows:", nrow(test_df), "\n")
} else {
  cat("✓ No NAs found in training data\n")
}
## # A tibble: 1 × 2
##   column           na_count
##   <chr>               <int>
## 1 person_education    10110
## 
## Removing rows with NAs from train and test...
## New training rows: 25885 
## New test rows: 6461

6 Regression — Predicting Income

6.1 Baseline Linear Regression (Raw Income)

lm_raw <- lm(
  person_income ~ person_age + person_education + person_emp_exp +
    person_home_ownership + loan_amnt + loan_int_rate +
    credit_score + loan_percent_income,
  data = train_df
)
summary(lm_raw)
## 
## Call:
## lm(formula = person_income ~ person_age + person_education + 
##     person_emp_exp + person_home_ownership + loan_amnt + loan_int_rate + 
##     credit_score + loan_percent_income, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -89722  -15830   -7071    5419 2295870 
## 
## Coefficients:
##                              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                 7.053e+04  5.549e+03   12.709  < 2e-16 ***
## person_age                  7.978e+02  1.680e+02    4.750 2.04e-06 ***
## person_educationBachelor    1.653e+01  6.930e+02    0.024  0.98097    
## person_educationMaster     -6.995e+02  8.338e+02   -0.839  0.40151    
## person_emp_exp              1.696e+02  1.674e+02    1.013  0.31100    
## person_home_ownershipOTHER  3.702e+03  5.976e+03    0.619  0.53560    
## person_home_ownershipOWN   -3.356e+03  1.295e+03   -2.592  0.00955 ** 
## person_home_ownershipRENT  -7.364e+03  6.699e+02  -10.993  < 2e-16 ***
## loan_amnt                   7.199e+00  6.325e-02  113.816  < 2e-16 ***
## loan_int_rate              -2.097e+02  1.030e+02   -2.037  0.04167 *  
## credit_score               -7.644e+00  6.226e+00   -1.228  0.21956    
## loan_percent_income        -5.094e+05  4.537e+03 -112.266  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48350 on 25873 degrees of freedom
## Multiple R-squared:  0.4348, Adjusted R-squared:  0.4346 
## F-statistic:  1810 on 11 and 25873 DF,  p-value: < 2.2e-16

6.2 Assumption Checking — Baseline Model

par(mfrow = c(1, 2))
plot(lm_raw, which = 1, main = "Residuals vs Fitted (Raw LM)")
plot(lm_raw, which = 3, main = "Scale-Location (Raw LM)")

par(mfrow = c(1, 1))

par(mfrow = c(1, 2))
hist(
  residuals(lm_raw), breaks = 50, col = "#3498db",
  main = "Residual Histogram (Raw LM)", xlab = "Residuals"
)
plot(lm_raw, which = 2, main = "Q-Q Plot (Raw LM)")

par(mfrow = c(1, 1))

ad_raw <- ad.test(residuals(lm_raw))
cat("\nAnderson-Darling Test (Raw LM):\n")
## 
## Anderson-Darling Test (Raw LM):
cat("  A =", round(ad_raw$statistic, 4),
    " p-value =", formatC(ad_raw$p.value, format = "e", digits = 3), "\n")
##   A = 3085.106  p-value = 3.700e-24
vif_vals <- vif(lm_raw)
vif_df <- if (is.matrix(vif_vals)) {
  data.frame(
    Predictor = rownames(vif_vals),
    GVIF      = round(vif_vals[, "GVIF"], 3),
    Df        = vif_vals[, "Df"],
    GVIF_adj  = round(vif_vals[, "GVIF^(1/(2*Df))"], 3)
  )
} else {
  data.frame(Predictor = names(vif_vals), VIF = round(vif_vals, 3))
}
cat("\nVIF — Baseline Model:\n")
## 
## VIF — Baseline Model:
print(vif_df)
##                                   Predictor   GVIF Df GVIF_adj
## person_age                       person_age 10.269  1    3.205
## person_education           person_education  1.064  2    1.016
## person_emp_exp               person_emp_exp 10.291  1    3.208
## person_home_ownership person_home_ownership  1.151  3    1.024
## loan_amnt                         loan_amnt  1.759  1    1.326
## loan_int_rate                 loan_int_rate  1.050  1    1.025
## credit_score                   credit_score  1.092  1    1.045
## loan_percent_income     loan_percent_income  1.732  1    1.316
bp_raw <- bptest(lm_raw)
cat("\nBreusch-Pagan Test (Raw LM):\n")
## 
## Breusch-Pagan Test (Raw LM):
cat("  BP =", round(bp_raw$statistic, 3),
    " df =",  bp_raw$parameter,
    " p-value =", formatC(bp_raw$p.value, format = "e", digits = 3), "\n")
##   BP = 232.479  df = 11  p-value = 1.289e-43

6.3 Log-Transformed Linear Regression (Required Model)

lm_log <- lm(
  log_income ~ person_age + person_education + person_emp_exp +
    person_home_ownership + loan_amnt + loan_int_rate +
    credit_score + loan_percent_income,
  data = train_df
)
summary(lm_log)
## 
## Call:
## lm(formula = log_income ~ person_age + person_education + person_emp_exp + 
##     person_home_ownership + loan_amnt + loan_int_rate + credit_score + 
##     loan_percent_income, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7147 -0.1192  0.0056  0.1186  3.2153 
## 
## Coefficients:
##                              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                 1.112e+01  3.069e-02  362.232  < 2e-16 ***
## person_age                  4.935e-03  9.288e-04    5.313 1.09e-07 ***
## person_educationBachelor    8.349e-03  3.832e-03    2.179   0.0294 *  
## person_educationMaster      5.127e-03  4.611e-03    1.112   0.2662    
## person_emp_exp             -1.346e-03  9.255e-04   -1.454   0.1459    
## person_home_ownershipOTHER -9.952e-03  3.305e-02   -0.301   0.7633    
## person_home_ownershipOWN   -1.051e-01  7.160e-03  -14.676  < 2e-16 ***
## person_home_ownershipRENT  -1.024e-01  3.705e-03  -27.654  < 2e-16 ***
## loan_amnt                   8.389e-05  3.498e-07  239.813  < 2e-16 ***
## loan_int_rate              -6.118e-03  5.694e-04  -10.744  < 2e-16 ***
## credit_score                4.727e-06  3.443e-05    0.137   0.8908    
## loan_percent_income        -5.775e+00  2.509e-02 -230.157  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2674 on 25873 degrees of freedom
## Multiple R-squared:  0.7667, Adjusted R-squared:  0.7666 
## F-statistic:  7732 on 11 and 25873 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(lm_log, which = 1:4)

par(mfrow = c(1, 1))

bp_log <- bptest(lm_log)
ad_log <- ad.test(residuals(lm_log))

cat("\nPost-transformation diagnostics:\n")
## 
## Post-transformation diagnostics:
cat("  Breusch-Pagan p    =", formatC(bp_log$p.value, format = "e", digits = 3), "\n")
##   Breusch-Pagan p    = 8.173e-86
cat("  Anderson-Darling p =", formatC(ad_log$p.value, format = "e", digits = 3), "\n")
##   Anderson-Darling p = 3.700e-24
cat("  Mean of residuals  =", round(mean(residuals(lm_log)), 8), "\n")
##   Mean of residuals  = 0
vif_log <- vif(lm_log)
vif_log_df <- if (is.matrix(vif_log)) {
  data.frame(
    Predictor = rownames(vif_log),
    GVIF      = round(vif_log[, "GVIF"], 3),
    Df        = vif_log[, "Df"],
    GVIF_adj  = round(vif_log[, "GVIF^(1/(2*Df))"], 3)
  )
} else {
  data.frame(Predictor = names(vif_log), VIF = round(vif_log, 3))
}
cat("\nVIF — Log-Linear Model:\n")
## 
## VIF — Log-Linear Model:
print(vif_log_df)
##                                   Predictor   GVIF Df GVIF_adj
## person_age                       person_age 10.269  1    3.205
## person_education           person_education  1.064  2    1.016
## person_emp_exp               person_emp_exp 10.291  1    3.208
## person_home_ownership person_home_ownership  1.151  3    1.024
## loan_amnt                         loan_amnt  1.759  1    1.326
## loan_int_rate                 loan_int_rate  1.050  1    1.025
## credit_score                   credit_score  1.092  1    1.045
## loan_percent_income     loan_percent_income  1.732  1    1.316

6.4 Stepwise AIC Selection

lm_step <- stepAIC(lm_log, direction = "both", trace = FALSE)
summary(lm_step)
## 
## Call:
## lm(formula = log_income ~ person_age + person_education + person_emp_exp + 
##     person_home_ownership + loan_amnt + loan_int_rate + loan_percent_income, 
##     data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7151 -0.1193  0.0056  0.1186  3.2154 
## 
## Coefficients:
##                              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                 1.112e+01  2.230e-02  498.734  < 2e-16 ***
## person_age                  4.934e-03  9.288e-04    5.312 1.09e-07 ***
## person_educationBachelor    8.446e-03  3.767e-03    2.242    0.025 *  
## person_educationMaster      5.266e-03  4.499e-03    1.170    0.242    
## person_emp_exp             -1.338e-03  9.236e-04   -1.448    0.148    
## person_home_ownershipOTHER -9.974e-03  3.304e-02   -0.302    0.763    
## person_home_ownershipOWN   -1.051e-01  7.159e-03  -14.676  < 2e-16 ***
## person_home_ownershipRENT  -1.025e-01  3.705e-03  -27.655  < 2e-16 ***
## loan_amnt                   8.389e-05  3.498e-07  239.820  < 2e-16 ***
## loan_int_rate              -6.117e-03  5.694e-04  -10.744  < 2e-16 ***
## loan_percent_income        -5.775e+00  2.509e-02 -230.167  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2674 on 25874 degrees of freedom
## Multiple R-squared:  0.7667, Adjusted R-squared:  0.7667 
## F-statistic:  8505 on 10 and 25874 DF,  p-value: < 2.2e-16

6.5 Random Forest Regression Benchmark

set.seed(42)
rf_income <- randomForest(
  log_income ~ person_age + person_education + person_emp_exp +
    person_home_ownership + loan_amnt + loan_int_rate +
    credit_score + loan_percent_income,
  data       = train_df,
  ntree      = 300,
  importance = TRUE,
  na.action  = na.omit
)

cat("\nOOB R² (Random Forest on log-income):",
    round(1 - rf_income$mse[300] / var(train_df$log_income, na.rm = TRUE), 4), "\n")
## 
## OOB R² (Random Forest on log-income): 0.9435

6.6 Regression Model Comparison

eval_reg <- function(model, test, log_response = TRUE) {
  preds_log <- predict(model, test)
  preds     <- if (log_response) exp(preds_log) else preds_log
  actual    <- test$person_income
  rmse      <- sqrt(mean((actual - preds)^2, na.rm = TRUE))
  r2        <- cor(actual, preds, use = "complete.obs")^2
  c(RMSE = rmse, R2 = r2)
}

raw_metrics  <- eval_reg(lm_raw,    test_df, log_response = FALSE)
log_metrics  <- eval_reg(lm_log,    test_df)
step_metrics <- eval_reg(lm_step,   test_df)
rf_metrics   <- eval_reg(rf_income, test_df)

results_reg <- data.frame(
  Model = c(
    "Linear Reg (Raw Income)",
    "Log-Linear Reg (Required)",
    "Log-Linear Stepwise AIC",
    "Random Forest (Benchmark)"
  ),
  RMSE = round(c(
    raw_metrics["RMSE"],  log_metrics["RMSE"],
    step_metrics["RMSE"], rf_metrics["RMSE"]
  )),
  R_Squared = round(c(
    raw_metrics["R2"], log_metrics["R2"],
    step_metrics["R2"], rf_metrics["R2"]
  ), 4)
)

cat("\n========== Regression Model Comparison — Test Set ==========\n")
## 
## ========== Regression Model Comparison — Test Set ==========
print(results_reg)
##                       Model  RMSE R_Squared
## 1   Linear Reg (Raw Income) 40408    0.5147
## 2 Log-Linear Reg (Required) 40321    0.5290
## 3   Log-Linear Stepwise AIC 40322    0.5290
## 4 Random Forest (Benchmark) 27216    0.8427
cat("\nFINAL MODEL: lowest RMSE / highest R² selected.\n")
## 
## FINAL MODEL: lowest RMSE / highest R² selected.
# Actual vs Predicted plot
best_preds <- exp(predict(lm_log, test_df))

ggplot(
  data.frame(Actual = test_df$person_income, Predicted = best_preds),
  aes(x = Predicted, y = Actual)
) +
  geom_point(alpha = 0.25, colour = "#2980b9", size = 1.5) +
  geom_abline(intercept = 0, slope = 1,
              colour = "#e74c3c", linewidth = 1, linetype = "dashed") +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(
    title    = "Actual vs Predicted Income — Log-Linear Model",
    subtitle = "Dashed line = perfect prediction",
    x = "Predicted Income (USD)", y = "Actual Income (USD)"
  ) +
  theme_loan()

7 Classification — Loan Default Prediction

cls_formula <- loan_status_f ~
  person_income + person_age + person_education +
  person_home_ownership + loan_amnt + loan_int_rate +
  loan_percent_income + credit_score +
  previous_loan_defaults_on_file + loan_intent +
  age_group + high_risk + loan_to_income

7.1 Model 1: Logistic Regression

logit_model <- glm(
  loan_status ~
    log_income + person_age + person_education +
    person_home_ownership + loan_amnt + loan_int_rate +
    loan_percent_income + credit_score +
    previous_loan_defaults_on_file + loan_intent,
  data   = train_df,
  family = binomial(link = "logit")
)
summary(logit_model)
## 
## Call:
## glm(formula = loan_status ~ log_income + person_age + person_education + 
##     person_home_ownership + loan_amnt + loan_int_rate + loan_percent_income + 
##     credit_score + previous_loan_defaults_on_file + loan_intent, 
##     family = binomial(link = "logit"), data = train_df)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        1.205e+01  1.115e+00  10.806  < 2e-16 ***
## log_income                        -1.035e+00  9.529e-02 -10.857  < 2e-16 ***
## person_age                         2.800e-03  4.081e-03   0.686 0.492625    
## person_educationBachelor           2.165e-02  5.441e-02   0.398 0.690746    
## person_educationMaster             5.731e-02  6.505e-02   0.881 0.378292    
## person_home_ownershipOTHER         1.429e-01  4.324e-01   0.330 0.741060    
## person_home_ownershipOWN          -1.579e+00  1.370e-01 -11.528  < 2e-16 ***
## person_home_ownershipRENT          6.644e-01  5.360e-02  12.395  < 2e-16 ***
## loan_amnt                         -1.053e-05  9.365e-06  -1.125 0.260602    
## loan_int_rate                      3.294e-01  8.708e-03  37.829  < 2e-16 ***
## loan_percent_income                9.909e+00  6.296e-01  15.739  < 2e-16 ***
## credit_score                      -9.510e-03  5.469e-04 -17.389  < 2e-16 ***
## previous_loan_defaults_on_fileYes -2.033e+01  1.348e+02  -0.151 0.880105    
## loan_intentEDUCATION              -9.196e-01  7.752e-02 -11.864  < 2e-16 ***
## loan_intentHOMEIMPROVEMENT         9.452e-04  8.904e-02   0.011 0.991531    
## loan_intentMEDICAL                -2.607e-01  7.497e-02  -3.477 0.000506 ***
## loan_intentPERSONAL               -7.472e-01  7.995e-02  -9.346  < 2e-16 ***
## loan_intentVENTURE                -1.196e+00  8.369e-02 -14.288  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27452  on 25884  degrees of freedom
## Residual deviance: 11297  on 25867  degrees of freedom
## AIC: 11333
## 
## Number of Fisher Scoring iterations: 19
logit_probs <- predict(logit_model, test_df, type = "response")
logit_preds <- factor(
  ifelse(logit_probs > 0.5, "Defaulted", "Repaid"),
  levels = c("Repaid", "Defaulted")
)
cm_logit <- confusionMatrix(logit_preds, test_df$loan_status_f,
                             positive = "Defaulted")
cat("\n--- Logistic Regression Confusion Matrix ---\n")
## 
## --- Logistic Regression Confusion Matrix ---
print(cm_logit)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Repaid Defaulted
##   Repaid      4705       354
##   Defaulted    312      1090
##                                           
##                Accuracy : 0.8969          
##                  95% CI : (0.8892, 0.9042)
##     No Information Rate : 0.7765          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6999          
##                                           
##  Mcnemar's Test P-Value : 0.1121          
##                                           
##             Sensitivity : 0.7548          
##             Specificity : 0.9378          
##          Pos Pred Value : 0.7775          
##          Neg Pred Value : 0.9300          
##              Prevalence : 0.2235          
##          Detection Rate : 0.1687          
##    Detection Prevalence : 0.2170          
##       Balanced Accuracy : 0.8463          
##                                           
##        'Positive' Class : Defaulted       
## 

7.2 Model 2: Decision Tree

set.seed(123)
tree_model <- rpart(
  cls_formula,
  data    = train_df,
  method  = "class",
  control = rpart.control(cp = 0.001, minsplit = 30)
)

best_cp     <- tree_model$cptable[which.min(tree_model$cptable[, "xerror"]), "CP"]
tree_pruned <- prune(tree_model, cp = best_cp)

rpart.plot(
  tree_pruned, type = 4, extra = 104,
  main = "Pruned Decision Tree — Loan Default", cex = 0.7
)

tree_preds <- predict(tree_pruned, test_df, type = "class")
cm_tree    <- confusionMatrix(tree_preds, test_df$loan_status_f,
                               positive = "Defaulted")
cat("\n--- Decision Tree Confusion Matrix ---\n")
## 
## --- Decision Tree Confusion Matrix ---

7.3 Model 3: Random Forest

set.seed(42)
rf_class <- randomForest(
  cls_formula,
  data       = train_df,
  ntree      = 300,
  mtry       = 4,
  importance = TRUE,
  na.action  = na.omit
)

rf_preds <- predict(rf_class, test_df)
cm_rf    <- confusionMatrix(rf_preds, test_df$loan_status_f,
                              positive = "Defaulted")
cat("\n--- Random Forest Confusion Matrix ---\n")
## 
## --- Random Forest Confusion Matrix ---
print(cm_rf)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Repaid Defaulted
##   Repaid      4890       332
##   Defaulted    127      1112
##                                           
##                Accuracy : 0.929           
##                  95% CI : (0.9224, 0.9351)
##     No Information Rate : 0.7765          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7844          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7701          
##             Specificity : 0.9747          
##          Pos Pred Value : 0.8975          
##          Neg Pred Value : 0.9364          
##              Prevalence : 0.2235          
##          Detection Rate : 0.1721          
##    Detection Prevalence : 0.1918          
##       Balanced Accuracy : 0.8724          
##                                           
##        'Positive' Class : Defaulted       
## 
imp_df <- as.data.frame(importance(rf_class)) %>%
  rownames_to_column("Feature") %>%
  arrange(desc(MeanDecreaseGini))

ggplot(imp_df,
       aes(x = reorder(Feature, MeanDecreaseGini),
           y = MeanDecreaseGini, fill = MeanDecreaseGini)) +
  geom_col(alpha = 0.88) +
  coord_flip() +
  scale_fill_viridis_c(option = "C", guide = "none") +
  labs(
    title    = "Random Forest — Variable Importance",
    subtitle = "Mean Decrease Gini: higher = more predictive power",
    x = NULL, y = "Mean Decrease Gini"
  ) +
  theme_loan()

7.4 Model 4: Naïve Bayes

nb_model <- naiveBayes(
  loan_status_f ~ person_education + person_home_ownership +
    loan_intent + previous_loan_defaults_on_file +
    age_group + high_risk,
  data = train_df
)
nb_preds <- predict(nb_model, test_df)
cm_nb    <- confusionMatrix(nb_preds, test_df$loan_status_f,
                              positive = "Defaulted")
cat("\n--- Naïve Bayes Confusion Matrix ---\n")
## 
## --- Naïve Bayes Confusion Matrix ---
print(cm_nb)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Repaid Defaulted
##   Repaid      4394       562
##   Defaulted    623       882
##                                          
##                Accuracy : 0.8166         
##                  95% CI : (0.8069, 0.826)
##     No Information Rate : 0.7765         
##     P-Value [Acc > NIR] : 1.427e-15      
##                                          
##                   Kappa : 0.4794         
##                                          
##  Mcnemar's Test P-Value : 0.08134        
##                                          
##             Sensitivity : 0.6108         
##             Specificity : 0.8758         
##          Pos Pred Value : 0.5860         
##          Neg Pred Value : 0.8866         
##              Prevalence : 0.2235         
##          Detection Rate : 0.1365         
##    Detection Prevalence : 0.2329         
##       Balanced Accuracy : 0.7433         
##                                          
##        'Positive' Class : Defaulted      
## 

7.5 ROC Curves & AUC

roc_logit <- roc(test_df$loan_status, logit_probs, quiet = TRUE)
roc_tree  <- roc(
  test_df$loan_status,
  predict(tree_pruned, test_df, type = "prob")[, "Defaulted"],
  quiet = TRUE
)
roc_rf <- roc(
  test_df$loan_status,
  predict(rf_class, test_df, type = "prob")[, "Defaulted"],
  quiet = TRUE
)

plot(roc_logit, col = palette_main[1], lwd = 2,
     main = "ROC Curves — Loan Default Classifiers")
lines(roc_tree, col = palette_main[2], lwd = 2)
lines(roc_rf,   col = palette_main[3], lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "grey60")
legend(
  "bottomright",
  legend = c(
    paste0("Logistic Reg  AUC = ", round(auc(roc_logit), 3)),
    paste0("Decision Tree AUC = ", round(auc(roc_tree),  3)),
    paste0("Random Forest AUC = ", round(auc(roc_rf),    3))
  ),
  col = palette_main[1:3], lwd = 2
)

7.6 Classification Model Comparison

extract_metrics <- function(cm, model_name, auc_val = NA) {
  data.frame(
    Model       = model_name,
    Accuracy    = round(cm$overall["Accuracy"]    * 100, 2),
    Sensitivity = round(cm$byClass["Sensitivity"] * 100, 2),
    Specificity = round(cm$byClass["Specificity"] * 100, 2),
    AUC         = round(auc_val, 4)
  )
}

class_results <- rbind(
  extract_metrics(cm_logit, "Logistic Regression",    auc(roc_logit)),
  extract_metrics(cm_tree,  "Decision Tree (Pruned)", auc(roc_tree)),
  extract_metrics(cm_rf,    "Random Forest",          auc(roc_rf)),
  extract_metrics(cm_nb,    "Naïve Bayes",            NA)
)
rownames(class_results) <- NULL

cat("\n========== Classification Model Comparison — Test Set ==========\n")
## 
## ========== Classification Model Comparison — Test Set ==========
print(class_results)
##                    Model Accuracy Sensitivity Specificity    AUC
## 1    Logistic Regression    89.69       75.48       93.78 0.9530
## 2 Decision Tree (Pruned)    92.09       74.52       97.15 0.9612
## 3          Random Forest    92.90       77.01       97.47 0.9762
## 4            Naïve Bayes    81.66       61.08       87.58     NA
best_cls <- class_results$Model[which.max(class_results$AUC)]
cat("\nFINAL CLASSIFIER:", best_cls, "(highest AUC)\n")
## 
## FINAL CLASSIFIER: Random Forest (highest AUC)

8 Credit Risk Analysis

credit_risk_tbl <- df_clean %>%
  mutate(
    credit_band = cut(
      credit_score,
      breaks = c(300, 500, 600, 700, 800, 850),
      labels = c("300-500", "501-600", "601-700", "701-800", "801-850")
    )
  ) %>%
  group_by(credit_band) %>%
  summarise(
    n           = n(),
    default_pct = round(100 * mean(loan_status), 1),
    avg_income  = round(mean(person_income)),
    .groups     = "drop"
  )

cat("\n========== Default Rate & Income by Credit Score Band ==========\n")
## 
## ========== Default Rate & Income by Credit Score Band ==========
print(credit_risk_tbl)
## # A tibble: 4 × 4
##   credit_band     n default_pct avg_income
##   <fct>       <int>       <dbl>      <dbl>
## 1 300-500       529        21.2      74458
## 2 501-600     10520        22.6      78787
## 3 601-700     31252        22.2      79867
## 4 701-800      2692        21.2      85838
df_clean %>%
  mutate(
    credit_band = cut(
      credit_score,
      breaks = c(300, 500, 600, 700, 800, 850),
      labels = c("300-500", "501-600", "601-700", "701-800", "801-850")
    )
  ) %>%
  ggplot(aes(x = credit_score, y = log_income, colour = loan_status_f)) +
  geom_point(alpha = 0.15, size = 1.2) +
  geom_smooth(method = "lm", se = TRUE) +
  scale_colour_manual(values = c("#27ae60", "#e74c3c")) +
  labs(
    title    = "Credit Score vs Log(Income) by Loan Outcome",
    subtitle = "Higher credit score + higher income → lower default risk",
    x = "Credit Score", y = "Log(Income)", colour = "Status"
  ) +
  theme_loan()

PART 2
Salary Analysis
📊 Dataset: 88,584 salary records  |  🎯 Targets: salary_in_usd (regression) & salary tier (classification)

Libraries

# Additional packages needed for Part 2 not already loaded in Part 1
library(ggridges)      # geom_density_ridges()
library(viridis)       # scale_fill_viridis_c()
library(RColorBrewer)  # brewer palettes
# ── Custom palette & theme ────────────────────────────────────────────────────

palette_sal <- c(
  "#2980b9", "#27ae60", "#e74c3c", "#f39c12",
  "#8e44ad", "#16a085", "#2c3e50"
)

theme_report <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title    = element_text(face = "bold", size = 13, colour = "#2c3e50",
                                   margin = ggplot2::margin(b = 6)),
      plot.subtitle = element_text(colour = "#7f8c8d", size = 10,
                                   margin = ggplot2::margin(b = 10)),
      axis.title    = element_text(colour = "#555555", size = 10),
      panel.grid.minor  = element_blank(),
      panel.grid.major  = element_line(colour = "#eaecef"),
      legend.position   = "bottom",
      legend.title      = element_text(face = "bold")
    )
}

9 Data Loading & Initial Inspection

df_raw_sal <- read.csv("salaries.csv", stringsAsFactors = FALSE)

cat("Raw dimensions:", nrow(df_raw_sal), "rows ×", ncol(df_raw_sal), "columns\n\n")
## Raw dimensions: 88584 rows × 11 columns
str(df_raw_sal)
## 'data.frame':    88584 obs. of  11 variables:
##  $ work_year         : int  2025 2025 2025 2025 2025 2025 2025 2025 2025 2025 ...
##  $ experience_level  : chr  "MI" "SE" "SE" "SE" ...
##  $ employment_type   : chr  "FT" "FT" "FT" "FT" ...
##  $ job_title         : chr  "Customer Success Manager" "Engineer" "Engineer" "Applied Scientist" ...
##  $ salary            : int  57000 165000 109000 294000 137600 82000 44000 149800 89700 200000 ...
##  $ salary_currency   : chr  "EUR" "USD" "USD" "USD" ...
##  $ salary_in_usd     : int  60000 165000 109000 294000 137600 82000 44000 149800 89700 200000 ...
##  $ employee_residence: chr  "NL" "US" "US" "US" ...
##  $ remote_ratio      : int  50 0 0 0 0 0 0 0 0 0 ...
##  $ company_location  : chr  "NL" "US" "US" "US" ...
##  $ company_size      : chr  "L" "M" "M" "M" ...
head(df_raw_sal, 6)
##   work_year experience_level employment_type                job_title salary
## 1      2025               MI              FT Customer Success Manager  57000
## 2      2025               SE              FT                 Engineer 165000
## 3      2025               SE              FT                 Engineer 109000
## 4      2025               SE              FT        Applied Scientist 294000
## 5      2025               SE              FT        Applied Scientist 137600
## 6      2025               EN              FT             Data Analyst  82000
##   salary_currency salary_in_usd employee_residence remote_ratio
## 1             EUR         60000                 NL           50
## 2             USD        165000                 US            0
## 3             USD        109000                 US            0
## 4             USD        294000                 US            0
## 5             USD        137600                 US            0
## 6             USD         82000                 US            0
##   company_location company_size
## 1               NL            L
## 2               US            M
## 3               US            M
## 4               US            M
## 5               US            M
## 6               US            M

10 Data Cleaning, Type Conversion & Feature Engineering

10.1 Missing-Value Audit

na_report <- colSums(is.na(df_raw_sal))
if (any(na_report > 0)) {
  cat("Columns with NAs:\n")
  print(na_report[na_report > 0])
} else {
  cat("✔  No missing values detected.\n")
}
## ✔  No missing values detected.

10.2 Clean & Engineer Features

df <- df_raw_sal %>%
  drop_na(salary_in_usd) %>%
  mutate(
    # ── Ordered factors ──────────────────────────────────────────────────────
    experience_level = factor(
      experience_level,
      levels  = c("EN", "MI", "SE", "EX"),
      labels  = c("Entry", "Mid", "Senior", "Executive"),
      ordered = TRUE
    ),
    company_size = factor(
      company_size,
      levels  = c("S", "M", "L"),
      labels  = c("Small", "Medium", "Large"),
      ordered = TRUE
    ),

    # ── Unordered factors ────────────────────────────────────────────────────
    employment_type = factor(
      employment_type,
      levels = c("FT", "PT", "CT", "FL"),
      labels = c("Full-Time", "Part-Time", "Contract", "Freelance")
    ),
    remote_label = factor(
      remote_ratio,
      levels = c(0, 50, 100),
      labels = c("On-Site", "Hybrid", "Fully Remote")
    ),
    work_year = as.integer(work_year),

    # ── Derived / engineered features ────────────────────────────────────────
    log_salary = log(salary_in_usd),

    # Era grouping: pandemic (2020-21), recovery (2022-23), AI boom (2024-25)
    year_group = cut(
      work_year,
      breaks = c(2019, 2021, 2023, 2025),
      labels = c("2020-21", "2022-23", "2024-25")
    ),

    # Consolidate 80+ job titles into 4 meaningful buckets
    job_category = case_when(
      grepl("Analyst",               job_title, ignore.case = TRUE) ~ "Data Analyst",
      grepl("Scientist",             job_title, ignore.case = TRUE) ~ "Data Scientist",
      grepl("Engineer",              job_title, ignore.case = TRUE) ~ "Data / ML Engineer",
      grepl("Manager|Director|Lead", job_title, ignore.case = TRUE) ~ "Management",
      TRUE                                                           ~ "Other"
    ),
    job_category = factor(job_category)
  )

# Salary quartile tiers — classification target
sal_q <- quantile(df$salary_in_usd, probs = c(0, .25, .50, .75, 1), na.rm = TRUE)
df$salary_category <- cut(
  df$salary_in_usd,
  breaks         = sal_q,
  labels         = c("Low", "Medium", "High", "Very High"),
  include.lowest = TRUE
)

cat("\nCleaned dataset:", nrow(df), "rows ×", ncol(df), "columns\n")
## 
## Cleaned dataset: 88584 rows × 16 columns
cat("Salary range: $", format(min(df$salary_in_usd), big.mark = ","),
    "→ $", format(max(df$salary_in_usd), big.mark = ","), "\n")
## Salary range: $ 15,000 → $ 800,000

11 Exploratory Data Analysis (EDA)

11.1 Summary Statistics

df %>%
  select(work_year, salary_in_usd, log_salary, remote_ratio) %>%
  summary()
##    work_year    salary_in_usd      log_salary      remote_ratio   
##  Min.   :2020   Min.   : 15000   Min.   : 9.616   Min.   :  0.00  
##  1st Qu.:2024   1st Qu.:106097   1st Qu.:11.572   1st Qu.:  0.00  
##  Median :2024   Median :146307   Median :11.893   Median :  0.00  
##  Mean   :2024   Mean   :157568   Mean   :11.858   Mean   : 21.29  
##  3rd Qu.:2024   3rd Qu.:198600   3rd Qu.:12.199   3rd Qu.:  0.00  
##  Max.   :2025   Max.   :800000   Max.   :13.592   Max.   :100.00

11.2 Key Metrics

kpi <- data.frame(
  Metric = c("Records", "Median Salary", "Mean Salary",
             "Max Salary", "Unique Job Titles", "Years Covered"),
  Value  = c(
    nrow(df),
    dollar(median(df$salary_in_usd)),
    dollar(round(mean(df$salary_in_usd))),
    dollar(max(df$salary_in_usd)),
    n_distinct(df$job_title),
    paste(min(df$work_year), "–", max(df$work_year))
  )
)
print(kpi)
##              Metric       Value
## 1           Records       88584
## 2     Median Salary    $146,307
## 3       Mean Salary    $157,568
## 4        Max Salary    $800,000
## 5 Unique Job Titles         312
## 6     Years Covered 2020 – 2025

11.3 Salary Distribution: Raw vs Log

p1 <- ggplot(df, aes(x = salary_in_usd)) +
  geom_histogram(bins = 60, fill = "#2980b9", colour = "white", alpha = 0.85) +
  scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  labs(title = "Raw Salary — Right-Skewed", x = "Salary (USD)", y = "Count") +
  theme_report()

p2 <- ggplot(df, aes(x = log_salary)) +
  geom_histogram(bins = 60, fill = "#27ae60", colour = "white", alpha = 0.85) +
  labs(title = "Log(Salary) — Approx. Normal", x = "ln(Salary)", y = "Count") +
  theme_report()

p1 + p2

11.4 Salary by Experience Level

ggplot(df, aes(x = experience_level, y = salary_in_usd, fill = experience_level)) +
  geom_violin(alpha = 0.6, trim = FALSE) +
  geom_boxplot(width = 0.18, fill = "white", outlier.alpha = 0.25) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  scale_fill_manual(values = palette_sal[1:4], guide = "none") +
  labs(
    title    = "Salary Distribution by Experience Level",
    subtitle = "Violin plot shows full density; boxplot shows quartiles",
    x = "Experience Level", y = "Salary (USD)"
  ) +
  theme_report()

11.6 Salary by Remote Work Arrangement

df %>%
  group_by(remote_label) %>%
  summarise(
    median = median(salary_in_usd),
    q25    = quantile(salary_in_usd, 0.25),
    q75    = quantile(salary_in_usd, 0.75),
    n      = n(),
    .groups = "drop"
  ) %>%
  ggplot(aes(x = remote_label, y = median, fill = remote_label)) +
  geom_col(alpha = 0.85) +
  geom_errorbar(aes(ymin = q25, ymax = q75), width = 0.25, colour = "#2c3e50") +
  geom_text(aes(label = dollar(round(median))), vjust = -0.5, fontface = "bold") +
  scale_fill_manual(values = palette_sal[1:3], guide = "none") +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  labs(
    title    = "Median Salary by Remote Work Arrangement",
    subtitle = "Error bars = IQR (25th–75th percentile)",
    x = NULL, y = "Salary (USD)"
  ) +
  theme_report()

11.7 Salary by Company Size & Employment Type

ggplot(df, aes(x = company_size, y = salary_in_usd, fill = company_size)) +
  geom_boxplot(outlier.alpha = 0.3, alpha = 0.75) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  scale_fill_manual(values = palette_sal[1:3], guide = "none") +
  facet_wrap(~employment_type) +
  labs(
    title    = "Salary by Company Size & Employment Type",
    subtitle = "Large firms pay premiums, especially for full-time roles",
    x = "Company Size", y = "Salary (USD)"
  ) +
  theme_report()

11.8 Log-Salary Density by Job Category (Ridge Plot)

ggplot(df, aes(x = log_salary, y = job_category, fill = job_category)) +
  geom_density_ridges(alpha = 0.7, scale = 1.5) +
  scale_fill_manual(values = palette_sal, guide = "none") +
  labs(
    title    = "Log-Salary Density by Job Category",
    subtitle = "Ridge plot reveals distributional shape across roles",
    x = "ln(Salary)", y = NULL
  ) +
  theme_report()

11.9 Correlation Matrix

cor_mat_sal <- df %>%
  select(work_year, salary_in_usd, log_salary, remote_ratio) %>%
  cor(use = "complete.obs")

ggcorrplot(
  cor_mat_sal,
  method   = "circle",
  type     = "lower",
  lab      = TRUE,
  lab_size = 4,
  colors   = c("#e74c3c", "white", "#2980b9"),
  title    = "Pearson Correlation Matrix — Numeric Variables",
  ggtheme  = theme_report()
)

12 Feature Engineering

df <- df %>%
  mutate(
    exp_size_combo = paste(
      as.character(experience_level),
      as.character(company_size), sep = " × "
    )
  )

fe_tbl <- df %>%
  group_by(job_category, year_group) %>%
  summarise(
    n              = n(),
    median_salary  = dollar(round(median(salary_in_usd))),
    pct_remote     = round(100 * mean(remote_ratio == 100), 1),
    .groups        = "drop"
  ) %>%
  arrange(job_category, year_group)

cat("\n========== Feature-Engineered View: Job Category × Year Group ==========\n")
## 
## ========== Feature-Engineered View: Job Category × Year Group ==========
print(fe_tbl)
## # A tibble: 15 × 5
##    job_category       year_group     n median_salary pct_remote
##    <fct>              <fct>      <int> <chr>              <dbl>
##  1 Data / ML Engineer 2020-21      109 $79,833             53.2
##  2 Data / ML Engineer 2022-23     4398 $153,000            32.7
##  3 Data / ML Engineer 2024-25    35990 $165,600            18.2
##  4 Data Analyst       2020-21       46 $71,893             60.9
##  5 Data Analyst       2022-23     1802 $104,000            41  
##  6 Data Analyst       2024-25    11198 $99,000             21  
##  7 Data Scientist     2020-21      121 $90,000             46.3
##  8 Data Scientist     2022-23     3144 $160,000            34.6
##  9 Data Scientist     2024-25    14930 $159,404            19  
## 10 Management         2020-21        4 $120,000            50  
## 11 Management         2022-23      237 $120,000            16.9
## 12 Management         2024-25     6561 $150,029            17.5
## 13 Other              2020-21       13 $150,000            61.5
## 14 Other              2022-23      601 $130,000            42.8
## 15 Other              2024-25     9430 $126,600            22.3
df %>%
  count(job_category, salary_category) %>%
  ggplot(aes(x = job_category, y = n, fill = salary_category)) +
  geom_col(position = "fill", alpha = 0.87) +
  scale_fill_manual(
    values = c("#3498db", "#2ecc71", "#f39c12", "#e74c3c"),
    name   = "Salary Tier"
  ) +
  scale_y_continuous(labels = percent_format()) +
  coord_flip() +
  labs(
    title    = "Salary Tier Composition by Job Category",
    subtitle = "100% stacked — proportion in each pay bracket",
    x = NULL, y = "Proportion"
  ) +
  theme_report()

13 Train / Test Split (80/20)

set.seed(123)

# Regression split (continuous log-salary target)
df_reg <- df %>%
  select(
    salary_in_usd, log_salary, work_year, experience_level,
    employment_type, company_size, remote_label, job_category
  ) %>%
  na.omit()

idx_r     <- createDataPartition(df_reg$log_salary, p = 0.80, list = FALSE)
train_reg <- df_reg[ idx_r, ]
test_reg  <- df_reg[-idx_r, ]

# Classification split (salary tier target)
df_cls <- df %>%
  select(
    salary_category, work_year, experience_level,
    employment_type, company_size, remote_label, job_category
  ) %>%
  na.omit()

idx_c     <- createDataPartition(df_cls$salary_category, p = 0.80, list = FALSE)
train_cls <- df_cls[ idx_c, ]
test_cls  <- df_cls[-idx_c, ]

cat("Regression    — Train:", nrow(train_reg), "| Test:", nrow(test_reg), "\n")
## Regression    — Train: 70869 | Test: 17715
cat("Classification — Train:", nrow(train_cls), "| Test:", nrow(test_cls), "\n")
## Classification — Train: 70869 | Test: 17715

14 Regression — Predicting Salary

14.1 Baseline Linear Regression (Raw Salary)

lm_raw_sal <- lm(
  salary_in_usd ~ work_year + experience_level +
    employment_type + company_size + remote_label + job_category,
  data = train_reg
)
summary(lm_raw_sal)
## 
## Call:
## lm(formula = salary_in_usd ~ work_year + experience_level + employment_type + 
##     company_size + remote_label + job_category, data = train_reg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -182789  -44633   -9633   33594  688249 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -8551620.8   853101.9 -10.024  < 2e-16 ***
## work_year                      4305.9      421.6  10.213  < 2e-16 ***
## experience_level.L            59222.5     1318.1  44.930  < 2e-16 ***
## experience_level.Q             4570.9     1015.8   4.500 6.82e-06 ***
## experience_level.C            -3534.1      580.6  -6.086 1.16e-09 ***
## employment_typePart-Time     -43483.7     5030.2  -8.645  < 2e-16 ***
## employment_typeContract      -32419.6     4909.6  -6.603 4.05e-11 ***
## employment_typeFreelance     -50590.3    18074.5  -2.799  0.00513 ** 
## company_size.L                23265.8     3898.0   5.969 2.40e-09 ***
## company_size.Q               -12526.3     2314.4  -5.412 6.24e-08 ***
## remote_labelHybrid           -52401.9     4494.3 -11.660  < 2e-16 ***
## remote_labelFully Remote     -11850.9      621.1 -19.080  < 2e-16 ***
## job_categoryData Analyst     -52230.8      812.9 -64.251  < 2e-16 ***
## job_categoryData Scientist    -5599.5      670.0  -8.357  < 2e-16 ***
## job_categoryManagement        -8738.5      982.3  -8.896  < 2e-16 ***
## job_categoryOther            -32255.5      836.8 -38.544  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66660 on 70853 degrees of freedom
## Multiple R-squared:  0.1743, Adjusted R-squared:  0.1741 
## F-statistic: 996.8 on 15 and 70853 DF,  p-value: < 2.2e-16

14.2 Assumption Checking — Baseline Model

par(mfrow = c(1, 2))
plot(lm_raw_sal, which = 1, main = "Residuals vs Fitted (Raw Salary)")
plot(lm_raw_sal, which = 3, main = "Scale-Location (Raw Salary)")

par(mfrow = c(1, 1))

par(mfrow = c(1, 2))
hist(
  residuals(lm_raw_sal), breaks = 50, col = "#3498db", border = "white",
  main = "Residuals Histogram (Raw)", xlab = "Residuals"
)
plot(lm_raw_sal, which = 2, main = "Q-Q Plot (Raw Salary)")

par(mfrow = c(1, 1))

ad_raw_sal <- ad.test(residuals(lm_raw_sal))
cat("\nAnderson-Darling Test (Raw Salary Model):\n")
## 
## Anderson-Darling Test (Raw Salary Model):
cat("  A =", round(ad_raw_sal$statistic, 4),
    " p-value =", formatC(ad_raw_sal$p.value, format = "e", digits = 3), "\n")
##   A = 774.6828  p-value = 3.700e-24
bp_raw_sal <- bptest(lm_raw_sal)
cat("\nBreusch-Pagan Test (Raw Salary Model):\n")
## 
## Breusch-Pagan Test (Raw Salary Model):
cat("  BP =", round(bp_raw_sal$statistic, 3),
    " p-value =", formatC(bp_raw_sal$p.value, format = "e", digits = 3), "\n")
##   BP = 647.331  p-value = 3.063e-128
vif_raw_sal <- vif(lm_raw_sal)
vif_raw_sal_df <- if (is.matrix(vif_raw_sal)) {
  data.frame(
    Predictor = rownames(vif_raw_sal),
    GVIF      = round(vif_raw_sal[, "GVIF"], 3),
    Df        = vif_raw_sal[, "Df"],
    GVIF_adj  = round(vif_raw_sal[, "GVIF^(1/(2*Df))"], 3)
  )
} else {
  data.frame(Predictor = names(vif_raw_sal), VIF = round(vif_raw_sal, 3))
}
cat("\nVIF — Baseline Model:\n")
## 
## VIF — Baseline Model:
print(vif_raw_sal_df)
##                         Predictor  GVIF Df GVIF_adj
## work_year               work_year 1.088  1    1.043
## experience_level experience_level 1.250  3    1.038
## employment_type   employment_type 1.047  3    1.008
## company_size         company_size 1.147  2    1.035
## remote_label         remote_label 1.121  2    1.029
## job_category         job_category 1.261  4    1.029

14.3 Log-Transformed Linear Regression (Required Model)

lm_log_sal <- lm(
  log_salary ~ work_year + experience_level +
    employment_type + company_size + remote_label + job_category,
  data = train_reg
)
summary(lm_log_sal)
## 
## Call:
## lm(formula = log_salary ~ work_year + experience_level + employment_type + 
##     company_size + remote_label + job_category, data = train_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.20916 -0.25629  0.01486  0.27929  2.13325 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -40.887361   5.415632  -7.550 4.41e-14 ***
## work_year                    0.026050   0.002676   9.733  < 2e-16 ***
## experience_level.L           0.453070   0.008368  54.146  < 2e-16 ***
## experience_level.Q          -0.024044   0.006449  -3.729 0.000193 ***
## experience_level.C          -0.019814   0.003686  -5.375 7.67e-08 ***
## employment_typePart-Time    -0.498455   0.031933 -15.610  < 2e-16 ***
## employment_typeContract     -0.430857   0.031167 -13.824  < 2e-16 ***
## employment_typeFreelance    -0.662959   0.114740  -5.778 7.59e-09 ***
## company_size.L               0.246188   0.024745   9.949  < 2e-16 ***
## company_size.Q              -0.132263   0.014692  -9.002  < 2e-16 ***
## remote_labelHybrid          -0.547849   0.028530 -19.202  < 2e-16 ***
## remote_labelFully Remote    -0.066311   0.003943 -16.818  < 2e-16 ***
## job_categoryData Analyst    -0.356829   0.005161 -69.146  < 2e-16 ***
## job_categoryData Scientist  -0.024616   0.004253  -5.787 7.18e-09 ***
## job_categoryManagement      -0.055554   0.006236  -8.909  < 2e-16 ***
## job_categoryOther           -0.230543   0.005312 -43.397  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4231 on 70853 degrees of freedom
## Multiple R-squared:  0.2358, Adjusted R-squared:  0.2356 
## F-statistic:  1457 on 15 and 70853 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(lm_log_sal, which = 1:4)

par(mfrow = c(1, 1))

bp_log_sal <- bptest(lm_log_sal)
ad_log_sal <- ad.test(residuals(lm_log_sal))

cat("\nPost-transformation diagnostics:\n")
## 
## Post-transformation diagnostics:
cat("  Breusch-Pagan p    =", formatC(bp_log_sal$p.value, format = "e", digits = 3), "\n")
##   Breusch-Pagan p    = 0.000e+00
cat("  Anderson-Darling p =", formatC(ad_log_sal$p.value, format = "e", digits = 3), "\n")
##   Anderson-Darling p = 3.700e-24
cat("  Mean of residuals  =", round(mean(residuals(lm_log_sal)), 8), "\n")
##   Mean of residuals  = 0
vif_log_sal <- vif(lm_log_sal)
vif_log_sal_df <- if (is.matrix(vif_log_sal)) {
  data.frame(
    Predictor = rownames(vif_log_sal),
    GVIF      = round(vif_log_sal[, "GVIF"], 3),
    Df        = vif_log_sal[, "Df"],
    GVIF_adj  = round(vif_log_sal[, "GVIF^(1/(2*Df))"], 3)
  )
} else {
  data.frame(Predictor = names(vif_log_sal), VIF = round(vif_log_sal, 3))
}
cat("\nVIF — Log-Linear Model:\n")
## 
## VIF — Log-Linear Model:
print(vif_log_sal_df)
##                         Predictor  GVIF Df GVIF_adj
## work_year               work_year 1.088  1    1.043
## experience_level experience_level 1.250  3    1.038
## employment_type   employment_type 1.047  3    1.008
## company_size         company_size 1.147  2    1.035
## remote_label         remote_label 1.121  2    1.029
## job_category         job_category 1.261  4    1.029
# Coefficient interpretation (% salary effect)
coef_df <- as.data.frame(summary(lm_log_sal)$coefficients) %>%
  rownames_to_column("Term") %>%
  mutate(
    `Salary % effect` = round((exp(Estimate) - 1) * 100, 2),
    Significant       = ifelse(`Pr(>|t|)` < 0.05, "Yes", "No")
  ) %>%
  select(Term, Estimate, `Salary % effect`, `Pr(>|t|)`, Significant)

cat("\n========== Log-Linear Model — Coefficient Salary % Effects ==========\n")
## 
## ========== Log-Linear Model — Coefficient Salary % Effects ==========
print(coef_df)
##                          Term     Estimate Salary % effect     Pr(>|t|)
## 1                 (Intercept) -40.88736106         -100.00 4.408608e-14
## 2                   work_year   0.02604969            2.64 2.246734e-22
## 3          experience_level.L   0.45306954           57.31 0.000000e+00
## 4          experience_level.Q  -0.02404406           -2.38 1.927001e-04
## 5          experience_level.C  -0.01981397           -1.96 7.665968e-08
## 6    employment_typePart-Time  -0.49845483          -39.25 7.731562e-55
## 7     employment_typeContract  -0.43085693          -35.00 2.075993e-43
## 8    employment_typeFreelance  -0.66295869          -48.47 7.594185e-09
## 9              company_size.L   0.24618830           27.91 2.637457e-23
## 10             company_size.Q  -0.13226257          -12.39 2.262214e-19
## 11         remote_labelHybrid  -0.54784861          -42.18 5.727239e-82
## 12   remote_labelFully Remote  -0.06631114           -6.42 2.395357e-63
## 13   job_categoryData Analyst  -0.35682851          -30.01 0.000000e+00
## 14 job_categoryData Scientist  -0.02461600           -2.43 7.177895e-09
## 15     job_categoryManagement  -0.05555433           -5.40 5.269606e-19
## 16          job_categoryOther  -0.23054302          -20.59 0.000000e+00
##    Significant
## 1          Yes
## 2          Yes
## 3          Yes
## 4          Yes
## 5          Yes
## 6          Yes
## 7          Yes
## 8          Yes
## 9          Yes
## 10         Yes
## 11         Yes
## 12         Yes
## 13         Yes
## 14         Yes
## 15         Yes
## 16         Yes

14.4 Stepwise AIC Selection

lm_step_sal <- stepAIC(lm_log_sal, direction = "both", trace = FALSE)
summary(lm_step_sal)
## 
## Call:
## lm(formula = log_salary ~ work_year + experience_level + employment_type + 
##     company_size + remote_label + job_category, data = train_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.20916 -0.25629  0.01486  0.27929  2.13325 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -40.887361   5.415632  -7.550 4.41e-14 ***
## work_year                    0.026050   0.002676   9.733  < 2e-16 ***
## experience_level.L           0.453070   0.008368  54.146  < 2e-16 ***
## experience_level.Q          -0.024044   0.006449  -3.729 0.000193 ***
## experience_level.C          -0.019814   0.003686  -5.375 7.67e-08 ***
## employment_typePart-Time    -0.498455   0.031933 -15.610  < 2e-16 ***
## employment_typeContract     -0.430857   0.031167 -13.824  < 2e-16 ***
## employment_typeFreelance    -0.662959   0.114740  -5.778 7.59e-09 ***
## company_size.L               0.246188   0.024745   9.949  < 2e-16 ***
## company_size.Q              -0.132263   0.014692  -9.002  < 2e-16 ***
## remote_labelHybrid          -0.547849   0.028530 -19.202  < 2e-16 ***
## remote_labelFully Remote    -0.066311   0.003943 -16.818  < 2e-16 ***
## job_categoryData Analyst    -0.356829   0.005161 -69.146  < 2e-16 ***
## job_categoryData Scientist  -0.024616   0.004253  -5.787 7.18e-09 ***
## job_categoryManagement      -0.055554   0.006236  -8.909  < 2e-16 ***
## job_categoryOther           -0.230543   0.005312 -43.397  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4231 on 70853 degrees of freedom
## Multiple R-squared:  0.2358, Adjusted R-squared:  0.2356 
## F-statistic:  1457 on 15 and 70853 DF,  p-value: < 2.2e-16

14.5 Random Forest Regression Benchmark

set.seed(42)
rf_reg_sal <- randomForest(
  log_salary ~ work_year + experience_level + employment_type +
    company_size + remote_label + job_category,
  data       = train_reg,
  ntree      = 300,
  importance = TRUE,
  na.action  = na.omit
)
cat("\nOOB R² (Random Forest on log-salary):",
    round(1 - rf_reg_sal$mse[300] / var(train_reg$log_salary), 4), "\n")
## 
## OOB R² (Random Forest on log-salary): 0.2518

14.6 Regression Model Comparison

eval_reg_sal <- function(model, test, log_resp = TRUE) {
  preds_log <- predict(model, test)
  preds     <- if (log_resp) exp(preds_log) else preds_log
  actual    <- test$salary_in_usd
  rmse      <- sqrt(mean((actual - preds)^2, na.rm = TRUE))
  r2        <- 1 - sum((actual - preds)^2, na.rm = TRUE) / sum((actual - mean(actual, na.rm = TRUE))^2, na.rm = TRUE)
  c(RMSE = rmse, R2 = r2)
}

m_raw  <- eval_reg_sal(lm_raw_sal,  test_reg, log_resp = FALSE)
m_log  <- eval_reg_sal(lm_log_sal,  test_reg)
m_step <- eval_reg_sal(lm_step_sal, test_reg)
m_rf   <- eval_reg_sal(rf_reg_sal,  test_reg)

reg_tbl <- data.frame(
  Model = c(
    "Baseline LM (Raw Salary)",
    "Log-Linear LM (Required)",
    "Log-Linear Stepwise AIC",
    "Random Forest (Benchmark)"
  ),
  RMSE = round(c(m_raw["RMSE"], m_log["RMSE"], m_step["RMSE"], m_rf["RMSE"])),
  R_Squared = round(c(m_raw["R2"], m_log["R2"], m_step["R2"], m_rf["R2"]), 4)
)

cat("\n========== Regression Model Comparison — Test Set ==========\n")
## 
## ========== Regression Model Comparison — Test Set ==========
print(reg_tbl)
##                       Model  RMSE R_Squared
## 1  Baseline LM (Raw Salary) 67791    0.1668
## 2  Log-Linear LM (Required) 69021    0.1363
## 3   Log-Linear Stepwise AIC 69021    0.1363
## 4 Random Forest (Benchmark) 68735    0.1435
# Actual vs Predicted
best_preds_sal <- exp(predict(lm_step_sal, test_reg))
pred_df <- data.frame(
  Actual    = test_reg$salary_in_usd,
  Predicted = best_preds_sal,
  Residual  = test_reg$salary_in_usd - best_preds_sal
)

ggplot(pred_df, aes(x = Predicted, y = Actual)) +
  geom_point(aes(colour = abs(Residual)), alpha = 0.6, size = 2) +
  geom_abline(intercept = 0, slope = 1,
              colour = "#e74c3c", linewidth = 1.1, linetype = "dashed") +
  scale_colour_viridis_c(name = "Abs. Residual", labels = dollar_format()) +
  scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  labs(
    title    = "Actual vs Predicted Salary — Best Linear Model",
    subtitle = "Dashed diagonal = perfect prediction; colour = residual size",
    x = "Predicted Salary", y = "Actual Salary"
  ) +
  theme_report()

# Residual plot
ggplot(pred_df, aes(x = Predicted, y = Residual)) +
  geom_hline(yintercept = 0, colour = "#e74c3c", linewidth = 1, linetype = "dashed") +
  geom_point(colour = "#2980b9", alpha = 0.55, size = 1.8) +
  geom_smooth(method = "loess", se = FALSE, colour = "#f39c12", linewidth = 1.1) +
  scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  labs(
    title    = "Residual Plot — Best Linear Model",
    subtitle = "Flat LOESS band and random scatter indicate assumption is met",
    x = "Predicted Salary", y = "Residual"
  ) +
  theme_report()

15 Classification — Salary Tier Prediction

15.1 Model 1: Decision Tree

set.seed(123)
dt_cls <- rpart(
  salary_category ~ .,
  data    = train_cls,
  method  = "class",
  control = rpart.control(cp = 0.001, minsplit = 20)
)

best_cp   <- dt_cls$cptable[which.min(dt_cls$cptable[, "xerror"]), "CP"]
dt_pruned <- prune(dt_cls, cp = best_cp)

rpart.plot(
  dt_pruned, type = 4, extra = 104,
  main = "Pruned Decision Tree — Salary Tier", cex = 0.7
)

dt_preds <- predict(dt_pruned, test_cls, type = "class")
cm_dt    <- confusionMatrix(dt_preds, test_cls$salary_category)
cat("\n--- Decision Tree Accuracy:",
    round(cm_dt$overall["Accuracy"] * 100, 2), "% ---\n")
## 
## --- Decision Tree Accuracy: 38.75 % ---
print(cm_dt$table)
##            Reference
## Prediction   Low Medium High Very High
##   Low       2450   1191  722       317
##   Medium     851    918  816       828
##   High       245    573  780       564
##   Very High  883   1747 2113      2717

15.2 Model 2: Random Forest Classification

set.seed(42)
rf_cls_sal <- randomForest(
  salary_category ~ .,
  data       = train_cls,
  ntree      = 300,
  importance = TRUE,
  na.action  = na.omit
)

rf_cls_preds <- predict(rf_cls_sal, test_cls)
cm_rf_cls    <- confusionMatrix(rf_cls_preds, test_cls$salary_category)
cat("\n--- Random Forest Accuracy:",
    round(cm_rf_cls$overall["Accuracy"] * 100, 2), "% ---\n")
## 
## --- Random Forest Accuracy: 39.37 % ---
# Confusion matrix heatmap
cm_df <- as.data.frame(cm_rf_cls$table) %>%
  rename(Predicted = Prediction, Actual = Reference)

ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 1.2) +
  geom_text(aes(label = Freq), size = 5, fontface = "bold") +
  scale_fill_gradient(low = "#dce8f5", high = "#1a6fa8", name = "Count") +
  labs(
    title    = "Confusion Matrix — Random Forest Salary Tier Classifier",
    subtitle = paste0("Overall accuracy: ",
                      round(cm_rf_cls$overall["Accuracy"] * 100, 1), "%"),
    x = "Actual Tier", y = "Predicted Tier"
  ) +
  theme_report() +
  theme(panel.grid = element_blank(), legend.position = "right")

# Variable importance
imp_df_sal <- as.data.frame(importance(rf_cls_sal)) %>%
  rownames_to_column("Feature") %>%
  arrange(desc(MeanDecreaseGini))

ggplot(imp_df_sal,
       aes(x = reorder(Feature, MeanDecreaseGini),
           y = MeanDecreaseGini, fill = MeanDecreaseGini)) +
  geom_col(alpha = 0.88) +
  geom_text(aes(label = round(MeanDecreaseGini, 1)),
            hjust = -0.15, size = 3.5, fontface = "bold") +
  coord_flip() +
  scale_fill_viridis_c(option = "C", guide = "none") +
  scale_y_continuous(expand = expansion(mult = c(0, .15))) +
  labs(
    title    = "Random Forest — Variable Importance (Salary Tier)",
    subtitle = "Mean Decrease Gini: higher = more predictive",
    x = NULL, y = "Mean Decrease Gini"
  ) +
  theme_report()

15.3 Classification Model Comparison

cls_tbl <- data.frame(
  Model    = c("Decision Tree (Pruned)", "Random Forest"),
  Accuracy = c(
    round(cm_dt$overall["Accuracy"]     * 100, 2),
    round(cm_rf_cls$overall["Accuracy"] * 100, 2)
  ),
  Kappa = c(
    round(cm_dt$overall["Kappa"],     3),
    round(cm_rf_cls$overall["Kappa"], 3)
  )
)
rownames(cls_tbl) <- NULL

cat("\n========== Classification Model Comparison ==========\n")
## 
## ========== Classification Model Comparison ==========
print(cls_tbl)
##                    Model Accuracy Kappa
## 1 Decision Tree (Pruned)    38.75 0.183
## 2          Random Forest    39.37 0.192
cat("\nFINAL CLASSIFIER: highest Accuracy + Kappa selected.\n")
## 
## FINAL CLASSIFIER: highest Accuracy + Kappa selected.
cat("Cohen's Kappa interpretation: >0.60 substantial; >0.80 almost-perfect.\n")
## Cohen's Kappa interpretation: >0.60 substantial; >0.80 almost-perfect.

16 Relationship Analysis

16.1 Experience × Company Size × Salary

ggplot(df,
       aes(x = experience_level, y = salary_in_usd,
           colour = company_size, group = company_size)) +
  stat_summary(fun = median, geom = "line",  linewidth = 1.2) +
  stat_summary(fun = median, geom = "point", size = 3) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  scale_colour_manual(values = palette_sal[1:3]) +
  labs(
    title    = "Median Salary by Experience × Company Size",
    subtitle = "Large companies show the steepest experience premium",
    x = "Experience Level", y = "Median Salary (USD)",
    colour = "Company Size"
  ) +
  theme_report()

16.2 Remote Work Penetration Over Time

df %>%
  group_by(work_year, remote_label) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(work_year) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = work_year, y = pct, fill = remote_label)) +
  geom_area(alpha = 0.8) +
  scale_fill_manual(values = palette_sal[1:3]) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title    = "Remote Work Mix Over Time",
    subtitle = "Fully remote share grew sharply post-2020",
    x = "Year", y = "Proportion", fill = "Work Arrangement"
  ) +
  theme_report()

16.3 Salary by Employment Type Over Time

df %>%
  group_by(work_year, employment_type) %>%
  summarise(median_salary = median(salary_in_usd), .groups = "drop") %>%
  ggplot(aes(x = work_year, y = median_salary,
             colour = employment_type, group = employment_type)) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 2.5) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  scale_colour_manual(values = palette_sal) +
  labs(
    title    = "Median Salary by Employment Type (2020–2025)",
    subtitle = "Full-time contracts dominate; freelance variance is wide",
    x = "Year", y = "Median Salary (USD)", colour = "Employment Type"
  ) +
  theme_report()

16.4 Top Paying Job Categories by Year Group

df %>%
  group_by(year_group, job_category) %>%
  summarise(median_salary = median(salary_in_usd), .groups = "drop") %>%
  ggplot(aes(x = year_group, y = median_salary, fill = job_category)) +
  geom_col(position = "dodge", alpha = 0.85) +
  scale_fill_manual(values = palette_sal[1:5]) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  labs(
    title    = "Median Salary by Job Category and Era",
    subtitle = "Management and ML Engineering command premiums across all periods",
    x = "Era", y = "Median Salary (USD)", fill = "Job Category"
  ) +
  theme_report()


Session Info

sessionInfo()
## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3   viridis_0.6.5        viridisLite_0.4.3   
##  [4] ggridges_0.5.7       patchwork_1.3.2      lubridate_1.9.5     
##  [7] forcats_1.0.1        stringr_1.6.0        dplyr_1.2.1         
## [10] purrr_1.2.2          readr_2.2.0          tidyr_1.3.2         
## [13] tibble_3.3.1         tidyverse_2.0.0      kableExtra_1.4.0    
## [16] knitr_1.51           ggcorrplot_0.1.4.1   scales_1.4.0        
## [19] pROC_1.19.0.1        e1071_1.7-17         randomForest_4.7-1.2
## [22] rpart.plot_3.1.4     rpart_4.1.24         caret_7.0-1         
## [25] lattice_0.22-9       ggplot2_4.0.2        nortest_1.0-4       
## [28] MASS_7.3-65          lmtest_0.9-40        zoo_1.8-15          
## [31] car_3.1-5            carData_3.0-6       
## 
## loaded via a namespace (and not attached):
##  [1] gridExtra_2.3        rlang_1.2.0          magrittr_2.0.5      
##  [4] otel_0.2.0           compiler_4.5.3       mgcv_1.9-4          
##  [7] systemfonts_1.3.2    vctrs_0.7.3          reshape2_1.4.5      
## [10] pkgconfig_2.0.3      fastmap_1.2.0        labeling_0.4.3      
## [13] utf8_1.2.6           rmarkdown_2.31       prodlim_2026.03.11  
## [16] tzdb_0.5.0           xfun_0.57            cachem_1.1.0        
## [19] jsonlite_2.0.0       recipes_1.3.2        parallel_4.5.3      
## [22] R6_2.6.1             bslib_0.10.0         stringi_1.8.7       
## [25] parallelly_1.46.1    jquerylib_0.1.4      Rcpp_1.1.1          
## [28] iterators_1.0.14     future.apply_1.20.2  Matrix_1.7-4        
## [31] splines_4.5.3        nnet_7.3-20          timechange_0.4.0    
## [34] tidyselect_1.2.1     rstudioapi_0.18.0    abind_1.4-8         
## [37] yaml_2.3.12          timeDate_4052.112    codetools_0.2-20    
## [40] listenv_0.10.1       plyr_1.8.9           withr_3.0.2         
## [43] S7_0.2.1             evaluate_1.0.5       future_1.70.0       
## [46] survival_3.8-6       proxy_0.4-29         xml2_1.5.2          
## [49] pillar_1.11.1        foreach_1.5.2        stats4_4.5.3        
## [52] generics_0.1.4       hms_1.1.4            globals_0.19.1      
## [55] class_7.3-23         glue_1.8.0           tools_4.5.3         
## [58] data.table_1.18.2.1  ModelMetrics_1.2.2.2 gower_1.0.2         
## [61] grid_4.5.3           ipred_0.9-15         nlme_3.1-168        
## [64] Formula_1.2-5        cli_3.6.6            textshaping_1.0.5   
## [67] svglite_2.2.2        lava_1.9.0           gtable_0.3.6        
## [70] sass_0.4.10          digest_0.6.39        farver_2.1.2        
## [73] htmltools_0.5.9      lifecycle_1.0.5      hardhat_1.4.3

```