1. Setup & Libraries

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(rpart)
library(scales)
## Warning: package 'scales' was built under R version 4.5.3

2. Data Loading & Cleaning

df <- read.csv("C:/Users/Esther N/OneDrive/Documents/RFiles/loan_data.csv")

# Remove unrealistic age outliers (age > 100)
df_clean <- df[df$person_age <= 100, ]

# Convert character columns to factors
df_clean$person_gender                  <- as.factor(df_clean$person_gender)
df_clean$person_education               <- as.factor(df_clean$person_education)
df_clean$person_home_ownership          <- as.factor(df_clean$person_home_ownership)
df_clean$loan_intent                    <- as.factor(df_clean$loan_intent)
df_clean$previous_loan_defaults_on_file <- as.factor(df_clean$previous_loan_defaults_on_file)

# Create labelled loan status factor
df_clean$loan_status_f <- factor(
  df_clean$loan_status,
  levels = c(0, 1),
  labels = c("Repaid", "Defaulted")
)

# Log-transform income (financial data is typically log-normal)
df_clean$log_income <- log(df_clean$person_income)

cat("Cleaned dataset size:", nrow(df_clean), "rows\n")
## Cleaned dataset size: 44993 rows

Train-Test Split (80/20)

set.seed(123)
train_idx <- sample(seq_len(nrow(df_clean)), size = floor(0.8 * nrow(df_clean)))
train_df  <- df_clean[train_idx, ]
test_df   <- df_clean[-train_idx, ]

cat("Training rows:", nrow(train_df), "\n")
## Training rows: 35994
cat("Test rows:    ", nrow(test_df), "\n")
## Test rows:     8999

3. Linear Regression — Income Prediction

A log-linear model is used to handle the right-skewed distribution of income.

model_log_lm <- lm(
  log_income ~ person_age + person_education + person_emp_exp +
               person_home_ownership + loan_amnt + loan_int_rate + credit_score,
  data = train_df
)

summary(model_log_lm)
## 
## Call:
## lm(formula = log_income ~ person_age + person_education + person_emp_exp + 
##     person_home_ownership + loan_amnt + loan_int_rate + credit_score, 
##     data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9101 -0.3075 -0.0137  0.2899  3.6446 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  1.071e+01  4.567e-02 234.480  < 2e-16 ***
## person_age                   1.279e-02  1.371e-03   9.335  < 2e-16 ***
## person_educationBachelor     1.850e-02  6.618e-03   2.796 0.005179 ** 
## person_educationDoctorate   -1.615e-02  2.162e-02  -0.747 0.455117    
## person_educationHigh School  7.485e-03  6.773e-03   1.105 0.269144    
## person_educationMaster       2.733e-02  7.940e-03   3.442 0.000578 ***
## person_emp_exp              -4.280e-03  1.367e-03  -3.131 0.001746 ** 
## person_home_ownershipOTHER  -1.342e-01  4.875e-02  -2.752 0.005925 ** 
## person_home_ownershipOWN    -3.756e-01  1.039e-02 -36.162  < 2e-16 ***
## person_home_ownershipRENT   -3.512e-01  5.279e-03 -66.534  < 2e-16 ***
## loan_amnt                    3.309e-05  4.002e-07  82.679  < 2e-16 ***
## loan_int_rate               -6.093e-03  8.489e-04  -7.178  7.2e-13 ***
## credit_score                 4.318e-05  5.103e-05   0.846 0.397437    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4683 on 35981 degrees of freedom
## Multiple R-squared:  0.2901, Adjusted R-squared:  0.2899 
## F-statistic:  1225 on 12 and 35981 DF,  p-value: < 2.2e-16

Residual Diagnostics

The mean of residuals should be close to 0 for a well-fitted model.

cat("Mean of residuals:", mean(residuals(model_log_lm)), "\n")
## Mean of residuals: -4.804171e-18

Test Set Performance

preds_log_lm  <- exp(predict(model_log_lm, test_df))
actual_income <- test_df$person_income

rmse_lm <- sqrt(mean((actual_income - preds_log_lm)^2))
rsq_lm  <- cor(actual_income, preds_log_lm)^2

cat("RMSE:      ", round(rmse_lm, 2), "\n")
## RMSE:       55522.88
cat("R-squared: ", round(rsq_lm, 4), "\n")
## R-squared:  0.1794

4. Classification — Loan Default Prediction

A decision tree classifier is trained to predict whether a borrower will default.

model_tree <- rpart(
  loan_status_f ~ person_income + person_age + person_education +
                  person_home_ownership + loan_amnt + loan_int_rate +
                  loan_percent_income + credit_score + previous_loan_defaults_on_file,
  data   = train_df,
  method = "class"
)

Confusion Matrix & Accuracy

tree_preds  <- predict(model_tree, test_df, type = "class")
conf_matrix <- table(Predicted = tree_preds, Actual = test_df$loan_status_f)
accuracy    <- sum(diag(conf_matrix)) / sum(conf_matrix)

print(conf_matrix)
##            Actual
## Predicted   Repaid Defaulted
##   Repaid      6699       536
##   Defaulted    271      1493
cat("\nClassification Accuracy:", round(accuracy * 100, 2), "%\n")
## 
## Classification Accuracy: 91.03 %

5. Visualizations

Plot 1: Income Distribution (Log Scale)

ggplot(df_clean, aes(x = person_income)) +
  geom_histogram(bins = 50, fill = "#3498db", color = "white") +
  scale_x_log10(labels = comma) +
  labs(
    title = "Income Distribution (Log Scale)",
    x     = "Annual Income (USD)",
    y     = "Count"
  ) +
  theme_minimal()

Plot 2: Default Rate by Home Ownership

ggplot(df_clean, aes(x = person_home_ownership, fill = loan_status_f)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("#2ecc71", "#e74c3c")) +
  labs(
    title = "Default Rate by Home Ownership",
    x     = "Home Ownership",
    y     = "Proportion",
    fill  = "Status"
  ) +
  theme_minimal()

Plot 3: Loan Amount vs Income

ggplot(df_clean, aes(x = person_income, y = loan_amnt, color = loan_status_f)) +
  geom_point(alpha = 0.2) +
  scale_x_log10(labels = comma) +
  scale_color_manual(values = c("#2ecc71", "#e74c3c")) +
  labs(
    title = "Loan Amount vs Income",
    x     = "Income (Log Scale)",
    y     = "Loan Amount",
    color = "Status"
  ) +
  theme_minimal()


Session Info

sessionInfo()
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Africa/Nairobi
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] scales_1.4.0  rpart_4.1.24  ggplot2_4.0.2
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.7.2        cli_3.6.5          knitr_1.51         rlang_1.1.7       
##  [5] xfun_0.57          generics_0.1.4     S7_0.2.1           jsonlite_2.0.0    
##  [9] labeling_0.4.3     glue_1.8.0         htmltools_0.5.9    sass_0.4.10       
## [13] rmarkdown_2.31     grid_4.5.2         tibble_3.3.1       evaluate_1.0.5    
## [17] jquerylib_0.1.4    fastmap_1.2.0      yaml_2.3.12        lifecycle_1.0.5   
## [21] compiler_4.5.2     dplyr_1.2.0        RColorBrewer_1.1-3 pkgconfig_2.0.3   
## [25] rstudioapi_0.18.0  farver_2.1.2       digest_0.6.39      R6_2.6.1          
## [29] tidyselect_1.2.1   pillar_1.11.1      magrittr_2.0.4     bslib_0.10.0      
## [33] withr_3.0.2        tools_4.5.2        gtable_0.3.6       cachem_1.1.0

📌 Executive Summary

This report analyses data science and technology compensation data spanning 2020 to 2025, covering 500 professionals across experience levels, employment types, company sizes, and remote-work arrangements. It combines exploratory data analysis, relationship modelling, and machine learning to surface actionable insights about how salaries vary — and how they can be predicted.

Key Metrics at a Glance

📊 Metric | 📈 Value|
Median Salary $151,000
Mean Salary $148,718
Peak Salary $268,508
Unique Job Titles 6

1 · Data Overview

Year Experience Employment Salary (USD) Remote Company Size
2,022 Entry Full-Time 134,687.97 Hybrid Large
2,020 Entry Full-Time 72,028.51 Fully Remote Large
2,021 Entry Full-Time 111,594.95 On-Site Medium
2,025 Entry Full-Time 177,527.66 Hybrid Small
2,020 Mid Full-Time 151,192.94 On-Site Small
2,022 Executive Full-Time 183,090.30 Fully Remote Small
2,025 Mid Full-Time 64,866.09 Hybrid Medium
2,024 Mid Full-Time 107,575.33 Fully Remote Large
2,025 Entry Full-Time 88,126.52 Fully Remote Large
2,021 Executive Freelance 219,999.82 On-Site Large
Records Years Min Salary Max Salary Median
500 2020 – 2025 $20,501.47 $268,508 $151,000

2 · Exploratory Data Analysis

2.1 Salary Distribution

2.2 Salary by Experience Level

2.4 Remote Work & Compensation

2.5 Salary by Company Size & Employment Type

2.6 Top Job Roles by Median Salary


3 · Relationship Analysis

3.1 Correlation Heatmap

3.2 Experience × Salary Scatter


4 · Feature Engineering

Job Category × Salary Tier — Record Counts
job_category Low Medium High Very High
Data / ML Engineer 64 57 53 60
Data Analyst 16 29 31 22
Data Scientist 45 39 41 43


5 · Predictive Modelling

5.1 Linear Regression — Salary Prediction

Model Performance on Test Set (20% holdout)
Model RMSE R.
Base Linear Regression $38,001 0.330
Log-Transformed Regression $38,173 0.324

5.2 Random Forest — Salary Tier Classification

Overall Classification Accuracy: 38%

5.3 Feature Importance


6 · Conclusions

Finding Insight
Experience drives pay Executive-level professionals earn ~2–3× more than entry-level counterparts on average.
Remote work pays Fully remote roles show comparable or higher median salaries than on-site equivalents.
Company size matters Large companies tend to offer higher salaries, especially for full-time roles.
Regression performance Base LM achieved R² = 0.33 on the test set — reasonable for salary data with categorical predictors.
Classification accuracy Random Forest classified salary tiers at 38% accuracy — strong signal in the features.

Report generated with R Markdown · Data Science Salary Analysis 2020–2025