## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'scales' was built under R version 4.5.3
df <- read.csv("C:/Users/Esther N/OneDrive/Documents/RFiles/loan_data.csv")
# Remove unrealistic age outliers (age > 100)
df_clean <- df[df$person_age <= 100, ]
# Convert character columns to factors
df_clean$person_gender <- as.factor(df_clean$person_gender)
df_clean$person_education <- as.factor(df_clean$person_education)
df_clean$person_home_ownership <- as.factor(df_clean$person_home_ownership)
df_clean$loan_intent <- as.factor(df_clean$loan_intent)
df_clean$previous_loan_defaults_on_file <- as.factor(df_clean$previous_loan_defaults_on_file)
# Create labelled loan status factor
df_clean$loan_status_f <- factor(
df_clean$loan_status,
levels = c(0, 1),
labels = c("Repaid", "Defaulted")
)
# Log-transform income (financial data is typically log-normal)
df_clean$log_income <- log(df_clean$person_income)
cat("Cleaned dataset size:", nrow(df_clean), "rows\n")## Cleaned dataset size: 44993 rows
A log-linear model is used to handle the right-skewed distribution of income.
model_log_lm <- lm(
log_income ~ person_age + person_education + person_emp_exp +
person_home_ownership + loan_amnt + loan_int_rate + credit_score,
data = train_df
)
summary(model_log_lm)##
## Call:
## lm(formula = log_income ~ person_age + person_education + person_emp_exp +
## person_home_ownership + loan_amnt + loan_int_rate + credit_score,
## data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9101 -0.3075 -0.0137 0.2899 3.6446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.071e+01 4.567e-02 234.480 < 2e-16 ***
## person_age 1.279e-02 1.371e-03 9.335 < 2e-16 ***
## person_educationBachelor 1.850e-02 6.618e-03 2.796 0.005179 **
## person_educationDoctorate -1.615e-02 2.162e-02 -0.747 0.455117
## person_educationHigh School 7.485e-03 6.773e-03 1.105 0.269144
## person_educationMaster 2.733e-02 7.940e-03 3.442 0.000578 ***
## person_emp_exp -4.280e-03 1.367e-03 -3.131 0.001746 **
## person_home_ownershipOTHER -1.342e-01 4.875e-02 -2.752 0.005925 **
## person_home_ownershipOWN -3.756e-01 1.039e-02 -36.162 < 2e-16 ***
## person_home_ownershipRENT -3.512e-01 5.279e-03 -66.534 < 2e-16 ***
## loan_amnt 3.309e-05 4.002e-07 82.679 < 2e-16 ***
## loan_int_rate -6.093e-03 8.489e-04 -7.178 7.2e-13 ***
## credit_score 4.318e-05 5.103e-05 0.846 0.397437
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4683 on 35981 degrees of freedom
## Multiple R-squared: 0.2901, Adjusted R-squared: 0.2899
## F-statistic: 1225 on 12 and 35981 DF, p-value: < 2.2e-16
The mean of residuals should be close to 0 for a well-fitted model.
## Mean of residuals: -4.804171e-18
A decision tree classifier is trained to predict whether a borrower will default.
model_tree <- rpart(
loan_status_f ~ person_income + person_age + person_education +
person_home_ownership + loan_amnt + loan_int_rate +
loan_percent_income + credit_score + previous_loan_defaults_on_file,
data = train_df,
method = "class"
)tree_preds <- predict(model_tree, test_df, type = "class")
conf_matrix <- table(Predicted = tree_preds, Actual = test_df$loan_status_f)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(conf_matrix)## Actual
## Predicted Repaid Defaulted
## Repaid 6699 536
## Defaulted 271 1493
##
## Classification Accuracy: 91.03 %
ggplot(df_clean, aes(x = person_income)) +
geom_histogram(bins = 50, fill = "#3498db", color = "white") +
scale_x_log10(labels = comma) +
labs(
title = "Income Distribution (Log Scale)",
x = "Annual Income (USD)",
y = "Count"
) +
theme_minimal()ggplot(df_clean, aes(x = person_home_ownership, fill = loan_status_f)) +
geom_bar(position = "fill") +
scale_fill_manual(values = c("#2ecc71", "#e74c3c")) +
labs(
title = "Default Rate by Home Ownership",
x = "Home Ownership",
y = "Proportion",
fill = "Status"
) +
theme_minimal()ggplot(df_clean, aes(x = person_income, y = loan_amnt, color = loan_status_f)) +
geom_point(alpha = 0.2) +
scale_x_log10(labels = comma) +
scale_color_manual(values = c("#2ecc71", "#e74c3c")) +
labs(
title = "Loan Amount vs Income",
x = "Income (Log Scale)",
y = "Loan Amount",
color = "Status"
) +
theme_minimal()## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Africa/Nairobi
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.4.0 rpart_4.1.24 ggplot2_4.0.2
##
## loaded via a namespace (and not attached):
## [1] vctrs_0.7.2 cli_3.6.5 knitr_1.51 rlang_1.1.7
## [5] xfun_0.57 generics_0.1.4 S7_0.2.1 jsonlite_2.0.0
## [9] labeling_0.4.3 glue_1.8.0 htmltools_0.5.9 sass_0.4.10
## [13] rmarkdown_2.31 grid_4.5.2 tibble_3.3.1 evaluate_1.0.5
## [17] jquerylib_0.1.4 fastmap_1.2.0 yaml_2.3.12 lifecycle_1.0.5
## [21] compiler_4.5.2 dplyr_1.2.0 RColorBrewer_1.1-3 pkgconfig_2.0.3
## [25] rstudioapi_0.18.0 farver_2.1.2 digest_0.6.39 R6_2.6.1
## [29] tidyselect_1.2.1 pillar_1.11.1 magrittr_2.0.4 bslib_0.10.0
## [33] withr_3.0.2 tools_4.5.2 gtable_0.3.6 cachem_1.1.0
This report analyses data science and technology compensation data spanning 2020 to 2025, covering 500 professionals across experience levels, employment types, company sizes, and remote-work arrangements. It combines exploratory data analysis, relationship modelling, and machine learning to surface actionable insights about how salaries vary — and how they can be predicted.
| 📊 Metric | | 📈 Value| |
|---|---|
| Median Salary | $151,000 |
| Mean Salary | $148,718 |
| Peak Salary | $268,508 |
| Unique Job Titles | 6 |
| Year | Experience | Employment | Salary (USD) | Remote | Company Size |
|---|---|---|---|---|---|
| 2,022 | Entry | Full-Time | 134,687.97 | Hybrid | Large |
| 2,020 | Entry | Full-Time | 72,028.51 | Fully Remote | Large |
| 2,021 | Entry | Full-Time | 111,594.95 | On-Site | Medium |
| 2,025 | Entry | Full-Time | 177,527.66 | Hybrid | Small |
| 2,020 | Mid | Full-Time | 151,192.94 | On-Site | Small |
| 2,022 | Executive | Full-Time | 183,090.30 | Fully Remote | Small |
| 2,025 | Mid | Full-Time | 64,866.09 | Hybrid | Medium |
| 2,024 | Mid | Full-Time | 107,575.33 | Fully Remote | Large |
| 2,025 | Entry | Full-Time | 88,126.52 | Fully Remote | Large |
| 2,021 | Executive | Freelance | 219,999.82 | On-Site | Large |
| Records | Years | Min Salary | Max Salary | Median |
|---|---|---|---|---|
| 500 | 2020 – 2025 | $20,501.47 | $268,508 | $151,000 |
| job_category | Low | Medium | High | Very High |
|---|---|---|---|---|
| Data / ML Engineer | 64 | 57 | 53 | 60 |
| Data Analyst | 16 | 29 | 31 | 22 |
| Data Scientist | 45 | 39 | 41 | 43 |
| Model | RMSE | R. |
|---|---|---|
| Base Linear Regression | $38,001 | 0.330 |
| Log-Transformed Regression | $38,173 | 0.324 |
Overall Classification Accuracy: 38%
| Finding | Insight |
|---|---|
| Experience drives pay | Executive-level professionals earn ~2–3× more than entry-level counterparts on average. |
| Remote work pays | Fully remote roles show comparable or higher median salaries than on-site equivalents. |
| Company size matters | Large companies tend to offer higher salaries, especially for full-time roles. |
| Regression performance | Base LM achieved R² = 0.33 on the test set — reasonable for salary data with categorical predictors. |
| Classification accuracy | Random Forest classified salary tiers at 38% accuracy — strong signal in the features. |
Report generated with R Markdown · Data Science Salary Analysis 2020–2025