mydata <- read.table("./credit_risk_dataset_updated.csv", header=TRUE, sep=";", dec=".")
head(mydata)
## person_age person_income loan_intent loan_grade loan_amnt loan_status
## 1 22 59000 PERSONAL D 35000 1
## 2 21 9600 EDUCATION B 1000 0
## 3 25 9600 MEDICAL C 5500 1
## 4 23 65500 MEDICAL C 35000 1
## 5 24 54400 MEDICAL C 35000 1
## 6 21 9900 VENTURE A 2500 1
## loan_percent_income
## 1 0.59
## 2 0.10
## 3 0.57
## 4 0.53
## 5 0.55
## 6 0.25
I limit the dataset to 150 randomly selected observations for the purpose of hypothesis testing
set.seed(123)
mydata_sample <- mydata[sample(nrow(mydata), 150), ]
head(mydata_sample)
## person_age person_income loan_intent loan_grade loan_amnt
## 18847 30 27996 MEDICAL C 1500
## 18895 29 18000 MEDICAL B 1700
## 26803 28 114000 VENTURE E 8000
## 25102 29 34999 MEDICAL B 12000
## 28867 28 80000 EDUCATION B 21000
## 2986 24 25000 DEBTCONSOLIDATION C 3200
## loan_status loan_percent_income
## 18847 0 0.05
## 18895 0 0.09
## 26803 0 0.07
## 25102 1 0.34
## 28867 0 0.26
## 2986 1 0.13
General:
Variables:
person_age
person_income
loan_intent
loan_grade
loan_amnt
loan_status
loan_percent_income
head(mydata_sample)
## person_age person_income loan_intent loan_grade loan_amnt
## 18847 30 27996 MEDICAL C 1500
## 18895 29 18000 MEDICAL B 1700
## 26803 28 114000 VENTURE E 8000
## 25102 29 34999 MEDICAL B 12000
## 28867 28 80000 EDUCATION B 21000
## 2986 24 25000 DEBTCONSOLIDATION C 3200
## loan_status loan_percent_income
## 18847 0 0.05
## 18895 0 0.09
## 26803 0 0.07
## 25102 1 0.34
## 28867 0 0.26
## 2986 1 0.13
mydata_sample$loan_status <- factor(mydata_sample$loan_status,
levels = c(0, 1),
labels = c("non-default", "default"))
names(mydata_sample)[names(mydata_sample) == "person_age"] <- "Age"
names(mydata_sample)[names(mydata_sample) == "person_income"] <- "Income"
names(mydata_sample)[names(mydata_sample) == "loan_grade"] <- "Grade"
names(mydata_sample)[names(mydata_sample) == "loan_percent_income"] <- "Income/Loan"
head(mydata_sample)
## Age Income loan_intent Grade loan_amnt loan_status Income/Loan
## 18847 30 27996 MEDICAL C 1500 non-default 0.05
## 18895 29 18000 MEDICAL B 1700 non-default 0.09
## 26803 28 114000 VENTURE E 8000 non-default 0.07
## 25102 29 34999 MEDICAL B 12000 default 0.34
## 28867 28 80000 EDUCATION B 21000 non-default 0.26
## 2986 24 25000 DEBTCONSOLIDATION C 3200 default 0.13
Descriptive statistics
library(pastecs)
round(stat.desc(mydata_sample[ , -3])[ , -5][ , -3], 1)
## Age Income loan_amnt Income/Loan
## nbr.val 150.0 150.0 150.0 150.0
## nbr.null 0.0 0.0 0.0 0.0
## nbr.na 0.0 0.0 0.0 0.0
## min 21.0 18000.0 1000.0 0.0
## max 58.0 207182.0 30000.0 0.5
## range 37.0 189182.0 29000.0 0.5
## sum 4202.0 8597444.0 1377325.0 27.1
## median 26.0 50500.0 8000.0 0.2
## mean 28.0 57316.3 9182.2 0.2
## SE.mean 0.5 2617.3 471.2 0.0
## CI.mean.0.95 1.0 5171.7 931.1 0.0
## var 39.0 1027504881.3 33307620.3 0.0
## std.dev 6.2 32054.7 5771.3 0.1
## coef.var 0.2 0.6 0.6 0.6
Calculated descriptive statistics for the numeric variables in the dataset, excluding categorical variables.
The mean income is 57.316,3 while the mean loan amount is 9.182,2. These averages might be influenced by outliers, such as extremely high incomes.
The median income is 50.500, which is lower than the mean – this suggests a right-skewed distribution, where a few applicants earn more.
The median loan amount is 8.000, indicating that half of the applicants requested loans below this value and the other half requested more. (The median is considered to be part of the lower half)
The Income-to-Loan ratio has a mean of 0,2 and a median of 0,2, suggesting that on average.
Test whether the mean personal income differs between people who defaulted on their loan and those who did not
Variables
Null Hypothesis - The mean personal income is equal between the two loan status groups.
H1 - The mean personal income is different between the two groups.
library(psych)
describeBy(mydata_sample$Income, mydata_sample$loan_status)
##
## Descriptive statistics by group
## group: non-default
## vars n mean sd median trimmed mad min max range skew
## X1 1 118 62238.35 32655.28 58500 58356.22 29533.39 18000 207182 189182 1.33
## kurtosis se
## X1 2.59 3006.16
## ------------------------------------------------------------
## group: default
## vars n mean sd median trimmed mad min max range skew
## X1 1 32 39166.22 21947.83 30500 35306.5 12901.59 18000 95000 77000 1.37
## kurtosis se
## X1 0.73 3879.87
#install.packages("ggpubr")
library(ggplot2)
library(ggpubr)
non_default <- ggplot(mydata_sample[mydata_sample$loan_status == "non-default", ],
aes(x = Income)) +
theme_linedraw() +
geom_histogram(binwidth = 5000, col = "black", fill = "lightblue") +
ylab("Frequency") +
ggtitle("Non-Default")
default <- ggplot(mydata_sample[mydata_sample$loan_status == "default", ],
aes(x = Income)) +
theme_linedraw() +
geom_histogram(binwidth = 5000, col = "black", fill = "lightcoral") +
ylab("Frequency") +
ggtitle("Default")
ggarrange(non_default, default,
ncol = 2, nrow = 1)
Since normality seems to be violated, the Shapiro test will be used to be sure
library(rstatix)
library(dplyr)
mydata_sample %>%
group_by(loan_status) %>%
shapiro_test(Income)
## # A tibble: 2 × 4
## loan_status variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 non-default Income 0.905 0.000000439
## 2 default Income 0.791 0.0000272
The null hypothesis of the Shapiro test is rejected, indicating that the data is not normally distributed.
wilcox.test(Income ~ loan_status,
data = mydata_sample,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon rank sum test
##
## data: Income by loan_status
## W = 2814, p-value = 2.151e-05
## alternative hypothesis: true location shift is not equal to 0
library(effectsize)
rank_biserial(Income ~ loan_status,
data = mydata_sample,
alternative = "two.sided")
## r (rank biserial) | 95% CI
## --------------------------------
## 0.49 | [0.30, 0.64]
Explanation
Based on the sample data, we find that individuals who defaulted on their loan differ significantly in personal income compared to those who did not (𝑝 < 0.001). Those who defaulted tend to have lower incomes. The difference in distribution locations is large (𝑟 = 0.49)
Parametrical Test - Trying to figure out which variables are best suited for a parametric test.⬇️
head(mydata_sample)
## Age Income loan_intent Grade loan_amnt loan_status Income/Loan
## 18847 30 27996 MEDICAL C 1500 non-default 0.05
## 18895 29 18000 MEDICAL B 1700 non-default 0.09
## 26803 28 114000 VENTURE E 8000 non-default 0.07
## 25102 29 34999 MEDICAL B 12000 default 0.34
## 28867 28 80000 EDUCATION B 21000 non-default 0.26
## 2986 24 25000 DEBTCONSOLIDATION C 3200 default 0.13
library(psych)
describeBy(x = mydata_sample$Income, group = mydata_sample$Grade)
##
## Descriptive statistics by group
## group: A
## vars n mean sd median trimmed mad min max range skew
## X1 1 48 61093.92 30738.13 58500 57820.82 26870.64 20000 150075 130075 0.97
## kurtosis se
## X1 0.53 4436.67
## ------------------------------------------------------------
## group: B
## vars n mean sd median trimmed mad min max range skew
## X1 1 49 59520.31 38557.19 47928 54241.78 32718.02 18000 207182 189182 1.56
## kurtosis se
## X1 2.91 5508.17
## ------------------------------------------------------------
## group: C
## vars n mean sd median trimmed mad min max range skew
## X1 1 32 47607.06 23619.99 38995 44493.46 19347.93 21600 105000 83400 0.98
## kurtosis se
## X1 -0.15 4175.46
## ------------------------------------------------------------
## group: D
## vars n mean sd median trimmed mad min max range skew
## X1 1 13 44505 22715.37 43680 42081 27694.97 21324 94350 73026 0.78
## kurtosis se
## X1 -0.59 6300.11
## ------------------------------------------------------------
## group: E
## vars n mean sd median trimmed mad min max range skew
## X1 1 5 78090 27414.06 63000 78090 17791.2 51000 114000 63000 0.29
## kurtosis se
## X1 -2.09 12259.94
## ------------------------------------------------------------
## group: F
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2 84000 15556.35 84000 84000 16308.6 73000 95000 22000 0 -2.75
## se
## X1 11000
## ------------------------------------------------------------
## group: G
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1 88000 NA 88000 88000 0 88000 88000 0 NA NA NA
-Filtering out data subsets with fewer than three observations⬇️
library(dplyr)
filtered_data <- mydata_sample %>%
group_by(Grade) %>%
filter(n() >= 3) %>%
ungroup()
-Checking the normality of Income across different grade categories(sadly not suitable for Parametrical Test)⬇️
library(rstatix)
filtered_data %>%
group_by(Grade) %>%
shapiro_test(Income)
## # A tibble: 5 × 4
## Grade variable statistic p
## <chr> <chr> <dbl> <dbl>
## 1 A Income 0.919 0.00280
## 2 B Income 0.855 0.0000258
## 3 C Income 0.874 0.00144
## 4 D Income 0.893 0.106
## 5 E Income 0.870 0.265
NEW Hypothesis
Test whether the mean Income-to-Loan ratio differs between selected “loan_intent” : Medical, Debt Consolidation, and Home Improvement⬇️
Subset Information
Variables
Income/Loan
- Later renamed because it seems R does not
like “/”)Null Hypothesis (H₀)
- There is no difference in the mean Income-to-Loan ratio between the
selected loan intent categories
Alternative Hypothesis (H₁)
- There is a difference in the mean Income-to-Loan ratio between at
least one of the selected loan intent categories.
library(rstatix)
library(dplyr)
mydata_sample %>%
filter(loan_intent %in% c("MEDICAL", "DEBTCONSOLIDATION", "HOMEIMPROVEMENT")) %>%
group_by(loan_intent) %>%
get_summary_stats(`Income/Loan`, type = "mean_sd")
## # A tibble: 3 × 5
## loan_intent variable n mean sd
## <chr> <fct> <dbl> <dbl> <dbl>
## 1 DEBTCONSOLIDATION Income/Loan 25 0.24 0.112
## 2 HOMEIMPROVEMENT Income/Loan 13 0.122 0.057
## 3 MEDICAL Income/Loan 32 0.186 0.106
-Rename⬇️
mydata_sample <- mydata_sample %>%
rename(IncomeLoan = `Income/Loan`)
-Shapiro test to check for normality ⬇️
mydata_sample %>%
filter(loan_intent %in% c("MEDICAL", "DEBTCONSOLIDATION", "HOMEIMPROVEMENT")) %>%
group_by(loan_intent) %>%
shapiro_test(IncomeLoan)
## # A tibble: 3 × 4
## loan_intent variable statistic p
## <chr> <chr> <dbl> <dbl>
## 1 DEBTCONSOLIDATION IncomeLoan 0.980 0.879
## 2 HOMEIMPROVEMENT IncomeLoan 0.963 0.804
## 3 MEDICAL IncomeLoan 0.940 0.0726
-Levene’s Test for Homogeneity of Variance ⬇️
#install.packages(car)
library(car)
leveneTest(IncomeLoan ~ loan_intent,
data = mydata_sample,
subset = loan_intent %in% c("MEDICAL", "DEBTCONSOLIDATION", "HOMEIMPROVEMENT"))
## Levene's Test for Homogeneity of Variance (center = median: loan_intent %in% c("MEDICAL", "DEBTCONSOLIDATION", "HOMEIMPROVEMENT"))
## Df F value Pr(>F)
## group 5 1.1648 0.3294
## 144
-The conditions of normality and homogeneity of variance are fulfilled - none of the null hypotheses were rejected.⬆️
-Anova Test to see whether the mean Income to Loan ratio differs across three selected loan intent categories⬇️
ANOVA_results <- aov(IncomeLoan ~ loan_intent,
data = mydata_sample,
subset = loan_intent %in% c("MEDICAL", "DEBTCONSOLIDATION", "HOMEIMPROVEMENT"))
summary(ANOVA_results)
## Df Sum Sq Mean Sq F value Pr(>F)
## loan_intent 2 0.1243 0.06215 6.037 0.00388 **
## Residuals 67 0.6897 0.01029
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-(p = 0.004) Reject the null hypothesis, there is a significant difference in the Income to Loan ratio between at least one of the selected loan intent groups⬆️
library(effectsize)
eta_squared(ANOVA_results)
## For one-way between subjects designs, partial eta squared is equivalent
## to eta squared. Returning eta squared.
## # Effect Size for ANOVA
##
## Parameter | Eta2 | 95% CI
## ---------------------------------
## loan_intent | 0.15 | [0.03, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
interpret_eta_squared(0.15, rules = "cohen1992")
## [1] "medium"
## (Rules: cohen1992)
-Medium strength between loan intent and Income/loan ratio⬆️
-Doing pairwise test with Bonferroni correction in order to identify which specific groups differ form another⬇️
pairwise.t.test(x = mydata_sample$IncomeLoan,
g = mydata_sample$loan_intent,
p.adj = "bonf",
subset = mydata_sample$loan_intent %in% c("MEDICAL", "DEBTCONSOLIDATION", "HOMEIMPROVEMENT"))
##
## Pairwise comparisons using t tests with pooled SD
##
## data: mydata_sample$IncomeLoan and mydata_sample$loan_intent
##
## DEBTCONSOLIDATION EDUCATION HOMEIMPROVEMENT MEDICAL PERSONAL
## EDUCATION 1.0000 - - - -
## HOMEIMPROVEMENT 0.0091 0.5229 - - -
## MEDICAL 0.6026 1.0000 0.7679 - -
## PERSONAL 0.1343 1.0000 1.0000 1.0000 -
## VENTURE 0.0200 1.0000 1.0000 1.0000 1.0000
##
## P value adjustment method: bonferroni
**Results:️*↕**
library(dplyr)
library(rstatix)
library(ggpubr)
filtered_data <- mydata_sample %>%
filter(loan_intent %in% c("MEDICAL", "DEBTCONSOLIDATION", "HOMEIMPROVEMENT"))
pwc <- filtered_data %>%
pairwise_t_test(IncomeLoan ~ loan_intent,
paired = FALSE,
p.adjust.method = "bonferroni")
ANOVA_results <- filtered_data %>%
anova_test(dv = IncomeLoan, between = loan_intent)
pwc <- pwc %>%
add_y_position(fun = "median", step.increase = 0.2)
ggboxplot(filtered_data,
x = "loan_intent", y = "IncomeLoan",
add = "point", ylim = c(0, 1)) + # Anpassung je nach Skala
stat_pvalue_manual(pwc, hide.ns = FALSE) +
stat_summary(fun = mean, geom = "point", shape = 16, size = 4,
aes(group = loan_intent), color = "darkred",
position = position_dodge(width = 0.8)) +
stat_summary(fun = mean, colour = "darkred",
position = position_dodge(width = 0.8),
geom = "text", vjust = -0.5, hjust = -1,
aes(label = round(after_stat(y), 2), group = loan_intent)) +
labs(subtitle = get_test_label(ANOVA_results, detailed = TRUE),
caption = get_pwc_label(pwc))
Explanation
We found that the Income-to-Loan ratio differs significantly for at least one of the selected loan intent groups – Medical, Debt Consolidation, and Home Improvement (F(2, 67) = 6.04, p = 0.004). The effect size was medium.
Post-hoc tests using Bonferroni correction revealed a significant difference between Debt Consolidation and Home Improvement (p = 0.01), while no significant differences were observed between Medical and the other groups.