Default Rate — Bar Chart → About the overall count of customers who defaulted vs did not default
ggplot(df, aes(x = default.payment.next.month, fill = default.payment.next.month)) +geom_bar(width =0.5) +geom_text(stat ="count", aes(label = ..count..), vjust =-0.5) +scale_fill_manual(values =c("#2ecc71", "#e74c3c")) +labs(title ="Default vs Non-Default Customers", x ="Default Status", y ="Count") +theme_minimal() +theme(legend.position ="none")
Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(count)` instead.
#Summary: The majority of customers did not default, with roughly 78% non-default vs 22% default.
Credit Limit Distribution — Histogram → About how credit limits are distributed across all customers.
ggplot(df, aes(x = LIMIT_BAL)) +geom_histogram(bins =50, fill ="#3498db", color ="white") +scale_x_continuous(labels = comma) +labs(title ="Distribution of Credit Limit", x ="Credit Limit (NT$)", y ="Count") +theme_minimal()
#Summary: Most customers have lower credit limits, with the distribution heavily right-skewed
Age Distribution — Histogram with Density → About the age spread of the entire customer base.
ggplot(df, aes(x = AGE)) +geom_histogram(aes(y = ..density..), bins =40, fill ="#9b59b6", color ="white", alpha =0.7) +geom_density(color ="#2c3e50", linewidth =1.2) +labs(title ="Age Distribution of Customers", x ="Age", y ="Density") +theme_minimal()
#Summary: The customer base is predominantly young adults between ages 25–40.
Credit Limit by Default Status — Density Plot → About how credit limit distribution differs between defaulters and non-defaulters
ggplot(df, aes(x = LIMIT_BAL, fill = default.payment.next.month)) +geom_density(alpha =0.5) +scale_x_continuous(labels = comma) +scale_fill_manual(values =c("#27ae60", "#c0392b")) +labs(title ="Credit Limit Density by Default Status", x ="Credit Limit", fill ="Status") +theme_minimal()
#Summary: Non-defaulters tend to have higher credit limits compared to defaulters.
Age by Default Status — Ridgeline Plot → About how age distribution compares between defaulters and non-defaulters.
ggplot(df, aes(x = AGE, y = default.payment.next.month, fill = default.payment.next.month)) +geom_density_ridges(alpha =0.7, scale =1.2) +scale_fill_manual(values =c("#1abc9c", "#e74c3c")) +labs(title ="Age Distribution by Default Status", x ="Age", y ="") +theme_ridges() +theme(legend.position ="none")
Picking joint bandwidth of 1.3
#Summary: Defaulters and non-defaulters have similar age distributions with slight differences in younger age bands
Education Level — Bar Chart → About how many customers fall under each education category
df %>%count(EDUCATION) %>%ggplot(aes(x =reorder(EDUCATION, -n), y = n, fill = EDUCATION)) +geom_bar(stat ="identity") +geom_text(aes(label = n), vjust =-0.4) +scale_fill_viridis_d() +labs(title ="Customer Count by Education Level", x ="Education", y ="Count") +theme_minimal() +theme(legend.position ="none")
#Summary: University-educated customers form the largest group in the dataset.
Default Rate by Education — Stacked Bar → About the proportion of defaulters within each education level.
df %>%count(EDUCATION, default.payment.next.month) %>%group_by(EDUCATION) %>%mutate(pct = n /sum(n)) %>%ggplot(aes(x = EDUCATION, y = pct, fill = default.payment.next.month)) +geom_bar(stat ="identity") +scale_y_continuous(labels = percent) +scale_fill_manual(values =c("#27ae60", "#c0392b")) +labs(title ="Default Rate by Education Level", x ="Education", y ="Percentage", fill ="Status") +theme_minimal()
#Summary: Customers with "Others" and lower education levels show relatively higher default rates.
Default Rate by Marriage Status — Grouped Bar → About how default rates vary across different marital statuses.
df %>%count(MARRIAGE, default.payment.next.month) %>%group_by(MARRIAGE) %>%mutate(pct = n /sum(n) *100) %>%ggplot(aes(x = MARRIAGE, y = pct, fill = default.payment.next.month)) +geom_bar(stat ="identity", position ="dodge") +scale_fill_manual(values =c("#2980b9", "#e74c3c")) +labs(title ="Default Rate by Marital Status", x ="Marital Status", y ="%", fill ="Status") +theme_minimal()
#Summary: Single customers have a slightly higher default rate compared to married customers.
Default Share by Gender — Pie Chart → About the gender-wise share of customers who defaulted.
df %>%count(SEX, default.payment.next.month) %>%group_by(SEX) %>%mutate(pct = n /sum(n)) %>%filter(default.payment.next.month =="Default") %>%ggplot(aes(x ="", y = pct, fill = SEX)) +geom_bar(stat ="identity", width =1) +coord_polar("y") +scale_fill_manual(values =c("#3498db", "#e91e63")) +labs(title ="Default Share by Gender", fill ="Gender") +theme_void()
#Summary: Female customers make up a slightly larger share of the customer base but default rates are comparable across genders.
Gender Distribution by Default — Faceted Bar → About how male and female customers are distributed across default and non-default groups.
ggplot(df, aes(x = SEX, fill = SEX)) +geom_bar() +facet_wrap(~default.payment.next.month) +scale_fill_manual(values =c("#3498db", "#e91e63")) +labs(title ="Gender Distribution by Default Status", x ="Gender", y ="Count") +theme_minimal() +theme(legend.position ="none")
#Summary: Both gender groups are well represented across default and non-default categories with no extreme imbalance.
Credit Limit vs Age — Scatter with Regression → About the relationship between a customer’s age and their assigned credit limit
ggplot(df, aes(x = AGE, y = LIMIT_BAL, color = default.payment.next.month)) +geom_point(alpha =0.2, size =1) +geom_smooth(method ="lm", se =FALSE, linewidth =1.2) +scale_y_continuous(labels = comma) +scale_color_manual(values =c("#27ae60", "#e74c3c")) +labs(title ="Credit Limit vs Age by Default Status", x ="Age", y ="Credit Limit", color ="Status") +theme_minimal()
`geom_smooth()` using formula = 'y ~ x'
#Summary: Credit limit tends to increase with age, and non-defaulters generally receive higher limits at every age.
Credit Limit by Education — Boxplot → About how credit limits vary across different education levels.
ggplot(df, aes(x = EDUCATION, y = LIMIT_BAL, fill = EDUCATION)) +geom_boxplot(outlier.alpha =0.2) +scale_y_continuous(labels = comma) +scale_fill_viridis_d() +labs(title ="Credit Limit by Education Level", x ="Education", y ="Credit Limit") +theme_minimal() +theme(legend.position ="none")
#Summary: Graduate-level customers have noticeably higher credit limits than those with lower education levels.
Credit Limit by Default — Violin Plot → About the full distribution shape of credit limits for defaulters vs non-defaulters.
ggplot(df, aes(x = default.payment.next.month, y = LIMIT_BAL, fill = default.payment.next.month)) +geom_violin(trim =FALSE, alpha =0.7) +geom_boxplot(width =0.08, fill ="white") +scale_y_continuous(labels = comma) +scale_fill_manual(values =c("#1abc9c", "#e74c3c")) +labs(title ="Credit Limit Distribution by Default Status", x ="", y ="Credit Limit") +theme_minimal() +theme(legend.position ="none")
#Summary: Non-defaulters have a wider and higher-shifted credit limit distribution than defaulters.
Age vs Credit Limit — Hexbin Density → About where the densest concentration of customers falls when plotting age against credit limit.
ggplot(df, aes(x = AGE, y = LIMIT_BAL)) +geom_hex(bins =40) +scale_fill_viridis_c() +scale_y_continuous(labels = comma) +labs(title ="Age vs Credit Limit — Density", x ="Age", y ="Credit Limit", fill ="Count") +theme_minimal()
#Summary: The densest customer cluster sits between ages 25–40 with credit limits under 200,000 NT$.
Bill Amount vs Payment Amount — Scatter → About the relationship between how much customers are billed and how much they actually pay.
ggplot(df %>%filter(BILL_AMT1 >0, PAY_AMT1 >0),aes(x = BILL_AMT1, y = PAY_AMT1, color = default.payment.next.month)) +geom_point(alpha =0.15, size =0.8) +scale_x_log10(labels = comma) +scale_y_log10(labels = comma) +scale_color_manual(values =c("#27ae60", "#e74c3c")) +labs(title ="Bill Amount vs Payment Amount (log scale)", x ="Bill Amt (log)", y ="Pay Amt (log)", color ="Status") +theme_minimal()
#Summary: Defaulters consistently pay far less relative to their bill amounts compared to non-defaulters.
Correlation Matrix — Corrplot → About the pairwise correlations between all numeric variables in the dataset.
num_df <- df %>%select(LIMIT_BAL, AGE, BILL_AMT1:BILL_AMT6, PAY_AMT1:PAY_AMT6)corrplot(cor(num_df), method ="color", type ="upper",tl.cex =0.7, tl.col ="black", addCoef.col ="black",number.cex =0.5, title ="Correlation Matrix", mar =c(0,0,1,0))
#Summary: Bill amounts across months are very highly correlated with each other, indicating stable spending behavior over time.
Correlation Heatmap — ggcorrplot → About which groups of numeric variables are most strongly correlated with each other.
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the ggcorrplot package.
Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
#Summary: Payment amounts form one cluster and bill amounts form another, confirming their internal consistency over months.
Average Bill Amount per Month — Line Plot → About how the average bill amount trends across the 6 recorded months.
df %>%summarise(across(BILL_AMT1:BILL_AMT6, mean)) %>%pivot_longer(everything(), names_to ="Month", values_to ="Avg_Bill") %>%mutate(Month =gsub("BILL_AMT", "Month ", Month)) %>%ggplot(aes(x = Month, y = Avg_Bill, group =1)) +geom_line(color ="#e67e22", linewidth =1.3) +geom_point(size =3, color ="#d35400") +scale_y_continuous(labels = comma) +labs(title ="Average Bill Amount Over 6 Months", x ="", y ="Avg Bill Amount") +theme_minimal()
#Summary: Average bill amounts remain relatively stable across the 6-month period with minor fluctuation.
Avg Bill by Default Status — Line Comparison → About how average monthly bill amounts differ between defaulters and non-defaulters over 6 months.
bill_long <- df %>%group_by(default.payment.next.month) %>%summarise(across(BILL_AMT1:BILL_AMT6, mean)) %>%pivot_longer(-default.payment.next.month, names_to ="Month", values_to ="Avg_Bill") %>%mutate(Month =gsub("BILL_AMT", "M", Month))ggplot(bill_long, aes(x = Month, y = Avg_Bill, color = default.payment.next.month, group = default.payment.next.month)) +geom_line(linewidth =1.2) +geom_point(size =2.5) +scale_y_continuous(labels = comma) +scale_color_manual(values =c("#27ae60", "#c0392b")) +labs(title ="Avg Bill Amount Over Time by Default Status", x ="Month", y ="Avg Bill", color ="Status") +theme_minimal()
#Summary: Defaulters consistently carry slightly lower average bill amounts than non-defaulters across all months.
Avg Payment Amount per Month — Line Plot → About how average monthly payment amounts compare between defaulters and non-defaulters over time.
df %>%group_by(default.payment.next.month) %>%summarise(across(PAY_AMT1:PAY_AMT6, mean)) %>%pivot_longer(-default.payment.next.month, names_to ="Month", values_to ="Avg_Pay") %>%mutate(Month =gsub("PAY_AMT", "M", Month)) %>%ggplot(aes(x = Month, y = Avg_Pay, color = default.payment.next.month, group = default.payment.next.month)) +geom_line(linewidth =1.2) +geom_point(size =2.5) +scale_y_continuous(labels = comma) +scale_color_manual(values =c("#1abc9c", "#e74c3c")) +labs(title ="Avg Payment Amount Over Time by Default Status", x ="Month", y ="Avg Payment", color ="Status") +theme_minimal()
#Summary: Non-defaulters pay significantly more on average each month, widening the gap with defaulters over time.
Default Rate by Pay Status (PAY_0) — Filled Bar → About how the likelihood of default changes with the severity of payment delay.
ggplot(df, aes(x =factor(PAY_0), fill = default.payment.next.month)) +geom_bar(position ="fill") +scale_y_continuous(labels = percent) +scale_fill_manual(values =c("#27ae60", "#e74c3c")) +labs(title ="Default Rate by Repayment Status (Most Recent Month)",x ="Pay Status (-2=No Consumption, -1=Paid Duly, 1-8=Months Delayed)",y ="Proportion", fill ="Status") +theme_minimal()
#Summary: Customers with longer payment delays (higher PAY_0 values) have dramatically higher default rates.
Payment Status Heatmap — Tile → About how frequently each payment status occurs across all 6 months.
pay_long <- df %>%select(ID, PAY_0:PAY_6, default.payment.next.month) %>%pivot_longer(PAY_0:PAY_6, names_to ="Month", values_to ="PayStatus")pay_long %>%count(Month, PayStatus) %>%ggplot(aes(x = Month, y =factor(PayStatus), fill = n)) +geom_tile(color ="white") +scale_fill_viridis_c(labels = comma) +labs(title ="Heatmap of Payment Status Across Months", x ="Month", y ="Payment Status", fill ="Count") +theme_minimal()
#Summary: The most common payment behavior is "paid duly" (−1), with very few customers in extreme delay categories.
Pay Status Distribution — Faceted Histogram → About the distribution of payment statuses individually for each of the 6 months.
df %>%select(PAY_0:PAY_6) %>%pivot_longer(everything(), names_to ="Month", values_to ="Status") %>%ggplot(aes(x =factor(Status), fill = Month)) +geom_bar() +facet_wrap(~Month, scales ="free_y") +scale_fill_viridis_d() +labs(title ="Payment Status Distribution Across Months", x ="Status", y ="Count") +theme_minimal() +theme(legend.position ="none")
#Summary: Payment behavior is fairly consistent across months, with on-time payments dominating in all periods.
Default Rate by Age Group — Bar Chart → About which age group has the highest and lowest default rates.
df %>%mutate(AgeGroup =cut(AGE, breaks =c(20,30,40,50,60,80),labels =c("20-30","30-40","40-50","50-60","60+"))) %>%group_by(AgeGroup) %>%summarise(DefaultRate =mean(default.payment.next.month =="Default") *100) %>%ggplot(aes(x = AgeGroup, y = DefaultRate, fill = AgeGroup)) +geom_bar(stat ="identity") +geom_text(aes(label =sprintf("%.1f%%", DefaultRate)), vjust =-0.4) +scale_fill_viridis_d() +labs(title ="Default Rate by Age Group", x ="Age Group", y ="Default Rate (%)") +theme_minimal() +theme(legend.position ="none")
#Summary: The 20–30 age group has the highest default rate, which gradually decreases with older age groups.
Credit Limit by Age Group — Boxplot → About how credit limits are distributed across different age groups
df %>%mutate(AgeGroup =cut(AGE, breaks =c(20,30,40,50,60,80),labels =c("20-30","30-40","40-50","50-60","60+"))) %>%ggplot(aes(x = AgeGroup, y = LIMIT_BAL, fill = AgeGroup)) +geom_boxplot() +scale_y_continuous(labels = comma) +scale_fill_viridis_d() +labs(title ="Credit Limit by Age Group", x ="Age Group", y ="Credit Limit") +theme_minimal() +theme(legend.position ="none")
#Summary: Credit limits increase steadily with age, with the 60+ group having the highest median limits.
Total Bill vs Total Payment — Scatter → About the relationship between a customer’s total 6-month bill and their total 6-month payment.
df %>%mutate(TotalBill = BILL_AMT1+BILL_AMT2+BILL_AMT3+BILL_AMT4+BILL_AMT5+BILL_AMT6,TotalPay = PAY_AMT1+PAY_AMT2+PAY_AMT3+PAY_AMT4+PAY_AMT5+PAY_AMT6) %>%filter(TotalBill >0) %>%ggplot(aes(x = TotalBill, y = TotalPay, color = default.payment.next.month)) +geom_point(alpha =0.15, size =0.8) +scale_x_log10(labels = comma) +scale_y_log10(labels = comma) +scale_color_manual(values =c("#1abc9c", "#e74c3c")) +labs(title ="Total Bill vs Total Payment (6 months, log scale)",x ="Total Bill", y ="Total Payment", color ="Status") +theme_minimal()
Warning in scale_y_log10(labels = comma): log-10 transformation introduced
infinite values.
#Summary: Non-defaulters show a much stronger alignment between total bills and total payments than defaulters.
Payment Utilization Ratio — Histogram → About how much of their bill customers actually pay, compared across defaulters and non-defaulters.
df %>%mutate(UtilRatio = PAY_AMT2 / (BILL_AMT2 +1)) %>%filter(UtilRatio <=2) %>%ggplot(aes(x = UtilRatio, fill = default.payment.next.month)) +geom_histogram(bins =60, position ="identity", alpha =0.6) +scale_fill_manual(values =c("#27ae60", "#e74c3c")) +labs(title ="Payment Utilization Ratio (Pay/Bill) by Default Status", x ="Pay / Bill Ratio", y ="Count", fill ="Status") +theme_minimal()
#Summary: Non-defaulters tend to pay a much higher fraction of their bill, while defaulters cluster near zero repayment
GGpairs — Pairplot → About the pairwise relationships and correlations among the key numeric variables split by default status.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
#Summary: LIMIT_BAL and PAY_AMT1 show the clearest separation between defaulters and non-defaulters among all pairs.
Default Rate Heatmap — Education × Gender → About how default rates vary across every combination of education level and gender.
df %>%group_by(EDUCATION, SEX) %>%summarise(DefaultRate =mean(default.payment.next.month =="Default") *100, .groups ="drop") %>%ggplot(aes(x = SEX, y = EDUCATION, fill = DefaultRate)) +geom_tile(color ="white", linewidth =0.8) +geom_text(aes(label =sprintf("%.1f%%", DefaultRate)), size =4) +scale_fill_gradient(low ="#f9f9f9", high ="#c0392b") +labs(title ="Default Rate by Education & Gender", x ="Gender", y ="Education", fill ="Default %") +theme_minimal()
#Summary: Certain education-gender combinations (e.g., male + high school) show notably elevated default rates compared to others.
ECDF of Credit Limit → About the cumulative distribution of credit limits and how it differs between defaulters and non-defaulters.
ggplot(df, aes(x = LIMIT_BAL, color = default.payment.next.month)) +stat_ecdf(linewidth =1.2) +scale_x_continuous(labels = comma) +scale_color_manual(values =c("#2980b9", "#e74c3c")) +labs(title ="ECDF of Credit Limit by Default Status",x ="Credit Limit", y ="Cumulative Proportion", color ="Status") +theme_minimal()
#Summary: At every credit limit level, non-defaulters make up a disproportionately larger share, confirming higher limits protect against default.