visualizations-LA2

Author

ANKITHA-SANJANA

Course Name: Exploratory Data Analysis

Course Code: 22ISE644

Academic Year: 2025 – 26, 6 semester

Team Name: DEADLINESURVIVORS

Team Number: 32

Team member1: ANKTHA KUMARI 1NT23IS027 SECTION A

Team member2: SANJANA S SONDUR 1NT23IS193 SECTION D

library(ggplot2)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyr)
library(corrplot)
corrplot 0.95 loaded
library(ggcorrplot)
library(scales)
library(viridis)
Loading required package: viridisLite

Attaching package: 'viridis'
The following object is masked from 'package:scales':

    viridis_pal
library(ggridges)
library(reshape2)

Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':

    smiths
library(GGally)

# Load data
df <- read.csv("UCI_Credit_Card.csv")

# Factorize categorical columns
df$default.payment.next.month <- factor(df$default.payment.next.month, labels = c("No Default", "Default"))
df$SEX    <- factor(df$SEX,      labels = c("Male", "Female"))
df$EDUCATION <- factor(df$EDUCATION, levels = 0:6,
                        labels = c("Unknown","Graduate","University","High School","Others","Unknown2","Unknown3"))
df$MARRIAGE  <- factor(df$MARRIAGE,  levels = 0:3,
                        labels = c("Unknown","Married","Single","Others"))

Default Rate — Bar Chart → About the overall count of customers who defaulted vs did not default

ggplot(df, aes(x = default.payment.next.month, fill = default.payment.next.month)) +
  geom_bar(width = 0.5) +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  scale_fill_manual(values = c("#2ecc71", "#e74c3c")) +
  labs(title = "Default vs Non-Default Customers", x = "Default Status", y = "Count") +
  theme_minimal() + theme(legend.position = "none")
Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(count)` instead.

#Summary: The majority of customers did not default, with roughly 78% non-default vs 22% default.

Credit Limit Distribution — Histogram → About how credit limits are distributed across all customers.

ggplot(df, aes(x = LIMIT_BAL)) +
  geom_histogram(bins = 50, fill = "#3498db", color = "white") +
  scale_x_continuous(labels = comma) +
  labs(title = "Distribution of Credit Limit", x = "Credit Limit (NT$)", y = "Count") +
  theme_minimal()

#Summary: Most customers have lower credit limits, with the distribution heavily right-skewed

Age Distribution — Histogram with Density → About the age spread of the entire customer base.

ggplot(df, aes(x = AGE)) +
  geom_histogram(aes(y = ..density..), bins = 40, fill = "#9b59b6", color = "white", alpha = 0.7) +
  geom_density(color = "#2c3e50", linewidth = 1.2) +
  labs(title = "Age Distribution of Customers", x = "Age", y = "Density") +
  theme_minimal()

#Summary: The customer base is predominantly young adults between ages 25–40.

Credit Limit by Default Status — Density Plot → About how credit limit distribution differs between defaulters and non-defaulters

ggplot(df, aes(x = LIMIT_BAL, fill = default.payment.next.month)) +
  geom_density(alpha = 0.5) +
  scale_x_continuous(labels = comma) +
  scale_fill_manual(values = c("#27ae60", "#c0392b")) +
  labs(title = "Credit Limit Density by Default Status", x = "Credit Limit", fill = "Status") +
  theme_minimal()

#Summary: Non-defaulters tend to have higher credit limits compared to defaulters.

Age by Default Status — Ridgeline Plot → About how age distribution compares between defaulters and non-defaulters.

ggplot(df, aes(x = AGE, y = default.payment.next.month, fill = default.payment.next.month)) +
  geom_density_ridges(alpha = 0.7, scale = 1.2) +
  scale_fill_manual(values = c("#1abc9c", "#e74c3c")) +
  labs(title = "Age Distribution by Default Status", x = "Age", y = "") +
  theme_ridges() + theme(legend.position = "none")
Picking joint bandwidth of 1.3

#Summary: Defaulters and non-defaulters have similar age distributions with slight differences in younger age bands

Education Level — Bar Chart → About how many customers fall under each education category

df %>% count(EDUCATION) %>%
  ggplot(aes(x = reorder(EDUCATION, -n), y = n, fill = EDUCATION)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), vjust = -0.4) +
  scale_fill_viridis_d() +
  labs(title = "Customer Count by Education Level", x = "Education", y = "Count") +
  theme_minimal() + theme(legend.position = "none")

#Summary: University-educated customers form the largest group in the dataset.

Default Rate by Education — Stacked Bar → About the proportion of defaulters within each education level.

df %>% count(EDUCATION, default.payment.next.month) %>%
  group_by(EDUCATION) %>% mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = EDUCATION, y = pct, fill = default.payment.next.month)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = percent) +
  scale_fill_manual(values = c("#27ae60", "#c0392b")) +
  labs(title = "Default Rate by Education Level", x = "Education", y = "Percentage", fill = "Status") +
  theme_minimal()

#Summary: Customers with "Others" and lower education levels show relatively higher default rates.

Default Rate by Marriage Status — Grouped Bar → About how default rates vary across different marital statuses.

df %>% count(MARRIAGE, default.payment.next.month) %>%
  group_by(MARRIAGE) %>% mutate(pct = n / sum(n) * 100) %>%
  ggplot(aes(x = MARRIAGE, y = pct, fill = default.payment.next.month)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("#2980b9", "#e74c3c")) +
  labs(title = "Default Rate by Marital Status", x = "Marital Status", y = "%", fill = "Status") +
  theme_minimal()

#Summary: Single customers have a slightly higher default rate compared to married customers.

Default Share by Gender — Pie Chart → About the gender-wise share of customers who defaulted.

df %>% count(SEX, default.payment.next.month) %>%
  group_by(SEX) %>% mutate(pct = n / sum(n)) %>%
  filter(default.payment.next.month == "Default") %>%
  ggplot(aes(x = "", y = pct, fill = SEX)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  scale_fill_manual(values = c("#3498db", "#e91e63")) +
  labs(title = "Default Share by Gender", fill = "Gender") +
  theme_void()

#Summary: Female customers make up a slightly larger share of the customer base but default rates are comparable across genders.

Gender Distribution by Default — Faceted Bar → About how male and female customers are distributed across default and non-default groups.

ggplot(df, aes(x = SEX, fill = SEX)) +
  geom_bar() +
  facet_wrap(~default.payment.next.month) +
  scale_fill_manual(values = c("#3498db", "#e91e63")) +
  labs(title = "Gender Distribution by Default Status", x = "Gender", y = "Count") +
  theme_minimal() + theme(legend.position = "none")

#Summary: Both gender groups are well represented across default and non-default categories with no extreme imbalance.

Credit Limit vs Age — Scatter with Regression → About the relationship between a customer’s age and their assigned credit limit

ggplot(df, aes(x = AGE, y = LIMIT_BAL, color = default.payment.next.month)) +
  geom_point(alpha = 0.2, size = 1) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1.2) +
  scale_y_continuous(labels = comma) +
  scale_color_manual(values = c("#27ae60", "#e74c3c")) +
  labs(title = "Credit Limit vs Age by Default Status", x = "Age", y = "Credit Limit", color = "Status") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

#Summary: Credit limit tends to increase with age, and non-defaulters generally receive higher limits at every age.

Credit Limit by Education — Boxplot → About how credit limits vary across different education levels.

ggplot(df, aes(x = EDUCATION, y = LIMIT_BAL, fill = EDUCATION)) +
  geom_boxplot(outlier.alpha = 0.2) +
  scale_y_continuous(labels = comma) +
  scale_fill_viridis_d() +
  labs(title = "Credit Limit by Education Level", x = "Education", y = "Credit Limit") +
  theme_minimal() + theme(legend.position = "none")

#Summary: Graduate-level customers have noticeably higher credit limits than those with lower education levels.

Credit Limit by Default — Violin Plot → About the full distribution shape of credit limits for defaulters vs non-defaulters.

ggplot(df, aes(x = default.payment.next.month, y = LIMIT_BAL, fill = default.payment.next.month)) +
  geom_violin(trim = FALSE, alpha = 0.7) +
  geom_boxplot(width = 0.08, fill = "white") +
  scale_y_continuous(labels = comma) +
  scale_fill_manual(values = c("#1abc9c", "#e74c3c")) +
  labs(title = "Credit Limit Distribution by Default Status", x = "", y = "Credit Limit") +
  theme_minimal() + theme(legend.position = "none")

#Summary: Non-defaulters have a wider and higher-shifted credit limit distribution than defaulters.

Age vs Credit Limit — Hexbin Density → About where the densest concentration of customers falls when plotting age against credit limit.

ggplot(df, aes(x = AGE, y = LIMIT_BAL)) +
  geom_hex(bins = 40) +
  scale_fill_viridis_c() +
  scale_y_continuous(labels = comma) +
  labs(title = "Age vs Credit Limit — Density", x = "Age", y = "Credit Limit", fill = "Count") +
  theme_minimal()

#Summary: The densest customer cluster sits between ages 25–40 with credit limits under 200,000 NT$.

Bill Amount vs Payment Amount — Scatter → About the relationship between how much customers are billed and how much they actually pay.

ggplot(df %>% filter(BILL_AMT1 > 0, PAY_AMT1 > 0),
       aes(x = BILL_AMT1, y = PAY_AMT1, color = default.payment.next.month)) +
  geom_point(alpha = 0.15, size = 0.8) +
  scale_x_log10(labels = comma) + scale_y_log10(labels = comma) +
  scale_color_manual(values = c("#27ae60", "#e74c3c")) +
  labs(title = "Bill Amount vs Payment Amount (log scale)", x = "Bill Amt (log)", y = "Pay Amt (log)", color = "Status") +
  theme_minimal()

#Summary: Defaulters consistently pay far less relative to their bill amounts compared to non-defaulters.

Correlation Matrix — Corrplot → About the pairwise correlations between all numeric variables in the dataset.

num_df <- df %>% select(LIMIT_BAL, AGE, BILL_AMT1:BILL_AMT6, PAY_AMT1:PAY_AMT6)
corrplot(cor(num_df), method = "color", type = "upper",
         tl.cex = 0.7, tl.col = "black", addCoef.col = "black",
         number.cex = 0.5, title = "Correlation Matrix", mar = c(0,0,1,0))

#Summary: Bill amounts across months are very highly correlated with each other, indicating stable spending behavior over time.

Correlation Heatmap — ggcorrplot → About which groups of numeric variables are most strongly correlated with each other.

ggcorrplot(cor(num_df), hc.order = TRUE, type = "lower",
           colors = c("#e74c3c", "white", "#2980b9"),
           lab = TRUE, lab_size = 2.5,
           title = "Correlation Heatmap (Clustered)") +
  theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7))
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the ggcorrplot package.
  Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.

#Summary: Payment amounts form one cluster and bill amounts form another, confirming their internal consistency over months.

Average Bill Amount per Month — Line Plot → About how the average bill amount trends across the 6 recorded months.

df %>%
  summarise(across(BILL_AMT1:BILL_AMT6, mean)) %>%
  pivot_longer(everything(), names_to = "Month", values_to = "Avg_Bill") %>%
  mutate(Month = gsub("BILL_AMT", "Month ", Month)) %>%
  ggplot(aes(x = Month, y = Avg_Bill, group = 1)) +
  geom_line(color = "#e67e22", linewidth = 1.3) +
  geom_point(size = 3, color = "#d35400") +
  scale_y_continuous(labels = comma) +
  labs(title = "Average Bill Amount Over 6 Months", x = "", y = "Avg Bill Amount") +
  theme_minimal()

#Summary: Average bill amounts remain relatively stable across the 6-month period with minor fluctuation.

Avg Bill by Default Status — Line Comparison → About how average monthly bill amounts differ between defaulters and non-defaulters over 6 months.

bill_long <- df %>%
  group_by(default.payment.next.month) %>%
  summarise(across(BILL_AMT1:BILL_AMT6, mean)) %>%
  pivot_longer(-default.payment.next.month, names_to = "Month", values_to = "Avg_Bill") %>%
  mutate(Month = gsub("BILL_AMT", "M", Month))

ggplot(bill_long, aes(x = Month, y = Avg_Bill, color = default.payment.next.month, group = default.payment.next.month)) +
  geom_line(linewidth = 1.2) + geom_point(size = 2.5) +
  scale_y_continuous(labels = comma) +
  scale_color_manual(values = c("#27ae60", "#c0392b")) +
  labs(title = "Avg Bill Amount Over Time by Default Status", x = "Month", y = "Avg Bill", color = "Status") +
  theme_minimal()

#Summary: Defaulters consistently carry slightly lower average bill amounts than non-defaulters across all months.

Avg Payment Amount per Month — Line Plot → About how average monthly payment amounts compare between defaulters and non-defaulters over time.

df %>%
  group_by(default.payment.next.month) %>%
  summarise(across(PAY_AMT1:PAY_AMT6, mean)) %>%
  pivot_longer(-default.payment.next.month, names_to = "Month", values_to = "Avg_Pay") %>%
  mutate(Month = gsub("PAY_AMT", "M", Month)) %>%
  ggplot(aes(x = Month, y = Avg_Pay, color = default.payment.next.month, group = default.payment.next.month)) +
  geom_line(linewidth = 1.2) + geom_point(size = 2.5) +
  scale_y_continuous(labels = comma) +
  scale_color_manual(values = c("#1abc9c", "#e74c3c")) +
  labs(title = "Avg Payment Amount Over Time by Default Status", x = "Month", y = "Avg Payment", color = "Status") +
  theme_minimal()

#Summary: Non-defaulters pay significantly more on average each month, widening the gap with defaulters over time.

Default Rate by Pay Status (PAY_0) — Filled Bar → About how the likelihood of default changes with the severity of payment delay.

ggplot(df, aes(x = factor(PAY_0), fill = default.payment.next.month)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent) +
  scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
  labs(title = "Default Rate by Repayment Status (Most Recent Month)",
       x = "Pay Status (-2=No Consumption, -1=Paid Duly, 1-8=Months Delayed)",
       y = "Proportion", fill = "Status") +
  theme_minimal()

#Summary: Customers with longer payment delays (higher PAY_0 values) have dramatically higher default rates.

Payment Status Heatmap — Tile → About how frequently each payment status occurs across all 6 months.

pay_long <- df %>%
  select(ID, PAY_0:PAY_6, default.payment.next.month) %>%
  pivot_longer(PAY_0:PAY_6, names_to = "Month", values_to = "PayStatus")

pay_long %>%
  count(Month, PayStatus) %>%
  ggplot(aes(x = Month, y = factor(PayStatus), fill = n)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c(labels = comma) +
  labs(title = "Heatmap of Payment Status Across Months", x = "Month", y = "Payment Status", fill = "Count") +
  theme_minimal()

#Summary: The most common payment behavior is "paid duly" (−1), with very few customers in extreme delay categories.

Pay Status Distribution — Faceted Histogram → About the distribution of payment statuses individually for each of the 6 months.

df %>%
  select(PAY_0:PAY_6) %>%
  pivot_longer(everything(), names_to = "Month", values_to = "Status") %>%
  ggplot(aes(x = factor(Status), fill = Month)) +
  geom_bar() +
  facet_wrap(~Month, scales = "free_y") +
  scale_fill_viridis_d() +
  labs(title = "Payment Status Distribution Across Months", x = "Status", y = "Count") +
  theme_minimal() + theme(legend.position = "none")

#Summary: Payment behavior is fairly consistent across months, with on-time payments dominating in all periods.

Default Rate by Age Group — Bar Chart → About which age group has the highest and lowest default rates.

df %>%
  mutate(AgeGroup = cut(AGE, breaks = c(20,30,40,50,60,80),
                        labels = c("20-30","30-40","40-50","50-60","60+"))) %>%
  group_by(AgeGroup) %>%
  summarise(DefaultRate = mean(default.payment.next.month == "Default") * 100) %>%
  ggplot(aes(x = AgeGroup, y = DefaultRate, fill = AgeGroup)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = sprintf("%.1f%%", DefaultRate)), vjust = -0.4) +
  scale_fill_viridis_d() +
  labs(title = "Default Rate by Age Group", x = "Age Group", y = "Default Rate (%)") +
  theme_minimal() + theme(legend.position = "none")

#Summary: The 20–30 age group has the highest default rate, which gradually decreases with older age groups.

Credit Limit by Age Group — Boxplot → About how credit limits are distributed across different age groups

df %>%
  mutate(AgeGroup = cut(AGE, breaks = c(20,30,40,50,60,80),
                        labels = c("20-30","30-40","40-50","50-60","60+"))) %>%
  ggplot(aes(x = AgeGroup, y = LIMIT_BAL, fill = AgeGroup)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  scale_fill_viridis_d() +
  labs(title = "Credit Limit by Age Group", x = "Age Group", y = "Credit Limit") +
  theme_minimal() + theme(legend.position = "none")

#Summary: Credit limits increase steadily with age, with the 60+ group having the highest median limits.

Total Bill vs Total Payment — Scatter → About the relationship between a customer’s total 6-month bill and their total 6-month payment.

df %>%
  mutate(TotalBill = BILL_AMT1+BILL_AMT2+BILL_AMT3+BILL_AMT4+BILL_AMT5+BILL_AMT6,
         TotalPay  = PAY_AMT1+PAY_AMT2+PAY_AMT3+PAY_AMT4+PAY_AMT5+PAY_AMT6) %>%
  filter(TotalBill > 0) %>%
  ggplot(aes(x = TotalBill, y = TotalPay, color = default.payment.next.month)) +
  geom_point(alpha = 0.15, size = 0.8) +
  scale_x_log10(labels = comma) + scale_y_log10(labels = comma) +
  scale_color_manual(values = c("#1abc9c", "#e74c3c")) +
  labs(title = "Total Bill vs Total Payment (6 months, log scale)",
       x = "Total Bill", y = "Total Payment", color = "Status") +
  theme_minimal()
Warning in scale_y_log10(labels = comma): log-10 transformation introduced
infinite values.

#Summary: Non-defaulters show a much stronger alignment between total bills and total payments than defaulters.

Payment Utilization Ratio — Histogram → About how much of their bill customers actually pay, compared across defaulters and non-defaulters.

df %>% 
  mutate(UtilRatio = PAY_AMT2 / (BILL_AMT2 + 1)) %>% 
  filter(UtilRatio <= 2) %>% 
  ggplot(aes(x = UtilRatio, fill = default.payment.next.month)) + 
  geom_histogram(bins = 60, position = "identity", alpha = 0.6) + 
  scale_fill_manual(values = c("#27ae60", "#e74c3c")) + 
  labs(title = "Payment Utilization Ratio (Pay/Bill) by Default Status", 
       x = "Pay / Bill Ratio", y = "Count", fill = "Status") + 
  theme_minimal()

#Summary: Non-defaulters tend to pay a much higher fraction of their bill, while defaulters cluster near zero repayment

GGpairs — Pairplot → About the pairwise relationships and correlations among the key numeric variables split by default status.

df %>%
  select(LIMIT_BAL, AGE, BILL_AMT1, PAY_AMT1, default.payment.next.month) %>%
  ggpairs(aes(color = default.payment.next.month, alpha = 0.4),
          upper = list(continuous = wrap("cor", size = 3)),
          lower = list(continuous = wrap("points", alpha = 0.2, size = 0.5))) +
  scale_color_manual(values = c("#27ae60", "#e74c3c")) +
  scale_fill_manual(values = c("#27ae60", "#e74c3c")) +
  labs(title = "Pairplot of Key Variables by Default Status") +
  theme_minimal()
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.

#Summary: LIMIT_BAL and PAY_AMT1 show the clearest separation between defaulters and non-defaulters among all pairs.

Default Rate Heatmap — Education × Gender → About how default rates vary across every combination of education level and gender.

df %>%
  group_by(EDUCATION, SEX) %>%
  summarise(DefaultRate = mean(default.payment.next.month == "Default") * 100, .groups = "drop") %>%
  ggplot(aes(x = SEX, y = EDUCATION, fill = DefaultRate)) +
  geom_tile(color = "white", linewidth = 0.8) +
  geom_text(aes(label = sprintf("%.1f%%", DefaultRate)), size = 4) +
  scale_fill_gradient(low = "#f9f9f9", high = "#c0392b") +
  labs(title = "Default Rate by Education & Gender", x = "Gender", y = "Education", fill = "Default %") +
  theme_minimal()

#Summary: Certain education-gender combinations (e.g., male + high school) show notably elevated default rates compared to others.

ECDF of Credit Limit → About the cumulative distribution of credit limits and how it differs between defaulters and non-defaulters.

ggplot(df, aes(x = LIMIT_BAL, color = default.payment.next.month)) +
  stat_ecdf(linewidth = 1.2) +
  scale_x_continuous(labels = comma) +
  scale_color_manual(values = c("#2980b9", "#e74c3c")) +
  labs(title = "ECDF of Credit Limit by Default Status",
       x = "Credit Limit", y = "Cumulative Proportion", color = "Status") +
  theme_minimal()

#Summary: At every credit limit level, non-defaulters make up a disproportionately larger share, confirming higher limits protect against default.