import library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readr)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(ggplot2)
import data
df_raw <- read_csv("04_BANKING_CUSTOMERS.xlsx - Customers.csv",
locale = locale(decimal_mark = ","))
## Rows: 1000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): customer_id
## dbl (6): age, income, balance, num_transactions, credit_score, account_age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df_raw)
## Rows: 1,000
## Columns: 7
## $ customer_id <chr> "C000001", "C000002", "C000003", "C000004", "C000005"…
## $ age <dbl> 57, 30, 55, 20, 33, 59, 24, 71, 70, 20, 31, 33, 21, 4…
## $ income <dbl> 51808666, 33241740, 121506340, 17855323, 51440557, 18…
## $ balance <dbl> 71898776, 36624170, 222845105, 57806634, 34534238, 16…
## $ num_transactions <dbl> 23, 24, 15, 30, 13, 26, 19, 26, 21, 17, 8, 19, 15, 14…
## $ credit_score <dbl> 627.8335, 668.0386, 545.5959, 639.2831, 590.2093, 642…
## $ account_age <dbl> 18, 21, 8, 1, 14, 7, 9, 6, 18, 0, 21, 1, 2, 27, 13, 1…
Data Cleaning & Transformation
library(tidyverse)
library(scales)
df_raw <- read_csv("04_BANKING_CUSTOMERS.xlsx - Customers.csv",
locale = locale(decimal_mark = ","))
## Rows: 1000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): customer_id
## dbl (6): age, income, balance, num_transactions, credit_score, account_age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_clean <- df_raw %>%
mutate(age = if_else(age < 0 | age > 100, NA_real_, age)) %>%
mutate(credit_score = if_else(credit_score < 300 | credit_score > 850, NA_real_, credit_score)) %>%
mutate(
age = if_else(is.na(age), median(age, na.rm = TRUE), age),
income = if_else(is.na(income), median(income, na.rm = TRUE), income),
balance = if_else(is.na(balance), median(balance, na.rm = TRUE), balance),
num_transactions = if_else(is.na(num_transactions), median(num_transactions, na.rm = TRUE), num_transactions),
credit_score = if_else(is.na(credit_score), median(credit_score, na.rm = TRUE), credit_score)
) %>%
mutate(
age_category = case_when(
age < 30 ~ "Dewasa Muda",
age <= 50 ~ "Dewasa",
TRUE ~ "Lansia"
),
credit_category = case_when(
credit_score < 580 ~ "Buruk",
credit_score < 670 ~ "Cukup",
credit_score < 800 ~ "Baik",
TRUE ~ "Istimewa"
)
) %>%
mutate(
age_category = factor(age_category, levels = c("Dewasa Muda", "Dewasa", "Lansia")),
credit_category = factor(credit_category, levels = c("Buruk", "Cukup", "Baik", "Istimewa"))
)
summary(df_clean)
## customer_id age income balance
## Length:1000 Min. : 5.00 Min. : 10022456 Min. : 118439
## Class :character 1st Qu.:32.00 1st Qu.: 24525188 1st Qu.: 29972753
## Mode :character Median :45.00 Median : 43878928 Median : 70958269
## Mean :45.92 Mean : 59812600 Mean : 98894043
## 3rd Qu.:60.00 3rd Qu.: 78931340 3rd Qu.:136694830
## Max. :75.00 Max. :409316650 Max. :912473303
## num_transactions credit_score account_age age_category
## Min. : 7.00 Min. :326.1 Min. : 0.00 Dewasa Muda:207
## 1st Qu.:17.00 1st Qu.:587.5 1st Qu.: 7.00 Dewasa :393
## Median :20.00 Median :650.1 Median :15.00 Lansia :400
## Mean :20.25 Mean :651.3 Mean :14.72
## 3rd Qu.:23.00 3rd Qu.:719.5 3rd Qu.:23.00
## Max. :35.00 Max. :850.0 Max. :30.00
## credit_category
## Buruk :232
## Cukup :359
## Baik :335
## Istimewa: 74
##
##
Bar Chart Jumlah Nasabah Per Kategori Usia
ggplot(df_clean, aes(x = age_category, fill = age_category)) +
geom_bar(color = "black", alpha = 0.8) +
geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5, size = 5) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Distribusi Nasabah Berdasarkan Kategori Usia",
x = "Kategori Usia",
y = "Jumlah Nasabah"
) +
theme_minimal() +
theme(legend.position = "none")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Pie Chart Proporsi Kategori Skor Kredit
credit_data <- df_clean %>%
count(credit_category, name = "total") %>%
mutate(persen = total / sum(total) * 100)
ggplot(credit_data, aes(x = "", y = total, fill = credit_category)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar("y", start = 0) +
geom_text(aes(label = paste0(round(persen, 1), "%")),
position = position_stack(vjust = 0.5), color = "white", size = 5, fontface = "bold") +
scale_fill_brewer(palette = "RdYlGn") +
labs(
title = "Proporsi Nasabah Berdasarkan Skor Kredit",
fill = "Kategori",
x = NULL,
y = NULL
) +
theme_void()

Grouped Bar Chart Rata-rata Income & Balance Per Usia
summary_age <- df_clean %>%
group_by(age_category) %>%
summarise(
avg_income = mean(income),
avg_balance = mean(balance)
) %>%
pivot_longer(cols = c(avg_income, avg_balance), names_to = "metric", values_to = "amount")
ggplot(summary_age, aes(x = age_category, y = amount, fill = metric)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
scale_y_continuous(labels = dollar_format(scale = 1e-6, prefix = "Rp ", suffix = " M")) +
scale_fill_manual(values = c("avg_income" = "darkgoldenrod1", "avg_balance" = "darkolivegreen3"),
labels = c("Rata-rata Saldo", "Rata-rata Pendapatan")) +
labs(
title = "Perbandingan Pendapatan & Saldo per Kategori Usia",
x = "Kategori Usia",
y = "Jumlah (dalam Juta Rp)",
fill = "Metrik"
) +
theme_minimal()

Scatter Plot Hubungan income dan balance
ggplot(df_clean, aes(x = income, y = balance, color = age_category)) +
geom_point(alpha = 0.6, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed", linewidth = 1) + # Garis Tren
scale_x_continuous(labels = dollar_format(scale = 1e-6, prefix = "Rp ", suffix = " M")) +
scale_y_continuous(labels = dollar_format(scale = 1e-6, prefix = "Rp ", suffix = " M")) +
scale_color_brewer(palette = "Set1") +
labs(
title = "Korelasi antara Pendapatan dan Saldo Rekening",
subtitle = "Garis putus-putus menunjukkan tren linear keseluruhan",
x = "Pendapatan (Income)",
y = "Saldo (Balance)",
color = "Kategori Usia"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Boxplot
ggplot(df_clean, aes(x = credit_category, y = income, fill = credit_category)) +
geom_boxplot(alpha = 0.8, outlier.alpha = 0.5) +
scale_y_continuous(labels = dollar_format(scale = 1e-6, prefix = "Rp ", suffix = " M")) +
scale_fill_brewer(palette = "RdYlGn") +
labs(
title = "Distribusi Pendapatan Berdasarkan Kategori Skor Kredit",
x = "Kategori Skor Kredit",
y = "Pendapatan",
fill = "Skor Kredit"
) +
theme_minimal() +
theme(legend.position = "none")

Line Chart (Trend)
trend_account_age <- df_clean %>%
group_by(account_age) %>%
summarise(avg_balance = mean(balance)) %>%
arrange(account_age)
ggplot(trend_account_age, aes(x = account_age, y = avg_balance)) +
geom_line(color = "black", linewidth = 1.2) +
geom_point(color = "firebrick1", size = 2) +
scale_y_continuous(labels = dollar_format(scale = 1e-6, prefix = "Rp ", suffix = " M")) +
labs(
title = "Tren Rata-rata Saldo Berdasarkan Usia Rekening (Tahun)",
x = "Usia Rekening (Tahun)",
y = "Rata-rata Saldo"
) +
theme_minimal()

Histogram + Density Plot
ggplot(df_clean, aes(x = credit_score)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "dodgerblue", color = "white", alpha = 0.7) +
geom_density(color = "firebrick2", linewidth = 1.2) +
labs(
title = "Distribusi Skor Kredit Nasabah",
subtitle = "Garis merah menunjukkan estimasi kepadatan distribusi (Density)",
x = "Skor Kredit",
y = "Densitas"
) +
theme_minimal()

Correlation Heatmap
cor_data <- df_clean %>%
select(age, income, balance, num_transactions, credit_score, account_age) %>%
cor(use = "complete.obs") %>%
as.data.frame() %>%
rownames_to_column(var = "var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "correlation")
ggplot(cor_data, aes(x = var1, y = var2, fill = correlation)) +
geom_tile(color = "white") +
geom_text(aes(label = round(correlation, 2)), color = "black", size = 3.5) +
scale_fill_gradient2(low = "red3", mid = "white", high = "seagreen3", midpoint = 0, limits = c(-1, 1)) +
labs(
title = "Matriks Korelasi Antar Variabel Numerik",
x = NULL,
y = NULL
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
