import library

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readr)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggplot2)

import data

df_raw <- read_csv("04_BANKING_CUSTOMERS.xlsx - Customers.csv", 
                   locale = locale(decimal_mark = ","))
## Rows: 1000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): customer_id
## dbl (6): age, income, balance, num_transactions, credit_score, account_age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df_raw)
## Rows: 1,000
## Columns: 7
## $ customer_id      <chr> "C000001", "C000002", "C000003", "C000004", "C000005"…
## $ age              <dbl> 57, 30, 55, 20, 33, 59, 24, 71, 70, 20, 31, 33, 21, 4…
## $ income           <dbl> 51808666, 33241740, 121506340, 17855323, 51440557, 18…
## $ balance          <dbl> 71898776, 36624170, 222845105, 57806634, 34534238, 16…
## $ num_transactions <dbl> 23, 24, 15, 30, 13, 26, 19, 26, 21, 17, 8, 19, 15, 14…
## $ credit_score     <dbl> 627.8335, 668.0386, 545.5959, 639.2831, 590.2093, 642…
## $ account_age      <dbl> 18, 21, 8, 1, 14, 7, 9, 6, 18, 0, 21, 1, 2, 27, 13, 1…

Data Cleaning & Transformation

library(tidyverse)
library(scales)

df_raw <- read_csv("04_BANKING_CUSTOMERS.xlsx - Customers.csv", 
                   locale = locale(decimal_mark = ","))
## Rows: 1000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): customer_id
## dbl (6): age, income, balance, num_transactions, credit_score, account_age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_clean <- df_raw %>%
  mutate(age = if_else(age < 0 | age > 100, NA_real_, age)) %>%
  mutate(credit_score = if_else(credit_score < 300 | credit_score > 850, NA_real_, credit_score)) %>%
  mutate(
    age = if_else(is.na(age), median(age, na.rm = TRUE), age),
    income = if_else(is.na(income), median(income, na.rm = TRUE), income),
    balance = if_else(is.na(balance), median(balance, na.rm = TRUE), balance),
    num_transactions = if_else(is.na(num_transactions), median(num_transactions, na.rm = TRUE), num_transactions),
    credit_score = if_else(is.na(credit_score), median(credit_score, na.rm = TRUE), credit_score)
  ) %>%
  mutate(
    age_category = case_when(
      age < 30 ~ "Dewasa Muda",
      age <= 50 ~ "Dewasa",
      TRUE ~ "Lansia"
    ),
    credit_category = case_when(
      credit_score < 580 ~ "Buruk",
      credit_score < 670 ~ "Cukup",
      credit_score < 800 ~ "Baik",
      TRUE ~ "Istimewa"
    )
  ) %>%
  mutate(
    age_category = factor(age_category, levels = c("Dewasa Muda", "Dewasa", "Lansia")),
    credit_category = factor(credit_category, levels = c("Buruk", "Cukup", "Baik", "Istimewa"))
  )
summary(df_clean)
##  customer_id             age            income             balance         
##  Length:1000        Min.   : 5.00   Min.   : 10022456   Min.   :   118439  
##  Class :character   1st Qu.:32.00   1st Qu.: 24525188   1st Qu.: 29972753  
##  Mode  :character   Median :45.00   Median : 43878928   Median : 70958269  
##                     Mean   :45.92   Mean   : 59812600   Mean   : 98894043  
##                     3rd Qu.:60.00   3rd Qu.: 78931340   3rd Qu.:136694830  
##                     Max.   :75.00   Max.   :409316650   Max.   :912473303  
##  num_transactions  credit_score    account_age         age_category
##  Min.   : 7.00    Min.   :326.1   Min.   : 0.00   Dewasa Muda:207  
##  1st Qu.:17.00    1st Qu.:587.5   1st Qu.: 7.00   Dewasa     :393  
##  Median :20.00    Median :650.1   Median :15.00   Lansia     :400  
##  Mean   :20.25    Mean   :651.3   Mean   :14.72                    
##  3rd Qu.:23.00    3rd Qu.:719.5   3rd Qu.:23.00                    
##  Max.   :35.00    Max.   :850.0   Max.   :30.00                    
##  credit_category
##  Buruk   :232   
##  Cukup   :359   
##  Baik    :335   
##  Istimewa: 74   
##                 
## 

Bar Chart Jumlah Nasabah Per Kategori Usia

ggplot(df_clean, aes(x = age_category, fill = age_category)) +
  geom_bar(color = "black", alpha = 0.8) +
  geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5, size = 5) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Distribusi Nasabah Berdasarkan Kategori Usia",
    x = "Kategori Usia",
    y = "Jumlah Nasabah"
  ) +
  theme_minimal() +
  theme(legend.position = "none")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Pie Chart Proporsi Kategori Skor Kredit

credit_data <- df_clean %>%
  count(credit_category, name = "total") %>%
  mutate(persen = total / sum(total) * 100)

ggplot(credit_data, aes(x = "", y = total, fill = credit_category)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar("y", start = 0) +
  geom_text(aes(label = paste0(round(persen, 1), "%")), 
            position = position_stack(vjust = 0.5), color = "white", size = 5, fontface = "bold") +
  scale_fill_brewer(palette = "RdYlGn") +
  labs(
    title = "Proporsi Nasabah Berdasarkan Skor Kredit",
    fill = "Kategori",
    x = NULL,
    y = NULL
  ) +
  theme_void()

Grouped Bar Chart Rata-rata Income & Balance Per Usia

summary_age <- df_clean %>%
  group_by(age_category) %>%
  summarise(
    avg_income = mean(income),
    avg_balance = mean(balance)
  ) %>%
  pivot_longer(cols = c(avg_income, avg_balance), names_to = "metric", values_to = "amount")

ggplot(summary_age, aes(x = age_category, y = amount, fill = metric)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
  scale_y_continuous(labels = dollar_format(scale = 1e-6, prefix = "Rp ", suffix = " M")) +
  scale_fill_manual(values = c("avg_income" = "darkgoldenrod1", "avg_balance" = "darkolivegreen3"),
                    labels = c("Rata-rata Saldo", "Rata-rata Pendapatan")) +
  labs(
    title = "Perbandingan Pendapatan & Saldo per Kategori Usia",
    x = "Kategori Usia",
    y = "Jumlah (dalam Juta Rp)",
    fill = "Metrik"
  ) +
  theme_minimal()

Scatter Plot Hubungan income dan balance

ggplot(df_clean, aes(x = income, y = balance, color = age_category)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed", linewidth = 1) + # Garis Tren
  scale_x_continuous(labels = dollar_format(scale = 1e-6, prefix = "Rp ", suffix = " M")) +
  scale_y_continuous(labels = dollar_format(scale = 1e-6, prefix = "Rp ", suffix = " M")) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Korelasi antara Pendapatan dan Saldo Rekening",
    subtitle = "Garis putus-putus menunjukkan tren linear keseluruhan",
    x = "Pendapatan (Income)",
    y = "Saldo (Balance)",
    color = "Kategori Usia"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Boxplot

ggplot(df_clean, aes(x = credit_category, y = income, fill = credit_category)) +
  geom_boxplot(alpha = 0.8, outlier.alpha = 0.5) +
  scale_y_continuous(labels = dollar_format(scale = 1e-6, prefix = "Rp ", suffix = " M")) +
  scale_fill_brewer(palette = "RdYlGn") +
  labs(
    title = "Distribusi Pendapatan Berdasarkan Kategori Skor Kredit",
    x = "Kategori Skor Kredit",
    y = "Pendapatan",
    fill = "Skor Kredit"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Line Chart (Trend)

trend_account_age <- df_clean %>%
  group_by(account_age) %>%
  summarise(avg_balance = mean(balance)) %>%
  arrange(account_age)

ggplot(trend_account_age, aes(x = account_age, y = avg_balance)) +
  geom_line(color = "black", linewidth = 1.2) +
  geom_point(color = "firebrick1", size = 2) +
  scale_y_continuous(labels = dollar_format(scale = 1e-6, prefix = "Rp ", suffix = " M")) +
  labs(
    title = "Tren Rata-rata Saldo Berdasarkan Usia Rekening (Tahun)",
    x = "Usia Rekening (Tahun)",
    y = "Rata-rata Saldo"
  ) +
  theme_minimal()

Histogram + Density Plot

ggplot(df_clean, aes(x = credit_score)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "dodgerblue", color = "white", alpha = 0.7) +
  geom_density(color = "firebrick2", linewidth = 1.2) +
  labs(
    title = "Distribusi Skor Kredit Nasabah",
    subtitle = "Garis merah menunjukkan estimasi kepadatan distribusi (Density)",
    x = "Skor Kredit",
    y = "Densitas"
  ) +
  theme_minimal()

Correlation Heatmap

cor_data <- df_clean %>%
  select(age, income, balance, num_transactions, credit_score, account_age) %>%
  cor(use = "complete.obs") %>%
  as.data.frame() %>%
  rownames_to_column(var = "var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "correlation")

ggplot(cor_data, aes(x = var1, y = var2, fill = correlation)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(correlation, 2)), color = "black", size = 3.5) +
  scale_fill_gradient2(low = "red3", mid = "white", high = "seagreen3", midpoint = 0, limits = c(-1, 1)) +
  labs(
    title = "Matriks Korelasi Antar Variabel Numerik",
    x = NULL,
    y = NULL
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))