# Install necessary packages (if not already installed)
install.packages('embed')
## The following package(s) will be installed:
## - embed [1.1.5]
## These packages will be installed into "D:/Credit_Score/Cerdit_Score/renv/library/windows/R-4.4/x86_64-w64-mingw32".
##
## # Installing packages --------------------------------------------------------
## - Installing embed ... OK [copied from cache in 0.34s]
## Successfully installed 1 package in 0.38 seconds.
install.packages("tidyverse")
## The following package(s) will be installed:
## - tidyverse [2.0.0]
## These packages will be installed into "D:/Credit_Score/Cerdit_Score/renv/library/windows/R-4.4/x86_64-w64-mingw32".
##
## # Installing packages --------------------------------------------------------
## - Installing tidyverse ... OK [copied from cache in 0.35s]
## Successfully installed 1 package in 0.4 seconds.
install.packages("tidymodels")
## # Downloading packages -------------------------------------------------------
## - Downloading tidymodels from CRAN ... OK [87.2 Kb in 3.5s]
## - Downloading dials from CRAN ... OK [436.6 Kb in 3.3s]
## - Downloading DiceDesign from CRAN ... OK [333.5 Kb in 4.8s]
## - Downloading sfd from CRAN ... OK [2.3 Mb in 4.3s]
## - Downloading infer from CRAN ... OK [2 Mb in 4.2s]
## - Downloading modeldata from CRAN ... OK [4.9 Mb in 8.0s]
## - Downloading parsnip from CRAN ... OK [1.4 Mb in 5.0s]
## - Downloading tune from CRAN ... OK [1.9 Mb in 3.4s]
## - Downloading doFuture from CRAN ... OK [135.8 Kb in 2.7s]
## - Downloading foreach from CRAN ... OK [146.5 Kb in 2.7s]
## - Downloading iterators from CRAN ... OK [347.3 Kb in 13s]
## - Downloading GPfit from CRAN ... OK [77.8 Kb in 3.3s]
## - Downloading lhs from CRAN ... OK [697.6 Kb in 25s]
## - Downloading workflows from CRAN ... OK [256.7 Kb in 7.5s]
## - Downloading modelenv from CRAN ... OK [101.5 Kb in 3.0s]
## - Downloading yardstick from CRAN ... OK [1.1 Mb in 3.9s]
## - Downloading workflowsets from CRAN ... OK [2.7 Mb in 7.4s]
## Successfully downloaded 17 packages in 110 seconds.
##
## The following package(s) will be installed:
## - dials [1.4.0]
## - DiceDesign [1.10]
## - doFuture [1.0.1]
## - foreach [1.5.2]
## - GPfit [1.0-8]
## - infer [1.0.7]
## - iterators [1.0.14]
## - lhs [1.2.0]
## - modeldata [1.4.0]
## - modelenv [0.2.0]
## - parsnip [1.3.0]
## - patchwork [1.3.0]
## - sfd [0.1.0]
## - tidymodels [1.3.0]
## - tune [1.3.0]
## - workflows [1.2.0]
## - workflowsets [1.1.0]
## - yardstick [1.3.2]
## These packages will be installed into "D:/Credit_Score/Cerdit_Score/renv/library/windows/R-4.4/x86_64-w64-mingw32".
##
## # Installing packages --------------------------------------------------------
## - Installing DiceDesign ... OK [installed binary and cached in 0.45s]
## - Installing sfd ... OK [installed binary and cached in 0.39s]
## - Installing dials ... OK [installed binary and cached in 0.49s]
## - Installing patchwork ... OK [copied from cache in 0.38s]
## - Installing infer ... OK [installed binary and cached in 0.57s]
## - Installing modeldata ... OK [installed binary and cached in 0.52s]
## - Installing parsnip ... OK [installed binary and cached in 0.52s]
## - Installing iterators ... OK [installed binary and cached in 0.49s]
## - Installing foreach ... OK [installed binary and cached in 0.5s]
## - Installing doFuture ... OK [installed binary and cached in 0.48s]
## - Installing lhs ... OK [installed binary and cached in 0.51s]
## - Installing GPfit ... OK [installed binary and cached in 0.35s]
## - Installing modelenv ... OK [installed binary and cached in 0.37s]
## - Installing workflows ... OK [installed binary and cached in 0.48s]
## - Installing yardstick ... OK [installed binary and cached in 0.55s]
## - Installing tune ... OK [installed binary and cached in 0.47s]
## - Installing workflowsets ... OK [installed binary and cached in 0.58s]
## - Installing tidymodels ... OK [installed binary and cached in 0.5s]
## Successfully installed 18 packages in 14 seconds.
# Load the required libraries
library(tidyverse) # For data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels) # For modeling and machine learning
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.7 ✔ rsample 1.2.1
## ✔ dials 1.4.0 ✔ tune 1.3.0
## ✔ infer 1.0.7 ✔ workflows 1.2.0
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.3.0 ✔ yardstick 1.3.2
## ✔ recipes 1.1.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(embed) # For embedding and encoding
# URL for the credit score dataset
data_url <- "https://assets.datacamp.com/production/repositories/6081/datasets/e02471e553bc28edddc1fe862666d36e04daed80/credit_score.csv"
# Load the data using read_csv()
credit_df <- read_csv(data_url)
## Rows: 18965 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): month, occupation, credit_mix, payment_of_min_amount, payment_beha...
## dbl (17): age, annual_income, monthly_inhand_salary, num_bank_accounts, num_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Reorder the credit_score factor levels for better visualization
credit_df <- credit_df %>%
mutate(credit_score = factor(credit_score, levels = c("Poor", "Standard", "Good")))
# Glimpse the dataset to understand its structure
glimpse(credit_df)
## Rows: 18,965
## Columns: 23
## $ month <chr> "January", "July", "April", "January", "Febru…
## $ age <dbl> 44, 19, 39, 43, 22, 52, 32, 45, 37, 22, 26, 2…
## $ occupation <chr> "Doctor", "Doctor", "Manager", "Developer", "…
## $ annual_income <dbl> 32625.590, 31041.460, 25876.180, 15928.750, 1…
## $ monthly_inhand_salary <dbl> 2922.7992, 2501.7883, 2293.3483, 1301.3958, 1…
## $ num_bank_accounts <dbl> 0, 5, 4, 9, 7, 5, 7, 9, 1, 7, 5, 7, 9, 4, 4, …
## $ num_credit_card <dbl> 5, 2, 6, 9, 9, 5, 8, 8, 5, 7, 5, 5, 5, 5, 7, …
## $ interest_rate <dbl> 8, 5, 7, 24, 18, 5, 26, 21, 10, 25, 3, 23, 32…
## $ num_of_loan <dbl> 4, 1, 3, 4, 2, 4, 9, 2, 2, 7, 1, 3, 6, 1, 2, …
## $ delay_from_due_date <dbl> 5, 5, 5, 26, 40, 23, 62, 18, 13, 51, 10, 15, …
## $ num_of_delayed_payment <dbl> 10, 11, 15, 22, 17, 1, 18, 18, 3, 16, 7, 21, …
## $ changed_credit_limit <dbl> 2.55, 5.13, 10.29, 1.26, 6.09, 0.75, 22.13, 8…
## $ num_credit_inquiries <dbl> 3, 0, 4, 12, 12, 1, 12, 13, 3, 8, 0, 9, 12, 1…
## $ credit_mix <chr> "_", "Good", "_", "Bad", "Bad", "Good", "Bad"…
## $ outstanding_debt <dbl> 177.90, 291.77, 71.54, 2240.56, 2063.45, 636.…
## $ credit_utilization_ratio <dbl> 39.41268, 25.26184, 25.24044, 31.18439, 23.13…
## $ payment_of_min_amount <chr> "No", "NM", "NM", "Yes", "NM", "No", "NM", "Y…
## $ total_emi_per_month <dbl> 73.125008, 21.021180, 37.374350, 31.846679, 1…
## $ amount_invested_monthly <dbl> 139.43374, 185.31083, 117.93599, 161.38822, 1…
## $ payment_behaviour <chr> "Low_spent_Small_value_payments", "Low_spent_…
## $ monthly_balance <dbl> 369.7212, 333.8468, 354.0245, 226.9047, 343.9…
## $ credit_history_months <dbl> 344, 398, 257, 145, 161, 199, 31, 121, 244, 1…
## $ credit_score <fct> Standard, Good, Standard, Poor, Poor, Poor, S…
# Plot the distribution of annual income for each credit score level
credit_df %>%
ggplot(aes(x = annual_income, color = credit_score)) +
geom_density() + # Density plot to show distribution
xlim(0, 200000) + # Limit x-axis to focus on relevant range
labs(title = "Annual Income Distribution by Credit Score",
x = "Annual Income",
y = "Density",
color = "Credit Score")
## Warning: Removed 199 rows containing non-finite outside the scale range
## (`stat_density()`).

# Plot the distribution of age for each credit score level
credit_df %>%
ggplot(aes(x = age, color = credit_score)) +
geom_density() + # Density plot to show distribution
labs(title = "Age Distribution by Credit Score",
x = "Age",
y = "Density",
color = "Credit Score")

# Scatter plot to visualize the relationship between delay_from_due_date and credit_history_months
credit_df %>%
ggplot(aes(x = delay_from_due_date, y = credit_history_months, color = credit_score)) +
geom_jitter(alpha = 0.4) + # Add jitter to avoid overplotting
labs(title = "Delay from Due Date vs. Credit History Months",
x = "Delay from Due Date",
y = "Credit History Months",
color = "Credit Score")

# Scatter plot to visualize the relationship between credit_utilization_ratio and num_credit_card
credit_df %>%
ggplot(aes(x = credit_utilization_ratio, y = num_credit_card, color = credit_score)) +
geom_jitter(alpha = 0.4) + # Add jitter to avoid overplotting
ylim(0, 10) + # Limit y-axis to focus on relevant range
labs(title = "Credit Utilization Ratio vs. Number of Credit Cards",
x = "Credit Utilization Ratio",
y = "Number of Credit Cards",
color = "Credit Score")
## Warning: Removed 981 rows containing missing values or values outside the scale range
## (`geom_point()`).
