# Install necessary packages (if not already installed)
install.packages('embed')
## The following package(s) will be installed:
## - embed [1.1.5]
## These packages will be installed into "D:/Credit_Score/Cerdit_Score/renv/library/windows/R-4.4/x86_64-w64-mingw32".
## 
## # Installing packages --------------------------------------------------------
## - Installing embed ...                          OK [copied from cache in 0.34s]
## Successfully installed 1 package in 0.38 seconds.
install.packages("tidyverse")
## The following package(s) will be installed:
## - tidyverse [2.0.0]
## These packages will be installed into "D:/Credit_Score/Cerdit_Score/renv/library/windows/R-4.4/x86_64-w64-mingw32".
## 
## # Installing packages --------------------------------------------------------
## - Installing tidyverse ...                      OK [copied from cache in 0.35s]
## Successfully installed 1 package in 0.4 seconds.
install.packages("tidymodels")
## # Downloading packages -------------------------------------------------------
## - Downloading tidymodels from CRAN ...          OK [87.2 Kb in 3.5s]
## - Downloading dials from CRAN ...               OK [436.6 Kb in 3.3s]
## - Downloading DiceDesign from CRAN ...          OK [333.5 Kb in 4.8s]
## - Downloading sfd from CRAN ...                 OK [2.3 Mb in 4.3s]
## - Downloading infer from CRAN ...               OK [2 Mb in 4.2s]
## - Downloading modeldata from CRAN ...           OK [4.9 Mb in 8.0s]
## - Downloading parsnip from CRAN ...             OK [1.4 Mb in 5.0s]
## - Downloading tune from CRAN ...                OK [1.9 Mb in 3.4s]
## - Downloading doFuture from CRAN ...            OK [135.8 Kb in 2.7s]
## - Downloading foreach from CRAN ...             OK [146.5 Kb in 2.7s]
## - Downloading iterators from CRAN ...           OK [347.3 Kb in 13s]
## - Downloading GPfit from CRAN ...               OK [77.8 Kb in 3.3s]
## - Downloading lhs from CRAN ...                 OK [697.6 Kb in 25s]
## - Downloading workflows from CRAN ...           OK [256.7 Kb in 7.5s]
## - Downloading modelenv from CRAN ...            OK [101.5 Kb in 3.0s]
## - Downloading yardstick from CRAN ...           OK [1.1 Mb in 3.9s]
## - Downloading workflowsets from CRAN ...        OK [2.7 Mb in 7.4s]
## Successfully downloaded 17 packages in 110 seconds.
## 
## The following package(s) will be installed:
## - dials        [1.4.0]
## - DiceDesign   [1.10]
## - doFuture     [1.0.1]
## - foreach      [1.5.2]
## - GPfit        [1.0-8]
## - infer        [1.0.7]
## - iterators    [1.0.14]
## - lhs          [1.2.0]
## - modeldata    [1.4.0]
## - modelenv     [0.2.0]
## - parsnip      [1.3.0]
## - patchwork    [1.3.0]
## - sfd          [0.1.0]
## - tidymodels   [1.3.0]
## - tune         [1.3.0]
## - workflows    [1.2.0]
## - workflowsets [1.1.0]
## - yardstick    [1.3.2]
## These packages will be installed into "D:/Credit_Score/Cerdit_Score/renv/library/windows/R-4.4/x86_64-w64-mingw32".
## 
## # Installing packages --------------------------------------------------------
## - Installing DiceDesign ...                     OK [installed binary and cached in 0.45s]
## - Installing sfd ...                            OK [installed binary and cached in 0.39s]
## - Installing dials ...                          OK [installed binary and cached in 0.49s]
## - Installing patchwork ...                      OK [copied from cache in 0.38s]
## - Installing infer ...                          OK [installed binary and cached in 0.57s]
## - Installing modeldata ...                      OK [installed binary and cached in 0.52s]
## - Installing parsnip ...                        OK [installed binary and cached in 0.52s]
## - Installing iterators ...                      OK [installed binary and cached in 0.49s]
## - Installing foreach ...                        OK [installed binary and cached in 0.5s]
## - Installing doFuture ...                       OK [installed binary and cached in 0.48s]
## - Installing lhs ...                            OK [installed binary and cached in 0.51s]
## - Installing GPfit ...                          OK [installed binary and cached in 0.35s]
## - Installing modelenv ...                       OK [installed binary and cached in 0.37s]
## - Installing workflows ...                      OK [installed binary and cached in 0.48s]
## - Installing yardstick ...                      OK [installed binary and cached in 0.55s]
## - Installing tune ...                           OK [installed binary and cached in 0.47s]
## - Installing workflowsets ...                   OK [installed binary and cached in 0.58s]
## - Installing tidymodels ...                     OK [installed binary and cached in 0.5s]
## Successfully installed 18 packages in 14 seconds.
# Load the required libraries
library(tidyverse)    # For data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)   # For modeling and machine learning
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.7     ✔ rsample      1.2.1
## ✔ dials        1.4.0     ✔ tune         1.3.0
## ✔ infer        1.0.7     ✔ workflows    1.2.0
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.3.0     ✔ yardstick    1.3.2
## ✔ recipes      1.1.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
library(embed)        # For embedding and encoding
# URL for the credit score dataset
data_url <- "https://assets.datacamp.com/production/repositories/6081/datasets/e02471e553bc28edddc1fe862666d36e04daed80/credit_score.csv"

# Load the data using read_csv()
credit_df <- read_csv(data_url)
## Rows: 18965 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): month, occupation, credit_mix, payment_of_min_amount, payment_beha...
## dbl (17): age, annual_income, monthly_inhand_salary, num_bank_accounts, num_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Reorder the credit_score factor levels for better visualization
credit_df <- credit_df %>%
  mutate(credit_score = factor(credit_score, levels = c("Poor", "Standard", "Good")))

# Glimpse the dataset to understand its structure
glimpse(credit_df)
## Rows: 18,965
## Columns: 23
## $ month                    <chr> "January", "July", "April", "January", "Febru…
## $ age                      <dbl> 44, 19, 39, 43, 22, 52, 32, 45, 37, 22, 26, 2…
## $ occupation               <chr> "Doctor", "Doctor", "Manager", "Developer", "…
## $ annual_income            <dbl> 32625.590, 31041.460, 25876.180, 15928.750, 1…
## $ monthly_inhand_salary    <dbl> 2922.7992, 2501.7883, 2293.3483, 1301.3958, 1…
## $ num_bank_accounts        <dbl> 0, 5, 4, 9, 7, 5, 7, 9, 1, 7, 5, 7, 9, 4, 4, …
## $ num_credit_card          <dbl> 5, 2, 6, 9, 9, 5, 8, 8, 5, 7, 5, 5, 5, 5, 7, …
## $ interest_rate            <dbl> 8, 5, 7, 24, 18, 5, 26, 21, 10, 25, 3, 23, 32…
## $ num_of_loan              <dbl> 4, 1, 3, 4, 2, 4, 9, 2, 2, 7, 1, 3, 6, 1, 2, …
## $ delay_from_due_date      <dbl> 5, 5, 5, 26, 40, 23, 62, 18, 13, 51, 10, 15, …
## $ num_of_delayed_payment   <dbl> 10, 11, 15, 22, 17, 1, 18, 18, 3, 16, 7, 21, …
## $ changed_credit_limit     <dbl> 2.55, 5.13, 10.29, 1.26, 6.09, 0.75, 22.13, 8…
## $ num_credit_inquiries     <dbl> 3, 0, 4, 12, 12, 1, 12, 13, 3, 8, 0, 9, 12, 1…
## $ credit_mix               <chr> "_", "Good", "_", "Bad", "Bad", "Good", "Bad"…
## $ outstanding_debt         <dbl> 177.90, 291.77, 71.54, 2240.56, 2063.45, 636.…
## $ credit_utilization_ratio <dbl> 39.41268, 25.26184, 25.24044, 31.18439, 23.13…
## $ payment_of_min_amount    <chr> "No", "NM", "NM", "Yes", "NM", "No", "NM", "Y…
## $ total_emi_per_month      <dbl> 73.125008, 21.021180, 37.374350, 31.846679, 1…
## $ amount_invested_monthly  <dbl> 139.43374, 185.31083, 117.93599, 161.38822, 1…
## $ payment_behaviour        <chr> "Low_spent_Small_value_payments", "Low_spent_…
## $ monthly_balance          <dbl> 369.7212, 333.8468, 354.0245, 226.9047, 343.9…
## $ credit_history_months    <dbl> 344, 398, 257, 145, 161, 199, 31, 121, 244, 1…
## $ credit_score             <fct> Standard, Good, Standard, Poor, Poor, Poor, S…
# Plot the distribution of annual income for each credit score level
credit_df %>%
  ggplot(aes(x = annual_income, color = credit_score)) +
  geom_density() +  # Density plot to show distribution
  xlim(0, 200000) + # Limit x-axis to focus on relevant range
  labs(title = "Annual Income Distribution by Credit Score",
       x = "Annual Income",
       y = "Density",
       color = "Credit Score")
## Warning: Removed 199 rows containing non-finite outside the scale range
## (`stat_density()`).

# Plot the distribution of age for each credit score level
credit_df %>%
  ggplot(aes(x = age, color = credit_score)) +
  geom_density() +  # Density plot to show distribution
  labs(title = "Age Distribution by Credit Score",
       x = "Age",
       y = "Density",
       color = "Credit Score")

# Scatter plot to visualize the relationship between delay_from_due_date and credit_history_months
credit_df %>%
  ggplot(aes(x = delay_from_due_date, y = credit_history_months, color = credit_score)) +
  geom_jitter(alpha = 0.4) +  # Add jitter to avoid overplotting
  labs(title = "Delay from Due Date vs. Credit History Months",
       x = "Delay from Due Date",
       y = "Credit History Months",
       color = "Credit Score")

# Scatter plot to visualize the relationship between credit_utilization_ratio and num_credit_card
credit_df %>%
  ggplot(aes(x = credit_utilization_ratio, y = num_credit_card, color = credit_score)) +
  geom_jitter(alpha = 0.4) +  # Add jitter to avoid overplotting
  ylim(0, 10) +  # Limit y-axis to focus on relevant range
  labs(title = "Credit Utilization Ratio vs. Number of Credit Cards",
       x = "Credit Utilization Ratio",
       y = "Number of Credit Cards",
       color = "Credit Score")
## Warning: Removed 981 rows containing missing values or values outside the scale range
## (`geom_point()`).