Import Package

library(data.table)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(corrplot)

## corrplot 0.95 loaded

library(leaflet)
library(leaflet.extras)

1 Read Clean Dataset

full_df <- fread("CleanedDataSet/credit_limit_data.csv")
head(full_df)

##       id client_id card_brand card_type      card_number    expires   cvv
##    <int>     <int>     <char>    <char>            <i64>     <IDat> <int>
## 1:     0      1362       Amex    Credit  393314135668401 2024-04-01   866
## 2:     1       550 Mastercard    Credit 5278231764792292 2024-06-01   396
## 3:     2       556 Mastercard     Debit 5889825928297675 2021-09-01   422
## 4:     3      1937       Visa    Credit 4289888672554714 2020-04-01   736
## 5:     4      1981 Mastercard     Debit 5433366978583845 2024-03-01   530
## 6:     5       619       Visa     Debit 4657824650820465 2024-04-01   245
##    has_chip num_cards_issued acct_open_date year_pin_last_changed current_age
##       <int>            <int>         <IDat>                 <int>       <int>
## 1:        1                2     1991-01-01                  2014          58
## 2:        1                1     1994-01-01                  2013          76
## 3:        1                1     1995-01-01                  2011          46
## 4:        1                2     1995-01-01                  2015          65
## 5:        1                2     1997-01-01                  2007          48
## 6:        1                2     1997-01-01                  2012          54
##    retirement_age birth_year birth_month gender                   address
##             <int>      <int>       <int> <char>                    <char>
## 1:             67       1962           1   Male            3385 Hill Lane
## 2:             70       1944           2   Male           4937 Maple Lane
## 3:             66       1973          12   Male          5659 Park Avenue
## 4:             62       1955           2 Female           230 Plum Avenue
## 5:             65       1972           1   Male 8975 Littlewood Boulevard
## 6:             65       1965          12   Male     498 Littlewood Avenue
##    latitude longitude per_capita_income yearly_income total_debt credit_score
##       <num>     <num>             <int>         <int>      <int>        <int>
## 1:    38.78    -77.27             35563         72510      44317          727
## 2:    47.54   -122.58             21219         30248      35766          763
## 3:    41.01    -81.60             17856         36405      31815          715
## 4:    38.78    -90.70             25350         17056      29112          667
## 5:    26.63    -81.99             19274         39303      23650          702
## 6:    44.01    -92.47             26478         53986      58381          748
##    num_credit_cards debt_to_income_ratio total_transactions
##               <int>                <num>              <int>
## 1:                4            0.6111847               3402
## 2:                4            1.1824253               2841
## 3:                2            0.8739184               8028
## 4:                5            1.7068480               2870
## 5:                7            0.6017352                890
## 6:                4            1.0814100               2058
##    avg_transaction_amount max_transaction_amount min_transaction_amount
##                     <num>                  <num>                  <num>
## 1:               70.81277                1340.97                   -497
## 2:               54.81875                1099.49                   -500
## 3:               28.21148                1430.74                   -499
## 4:               86.89563                1165.37                   -469
## 5:               36.38093                 903.24                   -374
## 6:               43.81727                 626.73                   -491
##    total_spent total_refunded num_refunds transaction_frequency avg_errors
##          <num>          <num>       <int>                 <int>      <num>
## 1:   278052.05          37147         332                  3402 0.01704879
## 2:   163668.06           7928          58                  2841 0.01548750
## 3:   258937.80          32456         329                  8028 0.01258097
## 4:   266504.47          17114         162                  2870 0.02055749
## 5:    41365.03           8986         104                   890 0.03146067
## 6:   104039.94          13864         139                  2058 0.01311953
##    total_errors credit_limit
##           <int>        <int>
## 1:           58        33900
## 2:           44        11600
## 3:          101        19948
## 4:           59        16400
## 5:           28        19439
## 6:           27        21883

fraud_df <- readRDS("CleanedDataSet/fraud_detection_data.rds")
head(fraud_df)

## Key: <client_id>
##          id       date client_id card_id amount   mcc          use_chip
##       <int>     <Date>     <int>   <int>  <num> <int>            <char>
## 1: 11873816 2012-10-31         0    1271  62.99  1711 Swipe Transaction
## 2: 14690298 2014-07-15         0    1271  64.96  1711 Swipe Transaction
## 3: 16182985 2015-06-03         0    1271 309.57  3000  Chip Transaction
## 4: 22742665 2019-03-30         0    1271 535.73  3000  Chip Transaction
## 5: 18200515 2016-08-08         0    1271 568.10  3001  Chip Transaction
## 6: 12614098 2013-04-15         0    1271 564.05  3132 Swipe Transaction
##    merchant_id merchant_city merchant_state   zip is_refund has_error
##          <int>        <char>         <char> <num>     <num>     <num>
## 1:       11582   Scarborough             ME  4074         0         0
## 2:       11582   Scarborough             ME  4074         0         0
## 3:       60152     Stratford             CT  6615         0         0
## 4:       60152     Stratford             CT  6615         0         0
## 5:        5594       Norwich             CT  6360         0         0
## 6:       57386   Glastonbury             CT  6033         0         0
##    error_bad_expiration error_bad_card_number error_insufficient_balance
##                   <num>                 <num>                      <num>
## 1:                    0                     0                          0
## 2:                    0                     0                          0
## 3:                    0                     0                          0
## 4:                    0                     0                          0
## 5:                    0                     0                          0
## 6:                    0                     0                          0
##    error_bad_pin error_bad_cvv error_bad_zipcode error_technical_glitch
##            <num>         <num>             <num>                  <num>
## 1:             0             0                 0                      0
## 2:             0             0                 0                      0
## 3:             0             0                 0                      0
## 4:             0             0                 0                      0
## 5:             0             0                 0                      0
## 6:             0             0                 0                      0
##    error_count                                     description card_brand
##          <num>                                          <char>     <char>
## 1:           0 Heating, Plumbing, Air Conditioning Contractors Mastercard
## 2:           0 Heating, Plumbing, Air Conditioning Contractors Mastercard
## 3:           0                                      Steelworks Mastercard
## 4:           0                                      Steelworks Mastercard
## 5:           0                    Steel Products Manufacturing Mastercard
## 6:           0                                   Leather Goods Mastercard
##    card_type      card_number    expires   cvv has_chip num_cards_issued
##       <char>            <i64>     <Date> <int>    <num>            <int>
## 1:     Debit 5050211780967429 2021-04-01   316        1                2
## 2:     Debit 5050211780967429 2021-04-01   316        1                2
## 3:     Debit 5050211780967429 2021-04-01   316        1                2
## 4:     Debit 5050211780967429 2021-04-01   316        1                2
## 5:     Debit 5050211780967429 2021-04-01   316        1                2
## 6:     Debit 5050211780967429 2021-04-01   316        1                2
##    credit_limit acct_open_date year_pin_last_changed current_age retirement_age
##           <num>         <Date>                 <int>       <int>          <int>
## 1:        31490     2011-02-01                  2011          33             69
## 2:        31490     2011-02-01                  2011          33             69
## 3:        31490     2011-02-01                  2011          33             69
## 4:        31490     2011-02-01                  2011          33             69
## 5:        31490     2011-02-01                  2011          33             69
## 6:        31490     2011-02-01                  2011          33             69
##    birth_year birth_month gender         address latitude longitude
##         <int>       <int> <char>          <char>    <num>     <num>
## 1:       1986           3   Male 858 Plum Avenue    43.59    -70.33
## 2:       1986           3   Male 858 Plum Avenue    43.59    -70.33
## 3:       1986           3   Male 858 Plum Avenue    43.59    -70.33
## 4:       1986           3   Male 858 Plum Avenue    43.59    -70.33
## 5:       1986           3   Male 858 Plum Avenue    43.59    -70.33
## 6:       1986           3   Male 858 Plum Avenue    43.59    -70.33
##    per_capita_income yearly_income total_debt credit_score num_credit_cards
##                <num>         <num>      <num>        <int>            <int>
## 1:             29237         59613      36199          763                4
## 2:             29237         59613      36199          763                4
## 3:             29237         59613      36199          763                4
## 4:             29237         59613      36199          763                4
## 5:             29237         59613      36199          763                4
## 6:             29237         59613      36199          763                4
##    debt_to_income_ratio target zip_missing
##                   <num> <fctr>       <num>
## 1:            0.6072333     No           0
## 2:            0.6072333     No           0
## 3:            0.6072333     No           0
## 4:            0.6072333     No           0
## 5:            0.6072333     No           0
## 6:            0.6072333     No           0

Distribution of Credit Limit

Based on the graph below, the credit limit distribution is right-skewed (positively skewed), that suggests majority of the clients have relatively low credit limits, with a few clients having exceptionally high credit limits. The red vertical line likely marks the mean value, where most of the credit limits fall in relation to this value.

ggplot(full_df, aes(x = credit_limit)) +
  geom_histogram(
    bins = 30,
    aes(fill = after_stat(count)),
    color = "white"
  ) +
  scale_fill_gradient(low = "#AED6F1", high = "#1F618D") +
  labs(
    title = "Distribution of Credit Limit",
    x = "Credit Limit",
    y = "Count",
    fill = "Frequency"
  ) +
  geom_vline(aes(xintercept = mean(credit_limit, na.rm = TRUE)),
             color = "#E15759", linewidth = 1.2) +
  theme_minimal(base_size = 13)

### Fraud Distribution This pie chart represents the distribution of fraud cases in this dataset where Non-Fraud Cases takes up 99.9% of the chart, indicating that the dataset is highly imbalanced, which is typical for fraud detection problems where fraud cases are much fewer than non-fraud cases. This imbalance will require special handling when training machine learning models to ensure the model learns to identify the minority class (fraud) effectively.

fraud_df$target <- gsub("yes", "fraud", fraud_df$target)
fraud_df$target <- gsub("no", "not fraud", fraud_df$target)

pie_chart <- fraud_df %>%
  group_by(target) %>%
  summarise(count = n()) %>%
  plot_ly(labels = ~target, values = ~count, type = 'pie', textinfo = 'label+percent', 
          title = 'Number of Fraud Cases', 
          marker = list(colors = c("red", "green"))) %>%
  layout(margin = list(t = 0, b = 0, l = 0, r = 0))

pie_chart

Count of Card Brands

Based on our current data, most of the clients’ card brand are Mastercard, followed by Visa Amex and Discover

ggplot(fraud_df, aes(x = card_brand, fill = target)) +
  geom_bar(position = "dodge") +  
  labs(title = "Distribution of Card Brand by Target (Fraud vs Non-Fraud)", 
       x = "Card Brand", y = "Count") +
  scale_fill_manual(values = c("Yes" = "red", "No" = "green")) +  
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

### Count of Credit Limit and Fraud Cases based on card type This graph provides a side-by-side comparison of fraud cases and total credit limit by card type. This comparison suggests that Debit cards are more prone to fraud and hold a larger financial exposure due to higher credit limits.

fraud_data <- fraud_df %>% filter(target == "Yes")

fraud_data$card_type <- factor(fraud_data$card_type, 
                               levels = names(sort(table(fraud_data$card_type), decreasing = TRUE)))

count_plot <- ggplot(fraud_data, aes(x = card_type, fill = card_type)) +
  geom_bar() +
  labs(title = "Distribution of Card Type for Fraud Cases", 
       x = "Card Type", 
       y = "Count of Fraud Cases") +
  scale_fill_manual(values = c("Credit" = "green", "Debit" = "red")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

credit_limit_plot <- ggplot(fraud_data, aes(x = card_type, y = credit_limit, fill = card_type)) +
  geom_bar(stat = "summary", fun = "sum") +
  labs(title = "Total Credit Limit by Card Type for Fraud Cases", 
       x = "Card Type", 
       y = "Total Credit Limit") +
  scale_fill_manual(values = c("Credit" = "green", "Debit" = "red")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

count_plotly <- ggplotly(count_plot)
credit_limit_plotly <- ggplotly(credit_limit_plot)

## Warning: Removed 58 rows containing non-finite outside the scale range
## (`stat_summary()`).

fig <- subplot(count_plotly, credit_limit_plotly, nrows = 1, shareX = TRUE) %>%
  layout(
    title = "Fraud Cases and Credit Limit by Card Type",
    showlegend = FALSE,  
    xaxis = list(title = "Card Type"),  
    yaxis = list(title = "Count of Fraud Cases"),  
    yaxis2 = list(
      title = "Total Credit Limit", 
      overlaying = "y",  
      side = "right"  
    )
  )

fig

Fraud Cases and Credit Limit by Gender

This graph presents two visualizations side by side to compare fraud cases and total credit limits by gender. Both Fraud Cases and Total Credit Limit appears to be higher for females than for males suggesting that the overall financial exposure may be greater for females. This could point to gender-based differences in credit access or borrowing behavior.

total_credit_limit_by_gender <- full_df %>%
  group_by(gender) %>%
  summarise(total_credit_limit = sum(credit_limit, na.rm = TRUE))

fraud_cases_by_gender <- fraud_df %>%
  filter(target == "Yes") %>%
  group_by(gender) %>%
  summarise(fraud_cases = n())

credit_limit_plot <- ggplot(total_credit_limit_by_gender, aes(x = gender, y = total_credit_limit, fill = gender)) +
  geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
  labs(title = "Total Credit Limit by Gender", x = "Gender", y = "Total Credit Limit") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "lightpink"))

fraud_cases_plot <- ggplot(fraud_cases_by_gender, aes(x = gender, y = fraud_cases, fill = gender)) +
  geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
  labs(title = "Number of Fraud Cases by Gender", x = "Gender", y = "Number of Fraud Cases") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "lightpink"))

credit_limit_plotly <- ggplotly(credit_limit_plot)
fraud_cases_plotly <- ggplotly(fraud_cases_plot)

fig <- subplot(fraud_cases_plotly, credit_limit_plotly, nrows = 1, shareX = TRUE) %>%
  layout(
    title = "Fraud Cases and Credit Limit by Gender",
    showlegend = FALSE,  
    xaxis = list(title = "Gender"),  
    yaxis = list(title = "Count of Fraud Cases"),  
    yaxis2 = list(
      title = "Total Credit Limit", 
      overlaying = "y",  
      side = "right"  
    )
  )
fig

Fraud Cases and Non-Fraud Cases by Month

This graph displays the monthly distribution of fraud cases where the highest number of fraud cases occurs in August, where the count reaches more than 1300 cases. Notice that June has the lowest number of fraud cases followed by January and February.

fraud_df$date <- as.Date(fraud_df$date)  
fraud_df$month <- month(fraud_df$date)
fraud_df$month <- month.name[fraud_df$month]

fraud_cases <- fraud_df %>%
  filter(target == "Yes") %>%
  group_by(month) %>%
  summarise(count = n(), .groups = 'drop')

# Ensure the month column is ordered from January to December
fraud_cases$month <- factor(fraud_cases$month, levels = month.name)

# Create the line plot for fraud cases
ggplot(fraud_cases, aes(x = month, y = count)) +
  geom_line(size = 1, color = "red") +  # Red line for fraud cases
  geom_point(size = 2, color = "red") +  # Red points for fraud cases
  labs(title = "Fraud Cases by Month", 
       x = "Month", y = "Number of Fraud Cases") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

Credit Score vs Credit Limit

This scatter plot shows the relationship between Credit Score and Credit Limit that suggest a visible weak correlation between Credit Score and Credit Limit. As Credit Score increases, the Credit Limit does not show a clear upward trend, and many of the data points are clustered at the lower end of the credit limit scale. A few data points stand out at the higher end of the Credit Limit that could represent cases with unusually high credit limits.

ggplot(full_df, aes(x = credit_score, y = credit_limit)) +
  geom_point(color = "blue", alpha = 0.6) +  
  labs(title = "Credit Score vs Credit Limit", x = "Credit Score", y = "Credit Limit") +
  theme_minimal() +
  geom_smooth(method = "lm", se = FALSE, color = "red")

## `geom_smooth()` using formula = 'y ~ x'

### Scatter plot of credit limit against total_spent This scatter plot shows the relationship between total spent and credit limit where there is a slight positive correlation indicated by the upward-sloping blue line. As individuals spend more, their credit limits tend to increase, although the relationship is weak and non-linear, especially at higher values.A few outliers at the higher end was spotted.

ggplot(full_df, aes(total_spent, credit_limit)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Heat Map of Credit Limit Distribution

This heatmap visualizes the distribution of credit limits across the United States where it concentrates on eastern U.S.and west coast. Meanwhile, areas in the central U.S. and northern regions show a blue tint, suggesting lower concentrations of credit limits in those regions. This heatmap reflects regional economic patterns, showing that credit limits are generally more concentrated in urban and economically active areas, especially along the East and West coasts.

heatmap_data <- full_df %>%
  select(latitude, longitude, credit_limit) 

map <- leaflet(heatmap_data) %>%
  # OpenStreetMap tiles
  addTiles() %>%  
  addHeatmap(
    lng = ~longitude, 
    lat = ~latitude,  
    intensity = ~credit_limit,  
    blur = 20, 
    max = 0.05, 
    radius = 15 
  ) %>%
  setView(lng = mean(fraud_df$longitude), lat = mean(fraud_df$latitude), zoom = 10)  # Center the map

map

Credit Limit by Card Brand

This box plot shows the distribution of credit limits by card brand that reveals significant variability in credit limits across different card brands, with Visa and Mastercard showing higher credit limits, and Amex and Discover having more concentrated and lower values.

ggplot(full_df, aes(x=card_brand, y=credit_limit, fill=card_brand)) +
  geom_boxplot() +
  labs(title="Credit Limit by Card Brand", x="Card Brand", y="Credit Limit")

Credit limit by card brand for fraud cases only

This box plot shows the distribution of credit limits by card brand for fraud cases only with Visa having a more considerable spread and a greater number of outliers at the upper end. The outliers are more prevalent in Visa and Mastercard, which suggests that these card brands tend to have higher variability in the credit limits granted to fraud cases. The box plots indicate that fraud cases associated with Visa and Mastercard generally have higher credit limits, while Amex and Discover are associated with lower credit limits for fraud cases.

fraud_cases <- fraud_df %>% filter(target == "Yes")

# Create the boxplot for Credit Limit by Card Brand, showing only fraud cases
ggplot(fraud_cases, aes(x = card_brand, y = credit_limit, fill = card_brand)) +
  geom_boxplot() +
  labs(title = "Credit Limit by Card Brand (Fraud Cases Only)", 
       x = "Card Brand", 
       y = "Credit Limit") +
  scale_fill_manual(values = c("red", "green", "blue", "yellow")) +  # Customize box colors for each card brand
  theme_minimal()

## Warning: Removed 58 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Correlation Plot Credit Limit

This correlation plot visualizes the relationship between different numerical variables in the dataset and how they correlate with the credit limit. The plot reveals that the per_capita_income and yearly_income is the strongest factor influencing credit limit. Higher income typically signals greater financial stability and the ability to repay debt and it is often use as an indicator of repayment capability, so individuals with higher incomes are more likely to be granted higher credit limits as they are perceived as lower risk borrowers.

cor_data <- full_df %>% select(where(is.numeric))
cor_data <- cor_data %>% select(-card_number)  
cor_matrix <- cor(cor_data, use = "complete.obs")

corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 90, tl.cex = 0.7)

### Correlation plot of target (Fraud Cases) This correlation plot visualizes the relationship between various numerical variables in the dataset, with a particular focus on how they relate to fraud cases. This variable appears to be negatively correlated with indicators such as credit limit, credit score, and debt-to-income ratio. This suggests that the outcome represented by target variable could be influenced by a combination of financial and error-related factors, which may reflect behavioral or risk-based characteristics.

fraud_df$new_target <- ifelse(fraud_df$target == "Yes", 1, 0)

# Select numeric columns from the data
cor_data <- fraud_df %>% select(where(is.numeric))

# Remove the 'card_number' column
cor_data <- cor_data %>% select(-card_number)  

# Calculate the correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")

# Create the correlation plot with smaller font size for axis labels
corrplot(cor_matrix, method = "circle", type = "upper", 
         tl.col = "black", tl.srt = 90, tl.cex = 0.7)

2. EDA

Adam