Project Title:

Data-Driven Financial Risk Analysis: Fraud Detection and Credit Limit Prediction Using Machine Learning

Dataset:

https://www.kaggle.com/datasets/computingvictor/transactions-fraud-datasets

Problem Statement

Goal:

We are developing a dual-model machine learning system to enhance decision-making in real-time:

Fraud Detection (Classification):

  • Goal: Instantly flag transactions as “Fraudulent” or “Legitimate.”
  • Method: Utilizing transaction metadata, user history, and card features to identify anomalies.

Credit Limit Optimization (Regression):

  • Goal: Predict the optimal credit limit for each user.
  • Method: Analyzing financial behavior and demographics to balance risk with customer purchasing power.

Research Questions:

  1. How can we classify whether a transaction is fraudulent based on user information, card attributes, merchant data, and transaction behavior?
  2. How can we accurately predict credit limits for cardholders based on their financial behavior, demographics, and transaction patterns?

Objectives:

  1. To develop a machine learning model to classify credit card transactions as fraudulent or legitimate using transaction, card, and user features.
  2. To build a regression model to predict appropriate credit limits for cardholders based on financial behavior, demographics, and transaction patterns.

Key Impact:

  1. Reduces financial loss from fraud.
  2. Enhances customer experience by automatically offering appropriate credit limits.

Import Package

library(data.table)   
library(jsonlite)     
library(dplyr)        
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)

1. LOAD DATA

# Load CSV files with fread (much faster than read.csv)
user_df <- fread('Financial/users_data.csv')

card_df <- fread('Financial/cards_data.csv')

transaction_df <- fread('Financial/transactions_data.csv')

train_fraud_label_df <- fread('Financial/train_fraud_labels.csv')

mcc_json <- fromJSON('Financial/mcc_codes.json')
mcc_codes_df <- data.table(
  mcc_code = names(mcc_json),
  description = as.character(mcc_json)
)

2. DATASET OVERVIEW

2.1 Display number or rows and columns

cat("User data:        ", format(nrow(user_df), big.mark=","), "rows,", ncol(user_df), "columns")
## User data:         2,000 rows, 14 columns
cat("Card data:        ", format(nrow(card_df), big.mark=","), "rows,", ncol(card_df), "columns")
## Card data:         6,146 rows, 13 columns
cat("Transaction data: ", format(nrow(transaction_df), big.mark=","), "rows,", ncol(transaction_df), "columns")
## Transaction data:  13,305,915 rows, 12 columns
cat("Fraud labels:     ", format(nrow(train_fraud_label_df), big.mark=","), "rows,", ncol(train_fraud_label_df), "columns")
## Fraud labels:      8,914,963 rows, 2 columns
cat("MCC codes:        ", format(nrow(mcc_codes_df), big.mark=","), "rows,", ncol(mcc_codes_df), "columns")
## MCC codes:         109 rows, 2 columns

2.2 Display column names

cat("User columns:", paste(names(user_df), collapse=", "))
## User columns: id, current_age, retirement_age, birth_year, birth_month, gender, address, latitude, longitude, per_capita_income, yearly_income, total_debt, credit_score, num_credit_cards
cat("Card columns:", paste(names(card_df), collapse=", "))
## Card columns: id, client_id, card_brand, card_type, card_number, expires, cvv, has_chip, num_cards_issued, credit_limit, acct_open_date, year_pin_last_changed, card_on_dark_web
cat("Transaction columns:", paste(names(transaction_df), collapse=", "))
## Transaction columns: id, date, client_id, card_id, amount, use_chip, merchant_id, merchant_city, merchant_state, zip, mcc, errors
cat("Fraud label columns:", paste(names(train_fraud_label_df), collapse=", "))
## Fraud label columns: transaction_id, target
cat("MCC code columns:", paste(names(mcc_codes_df), collapse=", "))
## MCC code columns: mcc_code, description

3. DATA QUALITY CHECK

3.1 Check for duplicates

cat("User duplicates (by id):", sum(duplicated(user_df$id)))
## User duplicates (by id): 0
cat("Card duplicates (by id):", sum(duplicated(card_df$id)))
## Card duplicates (by id): 0
cat("Transaction duplicates (by id):", sum(duplicated(transaction_df$id)))
## Transaction duplicates (by id): 0
cat("Fraud label duplicates (by id):", sum(duplicated(train_fraud_label_df$id)))
## Fraud label duplicates (by id): 0

3.3 Check for missing values

user_missing <- colSums(is.na(user_df))
if(sum(user_missing) == 0) {
  print("No missing value in User Data")
} else {
  print(user_missing[user_missing > 0])
  cat("Missing %:", round(user_missing[user_missing > 0] / nrow(user_df) * 100, 2))
}
## [1] "No missing value in User Data"
card_missing <- colSums(is.na(card_df))
if(sum(card_missing) == 0) {
  print("No missing value in Card Data")
} else {
  print(card_missing[card_missing > 0])
  cat("Missing %:", round(card_missing[card_missing > 0] / nrow(card_df) * 100, 2))
}
## [1] "No missing value in Card Data"
trans_missing <- colSums(is.na(transaction_df))
if(sum(trans_missing) == 0) {
  print("No missing value in Transaction Data")
} else {
  print(trans_missing[trans_missing > 0])
  cat("Missing % (in sample):", round(trans_missing[trans_missing > 0] / nrow(transaction_df) * 100, 2))
}
##     zip 
## 1652706 
## Missing % (in sample): 12.42
fraud_missing <- colSums(is.na(train_fraud_label_df))
if(sum(fraud_missing) == 0) {
  print("No missing value in Fraud Data")
} else {
  print(fraud_missing[fraud_missing > 0])
  cat("Missing % (in sample):", round(fraud_missing[fraud_missing > 0] / nrow(train_fraud_label_df) * 100, 2))
}
## [1] "No missing value in Fraud Data"

Explanation: For the missing values in zip column in transaction data, we will

  • keep the missing values, and fill it with 00000, and

  • create one more feature “zip_missing”.

This is because it might be useful for fraud detection (missing transaction locations details, higher chances to be fraud transaction)

4. DATA CLEANING

4.1 Handle missing ZIP codes

if("zip" %in% names(transaction_df)) {
  # Create a flag for missing ZIP (useful feature)
  transaction_df[, zip_missing := ifelse(is.na(zip), 1, 0)]
  
  # Fill with placeholder for missing 
  transaction_df[is.na(zip), zip := "00000"]
  
  cat("Missing ZIP codes handled (", sum(transaction_df$zip_missing), "flagged )")
}
## Warning in `[.data.table`(transaction_df, is.na(zip), `:=`(zip, "00000")):
## Coercing 'character' RHS to 'double' to match the type of column 10 named
## 'zip'.
## Missing ZIP codes handled ( 1652706 flagged )

4.2 Remove duplicates for all tables

user_df <- unique(user_df, by = "id")
card_df <- unique(card_df, by = "id")
transaction_df <- unique(transaction_df, by = "id")
train_fraud_label_df <- unique(train_fraud_label_df, by = "transaction_id")

4.3 Set keys for faster joins

setkey(user_df, id)
setkey(card_df, id, client_id)
setkey(transaction_df, id, client_id, card_id)
setkey(train_fraud_label_df, transaction_id)

4.4 Convert date columns

4.4.1. Transaction date

if("date" %in% names(transaction_df)) {
  transaction_df[, date := as.Date(date)]
  print("Transaction dates converted.")
}
## [1] "Transaction dates converted."

4.4.2. acct_open_date

if("acct_open_date" %in% names(card_df)) {
  # Handle MM/YYYY format - convert to first day of the month
  card_df[, acct_open_date := as.Date(paste0("01/", acct_open_date), format = "%d/%m/%Y")]
  print("Account open dates converted")
}
## [1] "Account open dates converted"

4.4.3. expire

if("expires" %in% names(card_df)) {
  # Handle MM/YYYY format - convert to first day of the month
  card_df[, expires := as.Date(paste0("01/", expires), format = "%d/%m/%Y")]
  print("Expired dates converted")
}
## [1] "Expired dates converted"

4.5 Handle Target column

4.5.1. credit_limit (regression target)

  1. remove dollar sign
  2. Convert to numeric
  3. remove invalid values (negative or zero credit limits)
cat("Number of rows before removing invalid values:", nrow(card_df))
## Number of rows before removing invalid values: 6146
if("credit_limit" %in% names(card_df)) {
  # Remove dollar sign and convert to numeric
  card_df[, credit_limit := as.numeric(gsub("[$,]", "", credit_limit))]
  # Remove negative or zero credit limits
  card_df <- card_df[is.na(credit_limit) | credit_limit > 0]
}
cat("Number of rows after removing invalid values (negative or zero credit limits):", nrow(card_df))
## Number of rows after removing invalid values (negative or zero credit limits): 6115

4.5.2. fraud target (binary: YES/NO)

if("target" %in% names(train_fraud_label_df)) {
  # Convert to binary factor
  train_fraud_label_df[, target := factor(target, levels = c("No", "Yes"))]
}
head(train_fraud_label_df)
## Key: <transaction_id>
##    transaction_id target
##             <int> <fctr>
## 1:        7475327     No
## 2:        7475328     No
## 3:        7475329     No
## 4:        7475332     No
## 5:        7475333     No
## 6:        7475335     No

4.6 Transaction table: feature columns data cleaning

4.6.1. Clean merchant city and state - remove leading/trailing spaces

if("merchant_city" %in% names(transaction_df)) {
  transaction_df[, merchant_city := trimws(merchant_city)]
}
if("merchant_state" %in% names(transaction_df)) {
  transaction_df[, merchant_state := trimws(merchant_state)]
}

4.6.2. Handle amount column - remove dollar sign, convert to numeric

if("amount" %in% names(transaction_df)) {
  # Remove dollar sign and convert to numeric
  transaction_df[, amount := as.numeric(gsub("[$,]", "", amount))]
}

4.6.3. Feature Engineering - Create is_refund column to flag negative transaction

if("amount" %in% names(transaction_df)) {
  # Create flag for negative amounts (refunds/returns)
  transaction_df[, is_refund := ifelse(amount < 0, 1, 0)]
  cat("Negative amounts (refunds):", sum(transaction_df$amount < 0, na.rm=TRUE))
}
## Negative amounts (refunds): 660049

4.6.4. Feature Engineering - Create binary flags for each error type

  1. Create binary flags for each error type (useful for fraud detection)
    1. has_error
    2. error_bad_expiration
    3. error_bad_card_number
    4. error_insufficient_balance
    5. error_bad_pin
    6. error_bad_cvv
    7. error_bad_zipcode
    8. error_technical_glitch
  1. Count number of errors per transaction and create error count columns
if("errors" %in% names(transaction_df)) {

  # Replace empty strings or whitespace with NA
  transaction_df[errors == "" | trimws(errors) == "", errors := NA]
  
  # Create binary flags for each error type 
  transaction_df[, has_error := ifelse(is.na(errors), 0, 1)]
  transaction_df[, error_bad_expiration := ifelse(grepl("Bad Expiration", errors, ignore.case = TRUE), 1, 0)]
  transaction_df[, error_bad_card_number := ifelse(grepl("Bad Card Number", errors, ignore.case = TRUE), 1, 0)]
  transaction_df[, error_insufficient_balance := ifelse(grepl("Insufficient Balance", errors, ignore.case = TRUE), 1, 0)]
  transaction_df[, error_bad_pin := ifelse(grepl("Bad PIN", errors, ignore.case = TRUE), 1, 0)]
  transaction_df[, error_bad_cvv := ifelse(grepl("Bad CVV", errors, ignore.case = TRUE), 1, 0)]
  transaction_df[, error_bad_zipcode := ifelse(grepl("Bad Zipcode", errors, ignore.case = TRUE), 1, 0)]
  transaction_df[, error_technical_glitch := ifelse(grepl("Technical Glitch", errors, ignore.case = TRUE), 1, 0)]
  
  # Count number of errors per transaction
  transaction_df[, error_count := has_error]
  transaction_df[!is.na(errors), error_count := str_count(errors, ",") + 1]
  
  # Print error distribution
  cat("Transactions with errors:", sum(transaction_df$has_error), 
      "(", round(sum(transaction_df$has_error)/nrow(transaction_df)*100, 2), "%)\n")
  
  # Drop errors column
  transaction_df[, errors := NULL]
}
## Transactions with errors: 211393 ( 1.59 %)

4.7 Card data: feature columns data cleaning

4.7.1. Convert card_on_dark_web and has_chip binary columns (Yes/No) to numeric (0/1)

if("card_on_dark_web" %in% names(card_df)) {
  card_df[, card_on_dark_web := ifelse(toupper(card_on_dark_web) == "Yes", 1, 0)]
  table(card_df$card_on_dark_web)
}
## 
##    0 
## 6115
if("has_chip" %in% names(card_df)) {
  card_df[, has_chip := ifelse(toupper(has_chip) == "YES", 1, 0)]
  table(card_df$has_chip)
}
## 
##    0    1 
##  644 5471

Observation: Card on dark web column only consists of single value (No), we will drop this column as it contains zero predictive power.

card_df[, card_on_dark_web := NULL]
names(card_df)
##  [1] "id"                    "client_id"             "card_brand"           
##  [4] "card_type"             "card_number"           "expires"              
##  [7] "cvv"                   "has_chip"              "num_cards_issued"     
## [10] "credit_limit"          "acct_open_date"        "year_pin_last_changed"

User data: feature columns data cleaning

4.7.1. Clean financial columns - remove dollar signs and convert to numeric

if("per_capita_income" %in% names(user_df)) {
  user_df[, per_capita_income := as.numeric(gsub("[$,]", "", per_capita_income))]
  print("per_capita_income converted to numeric.")
}
## [1] "per_capita_income converted to numeric."
if("yearly_income" %in% names(user_df)) {
  user_df[, yearly_income := as.numeric(gsub("[$,]", "", yearly_income))]
  print("yearly_income converted to numeric.")
}
## [1] "yearly_income converted to numeric."
if("total_debt" %in% names(user_df)) {
  user_df[, total_debt := as.numeric(gsub("[$,]", "", total_debt))]
  print("total_debt converted to numeric.")
}
## [1] "total_debt converted to numeric."

4.7.2. Feature Engineering - Create new features debt_to_income_ratio for credit limit prediction

if("yearly_income" %in% names(user_df) & "total_debt" %in% names(user_df)) {
  # Debt-to-Income ratio (important for credit decisions)
  user_df[, debt_to_income_ratio := ifelse(yearly_income > 0, 
                                           total_debt / yearly_income, 
                                           NA)]
  print("debt_to_income_ratio feature created.")
}
## [1] "debt_to_income_ratio feature created."

4.8 mcc data: feature columns data cleaning

4.8.1. convert mcc_code to integer, as mcc_code in transaction df is integer

if("mcc" %in% names(transaction_df) & "mcc_code" %in% names(mcc_codes_df)) {
  # Convert mcc_code to integer to match transaction MCC
  mcc_codes_df[, mcc_code := as.integer(mcc_code)]
  cat("MCC codes formatted as integer for matching.\n")
}
## MCC codes formatted as integer for matching.

4.9 Final data type validation

head(transaction_df)
## Key: <id, client_id, card_id>
##         id       date client_id card_id amount          use_chip merchant_id
##      <int>     <Date>     <int>   <int>  <num>            <char>       <int>
## 1: 7475327 2010-01-01      1556    2972 -77.00 Swipe Transaction       59935
## 2: 7475328 2010-01-01       561    4575  14.57 Swipe Transaction       67570
## 3: 7475329 2010-01-01      1129     102  80.00 Swipe Transaction       27092
## 4: 7475331 2010-01-01       430    2860 200.00 Swipe Transaction       27092
## 5: 7475332 2010-01-01       848    3915  46.41 Swipe Transaction       13051
## 6: 7475333 2010-01-01      1807     165   4.81 Swipe Transaction       20519
##    merchant_city merchant_state   zip   mcc zip_missing is_refund has_error
##           <char>         <char> <num> <int>       <num>     <num>     <num>
## 1:        Beulah             ND 58523  5499           0         1         0
## 2:    Bettendorf             IA 52722  5311           0         0         0
## 3:         Vista             CA 92084  4829           0         0         0
## 4:   Crown Point             IN 46307  4829           0         0         0
## 5:       Harwood             MD 20776  5813           0         0         0
## 6:         Bronx             NY 10464  5942           0         0         0
##    error_bad_expiration error_bad_card_number error_insufficient_balance
##                   <num>                 <num>                      <num>
## 1:                    0                     0                          0
## 2:                    0                     0                          0
## 3:                    0                     0                          0
## 4:                    0                     0                          0
## 5:                    0                     0                          0
## 6:                    0                     0                          0
##    error_bad_pin error_bad_cvv error_bad_zipcode error_technical_glitch
##            <num>         <num>             <num>                  <num>
## 1:             0             0                 0                      0
## 2:             0             0                 0                      0
## 3:             0             0                 0                      0
## 4:             0             0                 0                      0
## 5:             0             0                 0                      0
## 6:             0             0                 0                      0
##    error_count
##          <num>
## 1:           0
## 2:           0
## 3:           0
## 4:           0
## 5:           0
## 6:           0
head(user_df)
## Key: <id>
##       id current_age retirement_age birth_year birth_month gender
##    <int>       <int>          <int>      <int>       <int> <char>
## 1:     0          33             69       1986           3   Male
## 2:     1          43             74       1976           4 Female
## 3:     2          48             64       1971           8   Male
## 4:     3          49             65       1970          12   Male
## 5:     4          54             72       1965           3 Female
## 6:     5          65             65       1955           2   Male
##                        address latitude longitude per_capita_income
##                         <char>    <num>     <num>             <num>
## 1:             858 Plum Avenue    43.59    -70.33             29237
## 2:              113 Burns Lane    30.44    -87.18             22247
## 3:          6035 Forest Avenue    40.84    -73.87             13461
## 4:              840 Elm Avenue    33.89    -98.51             13705
## 5: 6016 Little Creek Boulevard    47.61   -122.30             37485
## 6:            1032 Second Lane    42.95    -77.13             19095
##    yearly_income total_debt credit_score num_credit_cards debt_to_income_ratio
##            <num>      <num>        <int>            <int>                <num>
## 1:         59613      36199          763                4            0.6072333
## 2:         45360      14587          704                3            0.3215829
## 3:         27447      80850          673                5            2.9456771
## 4:         27943      18693          681                4            0.6689690
## 5:         76431     115362          716                5            1.5093614
## 6:         20614      14042          711                2            0.6811875
head(card_df)
## Key: <id, client_id>
##       id client_id card_brand card_type      card_number    expires   cvv
##    <int>     <int>     <char>    <char>            <i64>     <Date> <int>
## 1:     0      1362       Amex    Credit  393314135668401 2024-04-01   866
## 2:     1       550 Mastercard    Credit 5278231764792292 2024-06-01   396
## 3:     2       556 Mastercard     Debit 5889825928297675 2021-09-01   422
## 4:     3      1937       Visa    Credit 4289888672554714 2020-04-01   736
## 5:     4      1981 Mastercard     Debit 5433366978583845 2024-03-01   530
## 6:     5       619       Visa     Debit 4657824650820465 2024-04-01   245
##    has_chip num_cards_issued credit_limit acct_open_date year_pin_last_changed
##       <num>            <int>        <num>         <Date>                 <int>
## 1:        1                2        33900     1991-01-01                  2014
## 2:        1                1        11600     1994-01-01                  2013
## 3:        1                1        19948     1995-01-01                  2011
## 4:        1                2        16400     1995-01-01                  2015
## 5:        1                2        19439     1997-01-01                  2007
## 6:        1                2        21883     1997-01-01                  2012
head(mcc_codes_df)
##    mcc_code                                description
##       <int>                                     <char>
## 1:     5812              Eating Places and Restaurants
## 2:     5541                           Service Stations
## 3:     7996       Amusement Parks, Carnivals, Circuses
## 4:     5411               Grocery Stores, Supermarkets
## 5:     4784                      Tolls and Bridge Fees
## 6:     4900 Utilities - Electric, Gas, Water, Sanitary
head(train_fraud_label_df)
## Key: <transaction_id>
##    transaction_id target
##             <int> <fctr>
## 1:        7475327     No
## 2:        7475328     No
## 3:        7475329     No
## 4:        7475332     No
## 5:        7475333     No
## 6:        7475335     No

5. CLEANED DATA SUMMARY

5.1 Display number of rows and columns after basic cleaning

cat("User data:        ", format(nrow(user_df), big.mark=","), "rows\n")
## User data:         2,000 rows
cat("Card data:        ", format(nrow(card_df), big.mark=","), "rows\n")
## Card data:         6,115 rows
cat("Transaction data: ", format(nrow(transaction_df), big.mark=","), "rows\n")
## Transaction data:  13,305,915 rows
cat("Fraud labels:     ", format(nrow(train_fraud_label_df), big.mark=","), "rows\n")
## Fraud labels:      8,914,963 rows

5.2 Basic statistics

if("credit_limit" %in% names(card_df)) {
  print(summary(card_df$credit_limit))
}
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    7100   12600   14420   19244  151223
if("target" %in% names(train_fraud_label_df)) {
  print(table(train_fraud_label_df$target))
  cat("Fraud Rate:", round(sum(train_fraud_label_df$target == "YES") / nrow(train_fraud_label_df) * 100, 2))
}
## 
##      No     Yes 
## 8901631   13332 
## Fraud Rate: 0

6. PREPARED FRAUD DETECTION DATASET (ALL DATA MERGED)

6.1 Merge transactions with fraud labels

fraud_data <- merge(transaction_df, train_fraud_label_df, 
                    by.x = "id", by.y = "transaction_id", 
                    all.x = TRUE)
cat("After merge fraud and transaction data:", nrow(fraud_data), "rows")
## After merge fraud and transaction data: 13305915 rows

6.2 Remove transactions without fraud labels (keep only labelled data)

fraud_data <- fraud_data[!is.na(target)]
cat("  After removing unlabeled data:", format(nrow(fraud_data), big.mark=","))
##   After removing unlabeled data: 8,914,963

6.3 Add MCC descriptions

fraud_data <- merge(fraud_data, mcc_codes_df, 
                    by.x = "mcc", by.y = "mcc_code", 
                    all.x = TRUE)

6.4 Add card information

fraud_data <- merge(fraud_data, card_df, 
                    by.x = "card_id", by.y = "id", 
                    all.x = TRUE, 
                    suffixes = c("", "_card"))

6.5 Add user information

fraud_data <- merge(fraud_data, user_df, 
                    by.x = "client_id", by.y = "id", 
                    all.x = TRUE, 
                    suffixes = c("", "_user"))

6.6 Reorganize columns

# Remove client_id_card as it is same as client_id column
fraud_data[, client_id_card := NULL]

# Define column order: Transaction -> MCC -> Card -> User -> Target
transaction_cols <- c("id", "date", "client_id", "card_id", "amount", "mcc", 
                      "use_chip", "merchant_id", "merchant_city", "merchant_state", "zip",
                      "is_refund", "has_error", "error_bad_expiration", "error_bad_card_number", 
                      "error_insufficient_balance", "error_bad_pin","error_bad_cvv", 
                      "error_bad_zipcode", "error_technical_glitch", "error_count")

mcc_cols <- c("description")

card_cols <- c("card_brand", "card_type", "card_number", "expires", "cvv", "has_chip",
               "num_cards_issued", "credit_limit", "acct_open_date", "year_pin_last_changed")

user_cols <- c("current_age", "retirement_age", "birth_year", "birth_month", "gender",
               "address", "latitude", "longitude", "per_capita_income", "yearly_income",
               "total_debt", "credit_score", "num_credit_cards", "debt_to_income_ratio")

target_col <- c("target")

# Combine in desired order, keeping only columns that exist
all_cols <- c(transaction_cols, mcc_cols, card_cols, user_cols, target_col)
existing_cols <- all_cols[all_cols %in% names(fraud_data)]

# Reorder columns
setcolorder(fraud_data, existing_cols)

6.7 Final fraud detection dataset

cat("Total rows:", format(nrow(fraud_data), big.mark=","))
## Total rows: 8,914,963
cat("Total columns:", ncol(fraud_data))
## Total columns: 48
cat("Columns:", paste(names(fraud_data)[1:10], collapse=", "))
## Columns: id, date, client_id, card_id, amount, mcc, use_chip, merchant_id, merchant_city, merchant_state
head(fraud_data)
## Key: <client_id>
##          id       date client_id card_id amount   mcc          use_chip
##       <int>     <Date>     <int>   <int>  <num> <int>            <char>
## 1: 11873816 2012-10-31         0    1271  62.99  1711 Swipe Transaction
## 2: 14690298 2014-07-15         0    1271  64.96  1711 Swipe Transaction
## 3: 16182985 2015-06-03         0    1271 309.57  3000  Chip Transaction
## 4: 22742665 2019-03-30         0    1271 535.73  3000  Chip Transaction
## 5: 18200515 2016-08-08         0    1271 568.10  3001  Chip Transaction
## 6: 12614098 2013-04-15         0    1271 564.05  3132 Swipe Transaction
##    merchant_id merchant_city merchant_state   zip is_refund has_error
##          <int>        <char>         <char> <num>     <num>     <num>
## 1:       11582   Scarborough             ME  4074         0         0
## 2:       11582   Scarborough             ME  4074         0         0
## 3:       60152     Stratford             CT  6615         0         0
## 4:       60152     Stratford             CT  6615         0         0
## 5:        5594       Norwich             CT  6360         0         0
## 6:       57386   Glastonbury             CT  6033         0         0
##    error_bad_expiration error_bad_card_number error_insufficient_balance
##                   <num>                 <num>                      <num>
## 1:                    0                     0                          0
## 2:                    0                     0                          0
## 3:                    0                     0                          0
## 4:                    0                     0                          0
## 5:                    0                     0                          0
## 6:                    0                     0                          0
##    error_bad_pin error_bad_cvv error_bad_zipcode error_technical_glitch
##            <num>         <num>             <num>                  <num>
## 1:             0             0                 0                      0
## 2:             0             0                 0                      0
## 3:             0             0                 0                      0
## 4:             0             0                 0                      0
## 5:             0             0                 0                      0
## 6:             0             0                 0                      0
##    error_count                                     description card_brand
##          <num>                                          <char>     <char>
## 1:           0 Heating, Plumbing, Air Conditioning Contractors Mastercard
## 2:           0 Heating, Plumbing, Air Conditioning Contractors Mastercard
## 3:           0                                      Steelworks Mastercard
## 4:           0                                      Steelworks Mastercard
## 5:           0                    Steel Products Manufacturing Mastercard
## 6:           0                                   Leather Goods Mastercard
##    card_type      card_number    expires   cvv has_chip num_cards_issued
##       <char>            <i64>     <Date> <int>    <num>            <int>
## 1:     Debit 5050211780967429 2021-04-01   316        1                2
## 2:     Debit 5050211780967429 2021-04-01   316        1                2
## 3:     Debit 5050211780967429 2021-04-01   316        1                2
## 4:     Debit 5050211780967429 2021-04-01   316        1                2
## 5:     Debit 5050211780967429 2021-04-01   316        1                2
## 6:     Debit 5050211780967429 2021-04-01   316        1                2
##    credit_limit acct_open_date year_pin_last_changed current_age retirement_age
##           <num>         <Date>                 <int>       <int>          <int>
## 1:        31490     2011-02-01                  2011          33             69
## 2:        31490     2011-02-01                  2011          33             69
## 3:        31490     2011-02-01                  2011          33             69
## 4:        31490     2011-02-01                  2011          33             69
## 5:        31490     2011-02-01                  2011          33             69
## 6:        31490     2011-02-01                  2011          33             69
##    birth_year birth_month gender         address latitude longitude
##         <int>       <int> <char>          <char>    <num>     <num>
## 1:       1986           3   Male 858 Plum Avenue    43.59    -70.33
## 2:       1986           3   Male 858 Plum Avenue    43.59    -70.33
## 3:       1986           3   Male 858 Plum Avenue    43.59    -70.33
## 4:       1986           3   Male 858 Plum Avenue    43.59    -70.33
## 5:       1986           3   Male 858 Plum Avenue    43.59    -70.33
## 6:       1986           3   Male 858 Plum Avenue    43.59    -70.33
##    per_capita_income yearly_income total_debt credit_score num_credit_cards
##                <num>         <num>      <num>        <int>            <int>
## 1:             29237         59613      36199          763                4
## 2:             29237         59613      36199          763                4
## 3:             29237         59613      36199          763                4
## 4:             29237         59613      36199          763                4
## 5:             29237         59613      36199          763                4
## 6:             29237         59613      36199          763                4
##    debt_to_income_ratio target zip_missing
##                   <num> <fctr>       <num>
## 1:            0.6072333     No           0
## 2:            0.6072333     No           0
## 3:            0.6072333     No           0
## 4:            0.6072333     No           0
## 5:            0.6072333     No           0
## 6:            0.6072333     No           0

6.8 Check fraud distribution

if("target" %in% names(fraud_data)) {
  print(table(fraud_data$target, useNA = "ifany"))
}
## 
##      No     Yes 
## 8901631   13332

6.9 Save fraud detection dataset

# save as RDS (faster to load for next stage)
saveRDS(fraud_data, "CleanedDataSet/fraud_detection_data.rds")

7. PREPARED CREDIT LIMIT PREDICTION DATASET

7.1 Merge cards with users

credit_data <- merge(card_df, user_df, 
                     by.x = "client_id", by.y = "id", 
                     all.x = TRUE, 
                     suffixes = c("_card", "_user"))

7.2 Add aggregated transaction features per card

transaction_features <- transaction_df[, .(
  total_transactions = .N,
  avg_transaction_amount = mean(amount, na.rm = TRUE),
  max_transaction_amount = max(amount, na.rm = TRUE),
  min_transaction_amount = min(amount, na.rm = TRUE),
  total_spent = sum(amount[amount > 0], na.rm = TRUE),
  total_refunded = sum(abs(amount[amount < 0]), na.rm = TRUE),
  num_refunds = sum(is_refund, na.rm = TRUE),
  transaction_frequency = .N,  
  avg_errors = mean(has_error, na.rm = TRUE),
  total_errors = sum(has_error, na.rm = TRUE)
), by = card_id]

7.3 Merge transaction features with credit data

credit_data <- merge(credit_data, transaction_features, 
                     by.x = "id", by.y = "card_id", 
                     all.x = TRUE)

7.4 Fill NA transaction features with 0 for cards with no transactions

transaction_feature_cols <- c("total_transactions", "avg_transaction_amount", 
                              "max_transaction_amount", "min_transaction_amount",
                              "total_spent", "total_refunded", "num_refunds",
                              "transaction_frequency", "avg_errors", "total_errors")

for(col in transaction_feature_cols) {
  if(col %in% names(credit_data)) {
    credit_data[is.na(get(col)), (col) := 0]
  }
}

7.5 Reorganize columns

card_info_cols <- c("id", "client_id", "card_brand", "card_type", "card_number", 
                    "expires", "cvv", "has_chip", "num_cards_issued", 
                    "acct_open_date", "year_pin_last_changed")

user_info_cols <- c("current_age", "retirement_age", "birth_year", "birth_month", 
                    "gender", "address", "latitude", "longitude", 
                    "per_capita_income", "yearly_income", "total_debt", 
                    "credit_score", "num_credit_cards", "debt_to_income_ratio")

target_col <- c("credit_limit")

# Combine in order, keeping only existing columns
all_cols_credit <- c(card_info_cols, user_info_cols, transaction_feature_cols, target_col)
existing_cols_credit <- all_cols_credit[all_cols_credit %in% names(credit_data)]

setcolorder(credit_data, existing_cols_credit)

7.6 Final credit limit dataset

cat("Total rows:", format(nrow(credit_data), big.mark=","))
## Total rows: 6,115
cat("Total columns:", ncol(credit_data))
## Total columns: 36
cat("Columns:", paste(names(credit_data)[1:10], collapse=", "))
## Columns: id, client_id, card_brand, card_type, card_number, expires, cvv, has_chip, num_cards_issued, acct_open_date

7.7 Check target variable

if("credit_limit" %in% names(credit_data)) {
  cat("\nCredit Limit Summary:\n")
  print(summary(credit_data$credit_limit))
}
## 
## Credit Limit Summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    7100   12600   14420   19244  151223

7.8 Remove rows with missing credit_limit

credit_data <- credit_data[!is.na(credit_limit)]

7.9 Save credit limit dataset

saveRDS(credit_data, "CleanedDataSet/credit_limit_data.rds")
fwrite(credit_data, "CleanedDataSet/credit_limit_data.csv")