Data-Driven Financial Risk Analysis: Fraud Detection and Credit Limit Prediction Using Machine Learning
We are developing a dual-model machine learning system to enhance decision-making in real-time:
library(data.table)
library(jsonlite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(stringr)
# Load CSV files with fread (much faster than read.csv)
user_df <- fread('Financial/users_data.csv')
card_df <- fread('Financial/cards_data.csv')
transaction_df <- fread('Financial/transactions_data.csv')
train_fraud_label_df <- fread('Financial/train_fraud_labels.csv')
mcc_json <- fromJSON('Financial/mcc_codes.json')
mcc_codes_df <- data.table(
mcc_code = names(mcc_json),
description = as.character(mcc_json)
)
cat("User data: ", format(nrow(user_df), big.mark=","), "rows,", ncol(user_df), "columns")
## User data: 2,000 rows, 14 columns
cat("Card data: ", format(nrow(card_df), big.mark=","), "rows,", ncol(card_df), "columns")
## Card data: 6,146 rows, 13 columns
cat("Transaction data: ", format(nrow(transaction_df), big.mark=","), "rows,", ncol(transaction_df), "columns")
## Transaction data: 13,305,915 rows, 12 columns
cat("Fraud labels: ", format(nrow(train_fraud_label_df), big.mark=","), "rows,", ncol(train_fraud_label_df), "columns")
## Fraud labels: 8,914,963 rows, 2 columns
cat("MCC codes: ", format(nrow(mcc_codes_df), big.mark=","), "rows,", ncol(mcc_codes_df), "columns")
## MCC codes: 109 rows, 2 columns
cat("User columns:", paste(names(user_df), collapse=", "))
## User columns: id, current_age, retirement_age, birth_year, birth_month, gender, address, latitude, longitude, per_capita_income, yearly_income, total_debt, credit_score, num_credit_cards
cat("Card columns:", paste(names(card_df), collapse=", "))
## Card columns: id, client_id, card_brand, card_type, card_number, expires, cvv, has_chip, num_cards_issued, credit_limit, acct_open_date, year_pin_last_changed, card_on_dark_web
cat("Transaction columns:", paste(names(transaction_df), collapse=", "))
## Transaction columns: id, date, client_id, card_id, amount, use_chip, merchant_id, merchant_city, merchant_state, zip, mcc, errors
cat("Fraud label columns:", paste(names(train_fraud_label_df), collapse=", "))
## Fraud label columns: transaction_id, target
cat("MCC code columns:", paste(names(mcc_codes_df), collapse=", "))
## MCC code columns: mcc_code, description
cat("User duplicates (by id):", sum(duplicated(user_df$id)))
## User duplicates (by id): 0
cat("Card duplicates (by id):", sum(duplicated(card_df$id)))
## Card duplicates (by id): 0
cat("Transaction duplicates (by id):", sum(duplicated(transaction_df$id)))
## Transaction duplicates (by id): 0
cat("Fraud label duplicates (by id):", sum(duplicated(train_fraud_label_df$id)))
## Fraud label duplicates (by id): 0
user_missing <- colSums(is.na(user_df))
if(sum(user_missing) == 0) {
print("No missing value in User Data")
} else {
print(user_missing[user_missing > 0])
cat("Missing %:", round(user_missing[user_missing > 0] / nrow(user_df) * 100, 2))
}
## [1] "No missing value in User Data"
card_missing <- colSums(is.na(card_df))
if(sum(card_missing) == 0) {
print("No missing value in Card Data")
} else {
print(card_missing[card_missing > 0])
cat("Missing %:", round(card_missing[card_missing > 0] / nrow(card_df) * 100, 2))
}
## [1] "No missing value in Card Data"
trans_missing <- colSums(is.na(transaction_df))
if(sum(trans_missing) == 0) {
print("No missing value in Transaction Data")
} else {
print(trans_missing[trans_missing > 0])
cat("Missing % (in sample):", round(trans_missing[trans_missing > 0] / nrow(transaction_df) * 100, 2))
}
## zip
## 1652706
## Missing % (in sample): 12.42
fraud_missing <- colSums(is.na(train_fraud_label_df))
if(sum(fraud_missing) == 0) {
print("No missing value in Fraud Data")
} else {
print(fraud_missing[fraud_missing > 0])
cat("Missing % (in sample):", round(fraud_missing[fraud_missing > 0] / nrow(train_fraud_label_df) * 100, 2))
}
## [1] "No missing value in Fraud Data"
Explanation: For the missing values in zip column in transaction data, we will
keep the missing values, and fill it with 00000, and
create one more feature “zip_missing”.
This is because it might be useful for fraud detection (missing transaction locations details, higher chances to be fraud transaction)
if("zip" %in% names(transaction_df)) {
# Create a flag for missing ZIP (useful feature)
transaction_df[, zip_missing := ifelse(is.na(zip), 1, 0)]
# Fill with placeholder for missing
transaction_df[is.na(zip), zip := "00000"]
cat("Missing ZIP codes handled (", sum(transaction_df$zip_missing), "flagged )")
}
## Warning in `[.data.table`(transaction_df, is.na(zip), `:=`(zip, "00000")):
## Coercing 'character' RHS to 'double' to match the type of column 10 named
## 'zip'.
## Missing ZIP codes handled ( 1652706 flagged )
user_df <- unique(user_df, by = "id")
card_df <- unique(card_df, by = "id")
transaction_df <- unique(transaction_df, by = "id")
train_fraud_label_df <- unique(train_fraud_label_df, by = "transaction_id")
setkey(user_df, id)
setkey(card_df, id, client_id)
setkey(transaction_df, id, client_id, card_id)
setkey(train_fraud_label_df, transaction_id)
4.4.1. Transaction date
if("date" %in% names(transaction_df)) {
transaction_df[, date := as.Date(date)]
print("Transaction dates converted.")
}
## [1] "Transaction dates converted."
4.4.2. acct_open_date
if("acct_open_date" %in% names(card_df)) {
# Handle MM/YYYY format - convert to first day of the month
card_df[, acct_open_date := as.Date(paste0("01/", acct_open_date), format = "%d/%m/%Y")]
print("Account open dates converted")
}
## [1] "Account open dates converted"
4.4.3. expire
if("expires" %in% names(card_df)) {
# Handle MM/YYYY format - convert to first day of the month
card_df[, expires := as.Date(paste0("01/", expires), format = "%d/%m/%Y")]
print("Expired dates converted")
}
## [1] "Expired dates converted"
cat("Number of rows before removing invalid values:", nrow(card_df))
## Number of rows before removing invalid values: 6146
if("credit_limit" %in% names(card_df)) {
# Remove dollar sign and convert to numeric
card_df[, credit_limit := as.numeric(gsub("[$,]", "", credit_limit))]
# Remove negative or zero credit limits
card_df <- card_df[is.na(credit_limit) | credit_limit > 0]
}
cat("Number of rows after removing invalid values (negative or zero credit limits):", nrow(card_df))
## Number of rows after removing invalid values (negative or zero credit limits): 6115
if("target" %in% names(train_fraud_label_df)) {
# Convert to binary factor
train_fraud_label_df[, target := factor(target, levels = c("No", "Yes"))]
}
head(train_fraud_label_df)
## Key: <transaction_id>
## transaction_id target
## <int> <fctr>
## 1: 7475327 No
## 2: 7475328 No
## 3: 7475329 No
## 4: 7475332 No
## 5: 7475333 No
## 6: 7475335 No
4.6.1. Clean merchant city and state - remove leading/trailing spaces
if("merchant_city" %in% names(transaction_df)) {
transaction_df[, merchant_city := trimws(merchant_city)]
}
if("merchant_state" %in% names(transaction_df)) {
transaction_df[, merchant_state := trimws(merchant_state)]
}
4.6.2. Handle amount column - remove dollar sign, convert to numeric
if("amount" %in% names(transaction_df)) {
# Remove dollar sign and convert to numeric
transaction_df[, amount := as.numeric(gsub("[$,]", "", amount))]
}
4.6.3. Feature Engineering - Create is_refund column to flag negative transaction
if("amount" %in% names(transaction_df)) {
# Create flag for negative amounts (refunds/returns)
transaction_df[, is_refund := ifelse(amount < 0, 1, 0)]
cat("Negative amounts (refunds):", sum(transaction_df$amount < 0, na.rm=TRUE))
}
## Negative amounts (refunds): 660049
4.6.4. Feature Engineering - Create binary flags for each error type
if("errors" %in% names(transaction_df)) {
# Replace empty strings or whitespace with NA
transaction_df[errors == "" | trimws(errors) == "", errors := NA]
# Create binary flags for each error type
transaction_df[, has_error := ifelse(is.na(errors), 0, 1)]
transaction_df[, error_bad_expiration := ifelse(grepl("Bad Expiration", errors, ignore.case = TRUE), 1, 0)]
transaction_df[, error_bad_card_number := ifelse(grepl("Bad Card Number", errors, ignore.case = TRUE), 1, 0)]
transaction_df[, error_insufficient_balance := ifelse(grepl("Insufficient Balance", errors, ignore.case = TRUE), 1, 0)]
transaction_df[, error_bad_pin := ifelse(grepl("Bad PIN", errors, ignore.case = TRUE), 1, 0)]
transaction_df[, error_bad_cvv := ifelse(grepl("Bad CVV", errors, ignore.case = TRUE), 1, 0)]
transaction_df[, error_bad_zipcode := ifelse(grepl("Bad Zipcode", errors, ignore.case = TRUE), 1, 0)]
transaction_df[, error_technical_glitch := ifelse(grepl("Technical Glitch", errors, ignore.case = TRUE), 1, 0)]
# Count number of errors per transaction
transaction_df[, error_count := has_error]
transaction_df[!is.na(errors), error_count := str_count(errors, ",") + 1]
# Print error distribution
cat("Transactions with errors:", sum(transaction_df$has_error),
"(", round(sum(transaction_df$has_error)/nrow(transaction_df)*100, 2), "%)\n")
# Drop errors column
transaction_df[, errors := NULL]
}
## Transactions with errors: 211393 ( 1.59 %)
4.7.1. Convert card_on_dark_web and has_chip binary columns (Yes/No) to numeric (0/1)
if("card_on_dark_web" %in% names(card_df)) {
card_df[, card_on_dark_web := ifelse(toupper(card_on_dark_web) == "Yes", 1, 0)]
table(card_df$card_on_dark_web)
}
##
## 0
## 6115
if("has_chip" %in% names(card_df)) {
card_df[, has_chip := ifelse(toupper(has_chip) == "YES", 1, 0)]
table(card_df$has_chip)
}
##
## 0 1
## 644 5471
Observation: Card on dark web column only consists of single value (No), we will drop this column as it contains zero predictive power.
card_df[, card_on_dark_web := NULL]
names(card_df)
## [1] "id" "client_id" "card_brand"
## [4] "card_type" "card_number" "expires"
## [7] "cvv" "has_chip" "num_cards_issued"
## [10] "credit_limit" "acct_open_date" "year_pin_last_changed"
4.7.1. Clean financial columns - remove dollar signs and convert to numeric
if("per_capita_income" %in% names(user_df)) {
user_df[, per_capita_income := as.numeric(gsub("[$,]", "", per_capita_income))]
print("per_capita_income converted to numeric.")
}
## [1] "per_capita_income converted to numeric."
if("yearly_income" %in% names(user_df)) {
user_df[, yearly_income := as.numeric(gsub("[$,]", "", yearly_income))]
print("yearly_income converted to numeric.")
}
## [1] "yearly_income converted to numeric."
if("total_debt" %in% names(user_df)) {
user_df[, total_debt := as.numeric(gsub("[$,]", "", total_debt))]
print("total_debt converted to numeric.")
}
## [1] "total_debt converted to numeric."
4.7.2. Feature Engineering - Create new features debt_to_income_ratio for credit limit prediction
if("yearly_income" %in% names(user_df) & "total_debt" %in% names(user_df)) {
# Debt-to-Income ratio (important for credit decisions)
user_df[, debt_to_income_ratio := ifelse(yearly_income > 0,
total_debt / yearly_income,
NA)]
print("debt_to_income_ratio feature created.")
}
## [1] "debt_to_income_ratio feature created."
4.8.1. convert mcc_code to integer, as mcc_code in transaction df is integer
if("mcc" %in% names(transaction_df) & "mcc_code" %in% names(mcc_codes_df)) {
# Convert mcc_code to integer to match transaction MCC
mcc_codes_df[, mcc_code := as.integer(mcc_code)]
cat("MCC codes formatted as integer for matching.\n")
}
## MCC codes formatted as integer for matching.
head(transaction_df)
## Key: <id, client_id, card_id>
## id date client_id card_id amount use_chip merchant_id
## <int> <Date> <int> <int> <num> <char> <int>
## 1: 7475327 2010-01-01 1556 2972 -77.00 Swipe Transaction 59935
## 2: 7475328 2010-01-01 561 4575 14.57 Swipe Transaction 67570
## 3: 7475329 2010-01-01 1129 102 80.00 Swipe Transaction 27092
## 4: 7475331 2010-01-01 430 2860 200.00 Swipe Transaction 27092
## 5: 7475332 2010-01-01 848 3915 46.41 Swipe Transaction 13051
## 6: 7475333 2010-01-01 1807 165 4.81 Swipe Transaction 20519
## merchant_city merchant_state zip mcc zip_missing is_refund has_error
## <char> <char> <num> <int> <num> <num> <num>
## 1: Beulah ND 58523 5499 0 1 0
## 2: Bettendorf IA 52722 5311 0 0 0
## 3: Vista CA 92084 4829 0 0 0
## 4: Crown Point IN 46307 4829 0 0 0
## 5: Harwood MD 20776 5813 0 0 0
## 6: Bronx NY 10464 5942 0 0 0
## error_bad_expiration error_bad_card_number error_insufficient_balance
## <num> <num> <num>
## 1: 0 0 0
## 2: 0 0 0
## 3: 0 0 0
## 4: 0 0 0
## 5: 0 0 0
## 6: 0 0 0
## error_bad_pin error_bad_cvv error_bad_zipcode error_technical_glitch
## <num> <num> <num> <num>
## 1: 0 0 0 0
## 2: 0 0 0 0
## 3: 0 0 0 0
## 4: 0 0 0 0
## 5: 0 0 0 0
## 6: 0 0 0 0
## error_count
## <num>
## 1: 0
## 2: 0
## 3: 0
## 4: 0
## 5: 0
## 6: 0
head(user_df)
## Key: <id>
## id current_age retirement_age birth_year birth_month gender
## <int> <int> <int> <int> <int> <char>
## 1: 0 33 69 1986 3 Male
## 2: 1 43 74 1976 4 Female
## 3: 2 48 64 1971 8 Male
## 4: 3 49 65 1970 12 Male
## 5: 4 54 72 1965 3 Female
## 6: 5 65 65 1955 2 Male
## address latitude longitude per_capita_income
## <char> <num> <num> <num>
## 1: 858 Plum Avenue 43.59 -70.33 29237
## 2: 113 Burns Lane 30.44 -87.18 22247
## 3: 6035 Forest Avenue 40.84 -73.87 13461
## 4: 840 Elm Avenue 33.89 -98.51 13705
## 5: 6016 Little Creek Boulevard 47.61 -122.30 37485
## 6: 1032 Second Lane 42.95 -77.13 19095
## yearly_income total_debt credit_score num_credit_cards debt_to_income_ratio
## <num> <num> <int> <int> <num>
## 1: 59613 36199 763 4 0.6072333
## 2: 45360 14587 704 3 0.3215829
## 3: 27447 80850 673 5 2.9456771
## 4: 27943 18693 681 4 0.6689690
## 5: 76431 115362 716 5 1.5093614
## 6: 20614 14042 711 2 0.6811875
head(card_df)
## Key: <id, client_id>
## id client_id card_brand card_type card_number expires cvv
## <int> <int> <char> <char> <i64> <Date> <int>
## 1: 0 1362 Amex Credit 393314135668401 2024-04-01 866
## 2: 1 550 Mastercard Credit 5278231764792292 2024-06-01 396
## 3: 2 556 Mastercard Debit 5889825928297675 2021-09-01 422
## 4: 3 1937 Visa Credit 4289888672554714 2020-04-01 736
## 5: 4 1981 Mastercard Debit 5433366978583845 2024-03-01 530
## 6: 5 619 Visa Debit 4657824650820465 2024-04-01 245
## has_chip num_cards_issued credit_limit acct_open_date year_pin_last_changed
## <num> <int> <num> <Date> <int>
## 1: 1 2 33900 1991-01-01 2014
## 2: 1 1 11600 1994-01-01 2013
## 3: 1 1 19948 1995-01-01 2011
## 4: 1 2 16400 1995-01-01 2015
## 5: 1 2 19439 1997-01-01 2007
## 6: 1 2 21883 1997-01-01 2012
head(mcc_codes_df)
## mcc_code description
## <int> <char>
## 1: 5812 Eating Places and Restaurants
## 2: 5541 Service Stations
## 3: 7996 Amusement Parks, Carnivals, Circuses
## 4: 5411 Grocery Stores, Supermarkets
## 5: 4784 Tolls and Bridge Fees
## 6: 4900 Utilities - Electric, Gas, Water, Sanitary
head(train_fraud_label_df)
## Key: <transaction_id>
## transaction_id target
## <int> <fctr>
## 1: 7475327 No
## 2: 7475328 No
## 3: 7475329 No
## 4: 7475332 No
## 5: 7475333 No
## 6: 7475335 No
cat("User data: ", format(nrow(user_df), big.mark=","), "rows\n")
## User data: 2,000 rows
cat("Card data: ", format(nrow(card_df), big.mark=","), "rows\n")
## Card data: 6,115 rows
cat("Transaction data: ", format(nrow(transaction_df), big.mark=","), "rows\n")
## Transaction data: 13,305,915 rows
cat("Fraud labels: ", format(nrow(train_fraud_label_df), big.mark=","), "rows\n")
## Fraud labels: 8,914,963 rows
if("credit_limit" %in% names(card_df)) {
print(summary(card_df$credit_limit))
}
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 7100 12600 14420 19244 151223
if("target" %in% names(train_fraud_label_df)) {
print(table(train_fraud_label_df$target))
cat("Fraud Rate:", round(sum(train_fraud_label_df$target == "YES") / nrow(train_fraud_label_df) * 100, 2))
}
##
## No Yes
## 8901631 13332
## Fraud Rate: 0
fraud_data <- merge(transaction_df, train_fraud_label_df,
by.x = "id", by.y = "transaction_id",
all.x = TRUE)
cat("After merge fraud and transaction data:", nrow(fraud_data), "rows")
## After merge fraud and transaction data: 13305915 rows
fraud_data <- fraud_data[!is.na(target)]
cat(" After removing unlabeled data:", format(nrow(fraud_data), big.mark=","))
## After removing unlabeled data: 8,914,963
fraud_data <- merge(fraud_data, mcc_codes_df,
by.x = "mcc", by.y = "mcc_code",
all.x = TRUE)
fraud_data <- merge(fraud_data, card_df,
by.x = "card_id", by.y = "id",
all.x = TRUE,
suffixes = c("", "_card"))
fraud_data <- merge(fraud_data, user_df,
by.x = "client_id", by.y = "id",
all.x = TRUE,
suffixes = c("", "_user"))
# Remove client_id_card as it is same as client_id column
fraud_data[, client_id_card := NULL]
# Define column order: Transaction -> MCC -> Card -> User -> Target
transaction_cols <- c("id", "date", "client_id", "card_id", "amount", "mcc",
"use_chip", "merchant_id", "merchant_city", "merchant_state", "zip",
"is_refund", "has_error", "error_bad_expiration", "error_bad_card_number",
"error_insufficient_balance", "error_bad_pin","error_bad_cvv",
"error_bad_zipcode", "error_technical_glitch", "error_count")
mcc_cols <- c("description")
card_cols <- c("card_brand", "card_type", "card_number", "expires", "cvv", "has_chip",
"num_cards_issued", "credit_limit", "acct_open_date", "year_pin_last_changed")
user_cols <- c("current_age", "retirement_age", "birth_year", "birth_month", "gender",
"address", "latitude", "longitude", "per_capita_income", "yearly_income",
"total_debt", "credit_score", "num_credit_cards", "debt_to_income_ratio")
target_col <- c("target")
# Combine in desired order, keeping only columns that exist
all_cols <- c(transaction_cols, mcc_cols, card_cols, user_cols, target_col)
existing_cols <- all_cols[all_cols %in% names(fraud_data)]
# Reorder columns
setcolorder(fraud_data, existing_cols)
cat("Total rows:", format(nrow(fraud_data), big.mark=","))
## Total rows: 8,914,963
cat("Total columns:", ncol(fraud_data))
## Total columns: 48
cat("Columns:", paste(names(fraud_data)[1:10], collapse=", "))
## Columns: id, date, client_id, card_id, amount, mcc, use_chip, merchant_id, merchant_city, merchant_state
head(fraud_data)
## Key: <client_id>
## id date client_id card_id amount mcc use_chip
## <int> <Date> <int> <int> <num> <int> <char>
## 1: 11873816 2012-10-31 0 1271 62.99 1711 Swipe Transaction
## 2: 14690298 2014-07-15 0 1271 64.96 1711 Swipe Transaction
## 3: 16182985 2015-06-03 0 1271 309.57 3000 Chip Transaction
## 4: 22742665 2019-03-30 0 1271 535.73 3000 Chip Transaction
## 5: 18200515 2016-08-08 0 1271 568.10 3001 Chip Transaction
## 6: 12614098 2013-04-15 0 1271 564.05 3132 Swipe Transaction
## merchant_id merchant_city merchant_state zip is_refund has_error
## <int> <char> <char> <num> <num> <num>
## 1: 11582 Scarborough ME 4074 0 0
## 2: 11582 Scarborough ME 4074 0 0
## 3: 60152 Stratford CT 6615 0 0
## 4: 60152 Stratford CT 6615 0 0
## 5: 5594 Norwich CT 6360 0 0
## 6: 57386 Glastonbury CT 6033 0 0
## error_bad_expiration error_bad_card_number error_insufficient_balance
## <num> <num> <num>
## 1: 0 0 0
## 2: 0 0 0
## 3: 0 0 0
## 4: 0 0 0
## 5: 0 0 0
## 6: 0 0 0
## error_bad_pin error_bad_cvv error_bad_zipcode error_technical_glitch
## <num> <num> <num> <num>
## 1: 0 0 0 0
## 2: 0 0 0 0
## 3: 0 0 0 0
## 4: 0 0 0 0
## 5: 0 0 0 0
## 6: 0 0 0 0
## error_count description card_brand
## <num> <char> <char>
## 1: 0 Heating, Plumbing, Air Conditioning Contractors Mastercard
## 2: 0 Heating, Plumbing, Air Conditioning Contractors Mastercard
## 3: 0 Steelworks Mastercard
## 4: 0 Steelworks Mastercard
## 5: 0 Steel Products Manufacturing Mastercard
## 6: 0 Leather Goods Mastercard
## card_type card_number expires cvv has_chip num_cards_issued
## <char> <i64> <Date> <int> <num> <int>
## 1: Debit 5050211780967429 2021-04-01 316 1 2
## 2: Debit 5050211780967429 2021-04-01 316 1 2
## 3: Debit 5050211780967429 2021-04-01 316 1 2
## 4: Debit 5050211780967429 2021-04-01 316 1 2
## 5: Debit 5050211780967429 2021-04-01 316 1 2
## 6: Debit 5050211780967429 2021-04-01 316 1 2
## credit_limit acct_open_date year_pin_last_changed current_age retirement_age
## <num> <Date> <int> <int> <int>
## 1: 31490 2011-02-01 2011 33 69
## 2: 31490 2011-02-01 2011 33 69
## 3: 31490 2011-02-01 2011 33 69
## 4: 31490 2011-02-01 2011 33 69
## 5: 31490 2011-02-01 2011 33 69
## 6: 31490 2011-02-01 2011 33 69
## birth_year birth_month gender address latitude longitude
## <int> <int> <char> <char> <num> <num>
## 1: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## 2: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## 3: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## 4: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## 5: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## 6: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## per_capita_income yearly_income total_debt credit_score num_credit_cards
## <num> <num> <num> <int> <int>
## 1: 29237 59613 36199 763 4
## 2: 29237 59613 36199 763 4
## 3: 29237 59613 36199 763 4
## 4: 29237 59613 36199 763 4
## 5: 29237 59613 36199 763 4
## 6: 29237 59613 36199 763 4
## debt_to_income_ratio target zip_missing
## <num> <fctr> <num>
## 1: 0.6072333 No 0
## 2: 0.6072333 No 0
## 3: 0.6072333 No 0
## 4: 0.6072333 No 0
## 5: 0.6072333 No 0
## 6: 0.6072333 No 0
if("target" %in% names(fraud_data)) {
print(table(fraud_data$target, useNA = "ifany"))
}
##
## No Yes
## 8901631 13332
# save as RDS (faster to load for next stage)
saveRDS(fraud_data, "CleanedDataSet/fraud_detection_data.rds")
credit_data <- merge(card_df, user_df,
by.x = "client_id", by.y = "id",
all.x = TRUE,
suffixes = c("_card", "_user"))
transaction_features <- transaction_df[, .(
total_transactions = .N,
avg_transaction_amount = mean(amount, na.rm = TRUE),
max_transaction_amount = max(amount, na.rm = TRUE),
min_transaction_amount = min(amount, na.rm = TRUE),
total_spent = sum(amount[amount > 0], na.rm = TRUE),
total_refunded = sum(abs(amount[amount < 0]), na.rm = TRUE),
num_refunds = sum(is_refund, na.rm = TRUE),
transaction_frequency = .N,
avg_errors = mean(has_error, na.rm = TRUE),
total_errors = sum(has_error, na.rm = TRUE)
), by = card_id]
credit_data <- merge(credit_data, transaction_features,
by.x = "id", by.y = "card_id",
all.x = TRUE)
transaction_feature_cols <- c("total_transactions", "avg_transaction_amount",
"max_transaction_amount", "min_transaction_amount",
"total_spent", "total_refunded", "num_refunds",
"transaction_frequency", "avg_errors", "total_errors")
for(col in transaction_feature_cols) {
if(col %in% names(credit_data)) {
credit_data[is.na(get(col)), (col) := 0]
}
}
card_info_cols <- c("id", "client_id", "card_brand", "card_type", "card_number",
"expires", "cvv", "has_chip", "num_cards_issued",
"acct_open_date", "year_pin_last_changed")
user_info_cols <- c("current_age", "retirement_age", "birth_year", "birth_month",
"gender", "address", "latitude", "longitude",
"per_capita_income", "yearly_income", "total_debt",
"credit_score", "num_credit_cards", "debt_to_income_ratio")
target_col <- c("credit_limit")
# Combine in order, keeping only existing columns
all_cols_credit <- c(card_info_cols, user_info_cols, transaction_feature_cols, target_col)
existing_cols_credit <- all_cols_credit[all_cols_credit %in% names(credit_data)]
setcolorder(credit_data, existing_cols_credit)
cat("Total rows:", format(nrow(credit_data), big.mark=","))
## Total rows: 6,115
cat("Total columns:", ncol(credit_data))
## Total columns: 36
cat("Columns:", paste(names(credit_data)[1:10], collapse=", "))
## Columns: id, client_id, card_brand, card_type, card_number, expires, cvv, has_chip, num_cards_issued, acct_open_date
if("credit_limit" %in% names(credit_data)) {
cat("\nCredit Limit Summary:\n")
print(summary(credit_data$credit_limit))
}
##
## Credit Limit Summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 7100 12600 14420 19244 151223
credit_data <- credit_data[!is.na(credit_limit)]
saveRDS(credit_data, "CleanedDataSet/credit_limit_data.rds")
fwrite(credit_data, "CleanedDataSet/credit_limit_data.csv")