library(data.table)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(corrplot)
## corrplot 0.95 loaded
library(leaflet)
library(leaflet.extras)
full_df <- fread("CleanedDataSet/credit_limit_data.csv")
head(full_df)
## id client_id card_brand card_type card_number expires cvv
## <int> <int> <char> <char> <i64> <IDat> <int>
## 1: 0 1362 Amex Credit 393314135668401 2024-04-01 866
## 2: 1 550 Mastercard Credit 5278231764792292 2024-06-01 396
## 3: 2 556 Mastercard Debit 5889825928297675 2021-09-01 422
## 4: 3 1937 Visa Credit 4289888672554714 2020-04-01 736
## 5: 4 1981 Mastercard Debit 5433366978583845 2024-03-01 530
## 6: 5 619 Visa Debit 4657824650820465 2024-04-01 245
## has_chip num_cards_issued acct_open_date year_pin_last_changed current_age
## <int> <int> <IDat> <int> <int>
## 1: 1 2 1991-01-01 2014 58
## 2: 1 1 1994-01-01 2013 76
## 3: 1 1 1995-01-01 2011 46
## 4: 1 2 1995-01-01 2015 65
## 5: 1 2 1997-01-01 2007 48
## 6: 1 2 1997-01-01 2012 54
## retirement_age birth_year birth_month gender address
## <int> <int> <int> <char> <char>
## 1: 67 1962 1 Male 3385 Hill Lane
## 2: 70 1944 2 Male 4937 Maple Lane
## 3: 66 1973 12 Male 5659 Park Avenue
## 4: 62 1955 2 Female 230 Plum Avenue
## 5: 65 1972 1 Male 8975 Littlewood Boulevard
## 6: 65 1965 12 Male 498 Littlewood Avenue
## latitude longitude per_capita_income yearly_income total_debt credit_score
## <num> <num> <int> <int> <int> <int>
## 1: 38.78 -77.27 35563 72510 44317 727
## 2: 47.54 -122.58 21219 30248 35766 763
## 3: 41.01 -81.60 17856 36405 31815 715
## 4: 38.78 -90.70 25350 17056 29112 667
## 5: 26.63 -81.99 19274 39303 23650 702
## 6: 44.01 -92.47 26478 53986 58381 748
## num_credit_cards debt_to_income_ratio total_transactions
## <int> <num> <int>
## 1: 4 0.6111847 3402
## 2: 4 1.1824253 2841
## 3: 2 0.8739184 8028
## 4: 5 1.7068480 2870
## 5: 7 0.6017352 890
## 6: 4 1.0814100 2058
## avg_transaction_amount max_transaction_amount min_transaction_amount
## <num> <num> <num>
## 1: 70.81277 1340.97 -497
## 2: 54.81875 1099.49 -500
## 3: 28.21148 1430.74 -499
## 4: 86.89563 1165.37 -469
## 5: 36.38093 903.24 -374
## 6: 43.81727 626.73 -491
## total_spent total_refunded num_refunds transaction_frequency avg_errors
## <num> <num> <int> <int> <num>
## 1: 278052.05 37147 332 3402 0.01704879
## 2: 163668.06 7928 58 2841 0.01548750
## 3: 258937.80 32456 329 8028 0.01258097
## 4: 266504.47 17114 162 2870 0.02055749
## 5: 41365.03 8986 104 890 0.03146067
## 6: 104039.94 13864 139 2058 0.01311953
## total_errors credit_limit
## <int> <int>
## 1: 58 33900
## 2: 44 11600
## 3: 101 19948
## 4: 59 16400
## 5: 28 19439
## 6: 27 21883
fraud_df <- readRDS("CleanedDataSet/fraud_detection_data.rds")
head(fraud_df)
## Key: <client_id>
## id date client_id card_id amount mcc use_chip
## <int> <Date> <int> <int> <num> <int> <char>
## 1: 11873816 2012-10-31 0 1271 62.99 1711 Swipe Transaction
## 2: 14690298 2014-07-15 0 1271 64.96 1711 Swipe Transaction
## 3: 16182985 2015-06-03 0 1271 309.57 3000 Chip Transaction
## 4: 22742665 2019-03-30 0 1271 535.73 3000 Chip Transaction
## 5: 18200515 2016-08-08 0 1271 568.10 3001 Chip Transaction
## 6: 12614098 2013-04-15 0 1271 564.05 3132 Swipe Transaction
## merchant_id merchant_city merchant_state zip is_refund has_error
## <int> <char> <char> <num> <num> <num>
## 1: 11582 Scarborough ME 4074 0 0
## 2: 11582 Scarborough ME 4074 0 0
## 3: 60152 Stratford CT 6615 0 0
## 4: 60152 Stratford CT 6615 0 0
## 5: 5594 Norwich CT 6360 0 0
## 6: 57386 Glastonbury CT 6033 0 0
## error_bad_expiration error_bad_card_number error_insufficient_balance
## <num> <num> <num>
## 1: 0 0 0
## 2: 0 0 0
## 3: 0 0 0
## 4: 0 0 0
## 5: 0 0 0
## 6: 0 0 0
## error_bad_pin error_bad_cvv error_bad_zipcode error_technical_glitch
## <num> <num> <num> <num>
## 1: 0 0 0 0
## 2: 0 0 0 0
## 3: 0 0 0 0
## 4: 0 0 0 0
## 5: 0 0 0 0
## 6: 0 0 0 0
## error_count description card_brand
## <num> <char> <char>
## 1: 0 Heating, Plumbing, Air Conditioning Contractors Mastercard
## 2: 0 Heating, Plumbing, Air Conditioning Contractors Mastercard
## 3: 0 Steelworks Mastercard
## 4: 0 Steelworks Mastercard
## 5: 0 Steel Products Manufacturing Mastercard
## 6: 0 Leather Goods Mastercard
## card_type card_number expires cvv has_chip num_cards_issued
## <char> <i64> <Date> <int> <num> <int>
## 1: Debit 5050211780967429 2021-04-01 316 1 2
## 2: Debit 5050211780967429 2021-04-01 316 1 2
## 3: Debit 5050211780967429 2021-04-01 316 1 2
## 4: Debit 5050211780967429 2021-04-01 316 1 2
## 5: Debit 5050211780967429 2021-04-01 316 1 2
## 6: Debit 5050211780967429 2021-04-01 316 1 2
## credit_limit acct_open_date year_pin_last_changed current_age retirement_age
## <num> <Date> <int> <int> <int>
## 1: 31490 2011-02-01 2011 33 69
## 2: 31490 2011-02-01 2011 33 69
## 3: 31490 2011-02-01 2011 33 69
## 4: 31490 2011-02-01 2011 33 69
## 5: 31490 2011-02-01 2011 33 69
## 6: 31490 2011-02-01 2011 33 69
## birth_year birth_month gender address latitude longitude
## <int> <int> <char> <char> <num> <num>
## 1: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## 2: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## 3: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## 4: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## 5: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## 6: 1986 3 Male 858 Plum Avenue 43.59 -70.33
## per_capita_income yearly_income total_debt credit_score num_credit_cards
## <num> <num> <num> <int> <int>
## 1: 29237 59613 36199 763 4
## 2: 29237 59613 36199 763 4
## 3: 29237 59613 36199 763 4
## 4: 29237 59613 36199 763 4
## 5: 29237 59613 36199 763 4
## 6: 29237 59613 36199 763 4
## debt_to_income_ratio target zip_missing
## <num> <fctr> <num>
## 1: 0.6072333 No 0
## 2: 0.6072333 No 0
## 3: 0.6072333 No 0
## 4: 0.6072333 No 0
## 5: 0.6072333 No 0
## 6: 0.6072333 No 0
Based on the graph below, the credit limit distribution is right-skewed (positively skewed), that suggests majority of the clients have relatively low credit limits, with a few clients having exceptionally high credit limits. The red vertical line likely marks the mean value, where most of the credit limits fall in relation to this value.
ggplot(full_df, aes(x = credit_limit)) +
geom_histogram(
bins = 30,
aes(fill = after_stat(count)),
color = "white"
) +
scale_fill_gradient(low = "#AED6F1", high = "#1F618D") +
labs(
title = "Distribution of Credit Limit",
x = "Credit Limit",
y = "Count",
fill = "Frequency"
) +
geom_vline(aes(xintercept = mean(credit_limit, na.rm = TRUE)),
color = "#E15759", linewidth = 1.2) +
theme_minimal(base_size = 13)
### Fraud Distribution This pie chart represents the distribution of
fraud cases in this dataset where Non-Fraud Cases takes up 99.9% of the
chart, indicating that the dataset is highly imbalanced, which is
typical for fraud detection problems where fraud cases are much fewer
than non-fraud cases. This imbalance will require special handling when
training machine learning models to ensure the model learns to identify
the minority class (fraud) effectively.
fraud_df$target <- gsub("yes", "fraud", fraud_df$target)
fraud_df$target <- gsub("no", "not fraud", fraud_df$target)
pie_chart <- fraud_df %>%
group_by(target) %>%
summarise(count = n()) %>%
plot_ly(labels = ~target, values = ~count, type = 'pie', textinfo = 'label+percent',
title = 'Number of Fraud Cases',
marker = list(colors = c("red", "green"))) %>%
layout(margin = list(t = 0, b = 0, l = 0, r = 0))
pie_chart
Based on our current data, most of the clients’ card brand are Mastercard, followed by Visa Amex and Discover
ggplot(fraud_df, aes(x = card_brand, fill = target)) +
geom_bar(position = "dodge") +
labs(title = "Distribution of Card Brand by Target (Fraud vs Non-Fraud)",
x = "Card Brand", y = "Count") +
scale_fill_manual(values = c("Yes" = "red", "No" = "green")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
### Count of Credit Limit and Fraud Cases based on card type This graph
provides a side-by-side comparison of fraud cases and total credit limit
by card type. This comparison suggests that Debit cards are more prone
to fraud and hold a larger financial exposure due to higher credit
limits.
fraud_data <- fraud_df %>% filter(target == "Yes")
fraud_data$card_type <- factor(fraud_data$card_type,
levels = names(sort(table(fraud_data$card_type), decreasing = TRUE)))
count_plot <- ggplot(fraud_data, aes(x = card_type, fill = card_type)) +
geom_bar() +
labs(title = "Distribution of Card Type for Fraud Cases",
x = "Card Type",
y = "Count of Fraud Cases") +
scale_fill_manual(values = c("Credit" = "green", "Debit" = "red")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
credit_limit_plot <- ggplot(fraud_data, aes(x = card_type, y = credit_limit, fill = card_type)) +
geom_bar(stat = "summary", fun = "sum") +
labs(title = "Total Credit Limit by Card Type for Fraud Cases",
x = "Card Type",
y = "Total Credit Limit") +
scale_fill_manual(values = c("Credit" = "green", "Debit" = "red")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
count_plotly <- ggplotly(count_plot)
credit_limit_plotly <- ggplotly(credit_limit_plot)
## Warning: Removed 58 rows containing non-finite outside the scale range
## (`stat_summary()`).
fig <- subplot(count_plotly, credit_limit_plotly, nrows = 1, shareX = TRUE) %>%
layout(
title = "Fraud Cases and Credit Limit by Card Type",
showlegend = FALSE,
xaxis = list(title = "Card Type"),
yaxis = list(title = "Count of Fraud Cases"),
yaxis2 = list(
title = "Total Credit Limit",
overlaying = "y",
side = "right"
)
)
fig
This graph presents two visualizations side by side to compare fraud cases and total credit limits by gender. Both Fraud Cases and Total Credit Limit appears to be higher for females than for males suggesting that the overall financial exposure may be greater for females. This could point to gender-based differences in credit access or borrowing behavior.
total_credit_limit_by_gender <- full_df %>%
group_by(gender) %>%
summarise(total_credit_limit = sum(credit_limit, na.rm = TRUE))
fraud_cases_by_gender <- fraud_df %>%
filter(target == "Yes") %>%
group_by(gender) %>%
summarise(fraud_cases = n())
credit_limit_plot <- ggplot(total_credit_limit_by_gender, aes(x = gender, y = total_credit_limit, fill = gender)) +
geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
labs(title = "Total Credit Limit by Gender", x = "Gender", y = "Total Credit Limit") +
theme_minimal() +
scale_fill_manual(values = c("lightblue", "lightpink"))
fraud_cases_plot <- ggplot(fraud_cases_by_gender, aes(x = gender, y = fraud_cases, fill = gender)) +
geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
labs(title = "Number of Fraud Cases by Gender", x = "Gender", y = "Number of Fraud Cases") +
theme_minimal() +
scale_fill_manual(values = c("lightblue", "lightpink"))
credit_limit_plotly <- ggplotly(credit_limit_plot)
fraud_cases_plotly <- ggplotly(fraud_cases_plot)
fig <- subplot(fraud_cases_plotly, credit_limit_plotly, nrows = 1, shareX = TRUE) %>%
layout(
title = "Fraud Cases and Credit Limit by Gender",
showlegend = FALSE,
xaxis = list(title = "Gender"),
yaxis = list(title = "Count of Fraud Cases"),
yaxis2 = list(
title = "Total Credit Limit",
overlaying = "y",
side = "right"
)
)
fig
This graph displays the monthly distribution of fraud cases where the highest number of fraud cases occurs in August, where the count reaches more than 1300 cases. Notice that June has the lowest number of fraud cases followed by January and February.
fraud_df$date <- as.Date(fraud_df$date)
fraud_df$month <- month(fraud_df$date)
fraud_df$month <- month.name[fraud_df$month]
fraud_cases <- fraud_df %>%
filter(target == "Yes") %>%
group_by(month) %>%
summarise(count = n(), .groups = 'drop')
# Ensure the month column is ordered from January to December
fraud_cases$month <- factor(fraud_cases$month, levels = month.name)
# Create the line plot for fraud cases
ggplot(fraud_cases, aes(x = month, y = count)) +
geom_line(size = 1, color = "red") + # Red line for fraud cases
geom_point(size = 2, color = "red") + # Red points for fraud cases
labs(title = "Fraud Cases by Month",
x = "Month", y = "Number of Fraud Cases") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
This scatter plot shows the relationship between Credit Score and Credit Limit that suggest a visible weak correlation between Credit Score and Credit Limit. As Credit Score increases, the Credit Limit does not show a clear upward trend, and many of the data points are clustered at the lower end of the credit limit scale. A few data points stand out at the higher end of the Credit Limit that could represent cases with unusually high credit limits.
ggplot(full_df, aes(x = credit_score, y = credit_limit)) +
geom_point(color = "blue", alpha = 0.6) +
labs(title = "Credit Score vs Credit Limit", x = "Credit Score", y = "Credit Limit") +
theme_minimal() +
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
### Scatter plot of credit limit against total_spent This scatter plot
shows the relationship between total spent and credit limit where there
is a slight positive correlation indicated by the upward-sloping blue
line. As individuals spend more, their credit limits tend to increase,
although the relationship is weak and non-linear, especially at higher
values.A few outliers at the higher end was spotted.
ggplot(full_df, aes(total_spent, credit_limit)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
This heatmap visualizes the distribution of credit limits across the United States where it concentrates on eastern U.S.and west coast. Meanwhile, areas in the central U.S. and northern regions show a blue tint, suggesting lower concentrations of credit limits in those regions. This heatmap reflects regional economic patterns, showing that credit limits are generally more concentrated in urban and economically active areas, especially along the East and West coasts.
heatmap_data <- full_df %>%
select(latitude, longitude, credit_limit)
map <- leaflet(heatmap_data) %>%
# OpenStreetMap tiles
addTiles() %>%
addHeatmap(
lng = ~longitude,
lat = ~latitude,
intensity = ~credit_limit,
blur = 20,
max = 0.05,
radius = 15
) %>%
setView(lng = mean(fraud_df$longitude), lat = mean(fraud_df$latitude), zoom = 10) # Center the map
map
This box plot shows the distribution of credit limits by card brand that reveals significant variability in credit limits across different card brands, with Visa and Mastercard showing higher credit limits, and Amex and Discover having more concentrated and lower values.
ggplot(full_df, aes(x=card_brand, y=credit_limit, fill=card_brand)) +
geom_boxplot() +
labs(title="Credit Limit by Card Brand", x="Card Brand", y="Credit Limit")
This box plot shows the distribution of credit limits by card brand for fraud cases only with Visa having a more considerable spread and a greater number of outliers at the upper end. The outliers are more prevalent in Visa and Mastercard, which suggests that these card brands tend to have higher variability in the credit limits granted to fraud cases. The box plots indicate that fraud cases associated with Visa and Mastercard generally have higher credit limits, while Amex and Discover are associated with lower credit limits for fraud cases.
fraud_cases <- fraud_df %>% filter(target == "Yes")
# Create the boxplot for Credit Limit by Card Brand, showing only fraud cases
ggplot(fraud_cases, aes(x = card_brand, y = credit_limit, fill = card_brand)) +
geom_boxplot() +
labs(title = "Credit Limit by Card Brand (Fraud Cases Only)",
x = "Card Brand",
y = "Credit Limit") +
scale_fill_manual(values = c("red", "green", "blue", "yellow")) + # Customize box colors for each card brand
theme_minimal()
## Warning: Removed 58 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
This correlation plot visualizes the relationship between different numerical variables in the dataset and how they correlate with the credit limit. The plot reveals that the per_capita_income and yearly_income is the strongest factor influencing credit limit. Higher income typically signals greater financial stability and the ability to repay debt and it is often use as an indicator of repayment capability, so individuals with higher incomes are more likely to be granted higher credit limits as they are perceived as lower risk borrowers.
cor_data <- full_df %>% select(where(is.numeric))
cor_data <- cor_data %>% select(-card_number)
cor_matrix <- cor(cor_data, use = "complete.obs")
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 90, tl.cex = 0.7)
### Correlation plot of target (Fraud Cases) This correlation plot
visualizes the relationship between various numerical variables in the
dataset, with a particular focus on how they relate to fraud cases. This
variable appears to be negatively correlated with indicators such as
credit limit, credit score, and debt-to-income ratio. This suggests that
the outcome represented by target variable could be influenced by a
combination of financial and error-related factors, which may reflect
behavioral or risk-based characteristics.
fraud_df$new_target <- ifelse(fraud_df$target == "Yes", 1, 0)
# Select numeric columns from the data
cor_data <- fraud_df %>% select(where(is.numeric))
# Remove the 'card_number' column
cor_data <- cor_data %>% select(-card_number)
# Calculate the correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")
# Create the correlation plot with smaller font size for axis labels
corrplot(cor_matrix, method = "circle", type = "upper",
tl.col = "black", tl.srt = 90, tl.cex = 0.7)