Project Credit card fraud detection

library(scorecard)

## Warning: package 'scorecard' was built under R version 4.3.3

library(tidyverse) # metapackage of all tidyverse packages

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ tidyr::replace_na() masks scorecard::replace_na()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(GGally)

## Warning: package 'GGally' was built under R version 4.3.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ROCR)

## Warning: package 'ROCR' was built under R version 4.3.3

library(dplyr)

df<-read.csv('fraud test.csv')

df$date<-as.Date(ceiling_date(dmy_hm(df$trans_date_trans_time), "month"))
df %>% group_by(date) %>% summarize(n(),fraud_Rate=100*mean(is_fraud))

## # A tibble: 7 × 3
##   date        `n()` fraud_Rate
##   <date>      <int>      <dbl>
## 1 2020-07-01  30059      0.442
## 2 2020-08-01  85850      0.374
## 3 2020-09-01  88756      0.468
## 4 2020-10-01  69535      0.489
## 5 2020-11-01  69348      0.554
## 6 2020-12-01  72634      0.405
## 7 2021-01-01 139537      0.185

df %>% group_by(is_fraud) %>% summarize(n())

## # A tibble: 2 × 2
##   is_fraud  `n()`
##      <int>  <int>
## 1        0 553574
## 2        1   2145

it shows that you have 553,574 non-fraudulent transactions and 2,145 fraudulent transactions.

head(df)

##   X trans_date_trans_time      cc_num                             merchant
## 1 0      21/06/2020 12:14 2.29116e+15                fraud_Kirlin and Sons
## 2 1      21/06/2020 12:14 3.57303e+15                 fraud_Sporer-Keebler
## 3 2      21/06/2020 12:14 3.59822e+15 fraud_Swaniawski, Nitzsche and Welch
## 4 3      21/06/2020 12:15 3.59192e+15                    fraud_Haley Group
## 5 4      21/06/2020 12:15 3.52683e+15                fraud_Johnston-Casper
## 6 5      21/06/2020 12:15 3.04077e+13                  fraud_Daugherty LLC
##         category   amt    first     last gender                      street
## 1  personal_care  2.86     Jeff  Elliott      M           351 Darlene Green
## 2  personal_care 29.84   Joanne Williams      F            3638 Marsh Union
## 3 health_fitness 41.28   Ashley    Lopez      F        9333 Valentine Point
## 4       misc_pos 60.05    Brian Williams      M 32941 Krystal Mill Apt. 552
## 5         travel  3.19   Nathan   Massey      M    5783 Evan Roads Apt. 465
## 6      kids_pets 19.55 Danielle    Evans      F  76752 David Lodge Apt. 064
##         city state   zip     lat      long city_pop                    job
## 1   Columbia    SC 29209 33.9659  -80.9355   333497    Mechanical engineer
## 2    Altonah    UT 84002 40.3207 -110.4360      302 Sales professional, IT
## 3   Bellmore    NY 11710 40.6729  -73.5365    34496      Librarian, public
## 4 Titusville    FL 32780 28.5697  -80.8191    54767           Set designer
## 5   Falmouth    MI 49632 44.2529  -85.0170     1126     Furniture designer
## 6  Breesport    NY 14816 42.1939  -76.7361      520        Psychotherapist
##          dob                        trans_num  unix_time merch_lat merch_long
## 1 19/03/1968 2da90c7d74bd46a0caf3777415b3ebd3 1371816865  33.98639  -81.20071
## 2 17/01/1990 324cc204407e99f51b0d6ca0055005e7 1371816873  39.45050 -109.96043
## 3 21/10/1970 c81755dbbbea9d5c77f094348a7579be 1371816893  40.49581  -74.19611
## 4 25/07/1987 2159175b9efe66dc301f149d3d5abf8c 1371816915  28.81240  -80.88306
## 5 06/07/1955 57ff021bd3f328f8738bb535c302a31b 1371816917  44.95915  -85.88473
## 6 13/10/1991 798db04aaceb4febd084f1a7c404da93 1371816937  41.74716  -77.58420
##   is_fraud       date
## 1        0 2020-07-01
## 2        0 2020-07-01
## 3        0 2020-07-01
## 4        0 2020-07-01
## 5        0 2020-07-01
## 6        0 2020-07-01

str(df)

## 'data.frame':    555719 obs. of  24 variables:
##  $ X                    : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ trans_date_trans_time: chr  "21/06/2020 12:14" "21/06/2020 12:14" "21/06/2020 12:14" "21/06/2020 12:15" ...
##  $ cc_num               : num  2.29e+15 3.57e+15 3.60e+15 3.59e+15 3.53e+15 ...
##  $ merchant             : chr  "fraud_Kirlin and Sons" "fraud_Sporer-Keebler" "fraud_Swaniawski, Nitzsche and Welch" "fraud_Haley Group" ...
##  $ category             : chr  "personal_care" "personal_care" "health_fitness" "misc_pos" ...
##  $ amt                  : num  2.86 29.84 41.28 60.05 3.19 ...
##  $ first                : chr  "Jeff" "Joanne" "Ashley" "Brian" ...
##  $ last                 : chr  "Elliott" "Williams" "Lopez" "Williams" ...
##  $ gender               : chr  "M" "F" "F" "M" ...
##  $ street               : chr  "351 Darlene Green" "3638 Marsh Union" "9333 Valentine Point" "32941 Krystal Mill Apt. 552" ...
##  $ city                 : chr  "Columbia" "Altonah" "Bellmore" "Titusville" ...
##  $ state                : chr  "SC" "UT" "NY" "FL" ...
##  $ zip                  : int  29209 84002 11710 32780 49632 14816 95528 57374 16858 76678 ...
##  $ lat                  : num  34 40.3 40.7 28.6 44.3 ...
##  $ long                 : num  -80.9 -110.4 -73.5 -80.8 -85 ...
##  $ city_pop             : int  333497 302 34496 54767 1126 520 1139 343 3688 263 ...
##  $ job                  : chr  "Mechanical engineer" "Sales professional, IT" "Librarian, public" "Set designer" ...
##  $ dob                  : chr  "19/03/1968" "17/01/1990" "21/10/1970" "25/07/1987" ...
##  $ trans_num            : chr  "2da90c7d74bd46a0caf3777415b3ebd3" "324cc204407e99f51b0d6ca0055005e7" "c81755dbbbea9d5c77f094348a7579be" "2159175b9efe66dc301f149d3d5abf8c" ...
##  $ unix_time            : int  1371816865 1371816873 1371816893 1371816915 1371816917 1371816937 1371816944 1371816950 1371816970 1371816971 ...
##  $ merch_lat            : num  34 39.5 40.5 28.8 45 ...
##  $ merch_long           : num  -81.2 -110 -74.2 -80.9 -85.9 ...
##  $ is_fraud             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ date                 : Date, format: "2020-07-01" "2020-07-01" ...

df %>%group_by(category) %>% summarize(n(),Fraud_Rate=100*mean(is_fraud),max(amt),min(amt))

## # A tibble: 14 × 5
##    category       `n()` Fraud_Rate `max(amt)` `min(amt)`
##    <chr>          <int>      <dbl>      <dbl>      <dbl>
##  1 entertainment  40104      0.147       801.       1   
##  2 food_dining    39268      0.138       544.       1   
##  3 gas_transport  56370      0.273       147.       5.53
##  4 grocery_net    19426      0.211       178.       1.07
##  5 grocery_pos    52553      0.923       392       10.7 
##  6 health_fitness 36674      0.142       594.       1   
##  7 home           52345      0.128       507.       1   
##  8 kids_pets      48692      0.133       555.       1.01
##  9 misc_net       27367      0.976      2802.       1   
## 10 misc_pos       34574      0.208      4801.       1   
## 11 personal_care  39327      0.178       522.       1   
## 12 shopping_net   41779      1.21       6852.       1   
## 13 shopping_pos   49791      0.428      7321.       1   
## 14 travel         17449      0.229     22768.       1

data<-df %>% filter(df$is_fraud==0)

hist(data$amt, breaks = 100, col = "blue", main = "Histogram of CC charges for Non-Fraud Cases", xlab = "Values", ylab = "Frequency")

data<-df %>% filter(df$is_fraud==1)

hist(data$amt, breaks = 100, col = "blue", main = "Histogram of CC charges for Fraud Cases", xlab = "Values", ylab = "Frequency")

# Perform ANOVA
model <- aov(amt ~ is_fraud, data = df)

# Summary of ANOVA
summary(model)

##                 Df    Sum Sq   Mean Sq F value Pr(>F)    
## is_fraud         1 4.536e+08 453590034   19096 <2e-16 ***
## Residuals   555717 1.320e+10     23753                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Feature engineering :

# Creation of previous fraud transaction flag
df$full_name<- paste0(df$first," ",df$last)
min_date_of_fraud<-df %>% group_by(full_name) %>%filter(is_fraud==1) %>% summarize(min_date_fraud=min(trans_date_trans_time))
df <- left_join(df, min_date_of_fraud, by = "full_name")
df$previous_fraud<-0
df$previous_fraud[df$trans_date_trans_time>df$min_date_fraud]<-1

#Creation of Number of transactions of customer
df <- df %>%
  group_by(full_name) %>%
  mutate(num_of_transactions = row_number()) %>%
  ungroup()

data<-df %>% filter(df$is_fraud==1)
hist(data$num_of_transactions,col="blue", main = "Histogram of num of past transactions (only fraud cases)")

data<-df %>% filter(df$is_fraud==0)
hist(data$num_of_transactions,col="blue", main = "Histogram of num of past transactions (only non-fraud cases)")

#Creation of age of customer
df$dob_year<-as.Date(df$dob, format = "%d/%m/%Y")
df$dob_year <- format(df$dob_year, "%Y")
df$year<-as.Date(dmy_hm(df$trans_date_trans_time), format = "%d/%m/%Y")
df$year <- format(df$year, "%Y")
df$age_of_user<--as.double(df$dob_year)+as.double(df$year)

data<-df %>% filter(df$is_fraud==1)
hist(data$age_of_user,col="red", main = "Histogram of age_of_user (only fraud cases)")

data<-df %>% filter(df$is_fraud==0)
hist(data$age_of_user,col="blue", main = "Histogram of age_of_user (only non-fraud cases)")

##Creation of Time based variables

df <- df %>%
  mutate(
    trans_hour = hour(as.POSIXct(trans_date_trans_time, format="%d/%m/%Y %H:%M")),
    trans_day = weekdays(as.Date(trans_date_trans_time, format="%d/%m/%Y")),
    trans_month = month(as.Date(trans_date_trans_time, format="%d/%m/%Y")),
    trans_month_name = month.abb[trans_month],
    trans_weekend = ifelse(trans_day %in% c("Saturday", "Sunday"), 1, 0)
  )

#Creation of numerical features
df$amt_sq<-df$amt*df$amt
df$num_of_transactions_sq<-df$num_of_transactions*df$num_of_transactions
df$age_of_user_sq<-df$age_of_user*df$age_of_user

##Train and out of time test set split

df<-df %>% select(merchant,category,amt,gender,city,job,is_fraud,previous_fraud,num_of_transactions,age_of_user,trans_hour,amt_sq,num_of_transactions_sq,age_of_user_sq)

df$amt<-(df$amt-mean(df$amt))/sd(df$amt)
df$num_of_transactions<-(df$num_of_transactions-mean(df$num_of_transactions))/sd(df$num_of_transactions)
df$age_of_user<-(df$age_of_user-mean(df$age_of_user))/sd(df$age_of_user)
df$amt_sq<-(df$amt_sq-mean(df$amt_sq))/sd(df$amt_sq)
df$num_of_transactions_sq<-(df$num_of_transactions_sq-mean(df$num_of_transactions_sq))/sd(df$num_of_transactions_sq)
df$age_of_user_sq<-(df$age_of_user_sq-mean(df$age_of_user_sq))/sd(df$age_of_user_sq)

set.seed(42)  # Set seed for reproducibility
split_index <- sample(1:nrow(df), 0.8 * nrow(df))

train_data <- df[split_index, ]
test_data <- df[-split_index, ]

# Load the necessary library
library(e1071)

# Train the Naive Bayes model
nb_model <- naiveBayes(is_fraud ~ ., data = train_data)

# Make predictions on the test set
nb_pred <- predict(nb_model, newdata = test_data, type = "class")

# Calculate accuracy for Naive Bayes model
nb_accuracy <- mean(nb_pred == test_data$is_fraud)
nb_accuracy

## [1] 0.96267

# Decision Tree Model
library(rpart)

# Train the decision tree model
tree_model <- rpart(is_fraud ~ ., data = train_data, method = "class")

# Predict on test data
tree_pred <- predict(tree_model, newdata = test_data, type = "class")

# Evaluate decision tree model
tree_accuracy <- mean(tree_pred == test_data$is_fraud)
tree_accuracy

## [1] 0.9981286

# k-Nearest Neighbor (KNN) Model
library(class)
X_train <- data.matrix(train_data[, setdiff(names(train_data), c("is_fraud"))])
y_train <- train_data$is_fraud
X_test <- data.matrix(test_data[, setdiff(names(test_data), c("is_fraud"))])
y_test <- test_data$is_fraud

# Train the KNN model
knn_model <- knn(train = X_train, test = X_test, cl = y_train, k = 5)

# Evaluate KNN model
knn_accuracy <- mean(knn_model == y_test)
knn_accuracy

## [1] 0.9964011

Project Credit card fraud detection

Vaishali Kondoju

2024-04-22