library(scorecard)
## Warning: package 'scorecard' was built under R version 4.3.3
library(tidyverse) # metapackage of all tidyverse packages
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::replace_na() masks scorecard::replace_na()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.3.3
library(dplyr)
df<-read.csv('fraud test.csv')
df$date<-as.Date(ceiling_date(dmy_hm(df$trans_date_trans_time), "month"))
df %>% group_by(date) %>% summarize(n(),fraud_Rate=100*mean(is_fraud))
## # A tibble: 7 × 3
## date `n()` fraud_Rate
## <date> <int> <dbl>
## 1 2020-07-01 30059 0.442
## 2 2020-08-01 85850 0.374
## 3 2020-09-01 88756 0.468
## 4 2020-10-01 69535 0.489
## 5 2020-11-01 69348 0.554
## 6 2020-12-01 72634 0.405
## 7 2021-01-01 139537 0.185
df %>% group_by(is_fraud) %>% summarize(n())
## # A tibble: 2 × 2
## is_fraud `n()`
## <int> <int>
## 1 0 553574
## 2 1 2145
it shows that you have 553,574 non-fraudulent transactions and 2,145 fraudulent transactions.
head(df)
## X trans_date_trans_time cc_num merchant
## 1 0 21/06/2020 12:14 2.29116e+15 fraud_Kirlin and Sons
## 2 1 21/06/2020 12:14 3.57303e+15 fraud_Sporer-Keebler
## 3 2 21/06/2020 12:14 3.59822e+15 fraud_Swaniawski, Nitzsche and Welch
## 4 3 21/06/2020 12:15 3.59192e+15 fraud_Haley Group
## 5 4 21/06/2020 12:15 3.52683e+15 fraud_Johnston-Casper
## 6 5 21/06/2020 12:15 3.04077e+13 fraud_Daugherty LLC
## category amt first last gender street
## 1 personal_care 2.86 Jeff Elliott M 351 Darlene Green
## 2 personal_care 29.84 Joanne Williams F 3638 Marsh Union
## 3 health_fitness 41.28 Ashley Lopez F 9333 Valentine Point
## 4 misc_pos 60.05 Brian Williams M 32941 Krystal Mill Apt. 552
## 5 travel 3.19 Nathan Massey M 5783 Evan Roads Apt. 465
## 6 kids_pets 19.55 Danielle Evans F 76752 David Lodge Apt. 064
## city state zip lat long city_pop job
## 1 Columbia SC 29209 33.9659 -80.9355 333497 Mechanical engineer
## 2 Altonah UT 84002 40.3207 -110.4360 302 Sales professional, IT
## 3 Bellmore NY 11710 40.6729 -73.5365 34496 Librarian, public
## 4 Titusville FL 32780 28.5697 -80.8191 54767 Set designer
## 5 Falmouth MI 49632 44.2529 -85.0170 1126 Furniture designer
## 6 Breesport NY 14816 42.1939 -76.7361 520 Psychotherapist
## dob trans_num unix_time merch_lat merch_long
## 1 19/03/1968 2da90c7d74bd46a0caf3777415b3ebd3 1371816865 33.98639 -81.20071
## 2 17/01/1990 324cc204407e99f51b0d6ca0055005e7 1371816873 39.45050 -109.96043
## 3 21/10/1970 c81755dbbbea9d5c77f094348a7579be 1371816893 40.49581 -74.19611
## 4 25/07/1987 2159175b9efe66dc301f149d3d5abf8c 1371816915 28.81240 -80.88306
## 5 06/07/1955 57ff021bd3f328f8738bb535c302a31b 1371816917 44.95915 -85.88473
## 6 13/10/1991 798db04aaceb4febd084f1a7c404da93 1371816937 41.74716 -77.58420
## is_fraud date
## 1 0 2020-07-01
## 2 0 2020-07-01
## 3 0 2020-07-01
## 4 0 2020-07-01
## 5 0 2020-07-01
## 6 0 2020-07-01
str(df)
## 'data.frame': 555719 obs. of 24 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ trans_date_trans_time: chr "21/06/2020 12:14" "21/06/2020 12:14" "21/06/2020 12:14" "21/06/2020 12:15" ...
## $ cc_num : num 2.29e+15 3.57e+15 3.60e+15 3.59e+15 3.53e+15 ...
## $ merchant : chr "fraud_Kirlin and Sons" "fraud_Sporer-Keebler" "fraud_Swaniawski, Nitzsche and Welch" "fraud_Haley Group" ...
## $ category : chr "personal_care" "personal_care" "health_fitness" "misc_pos" ...
## $ amt : num 2.86 29.84 41.28 60.05 3.19 ...
## $ first : chr "Jeff" "Joanne" "Ashley" "Brian" ...
## $ last : chr "Elliott" "Williams" "Lopez" "Williams" ...
## $ gender : chr "M" "F" "F" "M" ...
## $ street : chr "351 Darlene Green" "3638 Marsh Union" "9333 Valentine Point" "32941 Krystal Mill Apt. 552" ...
## $ city : chr "Columbia" "Altonah" "Bellmore" "Titusville" ...
## $ state : chr "SC" "UT" "NY" "FL" ...
## $ zip : int 29209 84002 11710 32780 49632 14816 95528 57374 16858 76678 ...
## $ lat : num 34 40.3 40.7 28.6 44.3 ...
## $ long : num -80.9 -110.4 -73.5 -80.8 -85 ...
## $ city_pop : int 333497 302 34496 54767 1126 520 1139 343 3688 263 ...
## $ job : chr "Mechanical engineer" "Sales professional, IT" "Librarian, public" "Set designer" ...
## $ dob : chr "19/03/1968" "17/01/1990" "21/10/1970" "25/07/1987" ...
## $ trans_num : chr "2da90c7d74bd46a0caf3777415b3ebd3" "324cc204407e99f51b0d6ca0055005e7" "c81755dbbbea9d5c77f094348a7579be" "2159175b9efe66dc301f149d3d5abf8c" ...
## $ unix_time : int 1371816865 1371816873 1371816893 1371816915 1371816917 1371816937 1371816944 1371816950 1371816970 1371816971 ...
## $ merch_lat : num 34 39.5 40.5 28.8 45 ...
## $ merch_long : num -81.2 -110 -74.2 -80.9 -85.9 ...
## $ is_fraud : int 0 0 0 0 0 0 0 0 0 0 ...
## $ date : Date, format: "2020-07-01" "2020-07-01" ...
df %>%group_by(category) %>% summarize(n(),Fraud_Rate=100*mean(is_fraud),max(amt),min(amt))
## # A tibble: 14 × 5
## category `n()` Fraud_Rate `max(amt)` `min(amt)`
## <chr> <int> <dbl> <dbl> <dbl>
## 1 entertainment 40104 0.147 801. 1
## 2 food_dining 39268 0.138 544. 1
## 3 gas_transport 56370 0.273 147. 5.53
## 4 grocery_net 19426 0.211 178. 1.07
## 5 grocery_pos 52553 0.923 392 10.7
## 6 health_fitness 36674 0.142 594. 1
## 7 home 52345 0.128 507. 1
## 8 kids_pets 48692 0.133 555. 1.01
## 9 misc_net 27367 0.976 2802. 1
## 10 misc_pos 34574 0.208 4801. 1
## 11 personal_care 39327 0.178 522. 1
## 12 shopping_net 41779 1.21 6852. 1
## 13 shopping_pos 49791 0.428 7321. 1
## 14 travel 17449 0.229 22768. 1
data<-df %>% filter(df$is_fraud==0)
hist(data$amt, breaks = 100, col = "blue", main = "Histogram of CC charges for Non-Fraud Cases", xlab = "Values", ylab = "Frequency")
data<-df %>% filter(df$is_fraud==1)
hist(data$amt, breaks = 100, col = "blue", main = "Histogram of CC charges for Fraud Cases", xlab = "Values", ylab = "Frequency")
# Perform ANOVA
model <- aov(amt ~ is_fraud, data = df)
# Summary of ANOVA
summary(model)
## Df Sum Sq Mean Sq F value Pr(>F)
## is_fraud 1 4.536e+08 453590034 19096 <2e-16 ***
## Residuals 555717 1.320e+10 23753
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Feature engineering :
# Creation of previous fraud transaction flag
df$full_name<- paste0(df$first," ",df$last)
min_date_of_fraud<-df %>% group_by(full_name) %>%filter(is_fraud==1) %>% summarize(min_date_fraud=min(trans_date_trans_time))
df <- left_join(df, min_date_of_fraud, by = "full_name")
df$previous_fraud<-0
df$previous_fraud[df$trans_date_trans_time>df$min_date_fraud]<-1
#Creation of Number of transactions of customer
df <- df %>%
group_by(full_name) %>%
mutate(num_of_transactions = row_number()) %>%
ungroup()
data<-df %>% filter(df$is_fraud==1)
hist(data$num_of_transactions,col="blue", main = "Histogram of num of past transactions (only fraud cases)")
data<-df %>% filter(df$is_fraud==0)
hist(data$num_of_transactions,col="blue", main = "Histogram of num of past transactions (only non-fraud cases)")
#Creation of age of customer
df$dob_year<-as.Date(df$dob, format = "%d/%m/%Y")
df$dob_year <- format(df$dob_year, "%Y")
df$year<-as.Date(dmy_hm(df$trans_date_trans_time), format = "%d/%m/%Y")
df$year <- format(df$year, "%Y")
df$age_of_user<--as.double(df$dob_year)+as.double(df$year)
data<-df %>% filter(df$is_fraud==1)
hist(data$age_of_user,col="red", main = "Histogram of age_of_user (only fraud cases)")
data<-df %>% filter(df$is_fraud==0)
hist(data$age_of_user,col="blue", main = "Histogram of age_of_user (only non-fraud cases)")
##Creation of Time based variables
df <- df %>%
mutate(
trans_hour = hour(as.POSIXct(trans_date_trans_time, format="%d/%m/%Y %H:%M")),
trans_day = weekdays(as.Date(trans_date_trans_time, format="%d/%m/%Y")),
trans_month = month(as.Date(trans_date_trans_time, format="%d/%m/%Y")),
trans_month_name = month.abb[trans_month],
trans_weekend = ifelse(trans_day %in% c("Saturday", "Sunday"), 1, 0)
)
#Creation of numerical features
df$amt_sq<-df$amt*df$amt
df$num_of_transactions_sq<-df$num_of_transactions*df$num_of_transactions
df$age_of_user_sq<-df$age_of_user*df$age_of_user
##Train and out of time test set split
df<-df %>% select(merchant,category,amt,gender,city,job,is_fraud,previous_fraud,num_of_transactions,age_of_user,trans_hour,amt_sq,num_of_transactions_sq,age_of_user_sq)
df$amt<-(df$amt-mean(df$amt))/sd(df$amt)
df$num_of_transactions<-(df$num_of_transactions-mean(df$num_of_transactions))/sd(df$num_of_transactions)
df$age_of_user<-(df$age_of_user-mean(df$age_of_user))/sd(df$age_of_user)
df$amt_sq<-(df$amt_sq-mean(df$amt_sq))/sd(df$amt_sq)
df$num_of_transactions_sq<-(df$num_of_transactions_sq-mean(df$num_of_transactions_sq))/sd(df$num_of_transactions_sq)
df$age_of_user_sq<-(df$age_of_user_sq-mean(df$age_of_user_sq))/sd(df$age_of_user_sq)
set.seed(42) # Set seed for reproducibility
split_index <- sample(1:nrow(df), 0.8 * nrow(df))
train_data <- df[split_index, ]
test_data <- df[-split_index, ]
# Load the necessary library
library(e1071)
# Train the Naive Bayes model
nb_model <- naiveBayes(is_fraud ~ ., data = train_data)
# Make predictions on the test set
nb_pred <- predict(nb_model, newdata = test_data, type = "class")
# Calculate accuracy for Naive Bayes model
nb_accuracy <- mean(nb_pred == test_data$is_fraud)
nb_accuracy
## [1] 0.96267
# Decision Tree Model
library(rpart)
# Train the decision tree model
tree_model <- rpart(is_fraud ~ ., data = train_data, method = "class")
# Predict on test data
tree_pred <- predict(tree_model, newdata = test_data, type = "class")
# Evaluate decision tree model
tree_accuracy <- mean(tree_pred == test_data$is_fraud)
tree_accuracy
## [1] 0.9981286
# k-Nearest Neighbor (KNN) Model
library(class)
X_train <- data.matrix(train_data[, setdiff(names(train_data), c("is_fraud"))])
y_train <- train_data$is_fraud
X_test <- data.matrix(test_data[, setdiff(names(test_data), c("is_fraud"))])
y_test <- test_data$is_fraud
# Train the KNN model
knn_model <- knn(train = X_train, test = X_test, cl = y_train, k = 5)
# Evaluate KNN model
knn_accuracy <- mean(knn_model == y_test)
knn_accuracy
## [1] 0.9964011