Housekeeping
rm(list=ls()) # clear workspace
cat("\014") # clear console
graphics.off() # shuts down all open graphics devices
Import dataset
financialfraud <- read.csv("/Users/thuhuong/Desktop/R Working Directory/FinancialFraud.csv")
head(financialfraud)
## step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest
## 1 1 PAYMENT 9839.64 C1231006815 170136 160296.36 M1979787155
## 2 1 PAYMENT 1864.28 C1666544295 21249 19384.72 M2044282225
## 3 1 TRANSFER 181.00 C1305486145 181 0.00 C553264065
## 4 1 CASH_OUT 181.00 C840083671 181 0.00 C38997010
## 5 1 PAYMENT 11668.14 C2048537720 41554 29885.86 M1230701703
## 6 1 PAYMENT 7817.71 C90045638 53860 46042.29 M573487274
## oldbalanceDest newbalanceDest isFraud isFlaggedFraud
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 1 0
## 4 21182 0 1 0
## 5 0 0 0 0
## 6 0 0 0 0
EDA
dim(financialfraud) #dimension of the dataset
## [1] 6362620 11
summary(financialfraud) #summary of the dataset
## step type amount nameOrig
## Min. : 1.0 Length:6362620 Min. : 0 Length:6362620
## 1st Qu.:156.0 Class :character 1st Qu.: 13390 Class :character
## Median :239.0 Mode :character Median : 74872 Mode :character
## Mean :243.4 Mean : 179862
## 3rd Qu.:335.0 3rd Qu.: 208721
## Max. :743.0 Max. :92445517
## oldbalanceOrg newbalanceOrig nameDest oldbalanceDest
## Min. : 0 Min. : 0 Length:6362620 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 Class :character 1st Qu.: 0
## Median : 14208 Median : 0 Mode :character Median : 132706
## Mean : 833883 Mean : 855114 Mean : 1100702
## 3rd Qu.: 107315 3rd Qu.: 144258 3rd Qu.: 943037
## Max. :59585040 Max. :49585040 Max. :356015889
## newbalanceDest isFraud isFlaggedFraud
## Min. : 0 Min. :0.000000 Min. :0.0e+00
## 1st Qu.: 0 1st Qu.:0.000000 1st Qu.:0.0e+00
## Median : 214661 Median :0.000000 Median :0.0e+00
## Mean : 1224996 Mean :0.001291 Mean :2.5e-06
## 3rd Qu.: 1111909 3rd Qu.:0.000000 3rd Qu.:0.0e+00
## Max. :356179279 Max. :1.000000 Max. :1.0e+00
sum(is.na(financialfraud)) #check null values in the dataset
## [1] 0
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
financialfraud %>% count(isFraud) #number of fraud vs. non-fraud transactions
## isFraud n
## 1 0 6354407
## 2 1 8213
library(ggplot2)
g <- ggplot(financialfraud, aes(isFraud))
g + geom_bar(fill="lightblue") + geom_label(stat='count', aes(label = paste0(round(((..count..) /sum(..count..)), 4)*100, "%"))) + labs(x = "Fraud vs Not Fraud", y = "Frequency", title = "Frequency of Fraud", subtitle = "Labels as Percent of Total Observations") + scale_y_continuous(labels = scales::comma)

fraud_trans_type <- financialfraud %>% group_by(type) %>% summarize(fraud_count = sum(isFraud)) #type of fraud transactions
ggplot(fraud_trans_type, aes(x=type, y=fraud_count)) + geom_col(fill="lightblue") + labs(title = 'Fraud transactions per Type', x = 'Transaction Type', y = 'Number of Fraud Transactions') + geom_label(aes(label=fraud_count)) + theme_classic()

ggplot(data=financialfraud[financialfraud$isFraud==1,], aes(x=amount)) + geom_histogram(fill="lightblue") + labs(title = 'Fraud transactions Amount distribution', x = 'Amount', y = 'Number of Fraud Transactions') + theme_classic() + scale_x_continuous(labels = scales::comma)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

modulo <- function(y, x) {
q <- y/x
q <- floor(q)
p <- q*x
mod <- y-p
return(mod)
}
financialfraud$hour <- modulo(y=financialfraud$step,x=24)
fraud_hour <- financialfraud %>%
group_by(hour) %>%
summarise(cnt = n(),sum=sum(isFraud)) %>%
mutate(fraud_percentage = round((sum/cnt)*100,2))
ggplot(fraud_hour, aes(x=hour,y=fraud_percentage)) + geom_col(fill="lightblue") + labs(title = 'Fraud Percentage per Hour', x = 'Hour', y = 'Percentage of Fraud Transactions') + theme_classic()

Create train and test set
financialfraud<- financialfraud %>%
select( -one_of('step','nameOrig', 'nameDest', 'isFlaggedFraud'))
library(caret)
## Loading required package: lattice
set.seed(1234)
splitindex <- createDataPartition(financialfraud$isFraud, p=0.7, list=FALSE, times=1)
train <- financialfraud[splitindex,]
table(train$isFraud)
##
## 0 1
## 4448109 5725
test <- financialfraud[-splitindex,]
table(test$isFraud)
##
## 0 1
## 1906298 2488
# Use under-sampling majority class method for inbalanced dataset
library(unbalanced)
## Loading required package: mlr
## Loading required package: ParamHelpers
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
##
## Attaching package: 'mlr'
## The following object is masked from 'package:caret':
##
## train
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
inputs <- train[,-which(names(train) %in% "isFraud")]
target <- as.factor(train[,which(names(train) %in% "isFraud")])
under_sam <- ubUnder(X = inputs, Y = target)
train <- cbind(under_sam$X, under_sam$Y)
train$isFraud <- train$`under_sam$Y`
train$`under_sam$Y` <- NULL
table(train$isFraud)
##
## 0 1
## 5725 5725
A.Logistic Regression
logistic <- glm(isFraud~., data=train, family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic)
##
## Call:
## glm(formula = isFraud ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.2350 0.0000 0.0000 0.0313 8.4904
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.038e+01 2.571e+02 -0.274 0.784
## typeCASH_OUT 7.162e+01 2.571e+02 0.279 0.781
## typeDEBIT 5.021e+01 5.038e+03 0.010 0.992
## typePAYMENT 5.026e+01 6.888e+02 0.073 0.942
## typeTRANSFER 7.323e+01 2.571e+02 0.285 0.776
## amount -3.484e-06 6.900e-07 -5.049 4.43e-07 ***
## oldbalanceOrg 2.366e-05 7.402e-07 31.964 < 2e-16 ***
## newbalanceOrig -2.162e-05 7.859e-07 -27.514 < 2e-16 ***
## oldbalanceDest 8.245e-06 6.782e-07 12.158 < 2e-16 ***
## newbalanceDest -8.277e-06 6.781e-07 -12.207 < 2e-16 ***
## hour -1.245e-01 8.187e-03 -15.212 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15873.1 on 11449 degrees of freedom
## Residual deviance: 3236.9 on 11439 degrees of freedom
## AIC: 3258.9
##
## Number of Fisher Scoring iterations: 20
logisticprediction <- predict(logistic, test, type="response")
logisticprediction <- ifelse(logisticprediction>0.5, 1, 0)
table(logisticprediction)
## logisticprediction
## 0 1
## 1811902 96884
table(test$isFraud)
##
## 0 1
## 1906298 2488
confusionMatrix(as.factor(logisticprediction),as.factor(test$isFraud))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1811765 137
## 1 94533 2351
##
## Accuracy : 0.9504
## 95% CI : (0.9501, 0.9507)
## No Information Rate : 0.9987
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0449
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.95041
## Specificity : 0.94494
## Pos Pred Value : 0.99992
## Neg Pred Value : 0.02427
## Prevalence : 0.99870
## Detection Rate : 0.94917
## Detection Prevalence : 0.94924
## Balanced Accuracy : 0.94767
##
## 'Positive' Class : 0
##
library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
auc1 <- roc(test$isFraud, logisticprediction)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc1
##
## Call:
## roc.default(response = test$isFraud, predictor = logisticprediction)
##
## Data: logisticprediction in 1906298 controls (test$isFraud 0) < 2488 cases (test$isFraud 1).
## Area under the curve: 0.9477
B.Decision Tree
library(rpart)
library(rpart.plot)
tree <- rpart(isFraud ~ ., data = train)
prp(tree)

treeprediction <- predict(tree, test, type = "class")
table(treeprediction)
## treeprediction
## 0 1
## 1836964 71822
table(test$isFraud)
##
## 0 1
## 1906298 2488
confusionMatrix(as.factor(treeprediction),as.factor(test$isFraud))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1836946 18
## 1 69352 2470
##
## Accuracy : 0.9637
## 95% CI : (0.9634, 0.9639)
## No Information Rate : 0.9987
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0641
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.96362
## Specificity : 0.99277
## Pos Pred Value : 0.99999
## Neg Pred Value : 0.03439
## Prevalence : 0.99870
## Detection Rate : 0.96236
## Detection Prevalence : 0.96237
## Balanced Accuracy : 0.97819
##
## 'Positive' Class : 0
##
auc2 <- roc(test$isFraud, as.numeric(treeprediction))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc2
##
## Call:
## roc.default(response = test$isFraud, predictor = as.numeric(treeprediction))
##
## Data: as.numeric(treeprediction) in 1906298 controls (test$isFraud 0) < 2488 cases (test$isFraud 1).
## Area under the curve: 0.9782
C.Random Forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
rf <- randomForest(isFraud ~ ., data = train, importance = TRUE)
rfprediction <- predict(rf, test)
table(rfprediction)
## rfprediction
## 0 1
## 1878467 30319
confusionMatrix(as.factor(rfprediction),as.factor(test$isFraud))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1878454 13
## 1 27844 2475
##
## Accuracy : 0.9854
## 95% CI : (0.9852, 0.9856)
## No Information Rate : 0.9987
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1488
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.98539
## Specificity : 0.99477
## Pos Pred Value : 0.99999
## Neg Pred Value : 0.08163
## Prevalence : 0.99870
## Detection Rate : 0.98411
## Detection Prevalence : 0.98412
## Balanced Accuracy : 0.99008
##
## 'Positive' Class : 0
##
auc3 <- roc(test$isFraud, as.numeric(rfprediction))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc3
##
## Call:
## roc.default(response = test$isFraud, predictor = as.numeric(rfprediction))
##
## Data: as.numeric(rfprediction) in 1906298 controls (test$isFraud 0) < 2488 cases (test$isFraud 1).
## Area under the curve: 0.9901
D.XGBoost
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
train_x = data.matrix(train[,-which(names(train) %in% "isFraud")])
train_y = as.numeric(levels(train$isFraud))[train$isFraud]
test_x = data.matrix(test[,-which(names(test) %in% "isFraud")])
test$isFraud <- as.factor(test$isFraud)
test_y = as.numeric(levels(test$isFraud))[test$isFraud]
xgb_train = xgb.DMatrix(data=train_x, label=train_y)
xgb_test = xgb.DMatrix(data=test_x, label=test_y)
xgmodel <- xgboost(data=xgb_train, nrounds=10000, max_depth=3, early_stopping_rounds=50, objective="binary:logistic", verbose=0)
## [11:12:29] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
xgmodel$best_iteration #tune for best nround
## [1] 5422
xgmodel <- xgboost(data=xgb_train, nrounds=2290, max_depth=3, early_stopping_rounds=50, objective="binary:logistic", verbose=0)
## [11:12:54] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
xgboostprediction <- predict(xgmodel, xgb_test)
xgboostprediction <- ifelse(xgboostprediction>0.5, 1, 0)
table(xgboostprediction)
## xgboostprediction
## 0 1
## 1887669 21117
confusionMatrix(as.factor(xgboostprediction),as.factor(test$isFraud))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1887662 7
## 1 18636 2481
##
## Accuracy : 0.9902
## 95% CI : (0.9901, 0.9904)
## No Information Rate : 0.9987
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2084
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9902
## Specificity : 0.9972
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.1175
## Prevalence : 0.9987
## Detection Rate : 0.9889
## Detection Prevalence : 0.9889
## Balanced Accuracy : 0.9937
##
## 'Positive' Class : 0
##
auc4 <- roc(test$isFraud, as.numeric(xgboostprediction))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc4
##
## Call:
## roc.default(response = test$isFraud, predictor = as.numeric(xgboostprediction))
##
## Data: as.numeric(xgboostprediction) in 1906298 controls (test$isFraud 0) < 2488 cases (test$isFraud 1).
## Area under the curve: 0.9937
Feature Importance in XGBoost
importance <- xgb.importance(model=xgmodel)
importance
## Feature Gain Cover Frequency
## 1: oldbalanceOrg 0.39580645 0.31005022 0.24926448
## 2: newbalanceOrig 0.35113572 0.10976239 0.01298570
## 3: amount 0.11870235 0.24924811 0.27026479
## 4: type 0.05981641 0.06426301 0.02089885
## 5: hour 0.04205064 0.06551954 0.05610226
## 6: newbalanceDest 0.02682679 0.11797130 0.19793040
## 7: oldbalanceDest 0.00566164 0.08318542 0.19255352
xgb.plot.importance(importance_matrix = importance, xlab = "Relative importance based on Gain")
