In this markdown, I’ve used German credit card dataset and used SMOTE to handle class imbalance and then I’ve used Logistic and Random Forest to predict if the probability of fraud.
The datasets contain transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions
Dataset link
x <- read.csv("E:\\Papers\\creditcard.csv", sep = ",")
set.seed(1029)
## Remove rows that do not have target variable values
final <- x[!(is.na(x$Class)),]
final$Class <- factor(final$Class)
library(caTools)
split <- sample.split(final$Class, SplitRatio = 0.75)
dresstrain <- subset(final, split == TRUE)
dresstest <- subset(final, split == FALSE)
## Let's check the count of unique value in the target variable
as.data.frame(table(dresstrain$Class))
## Var1 Freq
## 1 0 213236
## 2 1 369
## Loading DMwr to balance the unbalanced class
library(DMwR)
## Smote : Synthetic Minority Oversampling Technique To Handle Class Imbalancy In Binary Classification
balanced.data <- SMOTE(Class ~., dresstrain, perc.over = 4800, k = 5, perc.under = 1000)
as.data.frame(table(balanced.data$Class))
## Var1 Freq
## 1 0 177120
## 2 1 18081
Logistic Regression
library(caret)
model <- glm (Class~., data=balanced.data, family = binomial)
summary(model)
##
## Call:
## glm(formula = Class ~ ., family = binomial, data = balanced.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.4904 -0.1345 -0.0819 -0.0478 3.3359
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.359e+00 7.607e-02 -70.447 < 2e-16 ***
## Time -3.417e-06 6.392e-07 -5.346 8.97e-08 ***
## V1 1.884e-01 1.809e-02 10.415 < 2e-16 ***
## V2 1.685e-01 3.935e-02 4.281 1.86e-05 ***
## V3 9.730e-02 2.023e-02 4.809 1.52e-06 ***
## V4 7.327e-01 2.062e-02 35.538 < 2e-16 ***
## V5 2.216e-01 2.890e-02 7.667 1.76e-14 ***
## V6 -2.952e-01 2.562e-02 -11.521 < 2e-16 ***
## V7 -1.377e-01 3.549e-02 -3.879 0.000105 ***
## V8 -2.812e-01 1.594e-02 -17.639 < 2e-16 ***
## V9 -1.487e-01 2.957e-02 -5.027 4.99e-07 ***
## V10 -5.672e-01 3.238e-02 -17.516 < 2e-16 ***
## V11 1.888e-01 2.234e-02 8.454 < 2e-16 ***
## V12 -5.884e-01 2.315e-02 -25.420 < 2e-16 ***
## V13 -4.877e-01 2.058e-02 -23.694 < 2e-16 ***
## V14 -7.797e-01 2.047e-02 -38.096 < 2e-16 ***
## V15 5.692e-03 2.401e-02 0.237 0.812620
## V16 -1.856e-01 3.199e-02 -5.803 6.53e-09 ***
## V17 -1.270e-01 2.262e-02 -5.613 1.99e-08 ***
## V18 6.157e-03 3.308e-02 0.186 0.852358
## V19 -1.155e-01 2.775e-02 -4.162 3.15e-05 ***
## V20 -3.460e-01 4.104e-02 -8.430 < 2e-16 ***
## V21 1.704e-01 2.141e-02 7.958 1.75e-15 ***
## V22 3.873e-01 3.648e-02 10.619 < 2e-16 ***
## V23 1.175e-02 3.474e-02 0.338 0.735187
## V24 2.134e-01 4.225e-02 5.050 4.42e-07 ***
## V25 2.036e-01 4.304e-02 4.730 2.25e-06 ***
## V26 -3.818e-01 5.476e-02 -6.972 3.12e-12 ***
## V27 -1.297e-01 6.074e-02 -2.134 0.032804 *
## V28 1.802e-02 5.698e-02 0.316 0.751872
## Amount 2.313e-03 3.561e-04 6.495 8.31e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 120468 on 195200 degrees of freedom
## Residual deviance: 20712 on 195170 degrees of freedom
## AIC: 20774
##
## Number of Fisher Scoring iterations: 25
## Predict the Values
predict <- predict(model, dresstest, type = 'response')
## Create Confusion Matrix
table(dresstest$Class, predict > 0.5)
##
## FALSE TRUE
## 0 70872 207
## 1 18 105
#ROCR Curve
library(ROCR)
ROCRpred <- prediction(predict, dresstest$Class)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2,1.7))

Random FOrest
library(randomForest)
library(e1071)
rf = randomForest(Class~.,
ntree = 100,
data = balanced.data)
plot(rf)

varImp(rf)
## Overall
## Time 212.8759
## V1 321.0845
## V2 463.3183
## V3 1916.0623
## V4 1264.0709
## V5 188.7063
## V6 340.9743
## V7 929.5873
## V8 270.2217
## V9 1140.0924
## V10 4865.3979
## V11 2678.2463
## V12 3266.2186
## V13 243.7294
## V14 6561.4483
## V15 180.2387
## V16 1908.9783
## V17 3115.7313
## V18 529.4505
## V19 222.6081
## V20 162.6423
## V21 493.5526
## V22 145.5791
## V23 156.9237
## V24 154.1533
## V25 145.8141
## V26 185.8320
## V27 271.7943
## V28 222.8617
## Amount 250.0227
## Important variables according to the model
varImpPlot(rf,
sort = T,
n.var=25,
main="Variable Importance")

predicted.response <- predict(rf, dresstest)
confusionMatrix(data=predicted.response,
reference=dresstest$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 71060 20
## 1 19 103
##
## Accuracy : 0.9995
## 95% CI : (0.9993, 0.9996)
## No Information Rate : 0.9983
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8405
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9997
## Specificity : 0.8374
## Pos Pred Value : 0.9997
## Neg Pred Value : 0.8443
## Prevalence : 0.9983
## Detection Rate : 0.9980
## Detection Prevalence : 0.9983
## Balanced Accuracy : 0.9186
##
## 'Positive' Class : 0
##