Package and Data

library(ranger)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(data.table)
library(readr)
library(class)
creditcard_data <- read_csv("/Users/bhavyakalra/Desktop/Data_analytics/Credit Card fruad detection/creditcard.csv", show_col_types = FALSE)

Data Exploration

head(creditcard_data)
## # A tibble: 6 × 31
##    Time     V1      V2    V3     V4      V5      V6      V7      V8     V9
##   <dbl>  <dbl>   <dbl> <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
## 1     0 -1.36  -0.0728 2.54   1.38  -0.338   0.462   0.240   0.0987  0.364
## 2     0  1.19   0.266  0.166  0.448  0.0600 -0.0824 -0.0788  0.0851 -0.255
## 3     1 -1.36  -1.34   1.77   0.380 -0.503   1.80    0.791   0.248  -1.51 
## 4     1 -0.966 -0.185  1.79  -0.863 -0.0103  1.25    0.238   0.377  -1.39 
## 5     2 -1.16   0.878  1.55   0.403 -0.407   0.0959  0.593  -0.271   0.818
## 6     2 -0.426  0.961  1.14  -0.168  0.421  -0.0297  0.476   0.260  -0.569
## # ℹ 21 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>,
## #   V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## #   V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>,
## #   V27 <dbl>, V28 <dbl>, Amount <dbl>, Class <dbl>
dim(creditcard_data)
## [1] 284807     31
head(creditcard_data,6)
## # A tibble: 6 × 31
##    Time     V1      V2    V3     V4      V5      V6      V7      V8     V9
##   <dbl>  <dbl>   <dbl> <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
## 1     0 -1.36  -0.0728 2.54   1.38  -0.338   0.462   0.240   0.0987  0.364
## 2     0  1.19   0.266  0.166  0.448  0.0600 -0.0824 -0.0788  0.0851 -0.255
## 3     1 -1.36  -1.34   1.77   0.380 -0.503   1.80    0.791   0.248  -1.51 
## 4     1 -0.966 -0.185  1.79  -0.863 -0.0103  1.25    0.238   0.377  -1.39 
## 5     2 -1.16   0.878  1.55   0.403 -0.407   0.0959  0.593  -0.271   0.818
## 6     2 -0.426  0.961  1.14  -0.168  0.421  -0.0297  0.476   0.260  -0.569
## # ℹ 21 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>,
## #   V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## #   V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>,
## #   V27 <dbl>, V28 <dbl>, Amount <dbl>, Class <dbl>
tail(creditcard_data,6)
## # A tibble: 6 × 31
##     Time      V1      V2     V3     V4      V5     V6      V7     V8     V9
##    <dbl>   <dbl>   <dbl>  <dbl>  <dbl>   <dbl>  <dbl>   <dbl>  <dbl>  <dbl>
## 1 172785   0.120  0.931  -0.546 -0.745  1.13   -0.236  0.813   0.115 -0.204
## 2 172786 -11.9   10.1    -9.83  -2.07  -5.36   -2.61  -4.92    7.31   1.91 
## 3 172787  -0.733 -0.0551  2.04  -0.739  0.868   1.06   0.0243  0.295  0.585
## 4 172788   1.92  -0.301  -3.25  -0.558  2.63    3.03  -0.297   0.708  0.432
## 5 172788  -0.240  0.530   0.703  0.690 -0.378   0.624 -0.686   0.679  0.392
## 6 172792  -0.533 -0.190   0.703 -0.506 -0.0125 -0.650  1.58   -0.415  0.486
## # ℹ 21 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>,
## #   V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## #   V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>,
## #   V27 <dbl>, V28 <dbl>, Amount <dbl>, Class <dbl>
table(creditcard_data$Class)
## 
##      0      1 
## 284315    492
summary(creditcard_data$Amount)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     5.60    22.00    88.35    77.17 25691.16
names(creditcard_data)
##  [1] "Time"   "V1"     "V2"     "V3"     "V4"     "V5"     "V6"     "V7"    
##  [9] "V8"     "V9"     "V10"    "V11"    "V12"    "V13"    "V14"    "V15"   
## [17] "V16"    "V17"    "V18"    "V19"    "V20"    "V21"    "V22"    "V23"   
## [25] "V24"    "V25"    "V26"    "V27"    "V28"    "Amount" "Class"
var(creditcard_data$Amount)
## [1] 62560.07
sd(creditcard_data$Amount)
## [1] 250.1201

Data Manipulation

head(creditcard_data)
## # A tibble: 6 × 31
##    Time     V1      V2    V3     V4      V5      V6      V7      V8     V9
##   <dbl>  <dbl>   <dbl> <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
## 1     0 -1.36  -0.0728 2.54   1.38  -0.338   0.462   0.240   0.0987  0.364
## 2     0  1.19   0.266  0.166  0.448  0.0600 -0.0824 -0.0788  0.0851 -0.255
## 3     1 -1.36  -1.34   1.77   0.380 -0.503   1.80    0.791   0.248  -1.51 
## 4     1 -0.966 -0.185  1.79  -0.863 -0.0103  1.25    0.238   0.377  -1.39 
## 5     2 -1.16   0.878  1.55   0.403 -0.407   0.0959  0.593  -0.271   0.818
## 6     2 -0.426  0.961  1.14  -0.168  0.421  -0.0297  0.476   0.260  -0.569
## # ℹ 21 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>,
## #   V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## #   V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>,
## #   V27 <dbl>, V28 <dbl>, Amount <dbl>, Class <dbl>
creditcard_data$Amount=scale(creditcard_data$Amount)
NewData=creditcard_data[,-c(1)]
head(NewData)
## # A tibble: 6 × 30
##       V1      V2    V3     V4      V5      V6      V7      V8     V9     V10
##    <dbl>   <dbl> <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>
## 1 -1.36  -0.0728 2.54   1.38  -0.338   0.462   0.240   0.0987  0.364  0.0908
## 2  1.19   0.266  0.166  0.448  0.0600 -0.0824 -0.0788  0.0851 -0.255 -0.167 
## 3 -1.36  -1.34   1.77   0.380 -0.503   1.80    0.791   0.248  -1.51   0.208 
## 4 -0.966 -0.185  1.79  -0.863 -0.0103  1.25    0.238   0.377  -1.39  -0.0550
## 5 -1.16   0.878  1.55   0.403 -0.407   0.0959  0.593  -0.271   0.818  0.753 
## 6 -0.426  0.961  1.14  -0.168  0.421  -0.0297  0.476   0.260  -0.569 -0.371 
## # ℹ 20 more variables: V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>, V15 <dbl>,
## #   V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
## #   V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
## #   V28 <dbl>, Amount <dbl[,1]>, Class <dbl>
# Data Modelling
library(caTools)
set.seed(123)
data_sample = sample.split(NewData$Class,SplitRatio=0.80)
train_data = subset(NewData,data_sample==TRUE)
test_data = subset(NewData,data_sample==FALSE)
dim(train_data)
## [1] 227846     30
dim(test_data)
## [1] 56961    30

Fitting Logistic Regression Model

Logistic_Model=glm(Class~.,train_data,family=binomial())
summary(Logistic_Model)
## 
## Call:
## glm(formula = Class ~ ., family = binomial(), data = train_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.651305   0.160212 -53.999  < 2e-16 ***
## V1           0.072540   0.044144   1.643 0.100332    
## V2           0.014818   0.059777   0.248 0.804220    
## V3           0.026109   0.049776   0.525 0.599906    
## V4           0.681286   0.078071   8.726  < 2e-16 ***
## V5           0.087938   0.071553   1.229 0.219079    
## V6          -0.148083   0.085192  -1.738 0.082170 .  
## V7          -0.117344   0.068940  -1.702 0.088731 .  
## V8          -0.146045   0.035667  -4.095 4.23e-05 ***
## V9          -0.339828   0.117595  -2.890 0.003855 ** 
## V10         -0.785462   0.098486  -7.975 1.52e-15 ***
## V11          0.001492   0.085147   0.018 0.986018    
## V12          0.087106   0.094869   0.918 0.358532    
## V13         -0.343792   0.092381  -3.721 0.000198 ***
## V14         -0.526828   0.067084  -7.853 4.05e-15 ***
## V15         -0.095471   0.094037  -1.015 0.309991    
## V16         -0.130225   0.138629  -0.939 0.347537    
## V17          0.032463   0.074471   0.436 0.662900    
## V18         -0.100964   0.140985  -0.716 0.473909    
## V19          0.083711   0.105134   0.796 0.425897    
## V20         -0.463946   0.081871  -5.667 1.46e-08 ***
## V21          0.381206   0.065880   5.786 7.19e-09 ***
## V22          0.610874   0.142086   4.299 1.71e-05 ***
## V23         -0.071406   0.058799  -1.214 0.224589    
## V24          0.255791   0.170568   1.500 0.133706    
## V25         -0.073955   0.142634  -0.519 0.604109    
## V26          0.120841   0.202553   0.597 0.550783    
## V27         -0.852018   0.118391  -7.197 6.17e-13 ***
## V28         -0.323854   0.090075  -3.595 0.000324 ***
## Amount       0.292477   0.092075   3.177 0.001491 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5799.1  on 227845  degrees of freedom
## Residual deviance: 1790.9  on 227816  degrees of freedom
## AIC: 1850.9
## 
## Number of Fisher Scoring iterations: 12

Visualizing summarized model through the following plots

plot(Logistic_Model)

ROC Curve to assess the performance of the model

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
lr.predict <- predict(Logistic_Model,test_data, probability = TRUE)
auc.gbm = roc(test_data$Class, lr.predict, plot = TRUE, col = "blue")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Accuracy and Confusion Matrix

predictions <- predict(Logistic_Model, newdata = test_data, type = "response")
threshold <- 0.5
binary_predictions <- ifelse(predictions > threshold, 1, 0)
confusion_matrix <- table(test_data$Class, binary_predictions)
print(confusion_matrix)
##    binary_predictions
##         0     1
##   0 56856     7
##   1    41    57
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.99915731816506"

Fitting a Decision Tree Model

library(rpart)
library(rpart.plot)
X <- creditcard_data[, !(names(creditcard_data) %in% c("Class"))]
y <- creditcard_data$Class

set.seed(42)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]
classifier <- rpart(y_train ~ ., data = X_train, method = "class")
rpart.plot(classifier)

Accuracy and Confusion Matrix

y_pred <- predict(classifier, X_test, type = "class")
accuracy <- sum(y_pred == y_test) / length(y_test) * 100
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 99.9490879724724"
confusion_mat <- table(y_pred, y_test)
print(confusion_mat)
##       y_test
## y_pred     0     1
##      0 56853    23
##      1     6    79

Fitting a KNN model

knn_model <- knn(train = train_data[, -ncol(train_data)], 
                 test = test_data[, -ncol(test_data)], 
                 cl = train_data$Class, 
                 k = 5)

Accuracy and Confusion Matrix

accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.99915731816506"
confusion_matrix <- table(test_data$Class, knn_model)
print(confusion_matrix)
##    knn_model
##         0     1
##   0 56851    12
##   1    28    70