1. Data Loading and pre-processing

# Loading Required Libraries

library(ranger)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(data.table)
library(readr)
library(class)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# Loading the Dataset

creditcard_data <- read_csv("/Users/bhavyakalra/Desktop/Data_analytics/Credit Card fruad detection/creditcard.csv", show_col_types = FALSE)
head(creditcard_data)
## # A tibble: 6 × 31
##    Time     V1      V2    V3     V4      V5      V6      V7      V8     V9
##   <dbl>  <dbl>   <dbl> <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
## 1     0 -1.36  -0.0728 2.54   1.38  -0.338   0.462   0.240   0.0987  0.364
## 2     0  1.19   0.266  0.166  0.448  0.0600 -0.0824 -0.0788  0.0851 -0.255
## 3     1 -1.36  -1.34   1.77   0.380 -0.503   1.80    0.791   0.248  -1.51 
## 4     1 -0.966 -0.185  1.79  -0.863 -0.0103  1.25    0.238   0.377  -1.39 
## 5     2 -1.16   0.878  1.55   0.403 -0.407   0.0959  0.593  -0.271   0.818
## 6     2 -0.426  0.961  1.14  -0.168  0.421  -0.0297  0.476   0.260  -0.569
## # ℹ 21 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>,
## #   V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## #   V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>,
## #   V27 <dbl>, V28 <dbl>, Amount <dbl>, Class <dbl>
#Checking for missing values
colSums(is.na(creditcard_data))
##   Time     V1     V2     V3     V4     V5     V6     V7     V8     V9    V10 
##      0      0      0      0      0      0      0      0      0      0      0 
##    V11    V12    V13    V14    V15    V16    V17    V18    V19    V20    V21 
##      0      0      0      0      0      0      0      0      0      0      0 
##    V22    V23    V24    V25    V26    V27    V28 Amount  Class 
##      0      0      0      0      0      0      0      0      0

Here we are loading the credit card dataset and displaying the first few rows of the dataset to provide an initial overview of the data. Then we are checking if there are any missing values in the data and found out that there are no missing values, hence we can proceed with data exploration.

2. Exploratory Data Analysis

# Describing the Dataset
str(creditcard_data)
## spc_tbl_ [284,807 × 31] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Time  : num [1:284807] 0 0 1 1 2 2 4 7 7 9 ...
##  $ V1    : num [1:284807] -1.36 1.192 -1.358 -0.966 -1.158 ...
##  $ V2    : num [1:284807] -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
##  $ V3    : num [1:284807] 2.536 0.166 1.773 1.793 1.549 ...
##  $ V4    : num [1:284807] 1.378 0.448 0.38 -0.863 0.403 ...
##  $ V5    : num [1:284807] -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
##  $ V6    : num [1:284807] 0.4624 -0.0824 1.8005 1.2472 0.0959 ...
##  $ V7    : num [1:284807] 0.2396 -0.0788 0.7915 0.2376 0.5929 ...
##  $ V8    : num [1:284807] 0.0987 0.0851 0.2477 0.3774 -0.2705 ...
##  $ V9    : num [1:284807] 0.364 -0.255 -1.515 -1.387 0.818 ...
##  $ V10   : num [1:284807] 0.0908 -0.167 0.2076 -0.055 0.7531 ...
##  $ V11   : num [1:284807] -0.552 1.613 0.625 -0.226 -0.823 ...
##  $ V12   : num [1:284807] -0.6178 1.0652 0.0661 0.1782 0.5382 ...
##  $ V13   : num [1:284807] -0.991 0.489 0.717 0.508 1.346 ...
##  $ V14   : num [1:284807] -0.311 -0.144 -0.166 -0.288 -1.12 ...
##  $ V15   : num [1:284807] 1.468 0.636 2.346 -0.631 0.175 ...
##  $ V16   : num [1:284807] -0.47 0.464 -2.89 -1.06 -0.451 ...
##  $ V17   : num [1:284807] 0.208 -0.115 1.11 -0.684 -0.237 ...
##  $ V18   : num [1:284807] 0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
##  $ V19   : num [1:284807] 0.404 -0.146 -2.262 -1.233 0.803 ...
##  $ V20   : num [1:284807] 0.2514 -0.0691 0.525 -0.208 0.4085 ...
##  $ V21   : num [1:284807] -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
##  $ V22   : num [1:284807] 0.27784 -0.63867 0.77168 0.00527 0.79828 ...
##  $ V23   : num [1:284807] -0.11 0.101 0.909 -0.19 -0.137 ...
##  $ V24   : num [1:284807] 0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
##  $ V25   : num [1:284807] 0.129 0.167 -0.328 0.647 -0.206 ...
##  $ V26   : num [1:284807] -0.189 0.126 -0.139 -0.222 0.502 ...
##  $ V27   : num [1:284807] 0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
##  $ V28   : num [1:284807] -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
##  $ Amount: num [1:284807] 149.62 2.69 378.66 123.5 69.99 ...
##  $ Class : num [1:284807] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Time = col_double(),
##   ..   V1 = col_double(),
##   ..   V2 = col_double(),
##   ..   V3 = col_double(),
##   ..   V4 = col_double(),
##   ..   V5 = col_double(),
##   ..   V6 = col_double(),
##   ..   V7 = col_double(),
##   ..   V8 = col_double(),
##   ..   V9 = col_double(),
##   ..   V10 = col_double(),
##   ..   V11 = col_double(),
##   ..   V12 = col_double(),
##   ..   V13 = col_double(),
##   ..   V14 = col_double(),
##   ..   V15 = col_double(),
##   ..   V16 = col_double(),
##   ..   V17 = col_double(),
##   ..   V18 = col_double(),
##   ..   V19 = col_double(),
##   ..   V20 = col_double(),
##   ..   V21 = col_double(),
##   ..   V22 = col_double(),
##   ..   V23 = col_double(),
##   ..   V24 = col_double(),
##   ..   V25 = col_double(),
##   ..   V26 = col_double(),
##   ..   V27 = col_double(),
##   ..   V28 = col_double(),
##   ..   Amount = col_double(),
##   ..   Class = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Calculating Summary for 'Time' and 'Amount' only
summary_metrics <- data.frame(
  Variable = c("Time", "Amount"),
  Mean = c(mean(creditcard_data$Time, na.rm = TRUE), mean(creditcard_data$Amount, na.rm = TRUE)),
  SD = c(sd(creditcard_data$Time, na.rm = TRUE), sd(creditcard_data$Amount, na.rm = TRUE)),
  Min = c(min(creditcard_data$Time, na.rm = TRUE), min(creditcard_data$Amount, na.rm = TRUE)),
  Max = c(max(creditcard_data$Time, na.rm = TRUE), max(creditcard_data$Amount, na.rm = TRUE))
)

print(summary_metrics)
##   Variable        Mean         SD Min       Max
## 1     Time 94813.85957 47488.1460   0 172792.00
## 2   Amount    88.34962   250.1201   0  25691.16

From the output, we can observe the following summary statistics for the ‘Time’ and ‘Amount’ variables:

These statistics offer insights into the typical time and amount characteristics of transactions in the dataset.

2.1 Distribution of class

# Distribution of class

distribution_of_class <- table(creditcard_data$Class)
print(distribution_of_class)
## 
##      0      1 
## 284315    492
# Plotting Distribution of Fraudulent Transactions
ggplot(creditcard_data, aes(x = factor(Class), fill = factor(Class))) +
  geom_bar() +
  labs(x = "Class", y = "Count", fill = "Class", title = "Distribution of Fraudulent Transactions") +
  scale_fill_manual(values = c("0" = "cyan", "1" = "orange"), labels = c("Genuine", "Fraudulent")) +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal()

The distribution of the ‘Class’ variable reveals the following:

This shows a significant class imbalance, where the majority of transactions (284,315) are labeled as non-fraudulent (Class 0), while a minority (492) are classified as fraudulent (Class 1).

2.2 Analysis of Transaction Amount

# Log Scaled Transaction Amount plot 
ggplot(creditcard_data, aes(x = Amount, fill = factor(Class))) +
  geom_density(alpha = 0.5) +
  scale_x_log10(labels = scales::dollar) +
  scale_fill_manual(values = c("0" = "purple", "1" = "orange"), labels = c("Genuine", "Fraudulent")) +
  labs(x = "Transaction Amount (Log Scale)", y = "Density", fill = "Class", title = "Transaction Amount Distribution by Class (Log Scale)") +
  theme_minimal()

Here, we can notice that most transactions have smaller amounts. This makes sense since many everyday transactions involve smaller sums of money. The tops of the graphs show where most transactions happen, and they’re at different amounts for genuine and fraudulent ones. Fraudulent transactions have two peaks: one at a lower amount and another at a higher amount, unlike genuine ones

2.3 Analysis of Time

# Faceting Transaction Time plot
ggplot(creditcard_data, aes(x = Time, fill = factor(Class))) +
  geom_histogram(position = "identity", alpha = 0.5, bins = 100) +
  facet_wrap(~ Class, scales = "free_y", ncol = 1) +  # Facet by Class, allowing different y scales
  scale_fill_manual(values = c("0" = "green", "1" = "red"), labels = c("Genuine", "Fraudulent")) +
  labs(x = "Time (in seconds)", y = "Count", fill = "Class", title = "Transaction Time Distribution by Class") +
  theme_minimal()

From the graph we can observe that genuine transactions seem to follow a repeating pattern, possibly reflecting busier times during certain parts of the day or days of the week. The regular drops could indicate quieter times, like nighttime or weekends. However, fraudulent transactions don’t show this pattern as clearly, suggesting they might not follow the same routine as genuine ones.

3. Data Preparation

# Normalizing 'Time' and 'Amount'

time_amount <- creditcard_data[, c("Time", "Amount")]
preProcessRange <- preProcess(time_amount, method = c("center", "scale"))
data_norm <- predict(preProcessRange, time_amount)

# Combining normalized data with original data, excluding 'Time' and 'Amount' columns
creditcard_data <- cbind(creditcard_data[, !names(creditcard_data) %in% c("Time", "Amount")], data_norm)
head(creditcard_data)
##           V1          V2        V3         V4          V5          V6
## 1 -1.3598071 -0.07278117 2.5363467  1.3781552 -0.33832077  0.46238778
## 2  1.1918571  0.26615071 0.1664801  0.4481541  0.06001765 -0.08236081
## 3 -1.3583541 -1.34016307 1.7732093  0.3797796 -0.50319813  1.80049938
## 4 -0.9662717 -0.18522601 1.7929933 -0.8632913 -0.01030888  1.24720317
## 5 -1.1582331  0.87773675 1.5487178  0.4030339 -0.40719338  0.09592146
## 6 -0.4259659  0.96052304 1.1411093 -0.1682521  0.42098688 -0.02972755
##            V7          V8         V9         V10        V11         V12
## 1  0.23959855  0.09869790  0.3637870  0.09079417 -0.5515995 -0.61780086
## 2 -0.07880298  0.08510165 -0.2554251 -0.16697441  1.6127267  1.06523531
## 3  0.79146096  0.24767579 -1.5146543  0.20764287  0.6245015  0.06608369
## 4  0.23760894  0.37743587 -1.3870241 -0.05495192 -0.2264873  0.17822823
## 5  0.59294075 -0.27053268  0.8177393  0.75307443 -0.8228429  0.53819555
## 6  0.47620095  0.26031433 -0.5686714 -0.37140720  1.3412620  0.35989384
##          V13        V14        V15        V16         V17         V18
## 1 -0.9913898 -0.3111694  1.4681770 -0.4704005  0.20797124  0.02579058
## 2  0.4890950 -0.1437723  0.6355581  0.4639170 -0.11480466 -0.18336127
## 3  0.7172927 -0.1659459  2.3458649 -2.8900832  1.10996938 -0.12135931
## 4  0.5077569 -0.2879237 -0.6314181 -1.0596472 -0.68409279  1.96577500
## 5  1.3458516 -1.1196698  0.1751211 -0.4514492 -0.23703324 -0.03819479
## 6 -0.3580907 -0.1371337  0.5176168  0.4017259 -0.05813282  0.06865315
##           V19         V20          V21          V22         V23         V24
## 1  0.40399296  0.25141210 -0.018306778  0.277837576 -0.11047391  0.06692807
## 2 -0.14578304 -0.06908314 -0.225775248 -0.638671953  0.10128802 -0.33984648
## 3 -2.26185710  0.52497973  0.247998153  0.771679402  0.90941226 -0.68928096
## 4 -1.23262197 -0.20803778 -0.108300452  0.005273597 -0.19032052 -1.17557533
## 5  0.80348692  0.40854236 -0.009430697  0.798278495 -0.13745808  0.14126698
## 6 -0.03319379  0.08496767 -0.208253515 -0.559824796 -0.02639767 -0.37142658
##          V25        V26          V27         V28 Class      Time      Amount
## 1  0.1285394 -0.1891148  0.133558377 -0.02105305     0 -1.996580  0.24496383
## 2  0.1671704  0.1258945 -0.008983099  0.01472417     0 -1.996580 -0.34247394
## 3 -0.3276418 -0.1390966 -0.055352794 -0.05975184     0 -1.996558  1.16068389
## 4  0.6473760 -0.2219288  0.062722849  0.06145763     0 -1.996558  0.14053401
## 5 -0.2060096  0.5022922  0.219422230  0.21515315     0 -1.996537 -0.07340321
## 6 -0.2327938  0.1059148  0.253844225  0.08108026     0 -1.996537 -0.33855582
# Splitting the data into training and test sets (e.g., 60% training, 40% test)
set.seed(147)
train_index <- createDataPartition(creditcard_data$Class, p = 0.6, list = FALSE)
train_data <- creditcard_data[train_index, ]
test_data <- creditcard_data[-train_index, ]

# Counting the number of rows in the training and test sets
nrow(train_data)
## [1] 170885
nrow(test_data)
## [1] 113922

Here we are normalizing the data and splitting the data set into training data and test data in the ratio of 60:40

library(ROSE)
## Loaded ROSE 0.0-4
# Resampling
balanced_training_set <- ovun.sample(Class ~ ., data = train_data, p=0.5, seed = 151, method="both")$data

# Checking the number of each class
table(balanced_training_set$Class)
## 
##     0     1 
## 85425 85460
# Plotting the class distribution
options(repr.plot.width=12, repr.plot.height=7)
ggplot(balanced_training_set, aes(x = factor(Class), fill = factor(Class))) +
  geom_bar() +
  labs(x = "Class", y = "Count", fill = "Class", title = "Distribution of Classes") +
  scale_fill_manual(values = c("0" = "aquamarine", "1" = "brown1"), labels = c("Genuine", "Fraudulent")) +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal()

Since we have the class imbalance we are doing re-sampling of data in order to avoid overfitting and now after resampling it is equally distributed so we can proceed with the predictive model analysis.

4. Predictive Model Analysis

4.1 Logistic Regression Model

# Fitting the logistic regression model
Logistic_Model <- glm(Class ~ ., family = binomial(), data = balanced_training_set)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Checking summary of the model
summary(Logistic_Model)
## 
## Call:
## glm(formula = Class ~ ., family = binomial(), data = balanced_training_set)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.64628    0.14017 -33.148  < 2e-16 ***
## V1           0.52999    0.03672  14.431  < 2e-16 ***
## V2           0.92761    0.09866   9.402  < 2e-16 ***
## V3           0.23498    0.03088   7.609 2.76e-14 ***
## V4           1.37229    0.08043  17.062  < 2e-16 ***
## V5           1.13758    0.08116  14.017  < 2e-16 ***
## V6          -0.60286    0.03236 -18.630  < 2e-16 ***
## V7          -0.22801    0.07396  -3.083 0.002050 ** 
## V8          -0.45950    0.02031 -22.629  < 2e-16 ***
## V9           0.63743    0.12386   5.146 2.65e-07 ***
## V10         -1.14584    0.09927 -11.542  < 2e-16 ***
## V11          0.72279    0.03417  21.150  < 2e-16 ***
## V12         -0.92461    0.03967 -23.305  < 2e-16 ***
## V13         -0.67582    0.03758 -17.983  < 2e-16 ***
## V14         -1.26038    0.03292 -38.289  < 2e-16 ***
## V15          0.07845    0.02261   3.470 0.000520 ***
## V16          0.74228    0.17410   4.264 2.01e-05 ***
## V17         -1.11485    0.05196 -21.456  < 2e-16 ***
## V18         -1.48833    0.16154  -9.213  < 2e-16 ***
## V19          1.07336    0.09362  11.465  < 2e-16 ***
## V20         -0.24028    0.06936  -3.464 0.000532 ***
## V21          0.52577    0.04624  11.370  < 2e-16 ***
## V22          1.59434    0.07513  21.222  < 2e-16 ***
## V23          0.26856    0.06254   4.294 1.75e-05 ***
## V24         -0.12207    0.02583  -4.725 2.30e-06 ***
## V25          0.39767    0.03750  10.604  < 2e-16 ***
## V26         -0.23380    0.02893  -8.081 6.41e-16 ***
## V27          0.26071    0.04083   6.385 1.72e-10 ***
## V28          0.58517    0.06287   9.307  < 2e-16 ***
## Time        -0.38040    0.01919 -19.821  < 2e-16 ***
## Amount       2.04824    0.17358  11.800  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 236897  on 170884  degrees of freedom
## Residual deviance:  43884  on 170854  degrees of freedom
## AIC: 43946
## 
## Number of Fisher Scoring iterations: 16

Here the logistic regression model has been fitted to the balanced training set. It shows the coefficients for each predictor variable, indicating their impact on the likelihood of a transaction being fraudulent (Class = 1). The “Null deviance” and “Residual deviance” represent the goodness of fit of the model. A lower residual deviance suggests a better fit to the data. The “AIC” value is the Akaike Information Criterion, which measures the quality of the model while penalizing complexity.

# ROC Curve to assess the performance of the model

library(pROC)
lr.predict <- predict(Logistic_Model,test_data, probability = TRUE)
roc_glm = roc(test_data$Class, lr.predict, plot = TRUE, col = "blue")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# Accuracy and Confusion Matrix
auc_glm <- auc(roc_glm)
predictions <- predict(Logistic_Model, newdata = test_data, type = "response")
threshold <- 0.5
binary_predictions <- ifelse(predictions > threshold, 1, 0)
confusion_matrix_glm <- table(test_data$Class, binary_predictions)
print(confusion_matrix_glm)
##    binary_predictions
##          0      1
##   0 110901   2840
##   1     15    166

The confusion matrix displays the performance of the logistic regression model. It shows how many instances were correctly classified as either genuine (0) or fraudulent (1) transactions. In this case, the model correctly classified 110,901 genuine transactions as genuine and 166 fraudulent transactions as fraudulent. However, it misclassified 2,840 genuine transactions as fraudulent and 15 fraudulent transactions as genuine.

# Print AUC and accuracy
accuracy_glm <- sum(diag(confusion_matrix_glm)) / sum(confusion_matrix_glm)
cat("AUC:", auc(roc_glm), "\n")
## AUC: 0.9765142
cat("Accuracy:", accuracy_glm, "\n")
## Accuracy: 0.974939

The AUC (Area Under the Curve) value for the logistic regression model is 0.976, indicating strong performance in distinguishing between genuine and fraudulent transactions. The accuracy of the model, which measures the proportion of correctly classified instances, is 97.5%. This suggests that the model is highly accurate in identifying both genuine and fraudulent transactions.

4.2 Fitting a Decision Tree Model

library(rpart)
library(rpart.plot)
classifier <- rpart(Class ~ ., data = balanced_training_set, method = "class")
rpart.plot(classifier)

# Plotting ROC Curve
pred_dt <- predict(classifier, newdata = test_data, type = "prob")
dt_fg <- pred_dt[test_data$Class == 1, "1"] 
dt_bg <- pred_dt[test_data$Class == 0, "1"] 
true_positives <- rep(1, length(dt_fg))

true_negatives <- rep(0, length(dt_bg))

response <- c(true_positives, true_negatives)
predictor <- c(dt_fg, dt_bg)

roc_dt <- roc(response, predictor)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_dt, col = "blue", main = "ROC Curve")

# Confusion Matrix

threshold <- 0.5

binary_predictions <- c(rep(1, length(dt_fg)), rep(0, length(dt_bg)))
binary_predictions <- ifelse(predictor > threshold, 1, 0)

# Creating confusion matrix
confusion_matrix_dt <- table(True_Labels = response, Predicted_Labels = binary_predictions)

# Printing confusion matrix
print(confusion_matrix_dt)
##            Predicted_Labels
## True_Labels      0      1
##           0 110608   3133
##           1     21    160

In this specific scenario, the confusion matrix shows that our model correctly identified a large number of genuine transactions (110,608) and also captured some fraudulent transactions (160). However, it also misclassified some genuine transactions as fraudulent (3133 false positives) and failed to detect some fraudulent transactions (21 false negatives).

# Accuracy and AUC

# Calculating AUC
auc_dt <- auc(roc_dt)

# Calculating accuracy
accuracy_dt <- sum(diag(confusion_matrix_dt)) / sum(confusion_matrix_dt)

# Printing AUC and accuracy
cat("AUC:", auc_dt, "\n")
## AUC: 0.9443683
cat("Accuracy:", accuracy_dt, "\n")
## Accuracy: 0.9723144

The AUC (Area Under the Curve) value is approximately 0.944. Additionally, the accuracy of our model, which measures the proportion of correct predictions, is approximately 0.972. These metrics demonstrate that our decision tree model performs well in both identifying fraudulent transactions and making accurate predictions overall.

4.3 Fitting a KNN model

knn_model <- knn(train = train_data[, -ncol(train_data)], 
                 test = test_data[, -ncol(test_data)], 
                 cl = train_data$Class, 
                 k = 10)
# Creating ROC curve
roc_knn <- roc(test_data$Class, as.numeric(knn_model))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plotting ROC curve
plot(roc_knn, col = "blue", main = "ROC Curve for KNN Model")

# Confusion Matrix
confusion_matrix_knn <- table(test_data$Class, knn_model)
print(confusion_matrix_knn)
##    knn_model
##          0      1
##   0 113727     14
##   1     36    145

This confusion matrix presents the performance of our KNN (K-Nearest Neighbors) model in classifying transactions as either genuine or fraudulent. It indicates that out of 113741 genuine transactions (Class 0), 113727 were correctly classified, while 14 were misclassified as fraudulent. Similarly, out of 181 fraudulent transactions (Class 1), 145 were correctly classified, but 36 were misclassified as genuine.

# Accuracy and AUC

# Calculating AUC
auc_knn <- roc(test_data$Class, as.numeric(knn_model))$auc
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste("AUC:", auc_knn))
## [1] "AUC: 0.900490942856944"
# Calculating accuracy
accuracy_knn <- sum(diag(confusion_matrix_knn)) / sum(confusion_matrix_knn)
print(paste("Accuracy:", accuracy_knn))
## [1] "Accuracy: 0.999561103210969"

The calculated AUC (Area Under the Curve) for our KNN (K-Nearest Neighbors) model is approximately 0.900, indicating that the model has good discriminatory power in distinguishing between genuine and fraudulent transactions.

Additionally, the accuracy of the KNN model is approximately 99.96%, suggesting that it performs exceptionally well in correctly classifying transactions into their respective classes. This high accuracy further underscores the effectiveness of the model in fraud detection.

5. Comparison of Models and Evaluation

Here we are plotting ROC curves for all three models in single graph for comparison and bar plot for area under curves for all three models.

# Plot ROC curves
legend_labels <- c("KNN", "Decision Tree", "Logistic Regression (GLM)")
plot(roc_knn, col = "orange", lwd = 2, main = "ROC Curves Comparison", xlab = "False Positive Rate", ylab = "True Positive Rate")
lines(roc_dt, col = "blue", lwd = 2)
lines(roc_glm, col = "green", lwd = 2)
legend("bottomright", legend = legend_labels, col = c("orange", "blue", "green"), lwd = 2)

# Create a dataframe with model names and AUC values
auc_values <- c(auc_knn, auc_dt, auc_glm)
model_names <- c("KNN", "Decision Tree", "Logistic Regression (GLM)")
auc_df <- data.frame(Model = model_names, AUC_ROC = auc_values)

# Plot the bar plot
barplot(auc_df$AUC_ROC, names.arg = auc_df$Model, col = c("coral", "cyan3", "deeppink2"), 
        main = "AUC_ROC", ylab = "AUROC Value", las = 2, ylim = c(0, 1))

# Creating the bar plot for accuracy
model_names <- c("Logistic Regression", "Decision Tree", "KNN")
accuracy_values <- c(accuracy_glm, accuracy_dt, accuracy_knn)  # Replace with your actual accuracy values
bar_colors <- c("skyblue", "lightgreen", "lightcoral")
accuracy_df <- data.frame(Model = model_names, Accuracy = accuracy_values)

barplot(accuracy_values, names.arg = model_names, 
        col = bar_colors, main = "Model Accuracy Comparison",
        ylab = "Accuracy", ylim = c(0, 1))

6. Conclusion

In this analysis, we aimed to find the most effective models for handling this credit card fraud detection data set. Initially we did some exploratory data analysis and found that our data set is highly imbalanced which can lead to over-fitting so we did re-sampling in data preparation step. Then, We tested four machine learning models: Logistic Regression, Decision Trees and k-nearest neighbors. The results showed that KNN performed the best, achieving accuracy of 99.96%. Logistic Regression came next, closely trailing with 97.49% and Decision Tree had the lowest performance, with 97.23%.