# Loading Required Libraries
library(ranger)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(data.table)
library(readr)
library(class)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Loading the Dataset
creditcard_data <- read_csv("/Users/bhavyakalra/Desktop/Data_analytics/Credit Card fruad detection/creditcard.csv", show_col_types = FALSE)
head(creditcard_data)
## # A tibble: 6 × 31
## Time V1 V2 V3 V4 V5 V6 V7 V8 V9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 -1.36 -0.0728 2.54 1.38 -0.338 0.462 0.240 0.0987 0.364
## 2 0 1.19 0.266 0.166 0.448 0.0600 -0.0824 -0.0788 0.0851 -0.255
## 3 1 -1.36 -1.34 1.77 0.380 -0.503 1.80 0.791 0.248 -1.51
## 4 1 -0.966 -0.185 1.79 -0.863 -0.0103 1.25 0.238 0.377 -1.39
## 5 2 -1.16 0.878 1.55 0.403 -0.407 0.0959 0.593 -0.271 0.818
## 6 2 -0.426 0.961 1.14 -0.168 0.421 -0.0297 0.476 0.260 -0.569
## # ℹ 21 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>,
## # V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## # V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>,
## # V27 <dbl>, V28 <dbl>, Amount <dbl>, Class <dbl>
#Checking for missing values
colSums(is.na(creditcard_data))
## Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 0 0 0 0 0 0 0 0 0 0 0
## V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## 0 0 0 0 0 0 0 0 0 0 0
## V22 V23 V24 V25 V26 V27 V28 Amount Class
## 0 0 0 0 0 0 0 0 0
Here we are loading the credit card dataset and displaying the first few rows of the dataset to provide an initial overview of the data. Then we are checking if there are any missing values in the data and found out that there are no missing values, hence we can proceed with data exploration.
# Describing the Dataset
str(creditcard_data)
## spc_tbl_ [284,807 × 31] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Time : num [1:284807] 0 0 1 1 2 2 4 7 7 9 ...
## $ V1 : num [1:284807] -1.36 1.192 -1.358 -0.966 -1.158 ...
## $ V2 : num [1:284807] -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
## $ V3 : num [1:284807] 2.536 0.166 1.773 1.793 1.549 ...
## $ V4 : num [1:284807] 1.378 0.448 0.38 -0.863 0.403 ...
## $ V5 : num [1:284807] -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
## $ V6 : num [1:284807] 0.4624 -0.0824 1.8005 1.2472 0.0959 ...
## $ V7 : num [1:284807] 0.2396 -0.0788 0.7915 0.2376 0.5929 ...
## $ V8 : num [1:284807] 0.0987 0.0851 0.2477 0.3774 -0.2705 ...
## $ V9 : num [1:284807] 0.364 -0.255 -1.515 -1.387 0.818 ...
## $ V10 : num [1:284807] 0.0908 -0.167 0.2076 -0.055 0.7531 ...
## $ V11 : num [1:284807] -0.552 1.613 0.625 -0.226 -0.823 ...
## $ V12 : num [1:284807] -0.6178 1.0652 0.0661 0.1782 0.5382 ...
## $ V13 : num [1:284807] -0.991 0.489 0.717 0.508 1.346 ...
## $ V14 : num [1:284807] -0.311 -0.144 -0.166 -0.288 -1.12 ...
## $ V15 : num [1:284807] 1.468 0.636 2.346 -0.631 0.175 ...
## $ V16 : num [1:284807] -0.47 0.464 -2.89 -1.06 -0.451 ...
## $ V17 : num [1:284807] 0.208 -0.115 1.11 -0.684 -0.237 ...
## $ V18 : num [1:284807] 0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
## $ V19 : num [1:284807] 0.404 -0.146 -2.262 -1.233 0.803 ...
## $ V20 : num [1:284807] 0.2514 -0.0691 0.525 -0.208 0.4085 ...
## $ V21 : num [1:284807] -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
## $ V22 : num [1:284807] 0.27784 -0.63867 0.77168 0.00527 0.79828 ...
## $ V23 : num [1:284807] -0.11 0.101 0.909 -0.19 -0.137 ...
## $ V24 : num [1:284807] 0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
## $ V25 : num [1:284807] 0.129 0.167 -0.328 0.647 -0.206 ...
## $ V26 : num [1:284807] -0.189 0.126 -0.139 -0.222 0.502 ...
## $ V27 : num [1:284807] 0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
## $ V28 : num [1:284807] -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
## $ Amount: num [1:284807] 149.62 2.69 378.66 123.5 69.99 ...
## $ Class : num [1:284807] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. Time = col_double(),
## .. V1 = col_double(),
## .. V2 = col_double(),
## .. V3 = col_double(),
## .. V4 = col_double(),
## .. V5 = col_double(),
## .. V6 = col_double(),
## .. V7 = col_double(),
## .. V8 = col_double(),
## .. V9 = col_double(),
## .. V10 = col_double(),
## .. V11 = col_double(),
## .. V12 = col_double(),
## .. V13 = col_double(),
## .. V14 = col_double(),
## .. V15 = col_double(),
## .. V16 = col_double(),
## .. V17 = col_double(),
## .. V18 = col_double(),
## .. V19 = col_double(),
## .. V20 = col_double(),
## .. V21 = col_double(),
## .. V22 = col_double(),
## .. V23 = col_double(),
## .. V24 = col_double(),
## .. V25 = col_double(),
## .. V26 = col_double(),
## .. V27 = col_double(),
## .. V28 = col_double(),
## .. Amount = col_double(),
## .. Class = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Calculating Summary for 'Time' and 'Amount' only
summary_metrics <- data.frame(
Variable = c("Time", "Amount"),
Mean = c(mean(creditcard_data$Time, na.rm = TRUE), mean(creditcard_data$Amount, na.rm = TRUE)),
SD = c(sd(creditcard_data$Time, na.rm = TRUE), sd(creditcard_data$Amount, na.rm = TRUE)),
Min = c(min(creditcard_data$Time, na.rm = TRUE), min(creditcard_data$Amount, na.rm = TRUE)),
Max = c(max(creditcard_data$Time, na.rm = TRUE), max(creditcard_data$Amount, na.rm = TRUE))
)
print(summary_metrics)
## Variable Mean SD Min Max
## 1 Time 94813.85957 47488.1460 0 172792.00
## 2 Amount 88.34962 250.1201 0 25691.16
From the output, we can observe the following summary statistics for the ‘Time’ and ‘Amount’ variables:
These statistics offer insights into the typical time and amount characteristics of transactions in the dataset.
2.1 Distribution of class
# Distribution of class
distribution_of_class <- table(creditcard_data$Class)
print(distribution_of_class)
##
## 0 1
## 284315 492
# Plotting Distribution of Fraudulent Transactions
ggplot(creditcard_data, aes(x = factor(Class), fill = factor(Class))) +
geom_bar() +
labs(x = "Class", y = "Count", fill = "Class", title = "Distribution of Fraudulent Transactions") +
scale_fill_manual(values = c("0" = "cyan", "1" = "orange"), labels = c("Genuine", "Fraudulent")) +
scale_y_continuous(labels = scales::comma) +
theme_minimal()
The distribution of the ‘Class’ variable reveals the following:
This shows a significant class imbalance, where the majority of transactions (284,315) are labeled as non-fraudulent (Class 0), while a minority (492) are classified as fraudulent (Class 1).
2.2 Analysis of Transaction Amount
# Log Scaled Transaction Amount plot
ggplot(creditcard_data, aes(x = Amount, fill = factor(Class))) +
geom_density(alpha = 0.5) +
scale_x_log10(labels = scales::dollar) +
scale_fill_manual(values = c("0" = "purple", "1" = "orange"), labels = c("Genuine", "Fraudulent")) +
labs(x = "Transaction Amount (Log Scale)", y = "Density", fill = "Class", title = "Transaction Amount Distribution by Class (Log Scale)") +
theme_minimal()
Here, we can notice that most transactions have smaller amounts. This makes sense since many everyday transactions involve smaller sums of money. The tops of the graphs show where most transactions happen, and they’re at different amounts for genuine and fraudulent ones. Fraudulent transactions have two peaks: one at a lower amount and another at a higher amount, unlike genuine ones
2.3 Analysis of Time
# Faceting Transaction Time plot
ggplot(creditcard_data, aes(x = Time, fill = factor(Class))) +
geom_histogram(position = "identity", alpha = 0.5, bins = 100) +
facet_wrap(~ Class, scales = "free_y", ncol = 1) + # Facet by Class, allowing different y scales
scale_fill_manual(values = c("0" = "green", "1" = "red"), labels = c("Genuine", "Fraudulent")) +
labs(x = "Time (in seconds)", y = "Count", fill = "Class", title = "Transaction Time Distribution by Class") +
theme_minimal()
From the graph we can observe that genuine transactions seem to follow a repeating pattern, possibly reflecting busier times during certain parts of the day or days of the week. The regular drops could indicate quieter times, like nighttime or weekends. However, fraudulent transactions don’t show this pattern as clearly, suggesting they might not follow the same routine as genuine ones.
# Normalizing 'Time' and 'Amount'
time_amount <- creditcard_data[, c("Time", "Amount")]
preProcessRange <- preProcess(time_amount, method = c("center", "scale"))
data_norm <- predict(preProcessRange, time_amount)
# Combining normalized data with original data, excluding 'Time' and 'Amount' columns
creditcard_data <- cbind(creditcard_data[, !names(creditcard_data) %in% c("Time", "Amount")], data_norm)
head(creditcard_data)
## V1 V2 V3 V4 V5 V6
## 1 -1.3598071 -0.07278117 2.5363467 1.3781552 -0.33832077 0.46238778
## 2 1.1918571 0.26615071 0.1664801 0.4481541 0.06001765 -0.08236081
## 3 -1.3583541 -1.34016307 1.7732093 0.3797796 -0.50319813 1.80049938
## 4 -0.9662717 -0.18522601 1.7929933 -0.8632913 -0.01030888 1.24720317
## 5 -1.1582331 0.87773675 1.5487178 0.4030339 -0.40719338 0.09592146
## 6 -0.4259659 0.96052304 1.1411093 -0.1682521 0.42098688 -0.02972755
## V7 V8 V9 V10 V11 V12
## 1 0.23959855 0.09869790 0.3637870 0.09079417 -0.5515995 -0.61780086
## 2 -0.07880298 0.08510165 -0.2554251 -0.16697441 1.6127267 1.06523531
## 3 0.79146096 0.24767579 -1.5146543 0.20764287 0.6245015 0.06608369
## 4 0.23760894 0.37743587 -1.3870241 -0.05495192 -0.2264873 0.17822823
## 5 0.59294075 -0.27053268 0.8177393 0.75307443 -0.8228429 0.53819555
## 6 0.47620095 0.26031433 -0.5686714 -0.37140720 1.3412620 0.35989384
## V13 V14 V15 V16 V17 V18
## 1 -0.9913898 -0.3111694 1.4681770 -0.4704005 0.20797124 0.02579058
## 2 0.4890950 -0.1437723 0.6355581 0.4639170 -0.11480466 -0.18336127
## 3 0.7172927 -0.1659459 2.3458649 -2.8900832 1.10996938 -0.12135931
## 4 0.5077569 -0.2879237 -0.6314181 -1.0596472 -0.68409279 1.96577500
## 5 1.3458516 -1.1196698 0.1751211 -0.4514492 -0.23703324 -0.03819479
## 6 -0.3580907 -0.1371337 0.5176168 0.4017259 -0.05813282 0.06865315
## V19 V20 V21 V22 V23 V24
## 1 0.40399296 0.25141210 -0.018306778 0.277837576 -0.11047391 0.06692807
## 2 -0.14578304 -0.06908314 -0.225775248 -0.638671953 0.10128802 -0.33984648
## 3 -2.26185710 0.52497973 0.247998153 0.771679402 0.90941226 -0.68928096
## 4 -1.23262197 -0.20803778 -0.108300452 0.005273597 -0.19032052 -1.17557533
## 5 0.80348692 0.40854236 -0.009430697 0.798278495 -0.13745808 0.14126698
## 6 -0.03319379 0.08496767 -0.208253515 -0.559824796 -0.02639767 -0.37142658
## V25 V26 V27 V28 Class Time Amount
## 1 0.1285394 -0.1891148 0.133558377 -0.02105305 0 -1.996580 0.24496383
## 2 0.1671704 0.1258945 -0.008983099 0.01472417 0 -1.996580 -0.34247394
## 3 -0.3276418 -0.1390966 -0.055352794 -0.05975184 0 -1.996558 1.16068389
## 4 0.6473760 -0.2219288 0.062722849 0.06145763 0 -1.996558 0.14053401
## 5 -0.2060096 0.5022922 0.219422230 0.21515315 0 -1.996537 -0.07340321
## 6 -0.2327938 0.1059148 0.253844225 0.08108026 0 -1.996537 -0.33855582
# Splitting the data into training and test sets (e.g., 60% training, 40% test)
set.seed(147)
train_index <- createDataPartition(creditcard_data$Class, p = 0.6, list = FALSE)
train_data <- creditcard_data[train_index, ]
test_data <- creditcard_data[-train_index, ]
# Counting the number of rows in the training and test sets
nrow(train_data)
## [1] 170885
nrow(test_data)
## [1] 113922
Here we are normalizing the data and splitting the data set into training data and test data in the ratio of 60:40
library(ROSE)
## Loaded ROSE 0.0-4
# Resampling
balanced_training_set <- ovun.sample(Class ~ ., data = train_data, p=0.5, seed = 151, method="both")$data
# Checking the number of each class
table(balanced_training_set$Class)
##
## 0 1
## 85425 85460
# Plotting the class distribution
options(repr.plot.width=12, repr.plot.height=7)
ggplot(balanced_training_set, aes(x = factor(Class), fill = factor(Class))) +
geom_bar() +
labs(x = "Class", y = "Count", fill = "Class", title = "Distribution of Classes") +
scale_fill_manual(values = c("0" = "aquamarine", "1" = "brown1"), labels = c("Genuine", "Fraudulent")) +
scale_y_continuous(labels = scales::comma) +
theme_minimal()
Since we have the class imbalance we are doing re-sampling of data in order to avoid overfitting and now after resampling it is equally distributed so we can proceed with the predictive model analysis.
4.1 Logistic Regression Model
# Fitting the logistic regression model
Logistic_Model <- glm(Class ~ ., family = binomial(), data = balanced_training_set)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Checking summary of the model
summary(Logistic_Model)
##
## Call:
## glm(formula = Class ~ ., family = binomial(), data = balanced_training_set)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.64628 0.14017 -33.148 < 2e-16 ***
## V1 0.52999 0.03672 14.431 < 2e-16 ***
## V2 0.92761 0.09866 9.402 < 2e-16 ***
## V3 0.23498 0.03088 7.609 2.76e-14 ***
## V4 1.37229 0.08043 17.062 < 2e-16 ***
## V5 1.13758 0.08116 14.017 < 2e-16 ***
## V6 -0.60286 0.03236 -18.630 < 2e-16 ***
## V7 -0.22801 0.07396 -3.083 0.002050 **
## V8 -0.45950 0.02031 -22.629 < 2e-16 ***
## V9 0.63743 0.12386 5.146 2.65e-07 ***
## V10 -1.14584 0.09927 -11.542 < 2e-16 ***
## V11 0.72279 0.03417 21.150 < 2e-16 ***
## V12 -0.92461 0.03967 -23.305 < 2e-16 ***
## V13 -0.67582 0.03758 -17.983 < 2e-16 ***
## V14 -1.26038 0.03292 -38.289 < 2e-16 ***
## V15 0.07845 0.02261 3.470 0.000520 ***
## V16 0.74228 0.17410 4.264 2.01e-05 ***
## V17 -1.11485 0.05196 -21.456 < 2e-16 ***
## V18 -1.48833 0.16154 -9.213 < 2e-16 ***
## V19 1.07336 0.09362 11.465 < 2e-16 ***
## V20 -0.24028 0.06936 -3.464 0.000532 ***
## V21 0.52577 0.04624 11.370 < 2e-16 ***
## V22 1.59434 0.07513 21.222 < 2e-16 ***
## V23 0.26856 0.06254 4.294 1.75e-05 ***
## V24 -0.12207 0.02583 -4.725 2.30e-06 ***
## V25 0.39767 0.03750 10.604 < 2e-16 ***
## V26 -0.23380 0.02893 -8.081 6.41e-16 ***
## V27 0.26071 0.04083 6.385 1.72e-10 ***
## V28 0.58517 0.06287 9.307 < 2e-16 ***
## Time -0.38040 0.01919 -19.821 < 2e-16 ***
## Amount 2.04824 0.17358 11.800 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 236897 on 170884 degrees of freedom
## Residual deviance: 43884 on 170854 degrees of freedom
## AIC: 43946
##
## Number of Fisher Scoring iterations: 16
Here the logistic regression model has been fitted to the balanced training set. It shows the coefficients for each predictor variable, indicating their impact on the likelihood of a transaction being fraudulent (Class = 1). The “Null deviance” and “Residual deviance” represent the goodness of fit of the model. A lower residual deviance suggests a better fit to the data. The “AIC” value is the Akaike Information Criterion, which measures the quality of the model while penalizing complexity.
# ROC Curve to assess the performance of the model
library(pROC)
lr.predict <- predict(Logistic_Model,test_data, probability = TRUE)
roc_glm = roc(test_data$Class, lr.predict, plot = TRUE, col = "blue")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Accuracy and Confusion Matrix
auc_glm <- auc(roc_glm)
predictions <- predict(Logistic_Model, newdata = test_data, type = "response")
threshold <- 0.5
binary_predictions <- ifelse(predictions > threshold, 1, 0)
confusion_matrix_glm <- table(test_data$Class, binary_predictions)
print(confusion_matrix_glm)
## binary_predictions
## 0 1
## 0 110901 2840
## 1 15 166
The confusion matrix displays the performance of the logistic regression model. It shows how many instances were correctly classified as either genuine (0) or fraudulent (1) transactions. In this case, the model correctly classified 110,901 genuine transactions as genuine and 166 fraudulent transactions as fraudulent. However, it misclassified 2,840 genuine transactions as fraudulent and 15 fraudulent transactions as genuine.
# Print AUC and accuracy
accuracy_glm <- sum(diag(confusion_matrix_glm)) / sum(confusion_matrix_glm)
cat("AUC:", auc(roc_glm), "\n")
## AUC: 0.9765142
cat("Accuracy:", accuracy_glm, "\n")
## Accuracy: 0.974939
The AUC (Area Under the Curve) value for the logistic regression model is 0.976, indicating strong performance in distinguishing between genuine and fraudulent transactions. The accuracy of the model, which measures the proportion of correctly classified instances, is 97.5%. This suggests that the model is highly accurate in identifying both genuine and fraudulent transactions.
4.2 Fitting a Decision Tree Model
library(rpart)
library(rpart.plot)
classifier <- rpart(Class ~ ., data = balanced_training_set, method = "class")
rpart.plot(classifier)
# Plotting ROC Curve
pred_dt <- predict(classifier, newdata = test_data, type = "prob")
dt_fg <- pred_dt[test_data$Class == 1, "1"]
dt_bg <- pred_dt[test_data$Class == 0, "1"]
true_positives <- rep(1, length(dt_fg))
true_negatives <- rep(0, length(dt_bg))
response <- c(true_positives, true_negatives)
predictor <- c(dt_fg, dt_bg)
roc_dt <- roc(response, predictor)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_dt, col = "blue", main = "ROC Curve")
# Confusion Matrix
threshold <- 0.5
binary_predictions <- c(rep(1, length(dt_fg)), rep(0, length(dt_bg)))
binary_predictions <- ifelse(predictor > threshold, 1, 0)
# Creating confusion matrix
confusion_matrix_dt <- table(True_Labels = response, Predicted_Labels = binary_predictions)
# Printing confusion matrix
print(confusion_matrix_dt)
## Predicted_Labels
## True_Labels 0 1
## 0 110608 3133
## 1 21 160
In this specific scenario, the confusion matrix shows that our model correctly identified a large number of genuine transactions (110,608) and also captured some fraudulent transactions (160). However, it also misclassified some genuine transactions as fraudulent (3133 false positives) and failed to detect some fraudulent transactions (21 false negatives).
# Accuracy and AUC
# Calculating AUC
auc_dt <- auc(roc_dt)
# Calculating accuracy
accuracy_dt <- sum(diag(confusion_matrix_dt)) / sum(confusion_matrix_dt)
# Printing AUC and accuracy
cat("AUC:", auc_dt, "\n")
## AUC: 0.9443683
cat("Accuracy:", accuracy_dt, "\n")
## Accuracy: 0.9723144
The AUC (Area Under the Curve) value is approximately 0.944. Additionally, the accuracy of our model, which measures the proportion of correct predictions, is approximately 0.972. These metrics demonstrate that our decision tree model performs well in both identifying fraudulent transactions and making accurate predictions overall.
4.3 Fitting a KNN model
knn_model <- knn(train = train_data[, -ncol(train_data)],
test = test_data[, -ncol(test_data)],
cl = train_data$Class,
k = 10)
# Creating ROC curve
roc_knn <- roc(test_data$Class, as.numeric(knn_model))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plotting ROC curve
plot(roc_knn, col = "blue", main = "ROC Curve for KNN Model")
# Confusion Matrix
confusion_matrix_knn <- table(test_data$Class, knn_model)
print(confusion_matrix_knn)
## knn_model
## 0 1
## 0 113727 14
## 1 36 145
This confusion matrix presents the performance of our KNN (K-Nearest Neighbors) model in classifying transactions as either genuine or fraudulent. It indicates that out of 113741 genuine transactions (Class 0), 113727 were correctly classified, while 14 were misclassified as fraudulent. Similarly, out of 181 fraudulent transactions (Class 1), 145 were correctly classified, but 36 were misclassified as genuine.
# Accuracy and AUC
# Calculating AUC
auc_knn <- roc(test_data$Class, as.numeric(knn_model))$auc
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste("AUC:", auc_knn))
## [1] "AUC: 0.900490942856944"
# Calculating accuracy
accuracy_knn <- sum(diag(confusion_matrix_knn)) / sum(confusion_matrix_knn)
print(paste("Accuracy:", accuracy_knn))
## [1] "Accuracy: 0.999561103210969"
The calculated AUC (Area Under the Curve) for our KNN (K-Nearest Neighbors) model is approximately 0.900, indicating that the model has good discriminatory power in distinguishing between genuine and fraudulent transactions.
Additionally, the accuracy of the KNN model is approximately 99.96%, suggesting that it performs exceptionally well in correctly classifying transactions into their respective classes. This high accuracy further underscores the effectiveness of the model in fraud detection.
Here we are plotting ROC curves for all three models in single graph for comparison and bar plot for area under curves for all three models.
# Plot ROC curves
legend_labels <- c("KNN", "Decision Tree", "Logistic Regression (GLM)")
plot(roc_knn, col = "orange", lwd = 2, main = "ROC Curves Comparison", xlab = "False Positive Rate", ylab = "True Positive Rate")
lines(roc_dt, col = "blue", lwd = 2)
lines(roc_glm, col = "green", lwd = 2)
legend("bottomright", legend = legend_labels, col = c("orange", "blue", "green"), lwd = 2)
# Create a dataframe with model names and AUC values
auc_values <- c(auc_knn, auc_dt, auc_glm)
model_names <- c("KNN", "Decision Tree", "Logistic Regression (GLM)")
auc_df <- data.frame(Model = model_names, AUC_ROC = auc_values)
# Plot the bar plot
barplot(auc_df$AUC_ROC, names.arg = auc_df$Model, col = c("coral", "cyan3", "deeppink2"),
main = "AUC_ROC", ylab = "AUROC Value", las = 2, ylim = c(0, 1))
# Creating the bar plot for accuracy
model_names <- c("Logistic Regression", "Decision Tree", "KNN")
accuracy_values <- c(accuracy_glm, accuracy_dt, accuracy_knn) # Replace with your actual accuracy values
bar_colors <- c("skyblue", "lightgreen", "lightcoral")
accuracy_df <- data.frame(Model = model_names, Accuracy = accuracy_values)
barplot(accuracy_values, names.arg = model_names,
col = bar_colors, main = "Model Accuracy Comparison",
ylab = "Accuracy", ylim = c(0, 1))
In this analysis, we aimed to find the most effective models for handling this credit card fraud detection data set. Initially we did some exploratory data analysis and found that our data set is highly imbalanced which can lead to over-fitting so we did re-sampling in data preparation step. Then, We tested four machine learning models: Logistic Regression, Decision Trees and k-nearest neighbors. The results showed that KNN performed the best, achieving accuracy of 99.96%. Logistic Regression came next, closely trailing with 97.49% and Decision Tree had the lowest performance, with 97.23%.