Package and Data
library(ranger)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(data.table)
library(readr)
library(class)
creditcard_data <- read_csv("/Users/bhavyakalra/Desktop/Data_analytics/Credit Card fruad detection/creditcard.csv", show_col_types = FALSE)
Data Exploration
head(creditcard_data)
## # A tibble: 6 × 31
## Time V1 V2 V3 V4 V5 V6 V7 V8 V9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 -1.36 -0.0728 2.54 1.38 -0.338 0.462 0.240 0.0987 0.364
## 2 0 1.19 0.266 0.166 0.448 0.0600 -0.0824 -0.0788 0.0851 -0.255
## 3 1 -1.36 -1.34 1.77 0.380 -0.503 1.80 0.791 0.248 -1.51
## 4 1 -0.966 -0.185 1.79 -0.863 -0.0103 1.25 0.238 0.377 -1.39
## 5 2 -1.16 0.878 1.55 0.403 -0.407 0.0959 0.593 -0.271 0.818
## 6 2 -0.426 0.961 1.14 -0.168 0.421 -0.0297 0.476 0.260 -0.569
## # ℹ 21 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>,
## # V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## # V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>,
## # V27 <dbl>, V28 <dbl>, Amount <dbl>, Class <dbl>
dim(creditcard_data)
## [1] 284807 31
head(creditcard_data,6)
## # A tibble: 6 × 31
## Time V1 V2 V3 V4 V5 V6 V7 V8 V9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 -1.36 -0.0728 2.54 1.38 -0.338 0.462 0.240 0.0987 0.364
## 2 0 1.19 0.266 0.166 0.448 0.0600 -0.0824 -0.0788 0.0851 -0.255
## 3 1 -1.36 -1.34 1.77 0.380 -0.503 1.80 0.791 0.248 -1.51
## 4 1 -0.966 -0.185 1.79 -0.863 -0.0103 1.25 0.238 0.377 -1.39
## 5 2 -1.16 0.878 1.55 0.403 -0.407 0.0959 0.593 -0.271 0.818
## 6 2 -0.426 0.961 1.14 -0.168 0.421 -0.0297 0.476 0.260 -0.569
## # ℹ 21 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>,
## # V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## # V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>,
## # V27 <dbl>, V28 <dbl>, Amount <dbl>, Class <dbl>
tail(creditcard_data,6)
## # A tibble: 6 × 31
## Time V1 V2 V3 V4 V5 V6 V7 V8 V9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 172785 0.120 0.931 -0.546 -0.745 1.13 -0.236 0.813 0.115 -0.204
## 2 172786 -11.9 10.1 -9.83 -2.07 -5.36 -2.61 -4.92 7.31 1.91
## 3 172787 -0.733 -0.0551 2.04 -0.739 0.868 1.06 0.0243 0.295 0.585
## 4 172788 1.92 -0.301 -3.25 -0.558 2.63 3.03 -0.297 0.708 0.432
## 5 172788 -0.240 0.530 0.703 0.690 -0.378 0.624 -0.686 0.679 0.392
## 6 172792 -0.533 -0.190 0.703 -0.506 -0.0125 -0.650 1.58 -0.415 0.486
## # ℹ 21 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>,
## # V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## # V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>,
## # V27 <dbl>, V28 <dbl>, Amount <dbl>, Class <dbl>
table(creditcard_data$Class)
##
## 0 1
## 284315 492
summary(creditcard_data$Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 5.60 22.00 88.35 77.17 25691.16
names(creditcard_data)
## [1] "Time" "V1" "V2" "V3" "V4" "V5" "V6" "V7"
## [9] "V8" "V9" "V10" "V11" "V12" "V13" "V14" "V15"
## [17] "V16" "V17" "V18" "V19" "V20" "V21" "V22" "V23"
## [25] "V24" "V25" "V26" "V27" "V28" "Amount" "Class"
var(creditcard_data$Amount)
## [1] 62560.07
sd(creditcard_data$Amount)
## [1] 250.1201
Data Manipulation
head(creditcard_data)
## # A tibble: 6 × 31
## Time V1 V2 V3 V4 V5 V6 V7 V8 V9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 -1.36 -0.0728 2.54 1.38 -0.338 0.462 0.240 0.0987 0.364
## 2 0 1.19 0.266 0.166 0.448 0.0600 -0.0824 -0.0788 0.0851 -0.255
## 3 1 -1.36 -1.34 1.77 0.380 -0.503 1.80 0.791 0.248 -1.51
## 4 1 -0.966 -0.185 1.79 -0.863 -0.0103 1.25 0.238 0.377 -1.39
## 5 2 -1.16 0.878 1.55 0.403 -0.407 0.0959 0.593 -0.271 0.818
## 6 2 -0.426 0.961 1.14 -0.168 0.421 -0.0297 0.476 0.260 -0.569
## # ℹ 21 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>,
## # V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## # V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>,
## # V27 <dbl>, V28 <dbl>, Amount <dbl>, Class <dbl>
creditcard_data$Amount=scale(creditcard_data$Amount)
NewData=creditcard_data[,-c(1)]
head(NewData)
## # A tibble: 6 × 30
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -1.36 -0.0728 2.54 1.38 -0.338 0.462 0.240 0.0987 0.364 0.0908
## 2 1.19 0.266 0.166 0.448 0.0600 -0.0824 -0.0788 0.0851 -0.255 -0.167
## 3 -1.36 -1.34 1.77 0.380 -0.503 1.80 0.791 0.248 -1.51 0.208
## 4 -0.966 -0.185 1.79 -0.863 -0.0103 1.25 0.238 0.377 -1.39 -0.0550
## 5 -1.16 0.878 1.55 0.403 -0.407 0.0959 0.593 -0.271 0.818 0.753
## 6 -0.426 0.961 1.14 -0.168 0.421 -0.0297 0.476 0.260 -0.569 -0.371
## # ℹ 20 more variables: V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>, V15 <dbl>,
## # V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
## # V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
## # V28 <dbl>, Amount <dbl[,1]>, Class <dbl>
# Data Modelling
library(caTools)
set.seed(123)
data_sample = sample.split(NewData$Class,SplitRatio=0.80)
train_data = subset(NewData,data_sample==TRUE)
test_data = subset(NewData,data_sample==FALSE)
dim(train_data)
## [1] 227846 30
dim(test_data)
## [1] 56961 30
Fitting Logistic Regression Model
Logistic_Model=glm(Class~.,train_data,family=binomial())
summary(Logistic_Model)
##
## Call:
## glm(formula = Class ~ ., family = binomial(), data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.651305 0.160212 -53.999 < 2e-16 ***
## V1 0.072540 0.044144 1.643 0.100332
## V2 0.014818 0.059777 0.248 0.804220
## V3 0.026109 0.049776 0.525 0.599906
## V4 0.681286 0.078071 8.726 < 2e-16 ***
## V5 0.087938 0.071553 1.229 0.219079
## V6 -0.148083 0.085192 -1.738 0.082170 .
## V7 -0.117344 0.068940 -1.702 0.088731 .
## V8 -0.146045 0.035667 -4.095 4.23e-05 ***
## V9 -0.339828 0.117595 -2.890 0.003855 **
## V10 -0.785462 0.098486 -7.975 1.52e-15 ***
## V11 0.001492 0.085147 0.018 0.986018
## V12 0.087106 0.094869 0.918 0.358532
## V13 -0.343792 0.092381 -3.721 0.000198 ***
## V14 -0.526828 0.067084 -7.853 4.05e-15 ***
## V15 -0.095471 0.094037 -1.015 0.309991
## V16 -0.130225 0.138629 -0.939 0.347537
## V17 0.032463 0.074471 0.436 0.662900
## V18 -0.100964 0.140985 -0.716 0.473909
## V19 0.083711 0.105134 0.796 0.425897
## V20 -0.463946 0.081871 -5.667 1.46e-08 ***
## V21 0.381206 0.065880 5.786 7.19e-09 ***
## V22 0.610874 0.142086 4.299 1.71e-05 ***
## V23 -0.071406 0.058799 -1.214 0.224589
## V24 0.255791 0.170568 1.500 0.133706
## V25 -0.073955 0.142634 -0.519 0.604109
## V26 0.120841 0.202553 0.597 0.550783
## V27 -0.852018 0.118391 -7.197 6.17e-13 ***
## V28 -0.323854 0.090075 -3.595 0.000324 ***
## Amount 0.292477 0.092075 3.177 0.001491 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5799.1 on 227845 degrees of freedom
## Residual deviance: 1790.9 on 227816 degrees of freedom
## AIC: 1850.9
##
## Number of Fisher Scoring iterations: 12
Accuracy and Confusion Matrix
predictions <- predict(Logistic_Model, newdata = test_data, type = "response")
threshold <- 0.5
binary_predictions <- ifelse(predictions > threshold, 1, 0)
confusion_matrix <- table(test_data$Class, binary_predictions)
print(confusion_matrix)
## binary_predictions
## 0 1
## 0 56856 7
## 1 41 57
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.99915731816506"
Fitting a Decision Tree Model
library(rpart)
library(rpart.plot)
X <- creditcard_data[, !(names(creditcard_data) %in% c("Class"))]
y <- creditcard_data$Class
set.seed(42)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]
classifier <- rpart(y_train ~ ., data = X_train, method = "class")
rpart.plot(classifier)

Accuracy and Confusion Matrix
y_pred <- predict(classifier, X_test, type = "class")
accuracy <- sum(y_pred == y_test) / length(y_test) * 100
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 99.9490879724724"
confusion_mat <- table(y_pred, y_test)
print(confusion_mat)
## y_test
## y_pred 0 1
## 0 56853 23
## 1 6 79
Fitting a KNN model
knn_model <- knn(train = train_data[, -ncol(train_data)],
test = test_data[, -ncol(test_data)],
cl = train_data$Class,
k = 5)
Accuracy and Confusion Matrix
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.99915731816506"
confusion_matrix <- table(test_data$Class, knn_model)
print(confusion_matrix)
## knn_model
## 0 1
## 0 56851 12
## 1 28 70