# Load required libraries
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Load the dataset
dataset <- read.csv("fraud_dataset.csv")
head(dataset)
## Transaction_ID Transaction_Amount Account_Balance Transaction_Type Location
## 1 1 2876.4876 13681.137 Deposit ATM
## 2 2 7883.2630 29693.347 Transfer ATM
## 3 3 4090.3602 8009.241 Deposit ATM
## 4 4 8830.2910 42671.512 Withdrawal In-Branch
## 5 5 9404.7324 42386.958 Withdrawal ATM
## 6 6 456.5194 23894.341 Deposit In-Branch
## Fraud_Label
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
# Check the structure of the dataset
str(dataset)
## 'data.frame': 1000 obs. of 6 variables:
## $ Transaction_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Transaction_Amount: num 2876 7883 4090 8830 9405 ...
## $ Account_Balance : num 13681 29693 8009 42672 42387 ...
## $ Transaction_Type : chr "Deposit" "Transfer" "Deposit" "Withdrawal" ...
## $ Location : chr "ATM" "ATM" "ATM" "In-Branch" ...
## $ Fraud_Label : int 0 0 0 0 0 0 1 0 0 0 ...
# Ensure the target variable is a factor for classification
# Assuming the target variable is named 'fraud'
dataset$Fraud_Label <- as.factor(dataset$Fraud_Label)
# Split the dataset into features and target variable
features <- dataset[,2:5]
target <- dataset$Fraud_Label
# Train the Random Forest model
set.seed(123) # For reproducibility
rf_model <- randomForest(features, target, importance = TRUE, ntree = 500)
# Print the model summary
print(rf_model)
##
## Call:
## randomForest(x = features, y = target, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0.1%
## Confusion matrix:
## 0 1 class.error
## 0 945 0 0.00000000
## 1 1 54 0.01818182
# Predict using the Random Forest model (considering the complete dataset as the test dataset)
predictions <- predict(rf_model, features)
# Evaluate the model performance
confusion_matrix <- confusionMatrix(predictions, target,positive ="1")
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 945 0
## 1 0 55
##
## Accuracy : 1
## 95% CI : (0.9963, 1)
## No Information Rate : 0.945
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.000
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 1.000
## Prevalence : 0.055
## Detection Rate : 0.055
## Detection Prevalence : 0.055
## Balanced Accuracy : 1.000
##
## 'Positive' Class : 1
##
# Plot the feature importance
importance_values <- importance(rf_model)
feature_importance <- data.frame(Feature = row.names(importance_values), Importance = importance_values[, 1])
# Plotting the feature importance using ggplot2
g=ggplot(feature_importance, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Feature Importance in Random Forest Model",
x = "Features",
y = "Importance") +
theme_minimal()
ggplotly(g)