RandomForestModel

# Load required libraries
library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## Loading required package: lattice

library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

# Load the dataset
dataset <- read.csv("fraud_dataset.csv")
head(dataset)

##   Transaction_ID Transaction_Amount Account_Balance Transaction_Type  Location
## 1              1          2876.4876       13681.137          Deposit       ATM
## 2              2          7883.2630       29693.347         Transfer       ATM
## 3              3          4090.3602        8009.241          Deposit       ATM
## 4              4          8830.2910       42671.512       Withdrawal In-Branch
## 5              5          9404.7324       42386.958       Withdrawal       ATM
## 6              6           456.5194       23894.341          Deposit In-Branch
##   Fraud_Label
## 1           0
## 2           0
## 3           0
## 4           0
## 5           0
## 6           0

# Check the structure of the dataset
str(dataset)

## 'data.frame':    1000 obs. of  6 variables:
##  $ Transaction_ID    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Transaction_Amount: num  2876 7883 4090 8830 9405 ...
##  $ Account_Balance   : num  13681 29693 8009 42672 42387 ...
##  $ Transaction_Type  : chr  "Deposit" "Transfer" "Deposit" "Withdrawal" ...
##  $ Location          : chr  "ATM" "ATM" "ATM" "In-Branch" ...
##  $ Fraud_Label       : int  0 0 0 0 0 0 1 0 0 0 ...

# Ensure the target variable is a factor for classification
# Assuming the target variable is named 'fraud'
dataset$Fraud_Label <- as.factor(dataset$Fraud_Label)

# Split the dataset into features and target variable
features <- dataset[,2:5]
target <- dataset$Fraud_Label


# Train the Random Forest model
set.seed(123)  # For reproducibility
rf_model <- randomForest(features, target, importance = TRUE, ntree = 500)

# Print the model summary
print(rf_model)

## 
## Call:
##  randomForest(x = features, y = target, ntree = 500, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 0.1%
## Confusion matrix:
##     0  1 class.error
## 0 945  0  0.00000000
## 1   1 54  0.01818182

# Predict using the Random Forest model (considering the complete dataset as the test dataset)
predictions <- predict(rf_model, features)

# Evaluate the model performance
confusion_matrix <- confusionMatrix(predictions, target,positive ="1")
print(confusion_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 945   0
##          1   0  55
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9963, 1)
##     No Information Rate : 0.945      
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.000      
##             Specificity : 1.000      
##          Pos Pred Value : 1.000      
##          Neg Pred Value : 1.000      
##              Prevalence : 0.055      
##          Detection Rate : 0.055      
##    Detection Prevalence : 0.055      
##       Balanced Accuracy : 1.000      
##                                      
##        'Positive' Class : 1          
##

# Plot the feature importance
importance_values <- importance(rf_model)
feature_importance <- data.frame(Feature = row.names(importance_values), Importance = importance_values[, 1])

# Plotting the feature importance using ggplot2
g=ggplot(feature_importance, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Feature Importance in Random Forest Model",
       x = "Features",
       y = "Importance") +
  theme_minimal()
ggplotly(g)

RandomForestModel

2024-11-04