Fraud Detection

EDA

summary(Fraud_detect)

##   Profession            Income      Credit_card_number           
##  Length:10000       Min.   :    1   Min.   :        60402960771  
##  Class :character   1st Qu.:24864   1st Qu.:    180013750000000  
##  Mode  :character   Median :49483   Median :   3512440000000000  
##                     Mean   :49761   Mean   : 385136347436000000  
##                     3rd Qu.:74483   3rd Qu.:   4594780000000000  
##                     Max.   :99986   Max.   :4999700000000000000  
##     Expiry          Security_code        Fraud       
##  Length:10000       Min.   :   0.0   Min.   :0.0000  
##  Class :character   1st Qu.: 275.0   1st Qu.:0.0000  
##  Mode  :character   Median : 539.5   Median :1.0000  
##                     Mean   : 863.6   Mean   :0.5016  
##                     3rd Qu.: 813.2   3rd Qu.:1.0000  
##                     Max.   :9990.0   Max.   :1.0000

### glimpse function to check the ...
glimpse(Fraud_detect)

## Rows: 10,000
## Columns: 6
## $ Profession         <chr> "DOCTOR", "DOCTOR", "LAWYER", "LAWYER", "DOCTOR", "…
## $ Income             <dbl> 42509, 80334, 91552, 43623, 22962, 72106, 54992, 19…
## $ Credit_card_number <dbl> 3515420000000000, 213134000000000, 4869620000000000…
## $ Expiry             <chr> "Jul-25", "May-32", "Mar-30", "Jan-29", "Nov-30", "…
## $ Security_code      <dbl> 251, 858, 755, 160, 102, 834, 207, 433, 872, 295, 6…
## $ Fraud              <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, …

#### check for any missing data
sum(is.na(Fraud_detect))

## [1] 0

### describing the dataset
describe(Fraud_detect)

##                    vars     n                  mean                     sd
## Profession*           1 10000                  2.00                   0.82
## Income                2 10000              49761.21               28837.73
## Credit_card_number    3 10000 385136347435963712.00 1257949726283964928.00
## Expiry*               4 10000                 59.87                  34.60
## Security_code         5 10000                863.59                1484.42
## Fraud                 6 10000                  0.50                   0.50
##                                median             trimmed                 mad
## Profession*                       2.0                2.00                1.48
## Income                        49483.0            49679.15            36787.75
## Credit_card_number 3512440000000000.0 2579188701213750.00 4644885724500000.00
## Expiry*                          59.0               59.60               44.48
## Security_code                   539.5              541.84              399.56
## Fraud                             1.0                0.50                0.00
##                            min                 max               range  skew
## Profession*                  1                   3                   2  0.00
## Income                       1               99986               99985  0.02
## Credit_card_number 60402960771 4999700000000000000 4999699939597039616  3.00
## Expiry*                      1                 121                 120  0.05
## Security_code                0                9990                9990  4.10
## Fraud                        0                   1                   1 -0.01
##                    kurtosis                   se
## Profession*           -1.52                 0.01
## Income                -1.20               288.38
## Credit_card_number     7.04 12579497262839650.00
## Expiry*               -1.17                 0.35
## Security_code         17.12                14.84
## Fraud                 -2.00                 0.01

Data cleaning

### security code is supposed to be 3 digits, so we have to remove security code with less than 3 digits or greater 3 digits
Fraud_detect_security <- Fraud_detect %>%  filter(nchar(Security_code) == 3)
dim(Fraud_detect_security)

## [1] 8383    6

Fraud_detect_security$Security_code <- as.numeric(Fraud_detect_security$Security_code)

### to remove Income with less than 5 digits as these occupations cannot earn less than a 5 digit number
Fraud_detect_income <- Fraud_detect_security %>%  filter(nchar(Income) == 5)
dim(Fraud_detect_income)

## [1] 7532    6

### to remove Income with less than 16 digits
Fraud_detect_credit <- Fraud_detect_income  %>%  filter(nchar(Credit_card_number) == 16)
dim(Fraud_detect_income)

## [1] 7532    6

### assign back to original data
Fraud_detect <- Fraud_detect_income

Summary after cleaning

summary(Fraud_detect)

##   Profession            Income      Credit_card_number           
##  Length:7532        Min.   :10023   Min.   :        60402960771  
##  Class :character   1st Qu.:32176   1st Qu.:    180015000000000  
##  Mode  :character   Median :54224   Median :   3513075000000000  
##                     Mean   :54675   Mean   : 393265665072000000  
##                     3rd Qu.:77301   3rd Qu.:   4584480000000000  
##                     Max.   :99986   Max.   :4999700000000000000  
##     Expiry          Security_code     Fraud       
##  Length:7532        Min.   :100   Min.   :0.0000  
##  Class :character   1st Qu.:326   1st Qu.:0.0000  
##  Mode  :character   Median :548   Median :0.0000  
##                     Mean   :550   Mean   :0.4993  
##                     3rd Qu.:776   3rd Qu.:1.0000  
##                     Max.   :999   Max.   :1.0000

group_by profession and decreasing total income as a data frame

Fraud_detect_Income <- Fraud_detect %>% group_by(Profession) %>%  summarise(Total_income = sum(Income)) %>%
  arrange(desc(Total_income)) %>% as.data.frame()

Fraud_detect_Income

##   Profession Total_income
## 1     DOCTOR    140749145
## 2     LAWYER    138636835
## 3   ENGINEER    132423357

Check the proportion of profession

table(Fraud_detect$Profession)

## 
##   DOCTOR ENGINEER   LAWYER 
##     2550     2460     2522

Plot Profession barchart

job <- Fraud_detect %>% count(Profession)
  #count(Fraud_detect$Profession)

### rename profession types
colnames(job)[1] <- "Job_title"

job <-as.data.frame(job)

job %>% ggplot(aes(reorder(Job_title, -n), n, fill = Job_title)) + geom_bar(stat = "identity") + theme_classic()

Histogram and Boxplot

### Histogram and density plot of Income

Fraud_detect %>% ggplot(aes(Income, fill = Profession)) + geom_histogram(aes(y = ..density..), position="identity") + geom_density(fill="blue", alpha = .2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### boxplot showing income by profession
Fraud_detect %>% ggplot(aes(Profession, Income, color = Profession)) + geom_boxplot()

Arrange the highest salary by profession

#### arrange the highest salary by profession
Fraud_detect %>% select(Profession, Income) %>%  filter(Profession == "DOCTOR") %>% as.data.frame() %>% 
  arrange(desc(Income)) %>% slice(1:10)

##    Profession Income
## 1      DOCTOR  99986
## 2      DOCTOR  99956
## 3      DOCTOR  99904
## 4      DOCTOR  99904
## 5      DOCTOR  99893
## 6      DOCTOR  99877
## 7      DOCTOR  99866
## 8      DOCTOR  99860
## 9      DOCTOR  99858
## 10     DOCTOR  99837

Fraud_detect %>% select(Profession, Income) %>%  filter(Profession == "LAWYER") %>% as.data.frame() %>%

arrange(desc(Income)) %>% slice(1:10)

##    Profession Income
## 1      LAWYER  99924
## 2      LAWYER  99923
## 3      LAWYER  99912
## 4      LAWYER  99899
## 5      LAWYER  99868
## 6      LAWYER  99842
## 7      LAWYER  99823
## 8      LAWYER  99815
## 9      LAWYER  99810
## 10     LAWYER  99736

Fraud_detect %>% select(Profession, Income) %>%  filter(Profession == "ENGINEER") %>% as.data.frame() %>%

arrange(desc(Income)) %>% slice(1:10)

##    Profession Income
## 1    ENGINEER  99936
## 2    ENGINEER  99893
## 3    ENGINEER  99860
## 4    ENGINEER  99757
## 5    ENGINEER  99729
## 6    ENGINEER  99678
## 7    ENGINEER  99644
## 8    ENGINEER  99626
## 9    ENGINEER  99612
## 10   ENGINEER  99571

Change response variable to factor

Fraud_detect$Fraud <- as.factor(Fraud_detect$Fraud)

Select the required columns

Fraud_detect <- Fraud_detect %>% select(Profession,Income, Fraud, Security_code)

Splitting the data

set.seed(1234)
sample_fraud <- sample(nrow(Fraud_detect), round(nrow(Fraud_detect)* .80), replace = FALSE)
fraud_train <- Fraud_detect[sample_fraud, ] 
fraud_test <- Fraud_detect[-sample_fraud, ]

Checking class imbalance

##check for class imbalance
round(prop.table(table(select(Fraud_detect, Fraud))), 2)

## Fraud
##   0   1 
## 0.5 0.5

##check for class imbalance
round(prop.table(table(select(fraud_train, Fraud))), 2)

## Fraud
##   0   1 
## 0.5 0.5

## Check for class imbalance
round(prop.table(table(select(fraud_test, Fraud))), 2)

## Fraud
##    0    1 
## 0.49 0.51

Training the model in decision tree

library(rpart)
Fraud_detect_mod <- rpart(Fraud ~. ,method = "class", data = fraud_train)

Evaluate the model

library(rpart.plot)
rpart.plot(Fraud_detect_mod)

Confusion matrix for the model’s predictive accuracy

Fraud_detect_pred <- predict(Fraud_detect_mod, fraud_test, type = "class")

Fraud_detect_pred_table <- table(fraud_test$Fraud, Fraud_detect_pred)
Fraud_detect_pred_table

##    Fraud_detect_pred
##       0   1
##   0 236 503
##   1 233 534

Prediction accuracy

sum(diag(Fraud_detect_pred_table)) / nrow(fraud_test)

## [1] 0.5112882

Using random Forest

Fraud_random <- randomForest(Fraud ~ ., data = fraud_train, mtry = 4, ntree = 2001, importance = TRUE)

Fraud_random

## 
## Call:
##  randomForest(formula = Fraud ~ ., data = fraud_train, mtry = 4,      ntree = 2001, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 2001
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 48.29%
## Confusion matrix:
##      0    1 class.error
## 0 1589 1443   0.4759235
## 1 1467 1527   0.4899800

plot(Fraud_random)

Predict Test_result

Test_result <- data.frame(fraud_test$Fraud, predict(Fraud_random, fraud_test[ , c(-3)], type = "response"))

#Test_result

plot(Test_result)

Fraud Detection

Spencer Madamedon

2024-11-19

EDA

Data cleaning

Summary after cleaning

group_by profession and decreasing total income as a data frame

Check the proportion of profession

Plot Profession barchart

Histogram and Boxplot

Arrange the highest salary by profession

Change response variable to factor

Select the required columns

Splitting the data

Checking class imbalance

Training the model in decision tree

Evaluate the model

Confusion matrix for the model’s predictive accuracy

Prediction accuracy

Using random Forest

Predict Test_result