EDA

summary(Fraud_detect)
##   Profession            Income      Credit_card_number           
##  Length:10000       Min.   :    1   Min.   :        60402960771  
##  Class :character   1st Qu.:24864   1st Qu.:    180013750000000  
##  Mode  :character   Median :49483   Median :   3512440000000000  
##                     Mean   :49761   Mean   : 385136347436000000  
##                     3rd Qu.:74483   3rd Qu.:   4594780000000000  
##                     Max.   :99986   Max.   :4999700000000000000  
##     Expiry          Security_code        Fraud       
##  Length:10000       Min.   :   0.0   Min.   :0.0000  
##  Class :character   1st Qu.: 275.0   1st Qu.:0.0000  
##  Mode  :character   Median : 539.5   Median :1.0000  
##                     Mean   : 863.6   Mean   :0.5016  
##                     3rd Qu.: 813.2   3rd Qu.:1.0000  
##                     Max.   :9990.0   Max.   :1.0000
### glimpse function to check the ...
glimpse(Fraud_detect)
## Rows: 10,000
## Columns: 6
## $ Profession         <chr> "DOCTOR", "DOCTOR", "LAWYER", "LAWYER", "DOCTOR", "…
## $ Income             <dbl> 42509, 80334, 91552, 43623, 22962, 72106, 54992, 19…
## $ Credit_card_number <dbl> 3515420000000000, 213134000000000, 4869620000000000…
## $ Expiry             <chr> "Jul-25", "May-32", "Mar-30", "Jan-29", "Nov-30", "…
## $ Security_code      <dbl> 251, 858, 755, 160, 102, 834, 207, 433, 872, 295, 6…
## $ Fraud              <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
#### check for any missing data
sum(is.na(Fraud_detect))
## [1] 0
### describing the dataset
describe(Fraud_detect)
##                    vars     n                  mean                     sd
## Profession*           1 10000                  2.00                   0.82
## Income                2 10000              49761.21               28837.73
## Credit_card_number    3 10000 385136347435963712.00 1257949726283964928.00
## Expiry*               4 10000                 59.87                  34.60
## Security_code         5 10000                863.59                1484.42
## Fraud                 6 10000                  0.50                   0.50
##                                median             trimmed                 mad
## Profession*                       2.0                2.00                1.48
## Income                        49483.0            49679.15            36787.75
## Credit_card_number 3512440000000000.0 2579188701213750.00 4644885724500000.00
## Expiry*                          59.0               59.60               44.48
## Security_code                   539.5              541.84              399.56
## Fraud                             1.0                0.50                0.00
##                            min                 max               range  skew
## Profession*                  1                   3                   2  0.00
## Income                       1               99986               99985  0.02
## Credit_card_number 60402960771 4999700000000000000 4999699939597039616  3.00
## Expiry*                      1                 121                 120  0.05
## Security_code                0                9990                9990  4.10
## Fraud                        0                   1                   1 -0.01
##                    kurtosis                   se
## Profession*           -1.52                 0.01
## Income                -1.20               288.38
## Credit_card_number     7.04 12579497262839650.00
## Expiry*               -1.17                 0.35
## Security_code         17.12                14.84
## Fraud                 -2.00                 0.01

Data cleaning

### security code is supposed to be 3 digits, so we have to remove security code with less than 3 digits or greater 3 digits
Fraud_detect_security <- Fraud_detect %>%  filter(nchar(Security_code) == 3)
dim(Fraud_detect_security)
## [1] 8383    6
Fraud_detect_security$Security_code <- as.numeric(Fraud_detect_security$Security_code)

### to remove Income with less than 5 digits as these occupations cannot earn less than a 5 digit number
Fraud_detect_income <- Fraud_detect_security %>%  filter(nchar(Income) == 5)
dim(Fraud_detect_income)
## [1] 7532    6
### to remove Income with less than 16 digits
Fraud_detect_credit <- Fraud_detect_income  %>%  filter(nchar(Credit_card_number) == 16)
dim(Fraud_detect_income)
## [1] 7532    6
### assign back to original data
Fraud_detect <- Fraud_detect_income

Summary after cleaning

summary(Fraud_detect)
##   Profession            Income      Credit_card_number           
##  Length:7532        Min.   :10023   Min.   :        60402960771  
##  Class :character   1st Qu.:32176   1st Qu.:    180015000000000  
##  Mode  :character   Median :54224   Median :   3513075000000000  
##                     Mean   :54675   Mean   : 393265665072000000  
##                     3rd Qu.:77301   3rd Qu.:   4584480000000000  
##                     Max.   :99986   Max.   :4999700000000000000  
##     Expiry          Security_code     Fraud       
##  Length:7532        Min.   :100   Min.   :0.0000  
##  Class :character   1st Qu.:326   1st Qu.:0.0000  
##  Mode  :character   Median :548   Median :0.0000  
##                     Mean   :550   Mean   :0.4993  
##                     3rd Qu.:776   3rd Qu.:1.0000  
##                     Max.   :999   Max.   :1.0000

group_by profession and decreasing total income as a data frame

Fraud_detect_Income <- Fraud_detect %>% group_by(Profession) %>%  summarise(Total_income = sum(Income)) %>%
  arrange(desc(Total_income)) %>% as.data.frame()

Fraud_detect_Income
##   Profession Total_income
## 1     DOCTOR    140749145
## 2     LAWYER    138636835
## 3   ENGINEER    132423357

Check the proportion of profession

table(Fraud_detect$Profession)
## 
##   DOCTOR ENGINEER   LAWYER 
##     2550     2460     2522

Plot Profession barchart

job <- Fraud_detect %>% count(Profession)
  #count(Fraud_detect$Profession)

### rename profession types
colnames(job)[1] <- "Job_title"

job <-as.data.frame(job)

job %>% ggplot(aes(reorder(Job_title, -n), n, fill = Job_title)) + geom_bar(stat = "identity") + theme_classic()

Histogram and Boxplot

### Histogram and density plot of Income

Fraud_detect %>% ggplot(aes(Income, fill = Profession)) + geom_histogram(aes(y = ..density..), position="identity") + geom_density(fill="blue", alpha = .2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### boxplot showing income by profession
Fraud_detect %>% ggplot(aes(Profession, Income, color = Profession)) + geom_boxplot()

Arrange the highest salary by profession

#### arrange the highest salary by profession
Fraud_detect %>% select(Profession, Income) %>%  filter(Profession == "DOCTOR") %>% as.data.frame() %>% 
  arrange(desc(Income)) %>% slice(1:10)
##    Profession Income
## 1      DOCTOR  99986
## 2      DOCTOR  99956
## 3      DOCTOR  99904
## 4      DOCTOR  99904
## 5      DOCTOR  99893
## 6      DOCTOR  99877
## 7      DOCTOR  99866
## 8      DOCTOR  99860
## 9      DOCTOR  99858
## 10     DOCTOR  99837
Fraud_detect %>% select(Profession, Income) %>%  filter(Profession == "LAWYER") %>% as.data.frame() %>%

arrange(desc(Income)) %>% slice(1:10)
##    Profession Income
## 1      LAWYER  99924
## 2      LAWYER  99923
## 3      LAWYER  99912
## 4      LAWYER  99899
## 5      LAWYER  99868
## 6      LAWYER  99842
## 7      LAWYER  99823
## 8      LAWYER  99815
## 9      LAWYER  99810
## 10     LAWYER  99736
Fraud_detect %>% select(Profession, Income) %>%  filter(Profession == "ENGINEER") %>% as.data.frame() %>%

arrange(desc(Income)) %>% slice(1:10)
##    Profession Income
## 1    ENGINEER  99936
## 2    ENGINEER  99893
## 3    ENGINEER  99860
## 4    ENGINEER  99757
## 5    ENGINEER  99729
## 6    ENGINEER  99678
## 7    ENGINEER  99644
## 8    ENGINEER  99626
## 9    ENGINEER  99612
## 10   ENGINEER  99571

Change response variable to factor

Fraud_detect$Fraud <- as.factor(Fraud_detect$Fraud)

Select the required columns

Fraud_detect <- Fraud_detect %>% select(Profession,Income, Fraud, Security_code)

Splitting the data

set.seed(1234)
sample_fraud <- sample(nrow(Fraud_detect), round(nrow(Fraud_detect)* .80), replace = FALSE)
fraud_train <- Fraud_detect[sample_fraud, ] 
fraud_test <- Fraud_detect[-sample_fraud, ]

Checking class imbalance

##check for class imbalance
round(prop.table(table(select(Fraud_detect, Fraud))), 2)
## Fraud
##   0   1 
## 0.5 0.5
##check for class imbalance
round(prop.table(table(select(fraud_train, Fraud))), 2)
## Fraud
##   0   1 
## 0.5 0.5
## Check for class imbalance
round(prop.table(table(select(fraud_test, Fraud))), 2)
## Fraud
##    0    1 
## 0.49 0.51

Training the model in decision tree

library(rpart)
Fraud_detect_mod <- rpart(Fraud ~. ,method = "class", data = fraud_train)

Evaluate the model

library(rpart.plot)
rpart.plot(Fraud_detect_mod)

Confusion matrix for the model’s predictive accuracy

Fraud_detect_pred <- predict(Fraud_detect_mod, fraud_test, type = "class")

Fraud_detect_pred_table <- table(fraud_test$Fraud, Fraud_detect_pred)
Fraud_detect_pred_table
##    Fraud_detect_pred
##       0   1
##   0 236 503
##   1 233 534

Prediction accuracy

sum(diag(Fraud_detect_pred_table)) / nrow(fraud_test)
## [1] 0.5112882

Using random Forest

Fraud_random <- randomForest(Fraud ~ ., data = fraud_train, mtry = 4, ntree = 2001, importance = TRUE)

Fraud_random
## 
## Call:
##  randomForest(formula = Fraud ~ ., data = fraud_train, mtry = 4,      ntree = 2001, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 2001
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 48.29%
## Confusion matrix:
##      0    1 class.error
## 0 1589 1443   0.4759235
## 1 1467 1527   0.4899800
plot(Fraud_random)

Predict Test_result

Test_result <- data.frame(fraud_test$Fraud, predict(Fraud_random, fraud_test[ , c(-3)], type = "response"))

#Test_result

plot(Test_result)