EDA
summary(Fraud_detect)
## Profession Income Credit_card_number
## Length:10000 Min. : 1 Min. : 60402960771
## Class :character 1st Qu.:24864 1st Qu.: 180013750000000
## Mode :character Median :49483 Median : 3512440000000000
## Mean :49761 Mean : 385136347436000000
## 3rd Qu.:74483 3rd Qu.: 4594780000000000
## Max. :99986 Max. :4999700000000000000
## Expiry Security_code Fraud
## Length:10000 Min. : 0.0 Min. :0.0000
## Class :character 1st Qu.: 275.0 1st Qu.:0.0000
## Mode :character Median : 539.5 Median :1.0000
## Mean : 863.6 Mean :0.5016
## 3rd Qu.: 813.2 3rd Qu.:1.0000
## Max. :9990.0 Max. :1.0000
### glimpse function to check the ...
glimpse(Fraud_detect)
## Rows: 10,000
## Columns: 6
## $ Profession <chr> "DOCTOR", "DOCTOR", "LAWYER", "LAWYER", "DOCTOR", "…
## $ Income <dbl> 42509, 80334, 91552, 43623, 22962, 72106, 54992, 19…
## $ Credit_card_number <dbl> 3515420000000000, 213134000000000, 4869620000000000…
## $ Expiry <chr> "Jul-25", "May-32", "Mar-30", "Jan-29", "Nov-30", "…
## $ Security_code <dbl> 251, 858, 755, 160, 102, 834, 207, 433, 872, 295, 6…
## $ Fraud <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, …
#### check for any missing data
sum(is.na(Fraud_detect))
## [1] 0
### describing the dataset
describe(Fraud_detect)
## vars n mean sd
## Profession* 1 10000 2.00 0.82
## Income 2 10000 49761.21 28837.73
## Credit_card_number 3 10000 385136347435963712.00 1257949726283964928.00
## Expiry* 4 10000 59.87 34.60
## Security_code 5 10000 863.59 1484.42
## Fraud 6 10000 0.50 0.50
## median trimmed mad
## Profession* 2.0 2.00 1.48
## Income 49483.0 49679.15 36787.75
## Credit_card_number 3512440000000000.0 2579188701213750.00 4644885724500000.00
## Expiry* 59.0 59.60 44.48
## Security_code 539.5 541.84 399.56
## Fraud 1.0 0.50 0.00
## min max range skew
## Profession* 1 3 2 0.00
## Income 1 99986 99985 0.02
## Credit_card_number 60402960771 4999700000000000000 4999699939597039616 3.00
## Expiry* 1 121 120 0.05
## Security_code 0 9990 9990 4.10
## Fraud 0 1 1 -0.01
## kurtosis se
## Profession* -1.52 0.01
## Income -1.20 288.38
## Credit_card_number 7.04 12579497262839650.00
## Expiry* -1.17 0.35
## Security_code 17.12 14.84
## Fraud -2.00 0.01
Data cleaning
### security code is supposed to be 3 digits, so we have to remove security code with less than 3 digits or greater 3 digits
Fraud_detect_security <- Fraud_detect %>% filter(nchar(Security_code) == 3)
dim(Fraud_detect_security)
## [1] 8383 6
Fraud_detect_security$Security_code <- as.numeric(Fraud_detect_security$Security_code)
### to remove Income with less than 5 digits as these occupations cannot earn less than a 5 digit number
Fraud_detect_income <- Fraud_detect_security %>% filter(nchar(Income) == 5)
dim(Fraud_detect_income)
## [1] 7532 6
### to remove Income with less than 16 digits
Fraud_detect_credit <- Fraud_detect_income %>% filter(nchar(Credit_card_number) == 16)
dim(Fraud_detect_income)
## [1] 7532 6
### assign back to original data
Fraud_detect <- Fraud_detect_income
Summary after cleaning
summary(Fraud_detect)
## Profession Income Credit_card_number
## Length:7532 Min. :10023 Min. : 60402960771
## Class :character 1st Qu.:32176 1st Qu.: 180015000000000
## Mode :character Median :54224 Median : 3513075000000000
## Mean :54675 Mean : 393265665072000000
## 3rd Qu.:77301 3rd Qu.: 4584480000000000
## Max. :99986 Max. :4999700000000000000
## Expiry Security_code Fraud
## Length:7532 Min. :100 Min. :0.0000
## Class :character 1st Qu.:326 1st Qu.:0.0000
## Mode :character Median :548 Median :0.0000
## Mean :550 Mean :0.4993
## 3rd Qu.:776 3rd Qu.:1.0000
## Max. :999 Max. :1.0000
group_by profession and decreasing total income as a data frame
Fraud_detect_Income <- Fraud_detect %>% group_by(Profession) %>% summarise(Total_income = sum(Income)) %>%
arrange(desc(Total_income)) %>% as.data.frame()
Fraud_detect_Income
## Profession Total_income
## 1 DOCTOR 140749145
## 2 LAWYER 138636835
## 3 ENGINEER 132423357
Check the proportion of profession
table(Fraud_detect$Profession)
##
## DOCTOR ENGINEER LAWYER
## 2550 2460 2522
Plot Profession barchart
job <- Fraud_detect %>% count(Profession)
#count(Fraud_detect$Profession)
### rename profession types
colnames(job)[1] <- "Job_title"
job <-as.data.frame(job)
job %>% ggplot(aes(reorder(Job_title, -n), n, fill = Job_title)) + geom_bar(stat = "identity") + theme_classic()

Histogram and Boxplot
### Histogram and density plot of Income
Fraud_detect %>% ggplot(aes(Income, fill = Profession)) + geom_histogram(aes(y = ..density..), position="identity") + geom_density(fill="blue", alpha = .2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### boxplot showing income by profession
Fraud_detect %>% ggplot(aes(Profession, Income, color = Profession)) + geom_boxplot()

Arrange the highest salary by profession
#### arrange the highest salary by profession
Fraud_detect %>% select(Profession, Income) %>% filter(Profession == "DOCTOR") %>% as.data.frame() %>%
arrange(desc(Income)) %>% slice(1:10)
## Profession Income
## 1 DOCTOR 99986
## 2 DOCTOR 99956
## 3 DOCTOR 99904
## 4 DOCTOR 99904
## 5 DOCTOR 99893
## 6 DOCTOR 99877
## 7 DOCTOR 99866
## 8 DOCTOR 99860
## 9 DOCTOR 99858
## 10 DOCTOR 99837
Fraud_detect %>% select(Profession, Income) %>% filter(Profession == "LAWYER") %>% as.data.frame() %>%
arrange(desc(Income)) %>% slice(1:10)
## Profession Income
## 1 LAWYER 99924
## 2 LAWYER 99923
## 3 LAWYER 99912
## 4 LAWYER 99899
## 5 LAWYER 99868
## 6 LAWYER 99842
## 7 LAWYER 99823
## 8 LAWYER 99815
## 9 LAWYER 99810
## 10 LAWYER 99736
Fraud_detect %>% select(Profession, Income) %>% filter(Profession == "ENGINEER") %>% as.data.frame() %>%
arrange(desc(Income)) %>% slice(1:10)
## Profession Income
## 1 ENGINEER 99936
## 2 ENGINEER 99893
## 3 ENGINEER 99860
## 4 ENGINEER 99757
## 5 ENGINEER 99729
## 6 ENGINEER 99678
## 7 ENGINEER 99644
## 8 ENGINEER 99626
## 9 ENGINEER 99612
## 10 ENGINEER 99571
Change response variable to factor
Fraud_detect$Fraud <- as.factor(Fraud_detect$Fraud)
Select the required columns
Fraud_detect <- Fraud_detect %>% select(Profession,Income, Fraud, Security_code)
Splitting the data
set.seed(1234)
sample_fraud <- sample(nrow(Fraud_detect), round(nrow(Fraud_detect)* .80), replace = FALSE)
fraud_train <- Fraud_detect[sample_fraud, ]
fraud_test <- Fraud_detect[-sample_fraud, ]
Checking class imbalance
##check for class imbalance
round(prop.table(table(select(Fraud_detect, Fraud))), 2)
## Fraud
## 0 1
## 0.5 0.5
##check for class imbalance
round(prop.table(table(select(fraud_train, Fraud))), 2)
## Fraud
## 0 1
## 0.5 0.5
## Check for class imbalance
round(prop.table(table(select(fraud_test, Fraud))), 2)
## Fraud
## 0 1
## 0.49 0.51
Training the model in decision tree
library(rpart)
Fraud_detect_mod <- rpart(Fraud ~. ,method = "class", data = fraud_train)
Evaluate the model
library(rpart.plot)
rpart.plot(Fraud_detect_mod)

Confusion matrix for the model’s predictive accuracy
Fraud_detect_pred <- predict(Fraud_detect_mod, fraud_test, type = "class")
Fraud_detect_pred_table <- table(fraud_test$Fraud, Fraud_detect_pred)
Fraud_detect_pred_table
## Fraud_detect_pred
## 0 1
## 0 236 503
## 1 233 534
Prediction accuracy
sum(diag(Fraud_detect_pred_table)) / nrow(fraud_test)
## [1] 0.5112882
Using random Forest
Fraud_random <- randomForest(Fraud ~ ., data = fraud_train, mtry = 4, ntree = 2001, importance = TRUE)
Fraud_random
##
## Call:
## randomForest(formula = Fraud ~ ., data = fraud_train, mtry = 4, ntree = 2001, importance = TRUE)
## Type of random forest: classification
## Number of trees: 2001
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 48.29%
## Confusion matrix:
## 0 1 class.error
## 0 1589 1443 0.4759235
## 1 1467 1527 0.4899800
plot(Fraud_random)

Predict Test_result
Test_result <- data.frame(fraud_test$Fraud, predict(Fraud_random, fraud_test[ , c(-3)], type = "response"))
#Test_result
plot(Test_result)
