About Earning Manipulation Case

The object of the study is to analyze the 8 features of different companies in order to evaluate if the companies have actually manipulated their earnings. This is a typical case of data imbalance.

Read CSV

raw_data <- read.csv("fraud_data.csv",
                     head=TRUE,na.strings=c("", " ", "NA"), sep=",")

filter_data <- raw_data[,-c(1)]

load libraries

library(caret)          #for split and model accuracy
## Loading required package: lattice
## Loading required package: ggplot2
library(DMwR)           #for SMOTE Sampling
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ROCR)           #for ROC Plot
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(e1071)
library(xgboost)        #to implement xgbTree
#library(rattle)        #print the business rules for the model
library(inTrees)        #to extract the business rules from rf model
## Registered S3 method overwritten by 'RRF':
##   method      from        
##   plot.margin randomForest
library(UBL)
## Loading required package: MBA
## Loading required package: gstat
## Loading required package: automap
## Loading required package: sp
library(tictoc)         #to record the time elapsed

Define a 70:30 split

set.seed(4121)
trainIndex <- createDataPartition(filter_data$Status, p = 0.70, list=FALSE)
train_df <- filter_data[ trainIndex,]
test_df <- filter_data[-trainIndex,]

Listwise deletion of missing

summary(train_df) #summary of the data
##       DSRI              GMI                AQI                SGI          
##  Min.   : 0.0151   Min.   :-20.8118   Min.   :-32.8856   Min.   : 0.07692  
##  1st Qu.: 0.8908   1st Qu.:  0.9255   1st Qu.:  0.7904   1st Qu.: 0.96702  
##  Median : 1.0218   Median :  1.0000   Median :  1.0111   Median : 1.09389  
##  Mean   : 1.1330   Mean   :  0.9439   Mean   :  1.0340   Mean   : 1.12900  
##  3rd Qu.: 1.1863   3rd Qu.:  1.0574   3rd Qu.:  1.2162   3rd Qu.: 1.20012  
##  Max.   :14.0000   Max.   :  5.8551   Max.   : 52.8867   Max.   :13.08143  
##       DEPI              SGAI             ACCR               LEVI        
##  Min.   :0.06882   Min.   :0.0000   Min.   :-3.14350   Min.   : 0.1386  
##  1st Qu.:0.93711   1st Qu.:0.8996   1st Qu.:-0.07617   1st Qu.: 0.9221  
##  Median :1.00000   Median :1.0000   Median :-0.02928   Median : 1.0112  
##  Mean   :1.04472   Mean   :1.0674   Mean   :-0.03225   Mean   : 1.0623  
##  3rd Qu.:1.07886   3rd Qu.:1.1401   3rd Qu.: 0.02296   3rd Qu.: 1.1156  
##  Max.   :5.39387   Max.   :6.9075   Max.   : 0.71784   Max.   :13.0586  
##  Status   
##  No :840  
##  Yes: 28  
##           
##           
##           
## 
train_df <- na.omit(train_df) # listwise deletion of missing
test_df <- na.omit(test_df) # listwise deletion of missing

The code which is used here is using only 5 out of 8 features, the rest 3 are commented out as below

model_df <- as.data.frame(filter_data[,c(#"DSRI",
                                       #"GMI",
                                       "AQI",
                                       #"SGI",
                                       "DEPI",
                                       "SGAI",
                                       "ACCR",
                                       "LEVI",
                                       "Status"
)])

model_train_df <- as.data.frame(train_df[,c(#"DSRI",
                                          #"GMI",
                                          "AQI",
                                          #"SGI",
                                          "DEPI",
                                          "SGAI",
                                          "ACCR",
                                          "LEVI",
                                          "Status"
)])

model_test_df <- as.data.frame(test_df[,c(#"DSRI",
                                        #"GMI",
                                        "AQI",
                                        #"SGI",
                                        "DEPI",
                                        "SGAI",
                                        "ACCR",
                                        "LEVI",
                                        "Status"
)])

Correlation we are trying to test, but there is no correlation.

correlation_matrix <- cor(model_df[,c(1:5)])
print(correlation_matrix)
##               AQI        DEPI         SGAI        ACCR        LEVI
## AQI   1.000000000 -0.02124161  0.003712316 -0.04542383  0.07027302
## DEPI -0.021241615  1.00000000 -0.067247329 -0.01661336 -0.01271157
## SGAI  0.003712316 -0.06724733  1.000000000 -0.09066795  0.02174950
## ACCR -0.045423827 -0.01661336 -0.090667950  1.00000000 -0.01163113
## LEVI  0.070273016 -0.01271157  0.021749500 -0.01163113  1.00000000
# find attributes that are highly corrected (ideally >0.7)
highly_correlated <- findCorrelation(correlation_matrix, cutoff = 0.6, names = TRUE)
print(highly_correlated)
## character(0)

Here the method is “boot”, but you can change it to “cv”and number=5 instead of 1. You can go for “repeated cv” rather than cv as the number of observations is very less.. you can apply it rather than boot. Tick-tock is just to report the time it takes to run the code chunk. Method = “boot” here means bootstrap sampling. There is also a method called LOOCV: Leave one out CV. Train control is actually trying to set up the environment on which you do the training. eg. in order to implement 5 cv test. and then there is a train() function. And we have expandgrid() which is from the base package itself.

objControl <- trainControl(method='boot', number = 1,
                           returnResamp='none',
                           summaryFunction = twoClassSummary,
                           savePredictions = TRUE,
                           classProbs = TRUE,allowParallel=TRUE)

Now here we saying that we are using rf package and rf method. Here the metric is ROC, you could have asked for specificity-in caret you can’t pass specificity.

tic("RF Bagging with Bootstrap Sample")
set.seed(4121)
rf_bootstrap_model <- train(model_train_df[,1:5], model_train_df[,6],
                  method='rf',
                  trControl=objControl, ntree = 500,
                  metric = "ROC")
toc()
## RF Bagging with Bootstrap Sample: 2.701 sec elapsed
#rf_bootstrap_model$finalModel #rf_bootstrap_model$results
print(rf_bootstrap_model)
## Random Forest 
## 
## 868 samples
##   5 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Bootstrapped (1 reps) 
## Summary of sample sizes: 868 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens  Spec      
##   2     0.7867246  1     0.07692308
##   3     0.7882134  1     0.07692308
##   5     0.7512407  1     0.07692308
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.

It tried only 2, 3 and 5 and not 1 and 4. If there are n hyperparameter then it by default it tries to look for n^p. Here the model building is done.

confusionMatrix.train(rf_bootstrap_model)
## Bootstrapped (1 reps) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction   No  Yes
##        No  96.0  3.7
##        Yes  0.0  0.3
##                             
##  Accuracy (average) : 0.9628
plot(varImp(rf_bootstrap_model), main = "Variable importance from Bootstrap Random Forest", col = 2, lwd = 2)

caretPredictedClass <- predict(rf_bootstrap_model, model_test_df, type = "raw")
confusionMatrix(caretPredictedClass,model_test_df$Status)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  355   9
##        Yes   5   2
##                                           
##                Accuracy : 0.9623          
##                  95% CI : (0.9375, 0.9792)
##     No Information Rate : 0.9704          
##     P-Value [Acc > NIR] : 0.8573          
##                                           
##                   Kappa : 0.2039          
##                                           
##  Mcnemar's Test P-Value : 0.4227          
##                                           
##             Sensitivity : 0.9861          
##             Specificity : 0.1818          
##          Pos Pred Value : 0.9753          
##          Neg Pred Value : 0.2857          
##              Prevalence : 0.9704          
##          Detection Rate : 0.9569          
##    Detection Prevalence : 0.9811          
##       Balanced Accuracy : 0.5840          
##                                           
##        'Positive' Class : No              
## 
rf_bootstrap_pred <- predict(rf_bootstrap_model, model_test_df, type = "prob")[,2]
rf_bootstrap_prediction <- prediction(rf_bootstrap_pred,model_test_df$Status)
rf_bootstrap_perf <- performance(rf_bootstrap_prediction, "tpr","fpr")

plot(rf_bootstrap_perf,main="ROC Curve for bootstrap Random Forest",col=2,lwd=2)
abline(a=0,b=1,lwd=2,lty=3,col="black")

#AUC for the ROC plot
performance(rf_bootstrap_prediction, "auc")
## An object of class "performance"
## Slot "x.name":
## [1] "None"
## 
## Slot "y.name":
## [1] "Area under the ROC curve"
## 
## Slot "alpha.name":
## [1] "none"
## 
## Slot "x.values":
## list()
## 
## Slot "y.values":
## [[1]]
## [1] 0.807197
## 
## 
## Slot "alpha.values":
## list()
rf_bootstrap_model$bestTune
##   mtry
## 2    3