Earning Manipulation

Load the data

library(readxl)
earn <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Complete Data")
sam <-  read_excel("IMB579-XLS-ENG.xlsx",sheet = "Sample for Model Development")

Basic checks

str(earn)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1239 obs. of  11 variables:
##  $ Company ID   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ DSRI         : num  1.62 1 1 1.49 1 ...
##  $ GMI          : num  1.13 1.61 1.02 1 1.37 ...
##  $ AQI          : num  7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI          : num  0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI         : num  1.38 0.4 1.17 2 1.45 ...
##  $ SGAI         : num  1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR         : num  -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI         : num  1.161 0.987 1.264 0.681 0.939 ...
##  $ Manipulater  : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ C-MANIPULATOR: num  1 1 1 1 1 1 1 1 1 1 ...

str(sam)

## Classes 'tbl_df', 'tbl' and 'data.frame':    220 obs. of  11 variables:
##  $ Company ID   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ DSRI         : num  1.62 1 1 1.49 1 ...
##  $ GMI          : num  1.13 1.61 1.02 1 1.37 ...
##  $ AQI          : num  7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI          : num  0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI         : num  1.38 0.4 1.17 2 1.45 ...
##  $ SGAI         : num  1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR         : num  -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI         : num  1.161 0.987 1.264 0.681 0.939 ...
##  $ Manipulator  : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ C-MANIPULATOR: num  1 1 1 1 1 1 1 1 1 1 ...

earn$Manipulater <- factor(earn$Manipulater)
sam$Manipulator <- factor(sam$Manipulator)

Checking Missing Values

dim(earn)

## [1] 1239   11

str(earn)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1239 obs. of  11 variables:
##  $ Company ID   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ DSRI         : num  1.62 1 1 1.49 1 ...
##  $ GMI          : num  1.13 1.61 1.02 1 1.37 ...
##  $ AQI          : num  7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI          : num  0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI         : num  1.38 0.4 1.17 2 1.45 ...
##  $ SGAI         : num  1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR         : num  -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI         : num  1.161 0.987 1.264 0.681 0.939 ...
##  $ Manipulater  : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ C-MANIPULATOR: num  1 1 1 1 1 1 1 1 1 1 ...

# Checking Missing Values 
library(VIM)
aggr_plot = aggr(earn, 
                 col = c("red", "blue"), 
                 numbers = TRUE, 
                 prop = TRUE, 
                 sortVars = TRUE, 
                 labels = names(earn), 
                 cex.axis = 1, 
                 gap = 0, 
                 ylab = c("Histogram of missing data", "Pattern"))

## 
##  Variables sorted by number of missings: 
##       Variable Count
##     Company ID     0
##           DSRI     0
##            GMI     0
##            AQI     0
##            SGI     0
##           DEPI     0
##           SGAI     0
##           ACCR     0
##           LEVI     0
##    Manipulater     0
##  C-MANIPULATOR     0

# This shows that there are no missing data in our Dataset, Good Start!

Let us see how our target variable is in the data

# converting all the variables into factor which have less than 4 unique values 

col_names <- sapply(earn, function(col) length(unique(col)) < 4)
earn[ , col_names] <- lapply(earn[ , col_names] , factor)
str(earn)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1239 obs. of  11 variables:
##  $ Company ID   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ DSRI         : num  1.62 1 1 1.49 1 ...
##  $ GMI          : num  1.13 1.61 1.02 1 1.37 ...
##  $ AQI          : num  7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI          : num  0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI         : num  1.38 0.4 1.17 2 1.45 ...
##  $ SGAI         : num  1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR         : num  -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI         : num  1.161 0.987 1.264 0.681 0.939 ...
##  $ Manipulater  : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ C-MANIPULATOR: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

library(Hmisc)
describe(earn$Manipulater)

## earn$Manipulater 
##        n  missing distinct 
##     1239        0        2 
##                       
## Value         No   Yes
## Frequency   1200    39
## Proportion 0.969 0.031

# this suggests that there are only 3.1% data avaiable for manipulator 
# Also, the Target Variable is highly unbalanced as the data is split in 1200:39 ratio 

# Let us see the data graphically 

ggplot(data = earn, mapping = aes(x =Manipulater)) +
geom_bar(color="black",fill="orange")+
ggtitle("How is the target variable is distributed")+
  xlab("No->Not Manipulated 1->Manipulated")+
  theme_bw()

Let us try some sampling techniques we know for solving the class imbalance problem

#### Let us solve the Class Imbalance Problem with SMOTE Technique

ncols<-ncol(earn)
 earn<-cbind(earn[1:ncols],earn[10])
 earn <- earn[,-c(1,10)]

set.seed(1234)
index <- sample(1:2, nrow(earn), replace = TRUE, prob = c(0.7,0.3))
train <- earn[index == 1, ]
test <- earn[index == 2, ]

prop.table(table(train$Manipulater))

## 
##         No        Yes 
## 0.96453089 0.03546911

prop.table(table(test$Manipulater)) # high level of Imbalance

## 
##         No        Yes 
## 0.97808219 0.02191781

library(DMwR)
smote_train<-SMOTE(Manipulater ~ . , train,k=5,perc.over = 1400,perc.under=140)

prop.table(table(smote_train$Manipulater))

## 
##        No       Yes 
## 0.5662313 0.4337687

Question 1

Let us try to put Beneish Model for Elimentary Analysis

Note: We should note that this model was not created for financial institutions like Banks

Using Manipulator and Non-Manipulator Sheet, building a Beneish Model

m <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Manipulator")
non_m <-  read_excel("IMB579-XLS-ENG.xlsx",sheet = "Non-Manipulator")

# MANIPULATORS 
dim(m)

## [1] 39 11

library(dplyr)
beneish_m <-m %>%
  mutate(m_score= -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI -  0.172*SGAI + 4.679*ACCR -  0.327*LEVI,probability_of_manipulation=pnorm(m_score)*100)

# let us see the results visually 
hist(beneish_m$m_score,  # Save histogram as object
     breaks = 30,
     freq = T,
     col = "thistle1", # Or use: col = colors() [626]
     main = "Histogram for M-Score of Manipulators")

c<- count(beneish_m[-beneish_m$m_score<=-1.78,])
print(c)

## # A tibble: 1 x 1
##       n
##   <int>
## 1     7

# thus Beneish model could correctly identify 7 manipulator out of 39 

Accuracy_predicting_manipulator <- c/nrow(m)
print(Accuracy_predicting_manipulator) # thus, we could predict manipulators with an accuracy of 17.94%

##           n
## 1 0.1794872

# NON-MANIPULATORS 

dim(non_m)

## [1] 1200   11

beneish_non_m <-non_m %>%
  mutate(m_score= -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI -  0.172*SGAI + 4.679*ACCR -  0.327*LEVI,probability_of_manipulation=pnorm(m_score)*100)

# let us see the results visually 
hist(beneish_non_m$m_score,  # Save histogram as object
     breaks = 30,
     freq = T,
     col = "thistle1", # Or use: col = colors() [626]
     main = "Histogram for M-Score of Non-Manipulators")

c<- count(beneish_non_m[beneish_non_m$m_score<=-1.78,])
print(c)

## # A tibble: 1 x 1
##       n
##   <int>
## 1  1029

# thus Beneish model could correctly identify 1029 non-manipulator out of 1200 

Accuracy_predicting_non_manipulator <- c/nrow(non_m)
print(Accuracy_predicting_non_manipulator) # thus, we could predict non-manipulators with an accuracy of 85.75%

##        n
## 1 0.8575

# However, we are more interested in correctly finding firms who are doing manipulation, thus an accuracy of 17.94% would be very less for a good model 

# THUS, BENEISH MODEL WOULD NOT BE A SUITABLE MODEL FOR PREDICTING EARNING MANIPULATIONS IN INDIA

Question 3

Logistic Regression for the sampled data

sam <-  read_excel("IMB579-XLS-ENG.xlsx",sheet = "Sample for Model Development")

# Under Sampling 

str(sam)

## Classes 'tbl_df', 'tbl' and 'data.frame':    220 obs. of  11 variables:
##  $ Company ID   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ DSRI         : num  1.62 1 1 1.49 1 ...
##  $ GMI          : num  1.13 1.61 1.02 1 1.37 ...
##  $ AQI          : num  7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI          : num  0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI         : num  1.38 0.4 1.17 2 1.45 ...
##  $ SGAI         : num  1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR         : num  -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI         : num  1.161 0.987 1.264 0.681 0.939 ...
##  $ Manipulator  : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ C-MANIPULATOR: num  1 1 1 1 1 1 1 1 1 1 ...

dim(sam)

## [1] 220  11

sam$`Company ID`<- NULL
sam$Manipulator<- NULL
names(sam)[9] <- "Target"
sam$Target <- factor(sam$Target)

set.seed(1234)
index <- sample(1:2, nrow(sam), replace = TRUE, prob = c(0.7,0.3))
train <- sam[index == 1, ]
test <- sam[index == 2, ]

prop.table(table(train$Target))

## 
##         0         1 
## 0.8037975 0.1962025

prop.table(table(test$Target)) # high level of Imbalance

## 
##         0         1 
## 0.8709677 0.1290323

library(ROSE)

under <- ovun.sample(Target~., data=train, method = "under", N = 62)$data
table(under$Target)

## 
##  0  1 
## 31 31

# Let us try to do variable selection  

full <- glm(Target~., data=under,family = "binomial")
null <- glm(Target~., data=under,family = "binomial")

# Forward selection

mod_forward<- step(null, scope=list(lower=null,upper=full),direction = "forward")

## Start:  AIC=49.32
## Target ~ DSRI + GMI + AQI + SGI + DEPI + SGAI + ACCR + LEVI

summary(mod_forward)

## 
## Call:
## glm(formula = Target ~ DSRI + GMI + AQI + SGI + DEPI + SGAI + 
##     ACCR + LEVI, family = "binomial", data = under)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.07018  -0.23945  -0.00006   0.18141   1.76210  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -16.4226     5.7093  -2.876  0.00402 **
## DSRI          5.0589     2.0096   2.517  0.01182 * 
## GMI           1.6291     0.9614   1.694  0.09019 . 
## AQI           2.4184     0.9520   2.540  0.01107 * 
## SGI           8.9339     3.2512   2.748  0.00600 **
## DEPI         -1.2134     1.0416  -1.165  0.24401   
## SGAI         -0.9488     0.4996  -1.899  0.05756 . 
## ACCR         16.7136     6.0078   2.782  0.00540 **
## LEVI         -3.2234     1.7600  -1.832  0.06702 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 85.950  on 61  degrees of freedom
## Residual deviance: 31.319  on 53  degrees of freedom
## AIC: 49.319
## 
## Number of Fisher Scoring iterations: 10

options(scipens=999)
exp(coef(mod_forward))

##  (Intercept)         DSRI          GMI          AQI          SGI         DEPI 
## 7.374546e-08 1.574198e+02 5.099191e+00 1.122782e+01 7.585086e+03 2.971750e-01 
##         SGAI         ACCR         LEVI 
## 3.871904e-01 1.813944e+07 3.981997e-02

exp(cbind(OR = coef(mod_forward), confint(mod_forward)))

##                       OR        2.5 %       97.5 %
## (Intercept) 7.374546e-08 1.376497e-14 4.350312e-04
## DSRI        1.574198e+02 9.233737e+00 4.914377e+04
## GMI         5.099191e+00 6.729709e-01 4.631054e+01
## AQI         1.122782e+01 2.899242e+00 1.504357e+02
## SGI         7.585086e+03 5.714217e+01 4.929157e+07
## DEPI        2.971750e-01 1.995543e-02 1.935026e+00
## SGAI        3.871904e-01 9.883301e-02 1.633154e+00
## ACCR        1.813944e+07 2.437803e+03 9.251244e+13
## LEVI        3.981997e-02 7.612595e-04 1.827783e+00

predicted <- predict(mod_forward, newData=test, type="response")

# let us calculate the probabilties 

probs <- exp(predicted)/(1+exp(predicted))

table(predicted, test$Target)

##                       
## predicted              0 1
##   8.14612176574424e-09 1 0
##   6.00658300781842e-05 1 0
##   0.000127550984281535 1 0
##   0.000276517266447538 1 0
##   0.000592902756840113 1 0
##   0.00151816267703678  1 0
##   0.00306947695982215  1 0
##   0.00448491530210303  1 0
##   0.00693782205087582  1 0
##   0.00813703786161421  0 1
##   0.0084108264237783   1 0
##   0.0086517959535934   0 1
##   0.0131710154828861   1 0
##   0.015949706589745    0 1
##   0.0257009226292954   1 0
##   0.0291403764734181   0 1
##   0.0681607999554298   1 0
##   0.098022565407098    0 1
##   0.129616755225215    1 0
##   0.154409601581103    0 1
##   0.211717163282882    1 0
##   0.243015813454845    1 0
##   0.25990269148561     1 0
##   0.287073946802408    0 1
##   0.297366151992658    1 0
##   0.299158668931278    0 1
##   0.307025016645982    1 0
##   0.321514923618005    1 0
##   0.340417111224406    1 0
##   0.364326897165939    1 0
##   0.371328420400767    1 0
##   0.371432010888463    1 0
##   0.382995925350319    1 0
##   0.4309840526756      1 0
##   0.471325378687997    1 0
##   0.639955906691414    1 0
##   0.726269296389403    1 0
##   0.822343871921667    1 0
##   0.878476004215433    1 0
##   0.882677391259849    1 0
##   0.889476435122441    1 0
##   0.893501177885515    1 0
##   0.900386667947128    1 0
##   0.960076361258178    1 0
##   0.964801483378529    1 0
##   0.965782692601884    1 0
##   0.982005342815065    1 0
##   0.988220542078743    1 0
##   0.989396207678045    1 0
##   0.991337635862956    1 0
##   0.991861094604328    1 0
##   0.9981119618788      1 0
##   0.999357636633555    1 0
##   0.999942778120127    1 0
##   0.99999720805383     1 0
##   0.999999305471319    1 0
##   0.99999999996668     1 0
##   1                    5 0

# Recode factors
y_pred_num <- ifelse(predicted > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1),order=T)
y_act <- test$Target

# Accuracy
#mean(y_pred == y_act)  
# 43.54% for 0.5 threshold 
# 40.32% for 0.4 threshold
# 29.03% for 0.3 threshold 
# 43.54% for 0.6 threshold
# 45.16% for 0.7 threshold 

# with a misclassification rate of 57.46% we cannot conclude the significance of a variable based on just one sample. You have to create several samples before deciding the variable importance.

#run anova for checking the model 
anova(mod_forward, test = 'Chisq')

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Target
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                    61     85.950              
## DSRI  1   5.9990        60     79.951   0.01431 *  
## GMI   1   0.0652        59     79.886   0.79849    
## AQI   1   2.6542        58     77.232   0.10328    
## SGI   1  20.5576        57     56.674 5.786e-06 ***
## DEPI  1   0.2450        56     56.429   0.62062    
## SGAI  1   0.1501        55     56.279   0.69840    
## ACCR  1  22.2174        54     34.062 2.435e-06 ***
## LEVI  1   2.7430        53     31.319   0.09768 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# ROC Curve
roc.curve(test$Target, predicted)

## Area under the curve (AUC): 0.759

# Confusion Matrix
consufusion.1<-table(test$Target,y_pred)
consufusion.1

##    y_pred
##      0  1
##   0 27 27
##   1  8  0

#visualize the result
library(pROC)
roc.small<-roc(test$Target,predicted)
x<-1-roc.small$specificities
y<-roc.small$sensitivities
auc.small<-roc.small$auc

# roc with ggplot 
ggplot(data=NULL,mapping=aes(x=x,y=y))+
  geom_line(colour='pink')+
  geom_abline(intercept=0,slope=1)+
  annotate('text',x=0.4,y=0.4,label=paste('AUC=',round(auc.small,digit=2)))+
  labs(x='False positive rate',y='sensitivity',title='ROC curve')

XGBoost application

#library(xgboost)

Earning Manipulation

Chirag Ahluwalia, Ronald Pukadyil, Abirami V

4/15/2020

Load the data

Basic checks

Checking Missing Values

Let us see how our target variable is in the data

Let us try some sampling techniques we know for solving the class imbalance problem

Question 1

Let us try to put Beneish Model for Elimentary Analysis

Note: We should note that this model was not created for financial institutions like Banks

Using Manipulator and Non-Manipulator Sheet, building a Beneish Model

Question 3

Logistic Regression for the sampled data

XGBoost application