Load the data

library(readxl)
earn <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Complete Data")
sam <-  read_excel("IMB579-XLS-ENG.xlsx",sheet = "Sample for Model Development")

Basic checks

str(earn)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1239 obs. of  11 variables:
##  $ Company ID   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ DSRI         : num  1.62 1 1 1.49 1 ...
##  $ GMI          : num  1.13 1.61 1.02 1 1.37 ...
##  $ AQI          : num  7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI          : num  0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI         : num  1.38 0.4 1.17 2 1.45 ...
##  $ SGAI         : num  1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR         : num  -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI         : num  1.161 0.987 1.264 0.681 0.939 ...
##  $ Manipulater  : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ C-MANIPULATOR: num  1 1 1 1 1 1 1 1 1 1 ...
str(sam)
## Classes 'tbl_df', 'tbl' and 'data.frame':    220 obs. of  11 variables:
##  $ Company ID   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ DSRI         : num  1.62 1 1 1.49 1 ...
##  $ GMI          : num  1.13 1.61 1.02 1 1.37 ...
##  $ AQI          : num  7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI          : num  0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI         : num  1.38 0.4 1.17 2 1.45 ...
##  $ SGAI         : num  1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR         : num  -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI         : num  1.161 0.987 1.264 0.681 0.939 ...
##  $ Manipulator  : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ C-MANIPULATOR: num  1 1 1 1 1 1 1 1 1 1 ...
earn$Manipulater <- factor(earn$Manipulater)
sam$Manipulator <- factor(sam$Manipulator)

Checking Missing Values

dim(earn)
## [1] 1239   11
str(earn)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1239 obs. of  11 variables:
##  $ Company ID   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ DSRI         : num  1.62 1 1 1.49 1 ...
##  $ GMI          : num  1.13 1.61 1.02 1 1.37 ...
##  $ AQI          : num  7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI          : num  0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI         : num  1.38 0.4 1.17 2 1.45 ...
##  $ SGAI         : num  1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR         : num  -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI         : num  1.161 0.987 1.264 0.681 0.939 ...
##  $ Manipulater  : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ C-MANIPULATOR: num  1 1 1 1 1 1 1 1 1 1 ...
# Checking Missing Values 
library(VIM)
aggr_plot = aggr(earn, 
                 col = c("red", "blue"), 
                 numbers = TRUE, 
                 prop = TRUE, 
                 sortVars = TRUE, 
                 labels = names(earn), 
                 cex.axis = 1, 
                 gap = 0, 
                 ylab = c("Histogram of missing data", "Pattern"))

## 
##  Variables sorted by number of missings: 
##       Variable Count
##     Company ID     0
##           DSRI     0
##            GMI     0
##            AQI     0
##            SGI     0
##           DEPI     0
##           SGAI     0
##           ACCR     0
##           LEVI     0
##    Manipulater     0
##  C-MANIPULATOR     0
# This shows that there are no missing data in our Dataset, Good Start!

Let us see how our target variable is in the data

# converting all the variables into factor which have less than 4 unique values 

col_names <- sapply(earn, function(col) length(unique(col)) < 4)
earn[ , col_names] <- lapply(earn[ , col_names] , factor)
str(earn)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1239 obs. of  11 variables:
##  $ Company ID   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ DSRI         : num  1.62 1 1 1.49 1 ...
##  $ GMI          : num  1.13 1.61 1.02 1 1.37 ...
##  $ AQI          : num  7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI          : num  0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI         : num  1.38 0.4 1.17 2 1.45 ...
##  $ SGAI         : num  1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR         : num  -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI         : num  1.161 0.987 1.264 0.681 0.939 ...
##  $ Manipulater  : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ C-MANIPULATOR: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
library(Hmisc)
describe(earn$Manipulater)
## earn$Manipulater 
##        n  missing distinct 
##     1239        0        2 
##                       
## Value         No   Yes
## Frequency   1200    39
## Proportion 0.969 0.031
# this suggests that there are only 3.1% data avaiable for manipulator 
# Also, the Target Variable is highly unbalanced as the data is split in 1200:39 ratio 

# Let us see the data graphically 

ggplot(data = earn, mapping = aes(x =Manipulater)) +
geom_bar(color="black",fill="orange")+
ggtitle("How is the target variable is distributed")+
  xlab("No->Not Manipulated 1->Manipulated")+
  theme_bw()

Let us try some sampling techniques we know for solving the class imbalance problem
#### Let us solve the Class Imbalance Problem with SMOTE Technique

ncols<-ncol(earn)
 earn<-cbind(earn[1:ncols],earn[10])
 earn <- earn[,-c(1,10)]

set.seed(1234)
index <- sample(1:2, nrow(earn), replace = TRUE, prob = c(0.7,0.3))
train <- earn[index == 1, ]
test <- earn[index == 2, ]

prop.table(table(train$Manipulater))
## 
##         No        Yes 
## 0.96453089 0.03546911
prop.table(table(test$Manipulater)) # high level of Imbalance  
## 
##         No        Yes 
## 0.97808219 0.02191781
library(DMwR)
smote_train<-SMOTE(Manipulater ~ . , train,k=5,perc.over = 1400,perc.under=140)

prop.table(table(smote_train$Manipulater))
## 
##        No       Yes 
## 0.5662313 0.4337687

Question 1

Let us try to put Beneish Model for Elimentary Analysis

Note: We should note that this model was not created for financial institutions like Banks

Using Manipulator and Non-Manipulator Sheet, building a Beneish Model

m <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Manipulator")
non_m <-  read_excel("IMB579-XLS-ENG.xlsx",sheet = "Non-Manipulator")

# MANIPULATORS 
dim(m)
## [1] 39 11
library(dplyr)
beneish_m <-m %>%
  mutate(m_score= -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI -  0.172*SGAI + 4.679*ACCR -  0.327*LEVI,probability_of_manipulation=pnorm(m_score)*100)

# let us see the results visually 
hist(beneish_m$m_score,  # Save histogram as object
     breaks = 30,
     freq = T,
     col = "thistle1", # Or use: col = colors() [626]
     main = "Histogram for M-Score of Manipulators")

c<- count(beneish_m[-beneish_m$m_score<=-1.78,])
print(c)
## # A tibble: 1 x 1
##       n
##   <int>
## 1     7
# thus Beneish model could correctly identify 7 manipulator out of 39 

Accuracy_predicting_manipulator <- c/nrow(m)
print(Accuracy_predicting_manipulator) # thus, we could predict manipulators with an accuracy of 17.94% 
##           n
## 1 0.1794872
# NON-MANIPULATORS 

dim(non_m)
## [1] 1200   11
beneish_non_m <-non_m %>%
  mutate(m_score= -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI -  0.172*SGAI + 4.679*ACCR -  0.327*LEVI,probability_of_manipulation=pnorm(m_score)*100)

# let us see the results visually 
hist(beneish_non_m$m_score,  # Save histogram as object
     breaks = 30,
     freq = T,
     col = "thistle1", # Or use: col = colors() [626]
     main = "Histogram for M-Score of Non-Manipulators")

c<- count(beneish_non_m[beneish_non_m$m_score<=-1.78,])
print(c)
## # A tibble: 1 x 1
##       n
##   <int>
## 1  1029
# thus Beneish model could correctly identify 1029 non-manipulator out of 1200 

Accuracy_predicting_non_manipulator <- c/nrow(non_m)
print(Accuracy_predicting_non_manipulator) # thus, we could predict non-manipulators with an accuracy of 85.75% 
##        n
## 1 0.8575
# However, we are more interested in correctly finding firms who are doing manipulation, thus an accuracy of 17.94% would be very less for a good model 

# THUS, BENEISH MODEL WOULD NOT BE A SUITABLE MODEL FOR PREDICTING EARNING MANIPULATIONS IN INDIA

Question 3

Logistic Regression for the sampled data

sam <-  read_excel("IMB579-XLS-ENG.xlsx",sheet = "Sample for Model Development")

# Under Sampling 

str(sam)
## Classes 'tbl_df', 'tbl' and 'data.frame':    220 obs. of  11 variables:
##  $ Company ID   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ DSRI         : num  1.62 1 1 1.49 1 ...
##  $ GMI          : num  1.13 1.61 1.02 1 1.37 ...
##  $ AQI          : num  7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI          : num  0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI         : num  1.38 0.4 1.17 2 1.45 ...
##  $ SGAI         : num  1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR         : num  -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI         : num  1.161 0.987 1.264 0.681 0.939 ...
##  $ Manipulator  : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ C-MANIPULATOR: num  1 1 1 1 1 1 1 1 1 1 ...
dim(sam)
## [1] 220  11
sam$`Company ID`<- NULL
sam$Manipulator<- NULL
names(sam)[9] <- "Target"
sam$Target <- factor(sam$Target)

set.seed(1234)
index <- sample(1:2, nrow(sam), replace = TRUE, prob = c(0.7,0.3))
train <- sam[index == 1, ]
test <- sam[index == 2, ]

prop.table(table(train$Target))
## 
##         0         1 
## 0.8037975 0.1962025
prop.table(table(test$Target)) # high level of Imbalance 
## 
##         0         1 
## 0.8709677 0.1290323
library(ROSE)

under <- ovun.sample(Target~., data=train, method = "under", N = 62)$data
table(under$Target)
## 
##  0  1 
## 31 31
# Let us try to do variable selection  

full <- glm(Target~., data=under,family = "binomial")
null <- glm(Target~., data=under,family = "binomial")

# Forward selection

mod_forward<- step(null, scope=list(lower=null,upper=full),direction = "forward")
## Start:  AIC=49.32
## Target ~ DSRI + GMI + AQI + SGI + DEPI + SGAI + ACCR + LEVI
summary(mod_forward)
## 
## Call:
## glm(formula = Target ~ DSRI + GMI + AQI + SGI + DEPI + SGAI + 
##     ACCR + LEVI, family = "binomial", data = under)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.07018  -0.23945  -0.00006   0.18141   1.76210  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -16.4226     5.7093  -2.876  0.00402 **
## DSRI          5.0589     2.0096   2.517  0.01182 * 
## GMI           1.6291     0.9614   1.694  0.09019 . 
## AQI           2.4184     0.9520   2.540  0.01107 * 
## SGI           8.9339     3.2512   2.748  0.00600 **
## DEPI         -1.2134     1.0416  -1.165  0.24401   
## SGAI         -0.9488     0.4996  -1.899  0.05756 . 
## ACCR         16.7136     6.0078   2.782  0.00540 **
## LEVI         -3.2234     1.7600  -1.832  0.06702 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 85.950  on 61  degrees of freedom
## Residual deviance: 31.319  on 53  degrees of freedom
## AIC: 49.319
## 
## Number of Fisher Scoring iterations: 10
options(scipens=999)
exp(coef(mod_forward))
##  (Intercept)         DSRI          GMI          AQI          SGI         DEPI 
## 7.374546e-08 1.574198e+02 5.099191e+00 1.122782e+01 7.585086e+03 2.971750e-01 
##         SGAI         ACCR         LEVI 
## 3.871904e-01 1.813944e+07 3.981997e-02
exp(cbind(OR = coef(mod_forward), confint(mod_forward)))
##                       OR        2.5 %       97.5 %
## (Intercept) 7.374546e-08 1.376497e-14 4.350312e-04
## DSRI        1.574198e+02 9.233737e+00 4.914377e+04
## GMI         5.099191e+00 6.729709e-01 4.631054e+01
## AQI         1.122782e+01 2.899242e+00 1.504357e+02
## SGI         7.585086e+03 5.714217e+01 4.929157e+07
## DEPI        2.971750e-01 1.995543e-02 1.935026e+00
## SGAI        3.871904e-01 9.883301e-02 1.633154e+00
## ACCR        1.813944e+07 2.437803e+03 9.251244e+13
## LEVI        3.981997e-02 7.612595e-04 1.827783e+00
predicted <- predict(mod_forward, newData=test, type="response")

# let us calculate the probabilties 

probs <- exp(predicted)/(1+exp(predicted))

table(predicted, test$Target)
##                       
## predicted              0 1
##   8.14612176574424e-09 1 0
##   6.00658300781842e-05 1 0
##   0.000127550984281535 1 0
##   0.000276517266447538 1 0
##   0.000592902756840113 1 0
##   0.00151816267703678  1 0
##   0.00306947695982215  1 0
##   0.00448491530210303  1 0
##   0.00693782205087582  1 0
##   0.00813703786161421  0 1
##   0.0084108264237783   1 0
##   0.0086517959535934   0 1
##   0.0131710154828861   1 0
##   0.015949706589745    0 1
##   0.0257009226292954   1 0
##   0.0291403764734181   0 1
##   0.0681607999554298   1 0
##   0.098022565407098    0 1
##   0.129616755225215    1 0
##   0.154409601581103    0 1
##   0.211717163282882    1 0
##   0.243015813454845    1 0
##   0.25990269148561     1 0
##   0.287073946802408    0 1
##   0.297366151992658    1 0
##   0.299158668931278    0 1
##   0.307025016645982    1 0
##   0.321514923618005    1 0
##   0.340417111224406    1 0
##   0.364326897165939    1 0
##   0.371328420400767    1 0
##   0.371432010888463    1 0
##   0.382995925350319    1 0
##   0.4309840526756      1 0
##   0.471325378687997    1 0
##   0.639955906691414    1 0
##   0.726269296389403    1 0
##   0.822343871921667    1 0
##   0.878476004215433    1 0
##   0.882677391259849    1 0
##   0.889476435122441    1 0
##   0.893501177885515    1 0
##   0.900386667947128    1 0
##   0.960076361258178    1 0
##   0.964801483378529    1 0
##   0.965782692601884    1 0
##   0.982005342815065    1 0
##   0.988220542078743    1 0
##   0.989396207678045    1 0
##   0.991337635862956    1 0
##   0.991861094604328    1 0
##   0.9981119618788      1 0
##   0.999357636633555    1 0
##   0.999942778120127    1 0
##   0.99999720805383     1 0
##   0.999999305471319    1 0
##   0.99999999996668     1 0
##   1                    5 0
# Recode factors
y_pred_num <- ifelse(predicted > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1),order=T)
y_act <- test$Target

# Accuracy
#mean(y_pred == y_act)  
# 43.54% for 0.5 threshold 
# 40.32% for 0.4 threshold
# 29.03% for 0.3 threshold 
# 43.54% for 0.6 threshold
# 45.16% for 0.7 threshold 

# with a misclassification rate of 57.46% we cannot conclude the significance of a variable based on just one sample. You have to create several samples before deciding the variable importance.

#run anova for checking the model 
anova(mod_forward, test = 'Chisq')
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Target
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                    61     85.950              
## DSRI  1   5.9990        60     79.951   0.01431 *  
## GMI   1   0.0652        59     79.886   0.79849    
## AQI   1   2.6542        58     77.232   0.10328    
## SGI   1  20.5576        57     56.674 5.786e-06 ***
## DEPI  1   0.2450        56     56.429   0.62062    
## SGAI  1   0.1501        55     56.279   0.69840    
## ACCR  1  22.2174        54     34.062 2.435e-06 ***
## LEVI  1   2.7430        53     31.319   0.09768 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ROC Curve
roc.curve(test$Target, predicted)

## Area under the curve (AUC): 0.759
# Confusion Matrix
consufusion.1<-table(test$Target,y_pred)
consufusion.1
##    y_pred
##      0  1
##   0 27 27
##   1  8  0
#visualize the result
library(pROC)
roc.small<-roc(test$Target,predicted)
x<-1-roc.small$specificities
y<-roc.small$sensitivities
auc.small<-roc.small$auc

# roc with ggplot 
ggplot(data=NULL,mapping=aes(x=x,y=y))+
  geom_line(colour='pink')+
  geom_abline(intercept=0,slope=1)+
  annotate('text',x=0.4,y=0.4,label=paste('AUC=',round(auc.small,digit=2)))+
  labs(x='False positive rate',y='sensitivity',title='ROC curve')

XGBoost application
#library(xgboost)