library(readxl)
earn <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Complete Data")
sam <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Sample for Model Development")
str(earn)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1239 obs. of 11 variables:
## $ Company ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ DSRI : num 1.62 1 1 1.49 1 ...
## $ GMI : num 1.13 1.61 1.02 1 1.37 ...
## $ AQI : num 7.185 1.005 1.241 0.466 0.637 ...
## $ SGI : num 0.366 13.081 1.475 0.673 0.861 ...
## $ DEPI : num 1.38 0.4 1.17 2 1.45 ...
## $ SGAI : num 1.6241 5.1982 0.6477 0.0929 1.7415 ...
## $ ACCR : num -0.1668 0.0605 0.0367 0.2734 0.123 ...
## $ LEVI : num 1.161 0.987 1.264 0.681 0.939 ...
## $ Manipulater : chr "Yes" "Yes" "Yes" "Yes" ...
## $ C-MANIPULATOR: num 1 1 1 1 1 1 1 1 1 1 ...
str(sam)
## Classes 'tbl_df', 'tbl' and 'data.frame': 220 obs. of 11 variables:
## $ Company ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ DSRI : num 1.62 1 1 1.49 1 ...
## $ GMI : num 1.13 1.61 1.02 1 1.37 ...
## $ AQI : num 7.185 1.005 1.241 0.466 0.637 ...
## $ SGI : num 0.366 13.081 1.475 0.673 0.861 ...
## $ DEPI : num 1.38 0.4 1.17 2 1.45 ...
## $ SGAI : num 1.6241 5.1982 0.6477 0.0929 1.7415 ...
## $ ACCR : num -0.1668 0.0605 0.0367 0.2734 0.123 ...
## $ LEVI : num 1.161 0.987 1.264 0.681 0.939 ...
## $ Manipulator : chr "Yes" "Yes" "Yes" "Yes" ...
## $ C-MANIPULATOR: num 1 1 1 1 1 1 1 1 1 1 ...
earn$Manipulater <- factor(earn$Manipulater)
sam$Manipulator <- factor(sam$Manipulator)
dim(earn)
## [1] 1239 11
str(earn)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1239 obs. of 11 variables:
## $ Company ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ DSRI : num 1.62 1 1 1.49 1 ...
## $ GMI : num 1.13 1.61 1.02 1 1.37 ...
## $ AQI : num 7.185 1.005 1.241 0.466 0.637 ...
## $ SGI : num 0.366 13.081 1.475 0.673 0.861 ...
## $ DEPI : num 1.38 0.4 1.17 2 1.45 ...
## $ SGAI : num 1.6241 5.1982 0.6477 0.0929 1.7415 ...
## $ ACCR : num -0.1668 0.0605 0.0367 0.2734 0.123 ...
## $ LEVI : num 1.161 0.987 1.264 0.681 0.939 ...
## $ Manipulater : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ C-MANIPULATOR: num 1 1 1 1 1 1 1 1 1 1 ...
# Checking Missing Values
library(VIM)
aggr_plot = aggr(earn,
col = c("red", "blue"),
numbers = TRUE,
prop = TRUE,
sortVars = TRUE,
labels = names(earn),
cex.axis = 1,
gap = 0,
ylab = c("Histogram of missing data", "Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Company ID 0
## DSRI 0
## GMI 0
## AQI 0
## SGI 0
## DEPI 0
## SGAI 0
## ACCR 0
## LEVI 0
## Manipulater 0
## C-MANIPULATOR 0
# This shows that there are no missing data in our Dataset, Good Start!
# converting all the variables into factor which have less than 4 unique values
col_names <- sapply(earn, function(col) length(unique(col)) < 4)
earn[ , col_names] <- lapply(earn[ , col_names] , factor)
str(earn)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1239 obs. of 11 variables:
## $ Company ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ DSRI : num 1.62 1 1 1.49 1 ...
## $ GMI : num 1.13 1.61 1.02 1 1.37 ...
## $ AQI : num 7.185 1.005 1.241 0.466 0.637 ...
## $ SGI : num 0.366 13.081 1.475 0.673 0.861 ...
## $ DEPI : num 1.38 0.4 1.17 2 1.45 ...
## $ SGAI : num 1.6241 5.1982 0.6477 0.0929 1.7415 ...
## $ ACCR : num -0.1668 0.0605 0.0367 0.2734 0.123 ...
## $ LEVI : num 1.161 0.987 1.264 0.681 0.939 ...
## $ Manipulater : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ C-MANIPULATOR: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
library(Hmisc)
describe(earn$Manipulater)
## earn$Manipulater
## n missing distinct
## 1239 0 2
##
## Value No Yes
## Frequency 1200 39
## Proportion 0.969 0.031
# this suggests that there are only 3.1% data avaiable for manipulator
# Also, the Target Variable is highly unbalanced as the data is split in 1200:39 ratio
# Let us see the data graphically
ggplot(data = earn, mapping = aes(x =Manipulater)) +
geom_bar(color="black",fill="orange")+
ggtitle("How is the target variable is distributed")+
xlab("No->Not Manipulated 1->Manipulated")+
theme_bw()
#### Let us solve the Class Imbalance Problem with SMOTE Technique
ncols<-ncol(earn)
earn<-cbind(earn[1:ncols],earn[10])
earn <- earn[,-c(1,10)]
set.seed(1234)
index <- sample(1:2, nrow(earn), replace = TRUE, prob = c(0.7,0.3))
train <- earn[index == 1, ]
test <- earn[index == 2, ]
prop.table(table(train$Manipulater))
##
## No Yes
## 0.96453089 0.03546911
prop.table(table(test$Manipulater)) # high level of Imbalance
##
## No Yes
## 0.97808219 0.02191781
library(DMwR)
smote_train<-SMOTE(Manipulater ~ . , train,k=5,perc.over = 1400,perc.under=140)
prop.table(table(smote_train$Manipulater))
##
## No Yes
## 0.5662313 0.4337687
m <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Manipulator")
non_m <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Non-Manipulator")
# MANIPULATORS
dim(m)
## [1] 39 11
library(dplyr)
beneish_m <-m %>%
mutate(m_score= -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI - 0.172*SGAI + 4.679*ACCR - 0.327*LEVI,probability_of_manipulation=pnorm(m_score)*100)
# let us see the results visually
hist(beneish_m$m_score, # Save histogram as object
breaks = 30,
freq = T,
col = "thistle1", # Or use: col = colors() [626]
main = "Histogram for M-Score of Manipulators")
c<- count(beneish_m[-beneish_m$m_score<=-1.78,])
print(c)
## # A tibble: 1 x 1
## n
## <int>
## 1 7
# thus Beneish model could correctly identify 7 manipulator out of 39
Accuracy_predicting_manipulator <- c/nrow(m)
print(Accuracy_predicting_manipulator) # thus, we could predict manipulators with an accuracy of 17.94%
## n
## 1 0.1794872
# NON-MANIPULATORS
dim(non_m)
## [1] 1200 11
beneish_non_m <-non_m %>%
mutate(m_score= -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI - 0.172*SGAI + 4.679*ACCR - 0.327*LEVI,probability_of_manipulation=pnorm(m_score)*100)
# let us see the results visually
hist(beneish_non_m$m_score, # Save histogram as object
breaks = 30,
freq = T,
col = "thistle1", # Or use: col = colors() [626]
main = "Histogram for M-Score of Non-Manipulators")
c<- count(beneish_non_m[beneish_non_m$m_score<=-1.78,])
print(c)
## # A tibble: 1 x 1
## n
## <int>
## 1 1029
# thus Beneish model could correctly identify 1029 non-manipulator out of 1200
Accuracy_predicting_non_manipulator <- c/nrow(non_m)
print(Accuracy_predicting_non_manipulator) # thus, we could predict non-manipulators with an accuracy of 85.75%
## n
## 1 0.8575
# However, we are more interested in correctly finding firms who are doing manipulation, thus an accuracy of 17.94% would be very less for a good model
# THUS, BENEISH MODEL WOULD NOT BE A SUITABLE MODEL FOR PREDICTING EARNING MANIPULATIONS IN INDIA
sam <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Sample for Model Development")
# Under Sampling
str(sam)
## Classes 'tbl_df', 'tbl' and 'data.frame': 220 obs. of 11 variables:
## $ Company ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ DSRI : num 1.62 1 1 1.49 1 ...
## $ GMI : num 1.13 1.61 1.02 1 1.37 ...
## $ AQI : num 7.185 1.005 1.241 0.466 0.637 ...
## $ SGI : num 0.366 13.081 1.475 0.673 0.861 ...
## $ DEPI : num 1.38 0.4 1.17 2 1.45 ...
## $ SGAI : num 1.6241 5.1982 0.6477 0.0929 1.7415 ...
## $ ACCR : num -0.1668 0.0605 0.0367 0.2734 0.123 ...
## $ LEVI : num 1.161 0.987 1.264 0.681 0.939 ...
## $ Manipulator : chr "Yes" "Yes" "Yes" "Yes" ...
## $ C-MANIPULATOR: num 1 1 1 1 1 1 1 1 1 1 ...
dim(sam)
## [1] 220 11
sam$`Company ID`<- NULL
sam$Manipulator<- NULL
names(sam)[9] <- "Target"
sam$Target <- factor(sam$Target)
set.seed(1234)
index <- sample(1:2, nrow(sam), replace = TRUE, prob = c(0.7,0.3))
train <- sam[index == 1, ]
test <- sam[index == 2, ]
prop.table(table(train$Target))
##
## 0 1
## 0.8037975 0.1962025
prop.table(table(test$Target)) # high level of Imbalance
##
## 0 1
## 0.8709677 0.1290323
library(ROSE)
under <- ovun.sample(Target~., data=train, method = "under", N = 62)$data
table(under$Target)
##
## 0 1
## 31 31
# Let us try to do variable selection
full <- glm(Target~., data=under,family = "binomial")
null <- glm(Target~., data=under,family = "binomial")
# Forward selection
mod_forward<- step(null, scope=list(lower=null,upper=full),direction = "forward")
## Start: AIC=49.32
## Target ~ DSRI + GMI + AQI + SGI + DEPI + SGAI + ACCR + LEVI
summary(mod_forward)
##
## Call:
## glm(formula = Target ~ DSRI + GMI + AQI + SGI + DEPI + SGAI +
## ACCR + LEVI, family = "binomial", data = under)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.07018 -0.23945 -0.00006 0.18141 1.76210
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -16.4226 5.7093 -2.876 0.00402 **
## DSRI 5.0589 2.0096 2.517 0.01182 *
## GMI 1.6291 0.9614 1.694 0.09019 .
## AQI 2.4184 0.9520 2.540 0.01107 *
## SGI 8.9339 3.2512 2.748 0.00600 **
## DEPI -1.2134 1.0416 -1.165 0.24401
## SGAI -0.9488 0.4996 -1.899 0.05756 .
## ACCR 16.7136 6.0078 2.782 0.00540 **
## LEVI -3.2234 1.7600 -1.832 0.06702 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 85.950 on 61 degrees of freedom
## Residual deviance: 31.319 on 53 degrees of freedom
## AIC: 49.319
##
## Number of Fisher Scoring iterations: 10
options(scipens=999)
exp(coef(mod_forward))
## (Intercept) DSRI GMI AQI SGI DEPI
## 7.374546e-08 1.574198e+02 5.099191e+00 1.122782e+01 7.585086e+03 2.971750e-01
## SGAI ACCR LEVI
## 3.871904e-01 1.813944e+07 3.981997e-02
exp(cbind(OR = coef(mod_forward), confint(mod_forward)))
## OR 2.5 % 97.5 %
## (Intercept) 7.374546e-08 1.376497e-14 4.350312e-04
## DSRI 1.574198e+02 9.233737e+00 4.914377e+04
## GMI 5.099191e+00 6.729709e-01 4.631054e+01
## AQI 1.122782e+01 2.899242e+00 1.504357e+02
## SGI 7.585086e+03 5.714217e+01 4.929157e+07
## DEPI 2.971750e-01 1.995543e-02 1.935026e+00
## SGAI 3.871904e-01 9.883301e-02 1.633154e+00
## ACCR 1.813944e+07 2.437803e+03 9.251244e+13
## LEVI 3.981997e-02 7.612595e-04 1.827783e+00
predicted <- predict(mod_forward, newData=test, type="response")
# let us calculate the probabilties
probs <- exp(predicted)/(1+exp(predicted))
table(predicted, test$Target)
##
## predicted 0 1
## 8.14612176574424e-09 1 0
## 6.00658300781842e-05 1 0
## 0.000127550984281535 1 0
## 0.000276517266447538 1 0
## 0.000592902756840113 1 0
## 0.00151816267703678 1 0
## 0.00306947695982215 1 0
## 0.00448491530210303 1 0
## 0.00693782205087582 1 0
## 0.00813703786161421 0 1
## 0.0084108264237783 1 0
## 0.0086517959535934 0 1
## 0.0131710154828861 1 0
## 0.015949706589745 0 1
## 0.0257009226292954 1 0
## 0.0291403764734181 0 1
## 0.0681607999554298 1 0
## 0.098022565407098 0 1
## 0.129616755225215 1 0
## 0.154409601581103 0 1
## 0.211717163282882 1 0
## 0.243015813454845 1 0
## 0.25990269148561 1 0
## 0.287073946802408 0 1
## 0.297366151992658 1 0
## 0.299158668931278 0 1
## 0.307025016645982 1 0
## 0.321514923618005 1 0
## 0.340417111224406 1 0
## 0.364326897165939 1 0
## 0.371328420400767 1 0
## 0.371432010888463 1 0
## 0.382995925350319 1 0
## 0.4309840526756 1 0
## 0.471325378687997 1 0
## 0.639955906691414 1 0
## 0.726269296389403 1 0
## 0.822343871921667 1 0
## 0.878476004215433 1 0
## 0.882677391259849 1 0
## 0.889476435122441 1 0
## 0.893501177885515 1 0
## 0.900386667947128 1 0
## 0.960076361258178 1 0
## 0.964801483378529 1 0
## 0.965782692601884 1 0
## 0.982005342815065 1 0
## 0.988220542078743 1 0
## 0.989396207678045 1 0
## 0.991337635862956 1 0
## 0.991861094604328 1 0
## 0.9981119618788 1 0
## 0.999357636633555 1 0
## 0.999942778120127 1 0
## 0.99999720805383 1 0
## 0.999999305471319 1 0
## 0.99999999996668 1 0
## 1 5 0
# Recode factors
y_pred_num <- ifelse(predicted > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1),order=T)
y_act <- test$Target
# Accuracy
#mean(y_pred == y_act)
# 43.54% for 0.5 threshold
# 40.32% for 0.4 threshold
# 29.03% for 0.3 threshold
# 43.54% for 0.6 threshold
# 45.16% for 0.7 threshold
# with a misclassification rate of 57.46% we cannot conclude the significance of a variable based on just one sample. You have to create several samples before deciding the variable importance.
#run anova for checking the model
anova(mod_forward, test = 'Chisq')
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Target
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 61 85.950
## DSRI 1 5.9990 60 79.951 0.01431 *
## GMI 1 0.0652 59 79.886 0.79849
## AQI 1 2.6542 58 77.232 0.10328
## SGI 1 20.5576 57 56.674 5.786e-06 ***
## DEPI 1 0.2450 56 56.429 0.62062
## SGAI 1 0.1501 55 56.279 0.69840
## ACCR 1 22.2174 54 34.062 2.435e-06 ***
## LEVI 1 2.7430 53 31.319 0.09768 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ROC Curve
roc.curve(test$Target, predicted)
## Area under the curve (AUC): 0.759
# Confusion Matrix
consufusion.1<-table(test$Target,y_pred)
consufusion.1
## y_pred
## 0 1
## 0 27 27
## 1 8 0
#visualize the result
library(pROC)
roc.small<-roc(test$Target,predicted)
x<-1-roc.small$specificities
y<-roc.small$sensitivities
auc.small<-roc.small$auc
# roc with ggplot
ggplot(data=NULL,mapping=aes(x=x,y=y))+
geom_line(colour='pink')+
geom_abline(intercept=0,slope=1)+
annotate('text',x=0.4,y=0.4,label=paste('AUC=',round(auc.small,digit=2)))+
labs(x='False positive rate',y='sensitivity',title='ROC curve')
#library(xgboost)