Load the data

library(readxl)
earn <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Complete Data")
sam <-  read_excel("IMB579-XLS-ENG.xlsx",
                   sheet = "Sample for Model Development")
# Backup of the data 
e<- earn
s<- sam

Basic checks and Manipulation

dim(earn)
## [1] 1239   11
dim(sam)
## [1] 220  11
# removing variables which are not required for analysis 
earn$`Company ID`<- NULL
earn$`C-MANIPULATOR`<- NULL

sam$`Company ID`<- NULL
sam$`C-MANIPULATOR`<- NULL

# changing name of the target variable 
names(earn)[9] <- "Target"
names(sam)[9] <- "Target"

str(earn)
## tibble [1,239 × 9] (S3: tbl_df/tbl/data.frame)
##  $ DSRI  : num [1:1239] 1.62 1 1 1.49 1 ...
##  $ GMI   : num [1:1239] 1.13 1.61 1.02 1 1.37 ...
##  $ AQI   : num [1:1239] 7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI   : num [1:1239] 0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI  : num [1:1239] 1.38 0.4 1.17 2 1.45 ...
##  $ SGAI  : num [1:1239] 1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR  : num [1:1239] -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI  : num [1:1239] 1.161 0.987 1.264 0.681 0.939 ...
##  $ Target: chr [1:1239] "Yes" "Yes" "Yes" "Yes" ...
str(sam)
## tibble [220 × 9] (S3: tbl_df/tbl/data.frame)
##  $ DSRI  : num [1:220] 1.62 1 1 1.49 1 ...
##  $ GMI   : num [1:220] 1.13 1.61 1.02 1 1.37 ...
##  $ AQI   : num [1:220] 7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI   : num [1:220] 0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI  : num [1:220] 1.38 0.4 1.17 2 1.45 ...
##  $ SGAI  : num [1:220] 1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR  : num [1:220] -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI  : num [1:220] 1.161 0.987 1.264 0.681 0.939 ...
##  $ Target: chr [1:220] "Yes" "Yes" "Yes" "Yes" ...
# Changing target variable to Factor 
earn$Target <- factor(earn$Target)
sam$Target <- factor(sam$Target)

# Checking Missing values in the data

colSums(is.na(earn))
##   DSRI    GMI    AQI    SGI   DEPI   SGAI   ACCR   LEVI Target 
##      0      0      0      0      0      0      0      0      0
colSums(is.na(sam))
##   DSRI    GMI    AQI    SGI   DEPI   SGAI   ACCR   LEVI Target 
##      0      0      0      0      0      0      0      0      0
# This shows that there are no missing data in our Dataset, Good Start!

Let us see how our target variable is in the data

library(Hmisc)
describe(earn$Target)
## earn$Target 
##        n  missing distinct 
##     1239        0        2 
##                       
## Value         No   Yes
## Frequency   1200    39
## Proportion 0.969 0.031
describe(sam$Target)
## sam$Target 
##        n  missing distinct 
##      220        0        2 
##                       
## Value         No   Yes
## Frequency    181    39
## Proportion 0.823 0.177
# Distribution of the Majority and Minority Class

table(earn$Target)
## 
##   No  Yes 
## 1200   39
table(sam$Target) # this shows that the sample data also contains the same no of minority class as that Complete data 
## 
##  No Yes 
## 181  39
# Let us see the data graphically 

ggplot(data = earn, mapping = aes(x =Target)) +
geom_bar(color="black",fill="orange")+
ggtitle("How is the target variable distributed")+
  xlab("No->Non Manipulators 1->Manipulators")+
  theme_bw()

# Insight: the Target Variable is highly unbalanced as the data is split in 1200:39 ratio and 181:39 in complete and sample data 

Let us solve the Class Imabalance using sampling techniques like under/over sampling and SMOTE

library(UBL)

# Complete Data 

# let us split the data into Training and Testing data

set.seed(1234)
index <- sample(1:2, nrow(earn), replace = TRUE, prob = c(0.7,0.3))
train_com <- earn[index == 1, ]
test_com <- earn[index == 2, ]

# sampling the data from 3 fronts 

table(train_com$Target)
## 
##  No Yes 
## 843  31
smote_com <- SmoteClassif(Target~.,as.data.frame(train_com), "balance")
table(smote_com$Target)
## 
##  No Yes 
## 437 437
prop.table(table(smote_com$Target))
## 
##  No Yes 
## 0.5 0.5
under_com <- RandUnderClassif(Target~.,as.data.frame(train_com), "balance")
table(under_com$Target)
## 
##  No Yes 
##  31  31
prop.table(table(under_com$Target))
## 
##  No Yes 
## 0.5 0.5
over_com <- RandOverClassif(Target~.,as.data.frame(train_com), "balance")
table(over_com$Target)
## 
##  No Yes 
## 843 843
prop.table(table(over_com$Target))
## 
##  No Yes 
## 0.5 0.5
# Sample Data 

# let us split the data into Training and Testing data 
set.seed(9999)
index1 <- sample(1:2, nrow(sam), replace = TRUE, prob = c(0.7,0.3))
train_sam <- sam[index1 == 1, ]
test_sam <- sam[index1 == 2, ]

# sampling the data from three fronts 

table(train_sam$Target)
## 
##  No Yes 
## 137  23
smote_sam <- SmoteClassif(Target~.,as.data.frame(train_sam), "balance")
table(smote_sam$Target)
## 
##  No Yes 
##  80  79
prop.table(table(smote_sam$Target))
## 
##        No       Yes 
## 0.5031447 0.4968553
under_sam <- RandUnderClassif(Target~.,as.data.frame(train_sam), "balance")
table(under_sam$Target)
## 
##  No Yes 
##  23  23
prop.table(table(under_sam$Target))
## 
##  No Yes 
## 0.5 0.5
over_sam <- RandOverClassif(Target~.,as.data.frame(train_sam), "balance")
table(over_sam$Target)
## 
##  No Yes 
## 137 137
prop.table(table(over_sam$Target))
## 
##  No Yes 
## 0.5 0.5

Question 1

Part (a)

Let us try to put Beneish Model for Elimentary Analysis

Note: We should note that this model was not created for financial institutions like Banks

Using Manipulator and Non-Manipulator Sheet, building a Beneish Model

m <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Manipulator")
non_m <-  read_excel("IMB579-XLS-ENG.xlsx",sheet = "Non-Manipulator")

# MANIPULATORS 
dim(m)
## [1] 39 11
library(dplyr)
beneish_m <-m %>%
  mutate(m_score= -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI -  0.172*SGAI + 4.679*ACCR -  0.327*LEVI,probability_of_manipulation=pnorm(m_score)*100)

# let us see the results visually 
hist(beneish_m$m_score,  # Save histogram as object
     breaks = 30,
     freq = T,
     col = "thistle1", # Or use: col = colors() [626]
     main = "Histogram for M-Score of Manipulators")

c<- count(beneish_m[-beneish_m$m_score<=-1.78,])
print(c)
## # A tibble: 1 x 1
##       n
##   <int>
## 1     7
# thus Beneish model could correctly identify 7 manipulator out of 39 

Accuracy_predicting_manipulator <- c/nrow(m)
print(Accuracy_predicting_manipulator) # thus, we could predict manipulators with an accuracy of 17.94% 
##           n
## 1 0.1794872
# NON-MANIPULATORS 

dim(non_m)
## [1] 1200   11
beneish_non_m <-non_m %>%
  mutate(m_score= -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI -  0.172*SGAI + 4.679*ACCR -  0.327*LEVI,probability_of_manipulation=pnorm(m_score)*100)

# let us see the results visually 
hist(beneish_non_m$m_score,  # Save histogram as object
     breaks = 30,
     freq = T,
     col = "thistle1", # Or use: col = colors() [626]
     main = "Histogram for M-Score of Non-Manipulators")

c<- count(beneish_non_m[beneish_non_m$m_score<=-1.78,])
print(c)
## # A tibble: 1 x 1
##       n
##   <int>
## 1  1029
# thus Beneish model could correctly identify 1029 non-manipulator out of 1200 

Accuracy_predicting_non_manipulator <- c/nrow(non_m)
print(Accuracy_predicting_non_manipulator) # thus, we could predict non-manipulators with an accuracy of 85.75% 
##        n
## 1 0.8575
# Insight : Since we are more interested in correctly finding firms who are doing manipulation, thus an accuracy of 17.94% would be very less for a good model THUS, BENEISH MODEL WOULD NOT BE A SUITABLE MODEL FOR PREDICTING EARNING MANIPULATIONS IN INDIA

Part (b)

The number of manipulators is usually much less than non-manipulators (in the accompanying spreadsheet, the percentage of manipulators is less than 4% in the complete data).

The type of modeling we can do here is as below:

  1. Logistic regression with balancing the data
  2. Random Forest
  3. Decision trees
  4. Boosting(Adaboost and XGBoost)

Other comments: a) We can handle unbalanced data with bootstraping as this emsemble technique will help in reassigning weights to the unalanced classes b) we can also do Cross Validation like 10-fold to reassign and get better models which can predict who is a manipulator

How can one handle unbalanced problems?

We can handle unbalanced problems with various sampling techniques like a) Undersampling b) Oversampling c) SMOTE

Part (c)

Let us build a Logistic regression model using Undersampling technique

# undersampling has been applied under the Class Imbalance section 

# Let us check the sampling and distribution 

# original Sample Data 
table(sam$Target)
## 
##  No Yes 
## 181  39
prop.table(table(sam$Target)) # High Imbalance 
## 
##        No       Yes 
## 0.8227273 0.1772727
# Training Data 
table(train_sam$Target)
## 
##  No Yes 
## 137  23
prop.table(table(train_sam$Target)) # further Imbalance 
## 
##      No     Yes 
## 0.85625 0.14375
# Undersampling Data 
table(under_sam$Target)
## 
##  No Yes 
##  23  23
prop.table(table(under_sam$Target)) # finally the data is balanced 
## 
##  No Yes 
## 0.5 0.5
# Insight : Undersampling is not an advisable technique as we lose alot of valuable information which was stored in the original dataset, we can further investigate this insight 

# Let us try to do implement stepwise logistic regression 

full <- glm(Target~., data=under_sam,family = "binomial")
null <- glm(Target~1, data=under_sam,family = "binomial")

# Forward selection

mod_und<- step(full,scope=list(lower=null,upper=full),
                       direction = "both")
## Start:  AIC=45.4
## Target ~ DSRI + GMI + AQI + SGI + DEPI + SGAI + ACCR + LEVI
## 
##        Df Deviance    AIC
## - GMI   1   27.523 43.523
## - DEPI  1   27.683 43.683
## - LEVI  1   27.789 43.789
## - SGAI  1   28.563 44.563
## <none>      27.399 45.399
## - SGI   1   32.963 48.963
## - DSRI  1   34.286 50.286
## - AQI   1   38.126 54.126
## - ACCR  1   47.305 63.305
## 
## Step:  AIC=43.52
## Target ~ DSRI + AQI + SGI + DEPI + SGAI + ACCR + LEVI
## 
##        Df Deviance    AIC
## - LEVI  1   27.834 41.834
## - DEPI  1   27.881 41.881
## - SGAI  1   28.658 42.658
## <none>      27.523 43.523
## + GMI   1   27.399 45.399
## - SGI   1   34.885 48.885
## - DSRI  1   35.129 49.129
## - AQI   1   39.447 53.447
## - ACCR  1   47.342 61.342
## 
## Step:  AIC=41.83
## Target ~ DSRI + AQI + SGI + DEPI + SGAI + ACCR
## 
##        Df Deviance    AIC
## - DEPI  1   28.155 40.155
## - SGAI  1   28.674 40.674
## <none>      27.834 41.834
## + LEVI  1   27.523 43.523
## + GMI   1   27.789 43.789
## - DSRI  1   35.130 47.130
## - SGI   1   35.188 47.188
## - AQI   1   42.860 54.860
## - ACCR  1   48.217 60.217
## 
## Step:  AIC=40.15
## Target ~ DSRI + AQI + SGI + SGAI + ACCR
## 
##        Df Deviance    AIC
## - SGAI  1   28.699 38.699
## <none>      28.155 40.155
## + DEPI  1   27.834 41.834
## + LEVI  1   27.881 41.881
## + GMI   1   28.065 42.065
## - DSRI  1   35.215 45.215
## - SGI   1   37.693 47.693
## - AQI   1   43.475 53.475
## - ACCR  1   50.649 60.649
## 
## Step:  AIC=38.7
## Target ~ DSRI + AQI + SGI + ACCR
## 
##        Df Deviance    AIC
## <none>      28.699 38.699
## + SGAI  1   28.155 40.155
## + GMI   1   28.614 40.614
## + DEPI  1   28.674 40.674
## + LEVI  1   28.675 40.675
## - SGI   1   37.846 45.846
## - DSRI  1   39.748 47.748
## - AQI   1   45.110 53.110
## - ACCR  1   52.094 60.094
summary(mod_und)
## 
## Call:
## glm(formula = Target ~ DSRI + AQI + SGI + ACCR, family = "binomial", 
##     data = under_sam)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.95491  -0.41745  -0.05036   0.34078   2.25001  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -6.3724     2.7114  -2.350  0.01876 * 
## DSRI          0.7267     0.5750   1.264  0.20633   
## AQI           0.3871     0.1724   2.246  0.02469 * 
## SGI           3.5195     1.7354   2.028  0.04255 * 
## ACCR         19.4569     7.3948   2.631  0.00851 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 63.770  on 45  degrees of freedom
## Residual deviance: 28.699  on 41  degrees of freedom
## AIC: 38.699
## 
## Number of Fisher Scoring iterations: 8
# Important Variables for Undersampling model 
# AQI
# SGI
# ACCR - Most Important 

# let us tidy the model 
library (devtools)
library (broom)
tidy(mod_und)
## # A tibble: 5 x 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   -6.37      2.71      -2.35 0.0188 
## 2 DSRI           0.727     0.575      1.26 0.206  
## 3 AQI            0.387     0.172      2.25 0.0247 
## 4 SGI            3.52      1.74       2.03 0.0425 
## 5 ACCR          19.5       7.39       2.63 0.00851
# Checking Odds Ratio 
exp(coef(mod_und))
##  (Intercept)         DSRI          AQI          SGI         ACCR 
## 1.708097e-03 2.068192e+00 1.472774e+00 3.376802e+01 2.818513e+08
exp(cbind(OR = coef(mod_und), confint(mod_und)))
##                       OR        2.5 %       97.5 %
## (Intercept) 1.708097e-03 1.358528e-06 9.701667e-02
## DSRI        2.068192e+00 1.159864e+00 9.331341e+00
## AQI         1.472774e+00 1.136226e+00 2.317320e+00
## SGI         3.376802e+01 2.455897e+00 2.832386e+03
## ACCR        2.818513e+08 4.351163e+03 3.333050e+16
# with every one unit increase in AQI, the odds of being a Manipulator increase by 1.47
# with every one unit increase in SGI, the odds of being a Manipulator increase by 33.76
# with every one unit increase in ACCR, the odds of being a Manipulator increase by 281851324

# Let us do some prediction
predicted <- predict(mod_und, newdata=test_sam, type="response")

# let us calculate the probabilties for Manipulator CLass 

probs <- exp(predicted)/(1+exp(predicted)) # this is the probability of 1 class - meaning Manipulators 
print(probs)
##         1         2         3         4         5         6         7         8 
## 0.5031334 0.6631134 0.7061923 0.7295942 0.5301354 0.7273306 0.5970467 0.6536270 
##         9        10        11        12        13        14        15        16 
## 0.7250499 0.6734699 0.7220515 0.7310586 0.5009815 0.7310578 0.5000020 0.7242812 
##        17        18        19        20        21        22        23        24 
## 0.6281579 0.5843709 0.5421352 0.5010349 0.5200838 0.5856523 0.5766283 0.5170665 
##        25        26        27        28        29        30        31        32 
## 0.5099944 0.5231746 0.5324276 0.5575018 0.5194507 0.5149051 0.5149108 0.5536051 
##        33        34        35        36        37        38        39        40 
## 0.5145833 0.5033140 0.5437360 0.6968706 0.5328364 0.5933025 0.5769681 0.5421692 
##        41        42        43        44        45        46        47        48 
## 0.5742079 0.6408721 0.5345717 0.5034593 0.5114125 0.5057671 0.5245171 0.6771183 
##        49        50        51        52        53        54        55        56 
## 0.5602461 0.5254146 0.6900247 0.6623328 0.5093897 0.5010648 0.5315097 0.5145201 
##        57        58        59        60 
## 0.6204486 0.5036962 0.5099797 0.5251903
# Insight: After Running the Stepwise logistic regression using undersampling technique, we could see that ACCR, SGI, AQI are the major predictors for finding out who will be the Manipulators 

Part (d) and Part (e)

   Reference

Prediction No Yes No 38 5 Yes 6 11

‘Positive’ Class : Yes

Hypothesis and Business Logic

Given the above matrix we can interpret it as:

Recall - Our model predicts people as manipulators and they are actually manipulators - Paramter 1 which needs to be best for our model

1 - Specificity - False Positive Rate - Our model predicts people as non - manipulators and they are actually manipulators - Paramter 2 which needs to be best for our model

Cost Evaluation

Let us say that th ebank loans 10,00,000 INR to a person

if the model says that this person can be a manipulator but in actual he is not - FALSE NEGATIVE RATE - the bank may lose a potential interest which they might earn from the person

if the interest is @10% then - the BANK LOSES 1,00,000 - in a very basic terminology

However, if the model says that this person is not a manipulator but in actual he - FALSE Positive RATE - the bank may lose the full loan amount - 10,00,000

Thus, COST of FPR to COST of FNR is 10:1

# Model Evaluation

#run anova for checking the model 
anova(mod_und, test = 'Chisq')
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Target
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                    45     63.770             
## DSRI  1   2.5330        44     61.237 0.111488    
## AQI   1   2.4498        43     58.787 0.117542    
## SGI   1   6.6930        42     52.094 0.009679 ** 
## ACCR  1  23.3944        41     28.699 1.32e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA shows that SGI AND ACCR are the variables which show the most significance 

# Recode Predicted to factors
y_pred <- ifelse(predicted>0.5,"Yes","No") # let us recode with cut off of 0.5
table(y_pred,test_sam$Target)
##       
## y_pred No Yes
##    No  38   5
##    Yes  6  11
# Confusion Matrix 
library(caret)
confusionMatrix(factor(y_pred),test_sam$Target, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  38   5
##        Yes  6  11
##                                           
##                Accuracy : 0.8167          
##                  95% CI : (0.6956, 0.9048)
##     No Information Rate : 0.7333          
##     P-Value [Acc > NIR] : 0.09107         
##                                           
##                   Kappa : 0.5404          
##                                           
##  Mcnemar's Test P-Value : 1.00000         
##                                           
##             Sensitivity : 0.6875          
##             Specificity : 0.8636          
##          Pos Pred Value : 0.6471          
##          Neg Pred Value : 0.8837          
##              Prevalence : 0.2667          
##          Detection Rate : 0.1833          
##    Detection Prevalence : 0.2833          
##       Balanced Accuracy : 0.7756          
##                                           
##        'Positive' Class : Yes             
## 
# Let us try different probabilities to check various cut off points
library(caret)
for (i in seq(0.3,0.9,by=0.05))
  {
  y_pred <- ifelse(predicted>i,"Yes","No")
  c<- confusionMatrix(factor(y_pred),test_sam$Target, positive = "Yes")
       print(i)
       print(c$table)
       print(c$byClass[1])
       print(c$byClass[2])
}
## [1] 0.3
##           Reference
## Prediction No Yes
##        No  32   4
##        Yes 12  12
## Sensitivity 
##        0.75 
## Specificity 
##   0.7272727 
## [1] 0.35
##           Reference
## Prediction No Yes
##        No  36   4
##        Yes  8  12
## Sensitivity 
##        0.75 
## Specificity 
##   0.8181818 
## [1] 0.4
##           Reference
## Prediction No Yes
##        No  37   5
##        Yes  7  11
## Sensitivity 
##      0.6875 
## Specificity 
##   0.8409091 
## [1] 0.45
##           Reference
## Prediction No Yes
##        No  37   5
##        Yes  7  11
## Sensitivity 
##      0.6875 
## Specificity 
##   0.8409091 
## [1] 0.5
##           Reference
## Prediction No Yes
##        No  38   5
##        Yes  6  11
## Sensitivity 
##      0.6875 
## Specificity 
##   0.8636364 
## [1] 0.55
##           Reference
## Prediction No Yes
##        No  39   5
##        Yes  5  11
## Sensitivity 
##      0.6875 
## Specificity 
##   0.8863636 
## [1] 0.6
##           Reference
## Prediction No Yes
##        No  40   5
##        Yes  4  11
## Sensitivity 
##      0.6875 
## Specificity 
##   0.9090909 
## [1] 0.65
##           Reference
## Prediction No Yes
##        No  40   6
##        Yes  4  10
## Sensitivity 
##       0.625 
## Specificity 
##   0.9090909 
## [1] 0.7
##           Reference
## Prediction No Yes
##        No  41   7
##        Yes  3   9
## Sensitivity 
##      0.5625 
## Specificity 
##   0.9318182 
## [1] 0.75
##           Reference
## Prediction No Yes
##        No  42   8
##        Yes  2   8
## Sensitivity 
##         0.5 
## Specificity 
##   0.9545455 
## [1] 0.8
##           Reference
## Prediction No Yes
##        No  42   8
##        Yes  2   8
## Sensitivity 
##         0.5 
## Specificity 
##   0.9545455 
## [1] 0.85
##           Reference
## Prediction No Yes
##        No  44   8
##        Yes  0   8
## Sensitivity 
##         0.5 
## Specificity 
##           1 
## [1] 0.9
##           Reference
## Prediction No Yes
##        No  44   9
##        Yes  0   7
## Sensitivity 
##      0.4375 
## Specificity 
##           1
# Insight : Cutoff value of 0.6 bears the best result for the present case - BEST RECALL AND MIN False Positive Rate 

# ROC Curve with ROCR package 

library(ROCR)
pred <- predict(mod_und, test_sam, type="response")
pred1 <- prediction(pred,test_sam$Target)
eval <- performance(pred1, "tpr","fpr")
plot(eval) # ROC Curve 

# finding yonden's index best cut-off Probability with formula 

opt.cut <- function(ROCrf){
  cut.ind <- mapply(FUN = function(x,y,p){yi=(y+(1-x)-1)
  ind<- which(yi==max(yi))
  c(recall = y[[ind]], specificity = 1-x[[ind]],cutoff = p[[ind]])},ROCrf@x.values, ROCrf@y.values,ROCrf@alpha.values) }

print(opt.cut(eval)) # thus, the optimal cut off from Youden's Index is 0.635 
##                  [,1]
## recall      0.6875000
## specificity 0.9090909
## cutoff      0.6350208
# Cost Calculation 

cost.perf = performance(pred1, "cost",cost.fn=1,cost.fp=10)
pred1@cutoffs[[1]] [which.min(cost.perf@y.values[[1]])]
##         3 
## 0.8769623
# thus, the optimal cut off point comes out to be 0.8769 


# Confusion Matrix with Youden's index and Cost Cutoff

# Youden's - 0.635
y_pred_youden <- ifelse(predicted>0.635,"Yes","No") 
confusionMatrix(factor(y_pred_youden),test_sam$Target, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  40   5
##        Yes  4  11
##                                          
##                Accuracy : 0.85           
##                  95% CI : (0.7343, 0.929)
##     No Information Rate : 0.7333         
##     P-Value [Acc > NIR] : 0.02411        
##                                          
##                   Kappa : 0.6087         
##                                          
##  Mcnemar's Test P-Value : 1.00000        
##                                          
##             Sensitivity : 0.6875         
##             Specificity : 0.9091         
##          Pos Pred Value : 0.7333         
##          Neg Pred Value : 0.8889         
##              Prevalence : 0.2667         
##          Detection Rate : 0.1833         
##    Detection Prevalence : 0.2500         
##       Balanced Accuracy : 0.7983         
##                                          
##        'Positive' Class : Yes            
## 
# this is good, we have further reduced our False Positive Rate 

# Cost Parameter - 0.8769
y_pred <- ifelse(predicted>0.8769,"Yes","No") 
confusionMatrix(factor(y_pred),test_sam$Target, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  44   8
##        Yes  0   8
##                                           
##                Accuracy : 0.8667          
##                  95% CI : (0.7541, 0.9406)
##     No Information Rate : 0.7333          
##     P-Value [Acc > NIR] : 0.01050         
##                                           
##                   Kappa : 0.5946          
##                                           
##  Mcnemar's Test P-Value : 0.01333         
##                                           
##             Sensitivity : 0.5000          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.8462          
##              Prevalence : 0.2667          
##          Detection Rate : 0.1333          
##    Detection Prevalence : 0.1333          
##       Balanced Accuracy : 0.7500          
##                                           
##        'Positive' Class : Yes             
## 
# we have reduced our FPR to 0 however, the RECALL now is 50% 

# thus, we would choose Youden's Index as the cut-off for our model

Part(g) - CART Model

# original Complete Data 
table(earn$Target)
## 
##   No  Yes 
## 1200   39
prop.table(table(earn$Target)) # High Imbalance 
## 
##       No      Yes 
## 0.968523 0.031477
# Training Data 
table(train_com$Target)
## 
##  No Yes 
## 843  31
prop.table(table(train_sam$Target)) # further Imbalance 
## 
##      No     Yes 
## 0.85625 0.14375
# Testing Data 
table(test_com$Target)
## 
##  No Yes 
## 357   8
prop.table(table(test_sam$Target)) # further Imbalance 
## 
##        No       Yes 
## 0.7333333 0.2666667
# Sampled Data 
table(under_com$Target)
## 
##  No Yes 
##  31  31
table(over_com$Target)
## 
##  No Yes 
## 843 843
table(smote_com$Target)
## 
##  No Yes 
## 437 437
# Model 1 - Without sampling 

library(rpart)

rpart_tree1 <- rpart(Target~., 
                     data=train_com, 
                     method="class",
                     parms = list(split = "gini"),
                     control = rpart.control(mincriterion = 0.95,
                                              maxdepth = 4))

# Model 2 - Undersampling 

rpart_tree2 <- rpart(Target~., 
                     data=under_com, 
                     method="class",
                     parms = list(split = "gini"),
                     control = rpart.control(mincriterion = 0.95,
                                              maxdepth = 4))
# Model 3 - Oversampling 

rpart_tree3 <- rpart(Target~., 
                     data=over_com, 
                     method="class",
                     parms = list(split = "gini"),
                     control = rpart.control(mincriterion = 0.95,
                                              maxdepth = 4))
# Model 4 - SMOTE 

rpart_tree4 <- rpart(Target~., 
                     data=smote_com, 
                     method="class",
                     parms = list(split = "gini"),
                     control = rpart.control(mincriterion = 0.95,
                                              maxdepth = 4))


# Checking which model performs best on Testing data 

# Model 1 

pred <- predict(mod_und, test_sam, type="response")
pred1 <- prediction(pred,test_sam$Target)
eval <- performance(pred1, "tpr","fpr")
plot(eval) # ROC Curve 

predicted <- predict(rpart_tree1,newdata=test_com, type="class")
confusionMatrix(predicted,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  352   5
##        Yes   5   3
##                                           
##                Accuracy : 0.9726          
##                  95% CI : (0.9502, 0.9868)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 0.8181          
##                                           
##                   Kappa : 0.361           
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.375000        
##             Specificity : 0.985994        
##          Pos Pred Value : 0.375000        
##          Neg Pred Value : 0.985994        
##              Prevalence : 0.021918        
##          Detection Rate : 0.008219        
##    Detection Prevalence : 0.021918        
##       Balanced Accuracy : 0.680497        
##                                           
##        'Positive' Class : Yes             
## 
# Model 2 - Under Sampling 

predicted <- predict(rpart_tree2,newdata=test_com, type="class")
confusionMatrix(predicted,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  204   1
##        Yes 153   7
##                                           
##                Accuracy : 0.5781          
##                  95% CI : (0.5256, 0.6293)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0434          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.87500         
##             Specificity : 0.57143         
##          Pos Pred Value : 0.04375         
##          Neg Pred Value : 0.99512         
##              Prevalence : 0.02192         
##          Detection Rate : 0.01918         
##    Detection Prevalence : 0.43836         
##       Balanced Accuracy : 0.72321         
##                                           
##        'Positive' Class : Yes             
## 
# Model 3 - Over Sampling 

predicted <- predict(rpart_tree3,newdata=test_com, type="class")
confusionMatrix(predicted,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  301   4
##        Yes  56   4
##                                           
##                Accuracy : 0.8356          
##                  95% CI : (0.7935, 0.8722)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0821          
##                                           
##  Mcnemar's Test P-Value : 4.577e-11       
##                                           
##             Sensitivity : 0.50000         
##             Specificity : 0.84314         
##          Pos Pred Value : 0.06667         
##          Neg Pred Value : 0.98689         
##              Prevalence : 0.02192         
##          Detection Rate : 0.01096         
##    Detection Prevalence : 0.16438         
##       Balanced Accuracy : 0.67157         
##                                           
##        'Positive' Class : Yes             
## 
# Model 4 - SMOTE 

predicted <- predict(rpart_tree4,newdata=test_com, type="class")
confusionMatrix(predicted,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  278   2
##        Yes  79   6
##                                           
##                Accuracy : 0.7781          
##                  95% CI : (0.7319, 0.8197)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0927          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.75000         
##             Specificity : 0.77871         
##          Pos Pred Value : 0.07059         
##          Neg Pred Value : 0.99286         
##              Prevalence : 0.02192         
##          Detection Rate : 0.01644         
##    Detection Prevalence : 0.23288         
##       Balanced Accuracy : 0.76436         
##                                           
##        'Positive' Class : Yes             
## 
#Insight - the best trade off from the above model is coming from SMOTE 
# Where RECALL IS 75% and FPR is 22.2%

Part(h) - Logistic regression on Complete data

# original Complete Data 
table(earn$Target)
## 
##   No  Yes 
## 1200   39
prop.table(table(earn$Target)) # High Imbalance 
## 
##       No      Yes 
## 0.968523 0.031477
# Training Data 
table(train_com$Target)
## 
##  No Yes 
## 843  31
prop.table(table(train_sam$Target)) # further Imbalance 
## 
##      No     Yes 
## 0.85625 0.14375
# Testing Data 
table(test_com$Target)
## 
##  No Yes 
## 357   8
prop.table(table(test_sam$Target)) # further Imbalance 
## 
##        No       Yes 
## 0.7333333 0.2666667
# Sampled Data 
table(under_com$Target)
## 
##  No Yes 
##  31  31
table(over_com$Target)
## 
##  No Yes 
## 843 843
table(smote_com$Target)
## 
##  No Yes 
## 437 437
# Model 1 - without any sampling 

model_logit <- glm(Target~.,data=train_com,family = "binomial")

# Model 2 - Undersampling 

model_und_logit <- glm(Target~.,data=under_com,family = "binomial")

# Model 3 - Oversampling 

model_ovr_logit <- glm(Target~.,data=over_com,family = "binomial")

# Model 4 - SMOTE 

model_smo_logit <- glm(Target~.,data=smote_com,family = "binomial")


# Checking the Summary and performance of the model with AIC 
summary(model_logit) # AIC 199
## 
## Call:
## glm(formula = Target ~ ., family = "binomial", data = train_com)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0124  -0.2100  -0.1646  -0.1315   3.3716  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.21908    1.08201  -6.672 2.53e-11 ***
## DSRI         0.86708    0.23736   3.653 0.000259 ***
## GMI          0.05289    0.29781   0.178 0.859029    
## AQI          0.26773    0.08394   3.190 0.001424 ** 
## SGI          1.76046    0.40163   4.383 1.17e-05 ***
## DEPI        -0.08415    0.60158  -0.140 0.888750    
## SGAI         0.48359    0.28799   1.679 0.093119 .  
## ACCR         5.57172    1.43447   3.884 0.000103 ***
## LEVI        -0.40503    0.29307  -1.382 0.166955    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 267.91  on 873  degrees of freedom
## Residual deviance: 181.31  on 865  degrees of freedom
## AIC: 199.31
## 
## Number of Fisher Scoring iterations: 7
summary(model_und_logit) # AIC 47 
## 
## Call:
## glm(formula = Target ~ ., family = "binomial", data = under_com)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.40565  -0.20856  -0.00468   0.21791   1.64073  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -20.3065     8.5597  -2.372   0.0177 *
## DSRI          8.9638     3.7536   2.388   0.0169 *
## GMI           1.4028     1.4397   0.974   0.3299  
## AQI           0.9820     0.3823   2.569   0.0102 *
## SGI           7.2108     3.1904   2.260   0.0238 *
## DEPI         -0.5170     2.3183  -0.223   0.8235  
## SGAI         -0.5144     1.2203  -0.422   0.6734  
## ACCR         15.2658     6.1781   2.471   0.0135 *
## LEVI         -0.8619     1.2475  -0.691   0.4896  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 85.950  on 61  degrees of freedom
## Residual deviance: 29.429  on 53  degrees of freedom
## AIC: 47.429
## 
## Number of Fisher Scoring iterations: 11
summary(model_ovr_logit) # AIC 1290 
## 
## Call:
## glm(formula = Target ~ ., family = "binomial", data = over_com)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.6714  -0.5478   0.0000   0.6085   1.6515  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.59255    0.55405 -15.509  < 2e-16 ***
## DSRI         2.19141    0.16703  13.119  < 2e-16 ***
## GMI          1.22521    0.16487   7.431 1.08e-13 ***
## AQI          0.66398    0.04882  13.601  < 2e-16 ***
## SGI          3.14291    0.22922  13.712  < 2e-16 ***
## DEPI         0.42734    0.19636   2.176   0.0295 *  
## SGAI        -0.14083    0.12314  -1.144   0.2528    
## ACCR         8.65907    0.61299  14.126  < 2e-16 ***
## LEVI        -0.79353    0.14463  -5.487 4.10e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2337.3  on 1685  degrees of freedom
## Residual deviance: 1272.9  on 1677  degrees of freedom
## AIC: 1290.9
## 
## Number of Fisher Scoring iterations: 8
summary(model_smo_logit) # AIC 666
## 
## Call:
## glm(formula = Target ~ ., family = "binomial", data = smote_com)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2225  -0.5480  -0.0191   0.5567   1.6834  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.03608    0.90787  -9.953  < 2e-16 ***
## DSRI         2.58151    0.29095   8.873  < 2e-16 ***
## GMI          0.83156    0.21210   3.921 8.84e-05 ***
## AQI          0.58472    0.07195   8.127 4.42e-16 ***
## SGI          3.88480    0.39576   9.816  < 2e-16 ***
## DEPI         0.07938    0.27013   0.294  0.76887    
## SGAI        -0.10741    0.20496  -0.524  0.60023    
## ACCR        10.37650    1.04277   9.951  < 2e-16 ***
## LEVI        -0.78890    0.26436  -2.984  0.00284 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1211.62  on 873  degrees of freedom
## Residual deviance:  648.81  on 865  degrees of freedom
## AIC: 666.81
## 
## Number of Fisher Scoring iterations: 9
# Checking which model performs best on Testing data 

# Model 1 - without Sampling 

Pred1 <- predict(model_logit, test_com,type = "response")
y_pred <- ifelse(Pred1 > 0.6, "Yes","No")
confusionMatrix(factor(y_pred),test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  356   7
##        Yes   1   1
##                                           
##                Accuracy : 0.9781          
##                  95% CI : (0.9573, 0.9905)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 0.5926          
##                                           
##                   Kappa : 0.1929          
##                                           
##  Mcnemar's Test P-Value : 0.0771          
##                                           
##             Sensitivity : 0.125000        
##             Specificity : 0.997199        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.980716        
##              Prevalence : 0.021918        
##          Detection Rate : 0.002740        
##    Detection Prevalence : 0.005479        
##       Balanced Accuracy : 0.561099        
##                                           
##        'Positive' Class : Yes             
## 
# Model 2 - Under Sampling 

Pred2 <- predict(model_und_logit, test_com,type = "response")
y_pred <- ifelse(Pred2 > 0.6, "Yes","No")
confusionMatrix(factor(y_pred),test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  303   1
##        Yes  54   7
##                                           
##                Accuracy : 0.8493          
##                  95% CI : (0.8084, 0.8844)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1708          
##                                           
##  Mcnemar's Test P-Value : 2.355e-12       
##                                           
##             Sensitivity : 0.87500         
##             Specificity : 0.84874         
##          Pos Pred Value : 0.11475         
##          Neg Pred Value : 0.99671         
##              Prevalence : 0.02192         
##          Detection Rate : 0.01918         
##    Detection Prevalence : 0.16712         
##       Balanced Accuracy : 0.86187         
##                                           
##        'Positive' Class : Yes             
## 
# Model 3 - Over Sampling 

Pred3 <- predict(model_ovr_logit, test_com,type = "response")
y_pred <- ifelse(Pred3 > 0.6, "Yes","No")
confusionMatrix(factor(y_pred),test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  322   2
##        Yes  35   6
##                                          
##                Accuracy : 0.8986         
##                  95% CI : (0.863, 0.9276)
##     No Information Rate : 0.9781         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2161         
##                                          
##  Mcnemar's Test P-Value : 1.435e-07      
##                                          
##             Sensitivity : 0.75000        
##             Specificity : 0.90196        
##          Pos Pred Value : 0.14634        
##          Neg Pred Value : 0.99383        
##              Prevalence : 0.02192        
##          Detection Rate : 0.01644        
##    Detection Prevalence : 0.11233        
##       Balanced Accuracy : 0.82598        
##                                          
##        'Positive' Class : Yes            
## 
# Model 4 - SMOTE 

Pred4 <- predict(model_smo_logit, test_com,type = "response")
y_pred <- ifelse(Pred4 > 0.6, "Yes","No")
confusionMatrix(factor(y_pred),test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  319   2
##        Yes  38   6
##                                           
##                Accuracy : 0.8904          
##                  95% CI : (0.8538, 0.9205)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2011          
##                                           
##  Mcnemar's Test P-Value : 3.13e-08        
##                                           
##             Sensitivity : 0.75000         
##             Specificity : 0.89356         
##          Pos Pred Value : 0.13636         
##          Neg Pred Value : 0.99377         
##              Prevalence : 0.02192         
##          Detection Rate : 0.01644         
##    Detection Prevalence : 0.12055         
##       Balanced Accuracy : 0.82178         
##                                           
##        'Positive' Class : Yes             
## 
# Over Sampling Model performs best when put on Logistic Regression
# we get a best trade off with RECALL - 75%, FPR - 9.09% and Accuracy of 89.86%

Part(i) - Emsemble Techniques

Random Forest

# original Complete Data 
table(earn$Target)
## 
##   No  Yes 
## 1200   39
prop.table(table(earn$Target)) # High Imbalance 
## 
##       No      Yes 
## 0.968523 0.031477
# Training Data 
table(train_com$Target)
## 
##  No Yes 
## 843  31
prop.table(table(train_sam$Target)) # further Imbalance 
## 
##      No     Yes 
## 0.85625 0.14375
# Sampled Data 
table(under_com$Target)
## 
##  No Yes 
##  31  31
table(over_com$Target)
## 
##  No Yes 
## 843 843
table(smote_com$Target)
## 
##  No Yes 
## 437 437
# Model 1 - only on training data without any sampling technique 
library(randomForest)
model_random <- randomForest(Target ~ ., 
                          data = train_com,
                          ntree = 500, 
                          mtry = 6, 
                          importance = TRUE,
                          proximity= TRUE)
print(model_random)
## 
## Call:
##  randomForest(formula = Target ~ ., data = train_com, ntree = 500,      mtry = 6, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 3.43%
## Confusion matrix:
##      No Yes class.error
## No  837   6 0.007117438
## Yes  24   7 0.774193548
# Model 2 - only on training data with under Sampling Technique  
model_und_random <- randomForest(Target ~ ., 
                          data = under_com,
                          ntree = 500, 
                          mtry = 6, 
                          importance = TRUE,
                          proximity= TRUE)


# Model 3 - only on training data with over Sampling Technique  
model_ovr_random <- randomForest(Target ~ ., 
                          data = over_com,
                          ntree = 500, 
                          mtry = 6, 
                          importance = TRUE,
                          proximity= TRUE)


# Model 3 - only on training data with SMOTE Technique  
model_smo_random <- randomForest(Target ~ ., 
                          data = smote_com,
                          ntree = 500, 
                          mtry = 6, 
                          importance = TRUE,
                          proximity= TRUE)


# Checking which model performs best on Training Data 
print(model_random)     # Rank 4 
## 
## Call:
##  randomForest(formula = Target ~ ., data = train_com, ntree = 500,      mtry = 6, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 3.43%
## Confusion matrix:
##      No Yes class.error
## No  837   6 0.007117438
## Yes  24   7 0.774193548
print(model_und_random) # Rank 3 
## 
## Call:
##  randomForest(formula = Target ~ ., data = under_com, ntree = 500,      mtry = 6, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 19.35%
## Confusion matrix:
##     No Yes class.error
## No  25   6   0.1935484
## Yes  6  25   0.1935484
print(model_ovr_random) # RAnk 1 
## 
## Call:
##  randomForest(formula = Target ~ ., data = over_com, ntree = 500,      mtry = 6, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 0.36%
## Confusion matrix:
##      No Yes class.error
## No  837   6 0.007117438
## Yes   0 843 0.000000000
print(model_smo_random) # RAnk 2
## 
## Call:
##  randomForest(formula = Target ~ ., data = smote_com, ntree = 500,      mtry = 6, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 5.72%
## Confusion matrix:
##      No Yes class.error
## No  401  36  0.08237986
## Yes  14 423  0.03203661
# Checking which model performs best on Testing data 

# Model 1 
Pred1 <- predict(model_random, test_com)
confusionMatrix(Pred1,test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  354   5
##        Yes   3   3
##                                           
##                Accuracy : 0.9781          
##                  95% CI : (0.9573, 0.9905)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 0.5926          
##                                           
##                   Kappa : 0.4176          
##                                           
##  Mcnemar's Test P-Value : 0.7237          
##                                           
##             Sensitivity : 0.375000        
##             Specificity : 0.991597        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.986072        
##              Prevalence : 0.021918        
##          Detection Rate : 0.008219        
##    Detection Prevalence : 0.016438        
##       Balanced Accuracy : 0.683298        
##                                           
##        'Positive' Class : Yes             
## 
# Model 2 - Under Sampling 
Pred2 <- predict(model_und_random, test_com)
confusionMatrix(Pred2,test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  258   1
##        Yes  99   7
##                                           
##                Accuracy : 0.726           
##                  95% CI : (0.6772, 0.7712)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0855          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.87500         
##             Specificity : 0.72269         
##          Pos Pred Value : 0.06604         
##          Neg Pred Value : 0.99614         
##              Prevalence : 0.02192         
##          Detection Rate : 0.01918         
##    Detection Prevalence : 0.29041         
##       Balanced Accuracy : 0.79884         
##                                           
##        'Positive' Class : Yes             
## 
# Model 3 - Over Sampling 
Pred3 <- predict(model_ovr_random, test_com)
confusionMatrix(Pred3,test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  356   7
##        Yes   1   1
##                                           
##                Accuracy : 0.9781          
##                  95% CI : (0.9573, 0.9905)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 0.5926          
##                                           
##                   Kappa : 0.1929          
##                                           
##  Mcnemar's Test P-Value : 0.0771          
##                                           
##             Sensitivity : 0.125000        
##             Specificity : 0.997199        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.980716        
##              Prevalence : 0.021918        
##          Detection Rate : 0.002740        
##    Detection Prevalence : 0.005479        
##       Balanced Accuracy : 0.561099        
##                                           
##        'Positive' Class : Yes             
## 
# Model 4 - SMOTE 
Pred4 <- predict(model_smo_random, test_com)
confusionMatrix(Pred4,test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  332   5
##        Yes  25   3
##                                           
##                Accuracy : 0.9178          
##                  95% CI : (0.8847, 0.9439)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 1.0000000       
##                                           
##                   Kappa : 0.1373          
##                                           
##  Mcnemar's Test P-Value : 0.0005226       
##                                           
##             Sensitivity : 0.375000        
##             Specificity : 0.929972        
##          Pos Pred Value : 0.107143        
##          Neg Pred Value : 0.985163        
##              Prevalence : 0.021918        
##          Detection Rate : 0.008219        
##    Detection Prevalence : 0.076712        
##       Balanced Accuracy : 0.652486        
##                                           
##        'Positive' Class : Yes             
## 
# Looks like undersampling  technique, we are getting the best result
# where RECALL is 87.5%, FPR - 27.8% and Acc - 72.6%

# To check important variables
importance(model_und_random)        
##              No        Yes MeanDecreaseAccuracy MeanDecreaseGini
## DSRI 19.9859671 10.4927153           18.6137469         8.824965
## GMI   0.8699998 -0.3952660           -0.0717469         1.872862
## AQI  -4.2014921 -0.4546766           -2.4574979         1.281575
## SGI   9.5350293  7.7717381           11.1020745         4.872929
## DEPI -2.5088381 -5.0408946           -5.1777373         1.188867
## SGAI  3.7902269 -1.3646565            1.5398050         1.391985
## ACCR 16.8677842 13.6406390           19.6315002         9.485831
## LEVI  3.9216605 -0.3446201            2.2470759         1.545760
varImpPlot(model_und_random)  

# Top 4 Variable Importance
varImpPlot(model_und_random,
           sort = T,
           n.var = 4,
           main = "Top 4 Important Variables by Random Forest")

Part(i) - Emsemble Techniques

Adaboost

library("adabag")

adaboost <- boosting(Target ~ ., 
                            data = under_com, 
                            mfinal = 100, 
                            control = rpart.control(maxdepth = 1))  

summary(adaboost)
##            Length Class   Mode     
## formula      3    formula call     
## trees      100    -none-  list     
## weights    100    -none-  numeric  
## votes      124    -none-  numeric  
## prob       124    -none-  numeric  
## class       62    -none-  character
## importance   8    -none-  numeric  
## terms        3    terms   call     
## call         5    -none-  call
pred_boost = predict.boosting(adaboost, newdata = as.data.frame(test_com))
summary(pred_boost)
##           Length Class   Mode     
## formula     3    formula call     
## votes     730    -none-  numeric  
## prob      730    -none-  numeric  
## class     365    -none-  character
## confusion   4    table   numeric  
## error       1    -none-  numeric
print("Confusion Matrix using Adaboost")
## [1] "Confusion Matrix using Adaboost"
print(pred_boost$confusion)
##                Observed Class
## Predicted Class  No Yes
##             No  282   2
##             Yes  75   6
test_acc = 1-(pred_boost$error)
test_acc # Accuracy of 80.82% 
## [1] 0.7890411
# TPR is 75%% 
# FPR is 19.04%

# XGBOOST 

control = trainControl(
  method = 'cv',
  number = 10,
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

xgb <- train(Target ~ .,
             data = train_com,
             method = "xgbTree",
             metric = "Spec",
             trControl = control)

confusionMatrix(predict(xgb, test_com), test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  356   7
##        Yes   1   1
##                                           
##                Accuracy : 0.9781          
##                  95% CI : (0.9573, 0.9905)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 0.5926          
##                                           
##                   Kappa : 0.1929          
##                                           
##  Mcnemar's Test P-Value : 0.0771          
##                                           
##             Sensitivity : 0.125000        
##             Specificity : 0.997199        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.980716        
##              Prevalence : 0.021918        
##          Detection Rate : 0.002740        
##    Detection Prevalence : 0.005479        
##       Balanced Accuracy : 0.561099        
##                                           
##        'Positive' Class : Yes             
## 

Cross Validation with SMOTE Technique

library(rpart.plot)
table(smote_com$Target)
## 
##  No Yes 
## 437 437
folds <- createFolds(y=smote_com$Target,k=10,list = TRUE,returnTrain = TRUE)
lapply(folds,length)
## $Fold01
## [1] 787
## 
## $Fold02
## [1] 786
## 
## $Fold03
## [1] 787
## 
## $Fold04
## [1] 786
## 
## $Fold05
## [1] 787
## 
## $Fold06
## [1] 787
## 
## $Fold07
## [1] 786
## 
## $Fold08
## [1] 787
## 
## $Fold09
## [1] 787
## 
## $Fold10
## [1] 786
fold1<-folds[[1]]
train1<-smote_com[fold1,]
dim(train1)
## [1] 787   9
table(train1$Target)
## 
##  No Yes 
## 393 394
rpart_tree1 <- rpart(Target~., 
                     data=train1, 
                     method="class",
                     parms = list(split = "gini"))

prp(rpart_tree1,extra=4)

predicted <- predict(rpart_tree1,newdata=test_com, type="class")
confusionMatrix(predicted,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  300   2
##        Yes  57   6
##                                           
##                Accuracy : 0.8384          
##                  95% CI : (0.7965, 0.8746)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1354          
##                                           
##  Mcnemar's Test P-Value : 2.062e-12       
##                                           
##             Sensitivity : 0.75000         
##             Specificity : 0.84034         
##          Pos Pred Value : 0.09524         
##          Neg Pred Value : 0.99338         
##              Prevalence : 0.02192         
##          Detection Rate : 0.01644         
##    Detection Prevalence : 0.17260         
##       Balanced Accuracy : 0.79517         
##                                           
##        'Positive' Class : Yes             
## 
# RECALL - 62.5%, FPR - 14.3% and ACC - 85.21%

# Cross validation with Random Forest 

modelcv <- train(Target~., data=train_com,
  method = "rf",
  trControl = trainControl(method = "cv", number = 10))

pred<- predict(modelcv, newdata = test_com)
confusionMatrix(pred,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  357   8
##        Yes   0   0
##                                           
##                Accuracy : 0.9781          
##                  95% CI : (0.9573, 0.9905)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 0.59255         
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.01333         
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.97808         
##              Prevalence : 0.02192         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : Yes             
## 
# This model does not perform well as RECALL is 37.5%

# Cross validation with Logistic Regression(After Balancing the data)

modelglm <- train(Target~., data=smote_com,
  method = "glm",
  trControl = trainControl(method = "cv", number = 10))

pred<- predict(modelglm, newdata = test_com)
confusionMatrix(pred,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  307   1
##        Yes  50   7
##                                           
##                Accuracy : 0.8603          
##                  95% CI : (0.8204, 0.8942)
##     No Information Rate : 0.9781          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.184           
##                                           
##  Mcnemar's Test P-Value : 1.801e-11       
##                                           
##             Sensitivity : 0.87500         
##             Specificity : 0.85994         
##          Pos Pred Value : 0.12281         
##          Neg Pred Value : 0.99675         
##              Prevalence : 0.02192         
##          Detection Rate : 0.01918         
##    Detection Prevalence : 0.15616         
##       Balanced Accuracy : 0.86747         
##                                           
##        'Positive' Class : Yes             
## 
# RECALL - 87.5%, FPR - 14.1%, ACC - 86% 

# This looks like a decent result which can be deployed 

Trying neural Network and SVM learnt in class on Earning Manipulation

# Neural network model

num_cols <- unlist(lapply(earn, is.numeric))  
num_cols
##   DSRI    GMI    AQI    SGI   DEPI   SGAI   ACCR   LEVI Target 
##   TRUE   TRUE   TRUE   TRUE   TRUE   TRUE   TRUE   TRUE  FALSE
Num <- earn[, num_cols]
str(Num)
## tibble [1,239 × 8] (S3: tbl_df/tbl/data.frame)
##  $ DSRI: num [1:1239] 1.62 1 1 1.49 1 ...
##  $ GMI : num [1:1239] 1.13 1.61 1.02 1 1.37 ...
##  $ AQI : num [1:1239] 7.185 1.005 1.241 0.466 0.637 ...
##  $ SGI : num [1:1239] 0.366 13.081 1.475 0.673 0.861 ...
##  $ DEPI: num [1:1239] 1.38 0.4 1.17 2 1.45 ...
##  $ SGAI: num [1:1239] 1.6241 5.1982 0.6477 0.0929 1.7415 ...
##  $ ACCR: num [1:1239] -0.1668 0.0605 0.0367 0.2734 0.123 ...
##  $ LEVI: num [1:1239] 1.161 0.987 1.264 0.681 0.939 ...
# Create vector of column Max and Min values
maxs <- apply(Num, 2, max) 
mins <- apply(Num, 2, min)

# Use scale() and convert the resulting matrix to a data frame
scaled.data = as.data.frame(scale(Num, center = mins, scale = maxs - mins))
head(scaled.data)
##         DSRI       GMI       AQI        SGI       DEPI        SGAI      ACCR
## 1 0.04476959 0.3261186 0.4671749 0.02593324 0.24651346 0.032942925 0.7254232
## 2 0.02755490 0.3332169 0.3951229 1.00000000 0.06219235 0.105436499 0.7808125
## 3 0.02755490 0.3244343 0.3978791 0.11087489 0.20667035 0.013136867 0.7750261
## 4 0.04095316 0.3242023 0.3888336 0.04942290 0.36265903 0.001884109 0.8327107
## 5 0.02755490 0.3296875 0.3908340 0.06386372 0.26025165 0.035322449 0.7960614
## 6 0.02495184 0.3295668 0.3925464 0.13525910 0.22711927 0.010248312 0.7793910
##         LEVI
## 1 0.08891346
## 2 0.07556213
## 3 0.09681811
## 4 0.05214779
## 5 0.07191048
## 6 0.11817055
summary(scaled.data)
##       DSRI              GMI              AQI              SGI         
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.02455   1st Qu.:0.3231   1st Qu.:0.3924   1st Qu.:0.07220  
##  Median :0.02818   Median :0.3242   Median :0.3951   Median :0.08130  
##  Mean   :0.03221   Mean   :0.3240   Mean   :0.3950   Mean   :0.08422  
##  3rd Qu.:0.03286   3rd Qu.:0.3251   3rd Qu.:0.3976   3rd Qu.:0.08980  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##       DEPI             SGAI              ACCR             LEVI        
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1630   1st Qu.:0.01823   1st Qu.:0.7475   1st Qu.:0.07070  
##  Median :0.1752   Median :0.02028   Median :0.7589   Median :0.07758  
##  Mean   :0.1824   Mean   :0.02246   Mean   :0.7582   Mean   :0.08095  
##  3rd Qu.:0.1901   3rd Qu.:0.02292   3rd Qu.:0.7716   3rd Qu.:0.08543  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000
data <- data.frame(scaled.data, earn[!num_cols])
str(data)
## 'data.frame':    1239 obs. of  9 variables:
##  $ DSRI  : num  0.0448 0.0276 0.0276 0.041 0.0276 ...
##  $ GMI   : num  0.326 0.333 0.324 0.324 0.33 ...
##  $ AQI   : num  0.467 0.395 0.398 0.389 0.391 ...
##  $ SGI   : num  0.0259 1 0.1109 0.0494 0.0639 ...
##  $ DEPI  : num  0.2465 0.0622 0.2067 0.3627 0.2603 ...
##  $ SGAI  : num  0.03294 0.10544 0.01314 0.00188 0.03532 ...
##  $ ACCR  : num  0.725 0.781 0.775 0.833 0.796 ...
##  $ LEVI  : num  0.0889 0.0756 0.0968 0.0521 0.0719 ...
##  $ Target: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
table(data$Target)
## 
##   No  Yes 
## 1200   39
set.seed(123)
indx <- sample(2, nrow(data), replace = T, prob = c(0.7, 0.3))
train <- data[indx == 1, ]
test <- data[indx == 2, ]

table(train$Target)
## 
##  No Yes 
## 849  25
table(test$Target)
## 
##  No Yes 
## 351  14
library(nnet)
nn  <- nnet(Target ~ ., data = train, linout=F, size=10, decay=0.01)
## # weights:  101
## initial  value 829.860175 
## iter  10 value 113.830215
## iter  20 value 103.753055
## iter  30 value 92.992084
## iter  40 value 88.238549
## iter  50 value 85.258894
## iter  60 value 84.352992
## iter  70 value 84.013274
## iter  80 value 83.739899
## iter  90 value 83.635222
## iter 100 value 83.579918
## final  value 83.579918 
## stopped after 100 iterations
#Neural network model out put
summary(nn) #Summary gives you the number units in each layer in addition to all the weights
## a 8-10-1 network with 101 weights
## options were - entropy fitting  decay=0.01
##  b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 
##  -1.33  -0.08  -0.50  -0.64  -0.23  -0.24  -0.09  -1.12  -0.09 
##  b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 
##   3.92  -4.47  -4.84  -3.22   0.16   3.83  -2.87  -3.01   0.00 
##  b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 
##   1.26  -0.38  -1.13  -1.12  -0.41   0.94  -1.08  -1.33   0.05 
##  b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 i8->h4 
##   6.39  -4.03  -2.59  -2.80  -8.16  -9.22  -2.56  -3.34   0.75 
##  b->h5 i1->h5 i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 i7->h5 i8->h5 
##  -0.57  -0.14  -0.51  -0.59  -0.27   0.07  -0.25  -0.83  -0.10 
##  b->h6 i1->h6 i2->h6 i3->h6 i4->h6 i5->h6 i6->h6 i7->h6 i8->h6 
##   0.25  -0.44  -0.89  -0.79  -0.29   0.45  -0.68  -0.96  -0.01 
##  b->h7 i1->h7 i2->h7 i3->h7 i4->h7 i5->h7 i6->h7 i7->h7 i8->h7 
##  -1.53   0.44   1.36   1.17   0.27  -1.05   1.26   1.50  -0.06 
##  b->h8 i1->h8 i2->h8 i3->h8 i4->h8 i5->h8 i6->h8 i7->h8 i8->h8 
##  -2.70   0.66   2.12   1.72   0.44  -1.52   2.04   2.25  -0.17 
##  b->h9 i1->h9 i2->h9 i3->h9 i4->h9 i5->h9 i6->h9 i7->h9 i8->h9 
##  -1.68  -0.04  -0.54  -0.69  -0.23  -0.28  -0.02  -1.28  -0.09 
##  b->h10 i1->h10 i2->h10 i3->h10 i4->h10 i5->h10 i6->h10 i7->h10 i8->h10 
##    2.40   -0.82   -2.60   -1.57   -0.30    1.34   -1.85   -1.88    0.04 
##   b->o  h1->o  h2->o  h3->o  h4->o  h5->o  h6->o  h7->o  h8->o  h9->o h10->o 
##   3.12  -0.74  -9.81  -2.93 -13.34  -1.38  -2.14   3.72   5.21  -0.74  -5.21
#To plot the neural network using nnet we need to use devtools
library(devtools)
source_url('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r')
plot.nnet(nn)

# The darker lines are associated to the higher weights and gray lines are for small weights

# Important Variables with Neural Network are - ACCR, SGAI, DEPI, LEVI

#SVM 

library(e1071)
classifier = svm(formula = Target ~ .,
                 data = train_com,
                 type = 'C-classification',
                 kernel = 'radial')

y_pred = predict(classifier, newdata = test_com)

cm = table(test_com$Target, y_pred)
cm
##      y_pred
##        No Yes
##   No  357   0
##   Yes   8   0
# SVM also is not yielding good results as FPR is very bad for this model