library(readxl)
earn <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Complete Data")
sam <- read_excel("IMB579-XLS-ENG.xlsx",
sheet = "Sample for Model Development")
# Backup of the data
e<- earn
s<- sam
dim(earn)
## [1] 1239 11
dim(sam)
## [1] 220 11
# removing variables which are not required for analysis
earn$`Company ID`<- NULL
earn$`C-MANIPULATOR`<- NULL
sam$`Company ID`<- NULL
sam$`C-MANIPULATOR`<- NULL
# changing name of the target variable
names(earn)[9] <- "Target"
names(sam)[9] <- "Target"
str(earn)
## tibble [1,239 × 9] (S3: tbl_df/tbl/data.frame)
## $ DSRI : num [1:1239] 1.62 1 1 1.49 1 ...
## $ GMI : num [1:1239] 1.13 1.61 1.02 1 1.37 ...
## $ AQI : num [1:1239] 7.185 1.005 1.241 0.466 0.637 ...
## $ SGI : num [1:1239] 0.366 13.081 1.475 0.673 0.861 ...
## $ DEPI : num [1:1239] 1.38 0.4 1.17 2 1.45 ...
## $ SGAI : num [1:1239] 1.6241 5.1982 0.6477 0.0929 1.7415 ...
## $ ACCR : num [1:1239] -0.1668 0.0605 0.0367 0.2734 0.123 ...
## $ LEVI : num [1:1239] 1.161 0.987 1.264 0.681 0.939 ...
## $ Target: chr [1:1239] "Yes" "Yes" "Yes" "Yes" ...
str(sam)
## tibble [220 × 9] (S3: tbl_df/tbl/data.frame)
## $ DSRI : num [1:220] 1.62 1 1 1.49 1 ...
## $ GMI : num [1:220] 1.13 1.61 1.02 1 1.37 ...
## $ AQI : num [1:220] 7.185 1.005 1.241 0.466 0.637 ...
## $ SGI : num [1:220] 0.366 13.081 1.475 0.673 0.861 ...
## $ DEPI : num [1:220] 1.38 0.4 1.17 2 1.45 ...
## $ SGAI : num [1:220] 1.6241 5.1982 0.6477 0.0929 1.7415 ...
## $ ACCR : num [1:220] -0.1668 0.0605 0.0367 0.2734 0.123 ...
## $ LEVI : num [1:220] 1.161 0.987 1.264 0.681 0.939 ...
## $ Target: chr [1:220] "Yes" "Yes" "Yes" "Yes" ...
# Changing target variable to Factor
earn$Target <- factor(earn$Target)
sam$Target <- factor(sam$Target)
# Checking Missing values in the data
colSums(is.na(earn))
## DSRI GMI AQI SGI DEPI SGAI ACCR LEVI Target
## 0 0 0 0 0 0 0 0 0
colSums(is.na(sam))
## DSRI GMI AQI SGI DEPI SGAI ACCR LEVI Target
## 0 0 0 0 0 0 0 0 0
# This shows that there are no missing data in our Dataset, Good Start!
library(Hmisc)
describe(earn$Target)
## earn$Target
## n missing distinct
## 1239 0 2
##
## Value No Yes
## Frequency 1200 39
## Proportion 0.969 0.031
describe(sam$Target)
## sam$Target
## n missing distinct
## 220 0 2
##
## Value No Yes
## Frequency 181 39
## Proportion 0.823 0.177
# Distribution of the Majority and Minority Class
table(earn$Target)
##
## No Yes
## 1200 39
table(sam$Target) # this shows that the sample data also contains the same no of minority class as that Complete data
##
## No Yes
## 181 39
# Let us see the data graphically
ggplot(data = earn, mapping = aes(x =Target)) +
geom_bar(color="black",fill="orange")+
ggtitle("How is the target variable distributed")+
xlab("No->Non Manipulators 1->Manipulators")+
theme_bw()
# Insight: the Target Variable is highly unbalanced as the data is split in 1200:39 ratio and 181:39 in complete and sample data
library(UBL)
# Complete Data
# let us split the data into Training and Testing data
set.seed(1234)
index <- sample(1:2, nrow(earn), replace = TRUE, prob = c(0.7,0.3))
train_com <- earn[index == 1, ]
test_com <- earn[index == 2, ]
# sampling the data from 3 fronts
table(train_com$Target)
##
## No Yes
## 843 31
smote_com <- SmoteClassif(Target~.,as.data.frame(train_com), "balance")
table(smote_com$Target)
##
## No Yes
## 437 437
prop.table(table(smote_com$Target))
##
## No Yes
## 0.5 0.5
under_com <- RandUnderClassif(Target~.,as.data.frame(train_com), "balance")
table(under_com$Target)
##
## No Yes
## 31 31
prop.table(table(under_com$Target))
##
## No Yes
## 0.5 0.5
over_com <- RandOverClassif(Target~.,as.data.frame(train_com), "balance")
table(over_com$Target)
##
## No Yes
## 843 843
prop.table(table(over_com$Target))
##
## No Yes
## 0.5 0.5
# Sample Data
# let us split the data into Training and Testing data
set.seed(9999)
index1 <- sample(1:2, nrow(sam), replace = TRUE, prob = c(0.7,0.3))
train_sam <- sam[index1 == 1, ]
test_sam <- sam[index1 == 2, ]
# sampling the data from three fronts
table(train_sam$Target)
##
## No Yes
## 137 23
smote_sam <- SmoteClassif(Target~.,as.data.frame(train_sam), "balance")
table(smote_sam$Target)
##
## No Yes
## 80 79
prop.table(table(smote_sam$Target))
##
## No Yes
## 0.5031447 0.4968553
under_sam <- RandUnderClassif(Target~.,as.data.frame(train_sam), "balance")
table(under_sam$Target)
##
## No Yes
## 23 23
prop.table(table(under_sam$Target))
##
## No Yes
## 0.5 0.5
over_sam <- RandOverClassif(Target~.,as.data.frame(train_sam), "balance")
table(over_sam$Target)
##
## No Yes
## 137 137
prop.table(table(over_sam$Target))
##
## No Yes
## 0.5 0.5
m <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Manipulator")
non_m <- read_excel("IMB579-XLS-ENG.xlsx",sheet = "Non-Manipulator")
# MANIPULATORS
dim(m)
## [1] 39 11
library(dplyr)
beneish_m <-m %>%
mutate(m_score= -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI - 0.172*SGAI + 4.679*ACCR - 0.327*LEVI,probability_of_manipulation=pnorm(m_score)*100)
# let us see the results visually
hist(beneish_m$m_score, # Save histogram as object
breaks = 30,
freq = T,
col = "thistle1", # Or use: col = colors() [626]
main = "Histogram for M-Score of Manipulators")
c<- count(beneish_m[-beneish_m$m_score<=-1.78,])
print(c)
## # A tibble: 1 x 1
## n
## <int>
## 1 7
# thus Beneish model could correctly identify 7 manipulator out of 39
Accuracy_predicting_manipulator <- c/nrow(m)
print(Accuracy_predicting_manipulator) # thus, we could predict manipulators with an accuracy of 17.94%
## n
## 1 0.1794872
# NON-MANIPULATORS
dim(non_m)
## [1] 1200 11
beneish_non_m <-non_m %>%
mutate(m_score= -4.84 + 0.92*DSRI + 0.528*GMI + 0.404*AQI + 0.892*SGI + 0.115*DEPI - 0.172*SGAI + 4.679*ACCR - 0.327*LEVI,probability_of_manipulation=pnorm(m_score)*100)
# let us see the results visually
hist(beneish_non_m$m_score, # Save histogram as object
breaks = 30,
freq = T,
col = "thistle1", # Or use: col = colors() [626]
main = "Histogram for M-Score of Non-Manipulators")
c<- count(beneish_non_m[beneish_non_m$m_score<=-1.78,])
print(c)
## # A tibble: 1 x 1
## n
## <int>
## 1 1029
# thus Beneish model could correctly identify 1029 non-manipulator out of 1200
Accuracy_predicting_non_manipulator <- c/nrow(non_m)
print(Accuracy_predicting_non_manipulator) # thus, we could predict non-manipulators with an accuracy of 85.75%
## n
## 1 0.8575
# Insight : Since we are more interested in correctly finding firms who are doing manipulation, thus an accuracy of 17.94% would be very less for a good model THUS, BENEISH MODEL WOULD NOT BE A SUITABLE MODEL FOR PREDICTING EARNING MANIPULATIONS IN INDIA
The number of manipulators is usually much less than non-manipulators (in the accompanying spreadsheet, the percentage of manipulators is less than 4% in the complete data).
The type of modeling we can do here is as below:
Other comments: a) We can handle unbalanced data with bootstraping as this emsemble technique will help in reassigning weights to the unalanced classes b) we can also do Cross Validation like 10-fold to reassign and get better models which can predict who is a manipulator
How can one handle unbalanced problems?
We can handle unbalanced problems with various sampling techniques like a) Undersampling b) Oversampling c) SMOTE
# undersampling has been applied under the Class Imbalance section
# Let us check the sampling and distribution
# original Sample Data
table(sam$Target)
##
## No Yes
## 181 39
prop.table(table(sam$Target)) # High Imbalance
##
## No Yes
## 0.8227273 0.1772727
# Training Data
table(train_sam$Target)
##
## No Yes
## 137 23
prop.table(table(train_sam$Target)) # further Imbalance
##
## No Yes
## 0.85625 0.14375
# Undersampling Data
table(under_sam$Target)
##
## No Yes
## 23 23
prop.table(table(under_sam$Target)) # finally the data is balanced
##
## No Yes
## 0.5 0.5
# Insight : Undersampling is not an advisable technique as we lose alot of valuable information which was stored in the original dataset, we can further investigate this insight
# Let us try to do implement stepwise logistic regression
full <- glm(Target~., data=under_sam,family = "binomial")
null <- glm(Target~1, data=under_sam,family = "binomial")
# Forward selection
mod_und<- step(full,scope=list(lower=null,upper=full),
direction = "both")
## Start: AIC=45.4
## Target ~ DSRI + GMI + AQI + SGI + DEPI + SGAI + ACCR + LEVI
##
## Df Deviance AIC
## - GMI 1 27.523 43.523
## - DEPI 1 27.683 43.683
## - LEVI 1 27.789 43.789
## - SGAI 1 28.563 44.563
## <none> 27.399 45.399
## - SGI 1 32.963 48.963
## - DSRI 1 34.286 50.286
## - AQI 1 38.126 54.126
## - ACCR 1 47.305 63.305
##
## Step: AIC=43.52
## Target ~ DSRI + AQI + SGI + DEPI + SGAI + ACCR + LEVI
##
## Df Deviance AIC
## - LEVI 1 27.834 41.834
## - DEPI 1 27.881 41.881
## - SGAI 1 28.658 42.658
## <none> 27.523 43.523
## + GMI 1 27.399 45.399
## - SGI 1 34.885 48.885
## - DSRI 1 35.129 49.129
## - AQI 1 39.447 53.447
## - ACCR 1 47.342 61.342
##
## Step: AIC=41.83
## Target ~ DSRI + AQI + SGI + DEPI + SGAI + ACCR
##
## Df Deviance AIC
## - DEPI 1 28.155 40.155
## - SGAI 1 28.674 40.674
## <none> 27.834 41.834
## + LEVI 1 27.523 43.523
## + GMI 1 27.789 43.789
## - DSRI 1 35.130 47.130
## - SGI 1 35.188 47.188
## - AQI 1 42.860 54.860
## - ACCR 1 48.217 60.217
##
## Step: AIC=40.15
## Target ~ DSRI + AQI + SGI + SGAI + ACCR
##
## Df Deviance AIC
## - SGAI 1 28.699 38.699
## <none> 28.155 40.155
## + DEPI 1 27.834 41.834
## + LEVI 1 27.881 41.881
## + GMI 1 28.065 42.065
## - DSRI 1 35.215 45.215
## - SGI 1 37.693 47.693
## - AQI 1 43.475 53.475
## - ACCR 1 50.649 60.649
##
## Step: AIC=38.7
## Target ~ DSRI + AQI + SGI + ACCR
##
## Df Deviance AIC
## <none> 28.699 38.699
## + SGAI 1 28.155 40.155
## + GMI 1 28.614 40.614
## + DEPI 1 28.674 40.674
## + LEVI 1 28.675 40.675
## - SGI 1 37.846 45.846
## - DSRI 1 39.748 47.748
## - AQI 1 45.110 53.110
## - ACCR 1 52.094 60.094
summary(mod_und)
##
## Call:
## glm(formula = Target ~ DSRI + AQI + SGI + ACCR, family = "binomial",
## data = under_sam)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.95491 -0.41745 -0.05036 0.34078 2.25001
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.3724 2.7114 -2.350 0.01876 *
## DSRI 0.7267 0.5750 1.264 0.20633
## AQI 0.3871 0.1724 2.246 0.02469 *
## SGI 3.5195 1.7354 2.028 0.04255 *
## ACCR 19.4569 7.3948 2.631 0.00851 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 63.770 on 45 degrees of freedom
## Residual deviance: 28.699 on 41 degrees of freedom
## AIC: 38.699
##
## Number of Fisher Scoring iterations: 8
# Important Variables for Undersampling model
# AQI
# SGI
# ACCR - Most Important
# let us tidy the model
library (devtools)
library (broom)
tidy(mod_und)
## # A tibble: 5 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -6.37 2.71 -2.35 0.0188
## 2 DSRI 0.727 0.575 1.26 0.206
## 3 AQI 0.387 0.172 2.25 0.0247
## 4 SGI 3.52 1.74 2.03 0.0425
## 5 ACCR 19.5 7.39 2.63 0.00851
# Checking Odds Ratio
exp(coef(mod_und))
## (Intercept) DSRI AQI SGI ACCR
## 1.708097e-03 2.068192e+00 1.472774e+00 3.376802e+01 2.818513e+08
exp(cbind(OR = coef(mod_und), confint(mod_und)))
## OR 2.5 % 97.5 %
## (Intercept) 1.708097e-03 1.358528e-06 9.701667e-02
## DSRI 2.068192e+00 1.159864e+00 9.331341e+00
## AQI 1.472774e+00 1.136226e+00 2.317320e+00
## SGI 3.376802e+01 2.455897e+00 2.832386e+03
## ACCR 2.818513e+08 4.351163e+03 3.333050e+16
# with every one unit increase in AQI, the odds of being a Manipulator increase by 1.47
# with every one unit increase in SGI, the odds of being a Manipulator increase by 33.76
# with every one unit increase in ACCR, the odds of being a Manipulator increase by 281851324
# Let us do some prediction
predicted <- predict(mod_und, newdata=test_sam, type="response")
# let us calculate the probabilties for Manipulator CLass
probs <- exp(predicted)/(1+exp(predicted)) # this is the probability of 1 class - meaning Manipulators
print(probs)
## 1 2 3 4 5 6 7 8
## 0.5031334 0.6631134 0.7061923 0.7295942 0.5301354 0.7273306 0.5970467 0.6536270
## 9 10 11 12 13 14 15 16
## 0.7250499 0.6734699 0.7220515 0.7310586 0.5009815 0.7310578 0.5000020 0.7242812
## 17 18 19 20 21 22 23 24
## 0.6281579 0.5843709 0.5421352 0.5010349 0.5200838 0.5856523 0.5766283 0.5170665
## 25 26 27 28 29 30 31 32
## 0.5099944 0.5231746 0.5324276 0.5575018 0.5194507 0.5149051 0.5149108 0.5536051
## 33 34 35 36 37 38 39 40
## 0.5145833 0.5033140 0.5437360 0.6968706 0.5328364 0.5933025 0.5769681 0.5421692
## 41 42 43 44 45 46 47 48
## 0.5742079 0.6408721 0.5345717 0.5034593 0.5114125 0.5057671 0.5245171 0.6771183
## 49 50 51 52 53 54 55 56
## 0.5602461 0.5254146 0.6900247 0.6623328 0.5093897 0.5010648 0.5315097 0.5145201
## 57 58 59 60
## 0.6204486 0.5036962 0.5099797 0.5251903
# Insight: After Running the Stepwise logistic regression using undersampling technique, we could see that ACCR, SGI, AQI are the major predictors for finding out who will be the Manipulators
Reference
Prediction No Yes No 38 5 Yes 6 11
‘Positive’ Class : Yes
Hypothesis and Business Logic
Given the above matrix we can interpret it as:
Recall - Our model predicts people as manipulators and they are actually manipulators - Paramter 1 which needs to be best for our model
1 - Specificity - False Positive Rate - Our model predicts people as non - manipulators and they are actually manipulators - Paramter 2 which needs to be best for our model
Let us say that th ebank loans 10,00,000 INR to a person
if the model says that this person can be a manipulator but in actual he is not - FALSE NEGATIVE RATE - the bank may lose a potential interest which they might earn from the person
if the interest is @10% then - the BANK LOSES 1,00,000 - in a very basic terminology
However, if the model says that this person is not a manipulator but in actual he - FALSE Positive RATE - the bank may lose the full loan amount - 10,00,000
Thus, COST of FPR to COST of FNR is 10:1
# Model Evaluation
#run anova for checking the model
anova(mod_und, test = 'Chisq')
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Target
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 45 63.770
## DSRI 1 2.5330 44 61.237 0.111488
## AQI 1 2.4498 43 58.787 0.117542
## SGI 1 6.6930 42 52.094 0.009679 **
## ACCR 1 23.3944 41 28.699 1.32e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA shows that SGI AND ACCR are the variables which show the most significance
# Recode Predicted to factors
y_pred <- ifelse(predicted>0.5,"Yes","No") # let us recode with cut off of 0.5
table(y_pred,test_sam$Target)
##
## y_pred No Yes
## No 38 5
## Yes 6 11
# Confusion Matrix
library(caret)
confusionMatrix(factor(y_pred),test_sam$Target, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 38 5
## Yes 6 11
##
## Accuracy : 0.8167
## 95% CI : (0.6956, 0.9048)
## No Information Rate : 0.7333
## P-Value [Acc > NIR] : 0.09107
##
## Kappa : 0.5404
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.6875
## Specificity : 0.8636
## Pos Pred Value : 0.6471
## Neg Pred Value : 0.8837
## Prevalence : 0.2667
## Detection Rate : 0.1833
## Detection Prevalence : 0.2833
## Balanced Accuracy : 0.7756
##
## 'Positive' Class : Yes
##
# Let us try different probabilities to check various cut off points
library(caret)
for (i in seq(0.3,0.9,by=0.05))
{
y_pred <- ifelse(predicted>i,"Yes","No")
c<- confusionMatrix(factor(y_pred),test_sam$Target, positive = "Yes")
print(i)
print(c$table)
print(c$byClass[1])
print(c$byClass[2])
}
## [1] 0.3
## Reference
## Prediction No Yes
## No 32 4
## Yes 12 12
## Sensitivity
## 0.75
## Specificity
## 0.7272727
## [1] 0.35
## Reference
## Prediction No Yes
## No 36 4
## Yes 8 12
## Sensitivity
## 0.75
## Specificity
## 0.8181818
## [1] 0.4
## Reference
## Prediction No Yes
## No 37 5
## Yes 7 11
## Sensitivity
## 0.6875
## Specificity
## 0.8409091
## [1] 0.45
## Reference
## Prediction No Yes
## No 37 5
## Yes 7 11
## Sensitivity
## 0.6875
## Specificity
## 0.8409091
## [1] 0.5
## Reference
## Prediction No Yes
## No 38 5
## Yes 6 11
## Sensitivity
## 0.6875
## Specificity
## 0.8636364
## [1] 0.55
## Reference
## Prediction No Yes
## No 39 5
## Yes 5 11
## Sensitivity
## 0.6875
## Specificity
## 0.8863636
## [1] 0.6
## Reference
## Prediction No Yes
## No 40 5
## Yes 4 11
## Sensitivity
## 0.6875
## Specificity
## 0.9090909
## [1] 0.65
## Reference
## Prediction No Yes
## No 40 6
## Yes 4 10
## Sensitivity
## 0.625
## Specificity
## 0.9090909
## [1] 0.7
## Reference
## Prediction No Yes
## No 41 7
## Yes 3 9
## Sensitivity
## 0.5625
## Specificity
## 0.9318182
## [1] 0.75
## Reference
## Prediction No Yes
## No 42 8
## Yes 2 8
## Sensitivity
## 0.5
## Specificity
## 0.9545455
## [1] 0.8
## Reference
## Prediction No Yes
## No 42 8
## Yes 2 8
## Sensitivity
## 0.5
## Specificity
## 0.9545455
## [1] 0.85
## Reference
## Prediction No Yes
## No 44 8
## Yes 0 8
## Sensitivity
## 0.5
## Specificity
## 1
## [1] 0.9
## Reference
## Prediction No Yes
## No 44 9
## Yes 0 7
## Sensitivity
## 0.4375
## Specificity
## 1
# Insight : Cutoff value of 0.6 bears the best result for the present case - BEST RECALL AND MIN False Positive Rate
# ROC Curve with ROCR package
library(ROCR)
pred <- predict(mod_und, test_sam, type="response")
pred1 <- prediction(pred,test_sam$Target)
eval <- performance(pred1, "tpr","fpr")
plot(eval) # ROC Curve
# finding yonden's index best cut-off Probability with formula
opt.cut <- function(ROCrf){
cut.ind <- mapply(FUN = function(x,y,p){yi=(y+(1-x)-1)
ind<- which(yi==max(yi))
c(recall = y[[ind]], specificity = 1-x[[ind]],cutoff = p[[ind]])},ROCrf@x.values, ROCrf@y.values,ROCrf@alpha.values) }
print(opt.cut(eval)) # thus, the optimal cut off from Youden's Index is 0.635
## [,1]
## recall 0.6875000
## specificity 0.9090909
## cutoff 0.6350208
# Cost Calculation
cost.perf = performance(pred1, "cost",cost.fn=1,cost.fp=10)
pred1@cutoffs[[1]] [which.min(cost.perf@y.values[[1]])]
## 3
## 0.8769623
# thus, the optimal cut off point comes out to be 0.8769
# Confusion Matrix with Youden's index and Cost Cutoff
# Youden's - 0.635
y_pred_youden <- ifelse(predicted>0.635,"Yes","No")
confusionMatrix(factor(y_pred_youden),test_sam$Target, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 40 5
## Yes 4 11
##
## Accuracy : 0.85
## 95% CI : (0.7343, 0.929)
## No Information Rate : 0.7333
## P-Value [Acc > NIR] : 0.02411
##
## Kappa : 0.6087
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.6875
## Specificity : 0.9091
## Pos Pred Value : 0.7333
## Neg Pred Value : 0.8889
## Prevalence : 0.2667
## Detection Rate : 0.1833
## Detection Prevalence : 0.2500
## Balanced Accuracy : 0.7983
##
## 'Positive' Class : Yes
##
# this is good, we have further reduced our False Positive Rate
# Cost Parameter - 0.8769
y_pred <- ifelse(predicted>0.8769,"Yes","No")
confusionMatrix(factor(y_pred),test_sam$Target, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 44 8
## Yes 0 8
##
## Accuracy : 0.8667
## 95% CI : (0.7541, 0.9406)
## No Information Rate : 0.7333
## P-Value [Acc > NIR] : 0.01050
##
## Kappa : 0.5946
##
## Mcnemar's Test P-Value : 0.01333
##
## Sensitivity : 0.5000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8462
## Prevalence : 0.2667
## Detection Rate : 0.1333
## Detection Prevalence : 0.1333
## Balanced Accuracy : 0.7500
##
## 'Positive' Class : Yes
##
# we have reduced our FPR to 0 however, the RECALL now is 50%
# thus, we would choose Youden's Index as the cut-off for our model
# original Complete Data
table(earn$Target)
##
## No Yes
## 1200 39
prop.table(table(earn$Target)) # High Imbalance
##
## No Yes
## 0.968523 0.031477
# Training Data
table(train_com$Target)
##
## No Yes
## 843 31
prop.table(table(train_sam$Target)) # further Imbalance
##
## No Yes
## 0.85625 0.14375
# Testing Data
table(test_com$Target)
##
## No Yes
## 357 8
prop.table(table(test_sam$Target)) # further Imbalance
##
## No Yes
## 0.7333333 0.2666667
# Sampled Data
table(under_com$Target)
##
## No Yes
## 31 31
table(over_com$Target)
##
## No Yes
## 843 843
table(smote_com$Target)
##
## No Yes
## 437 437
# Model 1 - Without sampling
library(rpart)
rpart_tree1 <- rpart(Target~.,
data=train_com,
method="class",
parms = list(split = "gini"),
control = rpart.control(mincriterion = 0.95,
maxdepth = 4))
# Model 2 - Undersampling
rpart_tree2 <- rpart(Target~.,
data=under_com,
method="class",
parms = list(split = "gini"),
control = rpart.control(mincriterion = 0.95,
maxdepth = 4))
# Model 3 - Oversampling
rpart_tree3 <- rpart(Target~.,
data=over_com,
method="class",
parms = list(split = "gini"),
control = rpart.control(mincriterion = 0.95,
maxdepth = 4))
# Model 4 - SMOTE
rpart_tree4 <- rpart(Target~.,
data=smote_com,
method="class",
parms = list(split = "gini"),
control = rpart.control(mincriterion = 0.95,
maxdepth = 4))
# Checking which model performs best on Testing data
# Model 1
pred <- predict(mod_und, test_sam, type="response")
pred1 <- prediction(pred,test_sam$Target)
eval <- performance(pred1, "tpr","fpr")
plot(eval) # ROC Curve
predicted <- predict(rpart_tree1,newdata=test_com, type="class")
confusionMatrix(predicted,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 352 5
## Yes 5 3
##
## Accuracy : 0.9726
## 95% CI : (0.9502, 0.9868)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 0.8181
##
## Kappa : 0.361
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.375000
## Specificity : 0.985994
## Pos Pred Value : 0.375000
## Neg Pred Value : 0.985994
## Prevalence : 0.021918
## Detection Rate : 0.008219
## Detection Prevalence : 0.021918
## Balanced Accuracy : 0.680497
##
## 'Positive' Class : Yes
##
# Model 2 - Under Sampling
predicted <- predict(rpart_tree2,newdata=test_com, type="class")
confusionMatrix(predicted,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 204 1
## Yes 153 7
##
## Accuracy : 0.5781
## 95% CI : (0.5256, 0.6293)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0434
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.87500
## Specificity : 0.57143
## Pos Pred Value : 0.04375
## Neg Pred Value : 0.99512
## Prevalence : 0.02192
## Detection Rate : 0.01918
## Detection Prevalence : 0.43836
## Balanced Accuracy : 0.72321
##
## 'Positive' Class : Yes
##
# Model 3 - Over Sampling
predicted <- predict(rpart_tree3,newdata=test_com, type="class")
confusionMatrix(predicted,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 301 4
## Yes 56 4
##
## Accuracy : 0.8356
## 95% CI : (0.7935, 0.8722)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0821
##
## Mcnemar's Test P-Value : 4.577e-11
##
## Sensitivity : 0.50000
## Specificity : 0.84314
## Pos Pred Value : 0.06667
## Neg Pred Value : 0.98689
## Prevalence : 0.02192
## Detection Rate : 0.01096
## Detection Prevalence : 0.16438
## Balanced Accuracy : 0.67157
##
## 'Positive' Class : Yes
##
# Model 4 - SMOTE
predicted <- predict(rpart_tree4,newdata=test_com, type="class")
confusionMatrix(predicted,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 278 2
## Yes 79 6
##
## Accuracy : 0.7781
## 95% CI : (0.7319, 0.8197)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0927
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.75000
## Specificity : 0.77871
## Pos Pred Value : 0.07059
## Neg Pred Value : 0.99286
## Prevalence : 0.02192
## Detection Rate : 0.01644
## Detection Prevalence : 0.23288
## Balanced Accuracy : 0.76436
##
## 'Positive' Class : Yes
##
#Insight - the best trade off from the above model is coming from SMOTE
# Where RECALL IS 75% and FPR is 22.2%
# original Complete Data
table(earn$Target)
##
## No Yes
## 1200 39
prop.table(table(earn$Target)) # High Imbalance
##
## No Yes
## 0.968523 0.031477
# Training Data
table(train_com$Target)
##
## No Yes
## 843 31
prop.table(table(train_sam$Target)) # further Imbalance
##
## No Yes
## 0.85625 0.14375
# Testing Data
table(test_com$Target)
##
## No Yes
## 357 8
prop.table(table(test_sam$Target)) # further Imbalance
##
## No Yes
## 0.7333333 0.2666667
# Sampled Data
table(under_com$Target)
##
## No Yes
## 31 31
table(over_com$Target)
##
## No Yes
## 843 843
table(smote_com$Target)
##
## No Yes
## 437 437
# Model 1 - without any sampling
model_logit <- glm(Target~.,data=train_com,family = "binomial")
# Model 2 - Undersampling
model_und_logit <- glm(Target~.,data=under_com,family = "binomial")
# Model 3 - Oversampling
model_ovr_logit <- glm(Target~.,data=over_com,family = "binomial")
# Model 4 - SMOTE
model_smo_logit <- glm(Target~.,data=smote_com,family = "binomial")
# Checking the Summary and performance of the model with AIC
summary(model_logit) # AIC 199
##
## Call:
## glm(formula = Target ~ ., family = "binomial", data = train_com)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0124 -0.2100 -0.1646 -0.1315 3.3716
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.21908 1.08201 -6.672 2.53e-11 ***
## DSRI 0.86708 0.23736 3.653 0.000259 ***
## GMI 0.05289 0.29781 0.178 0.859029
## AQI 0.26773 0.08394 3.190 0.001424 **
## SGI 1.76046 0.40163 4.383 1.17e-05 ***
## DEPI -0.08415 0.60158 -0.140 0.888750
## SGAI 0.48359 0.28799 1.679 0.093119 .
## ACCR 5.57172 1.43447 3.884 0.000103 ***
## LEVI -0.40503 0.29307 -1.382 0.166955
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 267.91 on 873 degrees of freedom
## Residual deviance: 181.31 on 865 degrees of freedom
## AIC: 199.31
##
## Number of Fisher Scoring iterations: 7
summary(model_und_logit) # AIC 47
##
## Call:
## glm(formula = Target ~ ., family = "binomial", data = under_com)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.40565 -0.20856 -0.00468 0.21791 1.64073
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -20.3065 8.5597 -2.372 0.0177 *
## DSRI 8.9638 3.7536 2.388 0.0169 *
## GMI 1.4028 1.4397 0.974 0.3299
## AQI 0.9820 0.3823 2.569 0.0102 *
## SGI 7.2108 3.1904 2.260 0.0238 *
## DEPI -0.5170 2.3183 -0.223 0.8235
## SGAI -0.5144 1.2203 -0.422 0.6734
## ACCR 15.2658 6.1781 2.471 0.0135 *
## LEVI -0.8619 1.2475 -0.691 0.4896
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 85.950 on 61 degrees of freedom
## Residual deviance: 29.429 on 53 degrees of freedom
## AIC: 47.429
##
## Number of Fisher Scoring iterations: 11
summary(model_ovr_logit) # AIC 1290
##
## Call:
## glm(formula = Target ~ ., family = "binomial", data = over_com)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.6714 -0.5478 0.0000 0.6085 1.6515
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.59255 0.55405 -15.509 < 2e-16 ***
## DSRI 2.19141 0.16703 13.119 < 2e-16 ***
## GMI 1.22521 0.16487 7.431 1.08e-13 ***
## AQI 0.66398 0.04882 13.601 < 2e-16 ***
## SGI 3.14291 0.22922 13.712 < 2e-16 ***
## DEPI 0.42734 0.19636 2.176 0.0295 *
## SGAI -0.14083 0.12314 -1.144 0.2528
## ACCR 8.65907 0.61299 14.126 < 2e-16 ***
## LEVI -0.79353 0.14463 -5.487 4.10e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2337.3 on 1685 degrees of freedom
## Residual deviance: 1272.9 on 1677 degrees of freedom
## AIC: 1290.9
##
## Number of Fisher Scoring iterations: 8
summary(model_smo_logit) # AIC 666
##
## Call:
## glm(formula = Target ~ ., family = "binomial", data = smote_com)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2225 -0.5480 -0.0191 0.5567 1.6834
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.03608 0.90787 -9.953 < 2e-16 ***
## DSRI 2.58151 0.29095 8.873 < 2e-16 ***
## GMI 0.83156 0.21210 3.921 8.84e-05 ***
## AQI 0.58472 0.07195 8.127 4.42e-16 ***
## SGI 3.88480 0.39576 9.816 < 2e-16 ***
## DEPI 0.07938 0.27013 0.294 0.76887
## SGAI -0.10741 0.20496 -0.524 0.60023
## ACCR 10.37650 1.04277 9.951 < 2e-16 ***
## LEVI -0.78890 0.26436 -2.984 0.00284 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1211.62 on 873 degrees of freedom
## Residual deviance: 648.81 on 865 degrees of freedom
## AIC: 666.81
##
## Number of Fisher Scoring iterations: 9
# Checking which model performs best on Testing data
# Model 1 - without Sampling
Pred1 <- predict(model_logit, test_com,type = "response")
y_pred <- ifelse(Pred1 > 0.6, "Yes","No")
confusionMatrix(factor(y_pred),test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 356 7
## Yes 1 1
##
## Accuracy : 0.9781
## 95% CI : (0.9573, 0.9905)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 0.5926
##
## Kappa : 0.1929
##
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 0.125000
## Specificity : 0.997199
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.980716
## Prevalence : 0.021918
## Detection Rate : 0.002740
## Detection Prevalence : 0.005479
## Balanced Accuracy : 0.561099
##
## 'Positive' Class : Yes
##
# Model 2 - Under Sampling
Pred2 <- predict(model_und_logit, test_com,type = "response")
y_pred <- ifelse(Pred2 > 0.6, "Yes","No")
confusionMatrix(factor(y_pred),test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 303 1
## Yes 54 7
##
## Accuracy : 0.8493
## 95% CI : (0.8084, 0.8844)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1708
##
## Mcnemar's Test P-Value : 2.355e-12
##
## Sensitivity : 0.87500
## Specificity : 0.84874
## Pos Pred Value : 0.11475
## Neg Pred Value : 0.99671
## Prevalence : 0.02192
## Detection Rate : 0.01918
## Detection Prevalence : 0.16712
## Balanced Accuracy : 0.86187
##
## 'Positive' Class : Yes
##
# Model 3 - Over Sampling
Pred3 <- predict(model_ovr_logit, test_com,type = "response")
y_pred <- ifelse(Pred3 > 0.6, "Yes","No")
confusionMatrix(factor(y_pred),test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 322 2
## Yes 35 6
##
## Accuracy : 0.8986
## 95% CI : (0.863, 0.9276)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2161
##
## Mcnemar's Test P-Value : 1.435e-07
##
## Sensitivity : 0.75000
## Specificity : 0.90196
## Pos Pred Value : 0.14634
## Neg Pred Value : 0.99383
## Prevalence : 0.02192
## Detection Rate : 0.01644
## Detection Prevalence : 0.11233
## Balanced Accuracy : 0.82598
##
## 'Positive' Class : Yes
##
# Model 4 - SMOTE
Pred4 <- predict(model_smo_logit, test_com,type = "response")
y_pred <- ifelse(Pred4 > 0.6, "Yes","No")
confusionMatrix(factor(y_pred),test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 319 2
## Yes 38 6
##
## Accuracy : 0.8904
## 95% CI : (0.8538, 0.9205)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2011
##
## Mcnemar's Test P-Value : 3.13e-08
##
## Sensitivity : 0.75000
## Specificity : 0.89356
## Pos Pred Value : 0.13636
## Neg Pred Value : 0.99377
## Prevalence : 0.02192
## Detection Rate : 0.01644
## Detection Prevalence : 0.12055
## Balanced Accuracy : 0.82178
##
## 'Positive' Class : Yes
##
# Over Sampling Model performs best when put on Logistic Regression
# we get a best trade off with RECALL - 75%, FPR - 9.09% and Accuracy of 89.86%
# original Complete Data
table(earn$Target)
##
## No Yes
## 1200 39
prop.table(table(earn$Target)) # High Imbalance
##
## No Yes
## 0.968523 0.031477
# Training Data
table(train_com$Target)
##
## No Yes
## 843 31
prop.table(table(train_sam$Target)) # further Imbalance
##
## No Yes
## 0.85625 0.14375
# Sampled Data
table(under_com$Target)
##
## No Yes
## 31 31
table(over_com$Target)
##
## No Yes
## 843 843
table(smote_com$Target)
##
## No Yes
## 437 437
# Model 1 - only on training data without any sampling technique
library(randomForest)
model_random <- randomForest(Target ~ .,
data = train_com,
ntree = 500,
mtry = 6,
importance = TRUE,
proximity= TRUE)
print(model_random)
##
## Call:
## randomForest(formula = Target ~ ., data = train_com, ntree = 500, mtry = 6, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 3.43%
## Confusion matrix:
## No Yes class.error
## No 837 6 0.007117438
## Yes 24 7 0.774193548
# Model 2 - only on training data with under Sampling Technique
model_und_random <- randomForest(Target ~ .,
data = under_com,
ntree = 500,
mtry = 6,
importance = TRUE,
proximity= TRUE)
# Model 3 - only on training data with over Sampling Technique
model_ovr_random <- randomForest(Target ~ .,
data = over_com,
ntree = 500,
mtry = 6,
importance = TRUE,
proximity= TRUE)
# Model 3 - only on training data with SMOTE Technique
model_smo_random <- randomForest(Target ~ .,
data = smote_com,
ntree = 500,
mtry = 6,
importance = TRUE,
proximity= TRUE)
# Checking which model performs best on Training Data
print(model_random) # Rank 4
##
## Call:
## randomForest(formula = Target ~ ., data = train_com, ntree = 500, mtry = 6, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 3.43%
## Confusion matrix:
## No Yes class.error
## No 837 6 0.007117438
## Yes 24 7 0.774193548
print(model_und_random) # Rank 3
##
## Call:
## randomForest(formula = Target ~ ., data = under_com, ntree = 500, mtry = 6, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 19.35%
## Confusion matrix:
## No Yes class.error
## No 25 6 0.1935484
## Yes 6 25 0.1935484
print(model_ovr_random) # RAnk 1
##
## Call:
## randomForest(formula = Target ~ ., data = over_com, ntree = 500, mtry = 6, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 0.36%
## Confusion matrix:
## No Yes class.error
## No 837 6 0.007117438
## Yes 0 843 0.000000000
print(model_smo_random) # RAnk 2
##
## Call:
## randomForest(formula = Target ~ ., data = smote_com, ntree = 500, mtry = 6, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 5.72%
## Confusion matrix:
## No Yes class.error
## No 401 36 0.08237986
## Yes 14 423 0.03203661
# Checking which model performs best on Testing data
# Model 1
Pred1 <- predict(model_random, test_com)
confusionMatrix(Pred1,test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 354 5
## Yes 3 3
##
## Accuracy : 0.9781
## 95% CI : (0.9573, 0.9905)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 0.5926
##
## Kappa : 0.4176
##
## Mcnemar's Test P-Value : 0.7237
##
## Sensitivity : 0.375000
## Specificity : 0.991597
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.986072
## Prevalence : 0.021918
## Detection Rate : 0.008219
## Detection Prevalence : 0.016438
## Balanced Accuracy : 0.683298
##
## 'Positive' Class : Yes
##
# Model 2 - Under Sampling
Pred2 <- predict(model_und_random, test_com)
confusionMatrix(Pred2,test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 258 1
## Yes 99 7
##
## Accuracy : 0.726
## 95% CI : (0.6772, 0.7712)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0855
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.87500
## Specificity : 0.72269
## Pos Pred Value : 0.06604
## Neg Pred Value : 0.99614
## Prevalence : 0.02192
## Detection Rate : 0.01918
## Detection Prevalence : 0.29041
## Balanced Accuracy : 0.79884
##
## 'Positive' Class : Yes
##
# Model 3 - Over Sampling
Pred3 <- predict(model_ovr_random, test_com)
confusionMatrix(Pred3,test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 356 7
## Yes 1 1
##
## Accuracy : 0.9781
## 95% CI : (0.9573, 0.9905)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 0.5926
##
## Kappa : 0.1929
##
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 0.125000
## Specificity : 0.997199
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.980716
## Prevalence : 0.021918
## Detection Rate : 0.002740
## Detection Prevalence : 0.005479
## Balanced Accuracy : 0.561099
##
## 'Positive' Class : Yes
##
# Model 4 - SMOTE
Pred4 <- predict(model_smo_random, test_com)
confusionMatrix(Pred4,test_com$Target, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 332 5
## Yes 25 3
##
## Accuracy : 0.9178
## 95% CI : (0.8847, 0.9439)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 1.0000000
##
## Kappa : 0.1373
##
## Mcnemar's Test P-Value : 0.0005226
##
## Sensitivity : 0.375000
## Specificity : 0.929972
## Pos Pred Value : 0.107143
## Neg Pred Value : 0.985163
## Prevalence : 0.021918
## Detection Rate : 0.008219
## Detection Prevalence : 0.076712
## Balanced Accuracy : 0.652486
##
## 'Positive' Class : Yes
##
# Looks like undersampling technique, we are getting the best result
# where RECALL is 87.5%, FPR - 27.8% and Acc - 72.6%
# To check important variables
importance(model_und_random)
## No Yes MeanDecreaseAccuracy MeanDecreaseGini
## DSRI 19.9859671 10.4927153 18.6137469 8.824965
## GMI 0.8699998 -0.3952660 -0.0717469 1.872862
## AQI -4.2014921 -0.4546766 -2.4574979 1.281575
## SGI 9.5350293 7.7717381 11.1020745 4.872929
## DEPI -2.5088381 -5.0408946 -5.1777373 1.188867
## SGAI 3.7902269 -1.3646565 1.5398050 1.391985
## ACCR 16.8677842 13.6406390 19.6315002 9.485831
## LEVI 3.9216605 -0.3446201 2.2470759 1.545760
varImpPlot(model_und_random)
# Top 4 Variable Importance
varImpPlot(model_und_random,
sort = T,
n.var = 4,
main = "Top 4 Important Variables by Random Forest")
library("adabag")
adaboost <- boosting(Target ~ .,
data = under_com,
mfinal = 100,
control = rpart.control(maxdepth = 1))
summary(adaboost)
## Length Class Mode
## formula 3 formula call
## trees 100 -none- list
## weights 100 -none- numeric
## votes 124 -none- numeric
## prob 124 -none- numeric
## class 62 -none- character
## importance 8 -none- numeric
## terms 3 terms call
## call 5 -none- call
pred_boost = predict.boosting(adaboost, newdata = as.data.frame(test_com))
summary(pred_boost)
## Length Class Mode
## formula 3 formula call
## votes 730 -none- numeric
## prob 730 -none- numeric
## class 365 -none- character
## confusion 4 table numeric
## error 1 -none- numeric
print("Confusion Matrix using Adaboost")
## [1] "Confusion Matrix using Adaboost"
print(pred_boost$confusion)
## Observed Class
## Predicted Class No Yes
## No 282 2
## Yes 75 6
test_acc = 1-(pred_boost$error)
test_acc # Accuracy of 80.82%
## [1] 0.7890411
# TPR is 75%%
# FPR is 19.04%
# XGBOOST
control = trainControl(
method = 'cv',
number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
xgb <- train(Target ~ .,
data = train_com,
method = "xgbTree",
metric = "Spec",
trControl = control)
confusionMatrix(predict(xgb, test_com), test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 356 7
## Yes 1 1
##
## Accuracy : 0.9781
## 95% CI : (0.9573, 0.9905)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 0.5926
##
## Kappa : 0.1929
##
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 0.125000
## Specificity : 0.997199
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.980716
## Prevalence : 0.021918
## Detection Rate : 0.002740
## Detection Prevalence : 0.005479
## Balanced Accuracy : 0.561099
##
## 'Positive' Class : Yes
##
library(rpart.plot)
table(smote_com$Target)
##
## No Yes
## 437 437
folds <- createFolds(y=smote_com$Target,k=10,list = TRUE,returnTrain = TRUE)
lapply(folds,length)
## $Fold01
## [1] 787
##
## $Fold02
## [1] 786
##
## $Fold03
## [1] 787
##
## $Fold04
## [1] 786
##
## $Fold05
## [1] 787
##
## $Fold06
## [1] 787
##
## $Fold07
## [1] 786
##
## $Fold08
## [1] 787
##
## $Fold09
## [1] 787
##
## $Fold10
## [1] 786
fold1<-folds[[1]]
train1<-smote_com[fold1,]
dim(train1)
## [1] 787 9
table(train1$Target)
##
## No Yes
## 393 394
rpart_tree1 <- rpart(Target~.,
data=train1,
method="class",
parms = list(split = "gini"))
prp(rpart_tree1,extra=4)
predicted <- predict(rpart_tree1,newdata=test_com, type="class")
confusionMatrix(predicted,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 300 2
## Yes 57 6
##
## Accuracy : 0.8384
## 95% CI : (0.7965, 0.8746)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1354
##
## Mcnemar's Test P-Value : 2.062e-12
##
## Sensitivity : 0.75000
## Specificity : 0.84034
## Pos Pred Value : 0.09524
## Neg Pred Value : 0.99338
## Prevalence : 0.02192
## Detection Rate : 0.01644
## Detection Prevalence : 0.17260
## Balanced Accuracy : 0.79517
##
## 'Positive' Class : Yes
##
# RECALL - 62.5%, FPR - 14.3% and ACC - 85.21%
# Cross validation with Random Forest
modelcv <- train(Target~., data=train_com,
method = "rf",
trControl = trainControl(method = "cv", number = 10))
pred<- predict(modelcv, newdata = test_com)
confusionMatrix(pred,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 357 8
## Yes 0 0
##
## Accuracy : 0.9781
## 95% CI : (0.9573, 0.9905)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 0.59255
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.01333
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.97808
## Prevalence : 0.02192
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : Yes
##
# This model does not perform well as RECALL is 37.5%
# Cross validation with Logistic Regression(After Balancing the data)
modelglm <- train(Target~., data=smote_com,
method = "glm",
trControl = trainControl(method = "cv", number = 10))
pred<- predict(modelglm, newdata = test_com)
confusionMatrix(pred,test_com$Target,positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 307 1
## Yes 50 7
##
## Accuracy : 0.8603
## 95% CI : (0.8204, 0.8942)
## No Information Rate : 0.9781
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.184
##
## Mcnemar's Test P-Value : 1.801e-11
##
## Sensitivity : 0.87500
## Specificity : 0.85994
## Pos Pred Value : 0.12281
## Neg Pred Value : 0.99675
## Prevalence : 0.02192
## Detection Rate : 0.01918
## Detection Prevalence : 0.15616
## Balanced Accuracy : 0.86747
##
## 'Positive' Class : Yes
##
# RECALL - 87.5%, FPR - 14.1%, ACC - 86%
# This looks like a decent result which can be deployed
# Neural network model
num_cols <- unlist(lapply(earn, is.numeric))
num_cols
## DSRI GMI AQI SGI DEPI SGAI ACCR LEVI Target
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
Num <- earn[, num_cols]
str(Num)
## tibble [1,239 × 8] (S3: tbl_df/tbl/data.frame)
## $ DSRI: num [1:1239] 1.62 1 1 1.49 1 ...
## $ GMI : num [1:1239] 1.13 1.61 1.02 1 1.37 ...
## $ AQI : num [1:1239] 7.185 1.005 1.241 0.466 0.637 ...
## $ SGI : num [1:1239] 0.366 13.081 1.475 0.673 0.861 ...
## $ DEPI: num [1:1239] 1.38 0.4 1.17 2 1.45 ...
## $ SGAI: num [1:1239] 1.6241 5.1982 0.6477 0.0929 1.7415 ...
## $ ACCR: num [1:1239] -0.1668 0.0605 0.0367 0.2734 0.123 ...
## $ LEVI: num [1:1239] 1.161 0.987 1.264 0.681 0.939 ...
# Create vector of column Max and Min values
maxs <- apply(Num, 2, max)
mins <- apply(Num, 2, min)
# Use scale() and convert the resulting matrix to a data frame
scaled.data = as.data.frame(scale(Num, center = mins, scale = maxs - mins))
head(scaled.data)
## DSRI GMI AQI SGI DEPI SGAI ACCR
## 1 0.04476959 0.3261186 0.4671749 0.02593324 0.24651346 0.032942925 0.7254232
## 2 0.02755490 0.3332169 0.3951229 1.00000000 0.06219235 0.105436499 0.7808125
## 3 0.02755490 0.3244343 0.3978791 0.11087489 0.20667035 0.013136867 0.7750261
## 4 0.04095316 0.3242023 0.3888336 0.04942290 0.36265903 0.001884109 0.8327107
## 5 0.02755490 0.3296875 0.3908340 0.06386372 0.26025165 0.035322449 0.7960614
## 6 0.02495184 0.3295668 0.3925464 0.13525910 0.22711927 0.010248312 0.7793910
## LEVI
## 1 0.08891346
## 2 0.07556213
## 3 0.09681811
## 4 0.05214779
## 5 0.07191048
## 6 0.11817055
summary(scaled.data)
## DSRI GMI AQI SGI
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.02455 1st Qu.:0.3231 1st Qu.:0.3924 1st Qu.:0.07220
## Median :0.02818 Median :0.3242 Median :0.3951 Median :0.08130
## Mean :0.03221 Mean :0.3240 Mean :0.3950 Mean :0.08422
## 3rd Qu.:0.03286 3rd Qu.:0.3251 3rd Qu.:0.3976 3rd Qu.:0.08980
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## DEPI SGAI ACCR LEVI
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1630 1st Qu.:0.01823 1st Qu.:0.7475 1st Qu.:0.07070
## Median :0.1752 Median :0.02028 Median :0.7589 Median :0.07758
## Mean :0.1824 Mean :0.02246 Mean :0.7582 Mean :0.08095
## 3rd Qu.:0.1901 3rd Qu.:0.02292 3rd Qu.:0.7716 3rd Qu.:0.08543
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.00000
data <- data.frame(scaled.data, earn[!num_cols])
str(data)
## 'data.frame': 1239 obs. of 9 variables:
## $ DSRI : num 0.0448 0.0276 0.0276 0.041 0.0276 ...
## $ GMI : num 0.326 0.333 0.324 0.324 0.33 ...
## $ AQI : num 0.467 0.395 0.398 0.389 0.391 ...
## $ SGI : num 0.0259 1 0.1109 0.0494 0.0639 ...
## $ DEPI : num 0.2465 0.0622 0.2067 0.3627 0.2603 ...
## $ SGAI : num 0.03294 0.10544 0.01314 0.00188 0.03532 ...
## $ ACCR : num 0.725 0.781 0.775 0.833 0.796 ...
## $ LEVI : num 0.0889 0.0756 0.0968 0.0521 0.0719 ...
## $ Target: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
table(data$Target)
##
## No Yes
## 1200 39
set.seed(123)
indx <- sample(2, nrow(data), replace = T, prob = c(0.7, 0.3))
train <- data[indx == 1, ]
test <- data[indx == 2, ]
table(train$Target)
##
## No Yes
## 849 25
table(test$Target)
##
## No Yes
## 351 14
library(nnet)
nn <- nnet(Target ~ ., data = train, linout=F, size=10, decay=0.01)
## # weights: 101
## initial value 829.860175
## iter 10 value 113.830215
## iter 20 value 103.753055
## iter 30 value 92.992084
## iter 40 value 88.238549
## iter 50 value 85.258894
## iter 60 value 84.352992
## iter 70 value 84.013274
## iter 80 value 83.739899
## iter 90 value 83.635222
## iter 100 value 83.579918
## final value 83.579918
## stopped after 100 iterations
#Neural network model out put
summary(nn) #Summary gives you the number units in each layer in addition to all the weights
## a 8-10-1 network with 101 weights
## options were - entropy fitting decay=0.01
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1
## -1.33 -0.08 -0.50 -0.64 -0.23 -0.24 -0.09 -1.12 -0.09
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2
## 3.92 -4.47 -4.84 -3.22 0.16 3.83 -2.87 -3.01 0.00
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3
## 1.26 -0.38 -1.13 -1.12 -0.41 0.94 -1.08 -1.33 0.05
## b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 i8->h4
## 6.39 -4.03 -2.59 -2.80 -8.16 -9.22 -2.56 -3.34 0.75
## b->h5 i1->h5 i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 i7->h5 i8->h5
## -0.57 -0.14 -0.51 -0.59 -0.27 0.07 -0.25 -0.83 -0.10
## b->h6 i1->h6 i2->h6 i3->h6 i4->h6 i5->h6 i6->h6 i7->h6 i8->h6
## 0.25 -0.44 -0.89 -0.79 -0.29 0.45 -0.68 -0.96 -0.01
## b->h7 i1->h7 i2->h7 i3->h7 i4->h7 i5->h7 i6->h7 i7->h7 i8->h7
## -1.53 0.44 1.36 1.17 0.27 -1.05 1.26 1.50 -0.06
## b->h8 i1->h8 i2->h8 i3->h8 i4->h8 i5->h8 i6->h8 i7->h8 i8->h8
## -2.70 0.66 2.12 1.72 0.44 -1.52 2.04 2.25 -0.17
## b->h9 i1->h9 i2->h9 i3->h9 i4->h9 i5->h9 i6->h9 i7->h9 i8->h9
## -1.68 -0.04 -0.54 -0.69 -0.23 -0.28 -0.02 -1.28 -0.09
## b->h10 i1->h10 i2->h10 i3->h10 i4->h10 i5->h10 i6->h10 i7->h10 i8->h10
## 2.40 -0.82 -2.60 -1.57 -0.30 1.34 -1.85 -1.88 0.04
## b->o h1->o h2->o h3->o h4->o h5->o h6->o h7->o h8->o h9->o h10->o
## 3.12 -0.74 -9.81 -2.93 -13.34 -1.38 -2.14 3.72 5.21 -0.74 -5.21
#To plot the neural network using nnet we need to use devtools
library(devtools)
source_url('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r')
plot.nnet(nn)
# The darker lines are associated to the higher weights and gray lines are for small weights
# Important Variables with Neural Network are - ACCR, SGAI, DEPI, LEVI
#SVM
library(e1071)
classifier = svm(formula = Target ~ .,
data = train_com,
type = 'C-classification',
kernel = 'radial')
y_pred = predict(classifier, newdata = test_com)
cm = table(test_com$Target, y_pred)
cm
## y_pred
## No Yes
## No 357 0
## Yes 8 0
# SVM also is not yielding good results as FPR is very bad for this model