Problem Definition
Predict Loan default

Setup

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra) 
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(Deducer)
## Loading required package: JGR
## Loading required package: rJava
## Loading required package: JavaGD
## Loading required package: iplots
## 
## Please type JGR() to launch console. Platform specific launchers (.exe and .app) can also be obtained at http://www.rforge.net/JGR/files/.
## Loading required package: car
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## Note Non-JGR console detected:
##  Deducer is best used from within JGR (http://jgr.markushelbig.org/).
##  To Bring up GUI dialogs, type deducer().
library(caret)
## Loading required package: lattice
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(nnet)
## Warning: package 'nnet' was built under R version 3.4.3

Functions

Dataset

setwd("D:/Welingkar/Competitions/Anaholix/Round2/Q1")
dfrModel <- read.csv("./Question1.csv", header=T, stringsAsFactors=F)
head(dfrModel)
##   Market_Cap Beta PE_Ratio  ROE  ROA Asset_Turnover Leverage Rev_Growth
## 1      68.44 0.32     24.7 26.4 11.8            0.7     0.42       7.54
## 2       7.58 0.41     82.5 12.9  5.5            0.9     0.60       9.16
## 3       6.30 0.46     20.7 14.9  7.8            0.9     0.27       7.05
## 4      67.63 0.52     21.5 27.4 15.4            0.9     0.00      15.00
## 5      47.16 0.32     20.1 21.8  7.5            0.6     0.34      26.81
## 6      16.90 1.11     27.9  3.9  1.4            0.6     0.00      -3.17
##   Net_Profit_Margin Median_Recommendation Location Exchange
## 1              16.1                     3        1        1
## 2               5.5                     3        2        1
## 3              11.2                     4        3        1
## 4              18.0                     2        3        1
## 5              12.9                     3        4        1
## 6               2.6                     1        5        1

Observation
Only Numeric data is their so no data changes required

Missing Data

#sum(is.na(dfrModel$Age))
lapply(dfrModel, FUN=detect_na)
## $Market_Cap
## [1] 0
## 
## $Beta
## [1] 0
## 
## $PE_Ratio
## [1] 0
## 
## $ROE
## [1] 0
## 
## $ROA
## [1] 0
## 
## $Asset_Turnover
## [1] 0
## 
## $Leverage
## [1] 0
## 
## $Rev_Growth
## [1] 0
## 
## $Net_Profit_Margin
## [1] 0
## 
## $Median_Recommendation
## [1] 0
## 
## $Location
## [1] 0
## 
## $Exchange
## [1] 0

Observation
1. There are no NA records in dataset.

Outliers Data

#detect_outliers(dfrModel$Age)
lapply(dfrModel, FUN=detect_outliers)
## $Market_Cap
## numeric(0)
## 
## $Beta
## numeric(0)
## 
## $PE_Ratio
## [1] 82.5 56.5
## 
## $ROE
## numeric(0)
## 
## $ROA
## numeric(0)
## 
## $Asset_Turnover
## numeric(0)
## 
## $Leverage
## [1] 3.51
## 
## $Rev_Growth
## numeric(0)
## 
## $Net_Profit_Margin
## numeric(0)
## 
## $Median_Recommendation
## integer(0)
## 
## $Location
## integer(0)
## 
## $Exchange
## [1] 2 3

Observations
1. There are few outliers so we are going with Outliers

Outliers Graph

lapply(dfrModel, FUN=Graph_Boxplot)
## $Market_Cap

## 
## $Beta

## 
## $PE_Ratio

## 
## $ROE

## 
## $ROA

## 
## $Asset_Turnover

## 
## $Leverage

## 
## $Rev_Growth

## 
## $Net_Profit_Margin

## 
## $Median_Recommendation

## 
## $Location

## 
## $Exchange

Observations
There are few outliers.
We will go with Outliers

Train Data

dim(dfrModel)
## [1] 21 12

Missing Data

lapply(dfrModel, FUN=detect_na)
## $Market_Cap
## [1] 0
## 
## $Beta
## [1] 0
## 
## $PE_Ratio
## [1] 0
## 
## $ROE
## [1] 0
## 
## $ROA
## [1] 0
## 
## $Asset_Turnover
## [1] 0
## 
## $Leverage
## [1] 0
## 
## $Rev_Growth
## [1] 0
## 
## $Net_Profit_Margin
## [1] 0
## 
## $Median_Recommendation
## [1] 0
## 
## $Location
## [1] 0
## 
## $Exchange
## [1] 0

Observation
There is no data in the dataset with values as NA

Summary

#summary(dfrModel)
lapply(dfrModel, FUN=summary)
## $Market_Cap
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.41    6.30   48.19   57.65   73.84  199.47 
## 
## $Beta
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1800  0.3500  0.4600  0.5257  0.6500  1.1100 
## 
## $PE_Ratio
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.60   18.90   21.50   25.46   27.90   82.50 
## 
## $ROE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     3.9    14.9    22.6    25.8    31.0    62.9 
## 
## $ROA
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.40    5.70   11.20   10.51   15.00   20.30 
## 
## $Asset_Turnover
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.3     0.6     0.6     0.7     0.9     1.1 
## 
## $Leverage
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1600  0.3400  0.5857  0.6000  3.5100 
## 
## $Rev_Growth
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -3.17    6.38    9.37   13.37   21.87   34.21 
## 
## $Net_Profit_Margin
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.6    11.2    16.1    15.7    21.1    25.5 
## 
## $Median_Recommendation
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       2       2       3       4 
## 
## $Location
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    1.00    2.19    3.00    7.00 
## 
## $Exchange
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.143   1.000   3.000

Correlation

vctCorr = numeric(0)
for (i in names(dfrModel)){
    cor.result <- cor(as.numeric(dfrModel$Median_Recommendation), as.numeric(dfrModel[,i]))
    vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrModel)
dfrCorr
##            Market_Cap                  Beta              PE_Ratio 
##           -0.06177162            0.03894983            0.16124823 
##                   ROE                   ROA        Asset_Turnover 
##           -0.22638420           -0.14094038            0.04612656 
##              Leverage            Rev_Growth     Net_Profit_Margin 
##            0.15806781            0.23437920           -0.12343707 
## Median_Recommendation              Location              Exchange 
##            1.00000000           -0.10749108           -0.10458250

Data For Visualization

dfrGraph <- gather(dfrModel, variable, value, -Median_Recommendation)
head(dfrGraph)
##   Median_Recommendation   variable value
## 1                     3 Market_Cap 68.44
## 2                     3 Market_Cap  7.58
## 3                     4 Market_Cap  6.30
## 4                     2 Market_Cap 67.63
## 5                     3 Market_Cap 47.16
## 6                     1 Market_Cap 16.90

Data Visualization

ggplot(dfrGraph) +
    geom_jitter(aes(value,Median_Recommendation, colour=variable)) + 
    facet_wrap(~variable, scales="free_x") +
    labs(title="Relation Of Diabetes [atient] With Other Features")

Observation
As per graph, There is some correlation between Outcome variable and other variables.

Summary

lapply(dfrModel, FUN=summary)
## $Market_Cap
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.41    6.30   48.19   57.65   73.84  199.47 
## 
## $Beta
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1800  0.3500  0.4600  0.5257  0.6500  1.1100 
## 
## $PE_Ratio
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.60   18.90   21.50   25.46   27.90   82.50 
## 
## $ROE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     3.9    14.9    22.6    25.8    31.0    62.9 
## 
## $ROA
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.40    5.70   11.20   10.51   15.00   20.30 
## 
## $Asset_Turnover
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.3     0.6     0.6     0.7     0.9     1.1 
## 
## $Leverage
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1600  0.3400  0.5857  0.6000  3.5100 
## 
## $Rev_Growth
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -3.17    6.38    9.37   13.37   21.87   34.21 
## 
## $Net_Profit_Margin
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.6    11.2    16.1    15.7    21.1    25.5 
## 
## $Median_Recommendation
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       2       2       3       4 
## 
## $Location
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    1.00    2.19    3.00    7.00 
## 
## $Exchange
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.143   1.000   3.000

Observation
Mean and Median are nearly equal after doing data imputation whicih help to reduce Outliers.
Data summary is looking good we can continue with logistic model

Multi Nomimal Logistic Model
Choose the best logistic model by using step().

mymodel <- multinom(Median_Recommendation ~ Beta+PE_Ratio+
                      ROE+ROA+Asset_Turnover+Leverage+Rev_Growth+
                      Net_Profit_Margin+Location+Exchange, dfrModel)
## # weights:  48 (33 variable)
## initial  value 29.112182 
## iter  10 value 20.518427
## iter  20 value 9.314342
## iter  30 value 1.910117
## iter  40 value 0.007032
## final  value 0.000075 
## converged
summary(mymodel)
## Call:
## multinom(formula = Median_Recommendation ~ Beta + PE_Ratio + 
##     ROE + ROA + Asset_Turnover + Leverage + Rev_Growth + Net_Profit_Margin + 
##     Location + Exchange, data = dfrModel)
## 
## Coefficients:
##   (Intercept)       Beta    PE_Ratio       ROE      ROA Asset_Turnover
## 2    31.12473   9.140249 -29.3979785 -25.38890 86.80701       86.35753
## 3   -65.84934  -2.307581   0.1584095 -20.59774 79.15472      -81.40181
## 4    79.87170 -61.936233  -7.8910773 -18.26666 38.83841      256.98086
##   Leverage Rev_Growth Net_Profit_Margin Location  Exchange
## 2 312.4966  13.204994         -9.483626 56.77904 -152.0096
## 3 460.5909   8.450410        -15.694926 27.00512 -291.0530
## 4 122.5953   5.116294        -18.190461 29.03111  -26.8979
## 
## Std. Errors:
##   (Intercept)        Beta  PE_Ratio       ROE        ROA Asset_Turnover
## 2  306.445091  185.107686 1964.1258 3624.9487  548.80282     135.278690
## 3 2330.320283 1047.235339 1644.5566 2180.7877 8731.49691    1242.988557
## 4    5.764179    6.398238  160.8206   22.4803    8.06985       3.458507
##       Leverage Rev_Growth Net_Profit_Margin   Location    Exchange
## 2 1.487197e+02 9480.12351        2279.42338 6590.08398  306.445094
## 3 1.139967e+03 7848.02283        4118.17270 1407.76646 2330.320283
## 4 3.924808e-08   18.27245          14.98686   28.82089    5.764179
## 
## Residual Deviance: 0.0001505056 
## AIC: 66.00015