knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE
)
setwd("C:/Users/aksha/OneDrive/Documents/Shilpi_xtras/Shilpi_extras/GL_BAPI/Rprgm")
suppressMessages(library(kableExtra))
## Warning: package 'kableExtra' was built under R version 3.5.3
suppressMessages(library(gridExtra))
## Warning: package 'gridExtra' was built under R version 3.5.3
suppressMessages(library(VIF))
suppressMessages(library(car))
suppressMessages(library(corrplot))
## Warning: package 'corrplot' was built under R version 3.5.3
suppressMessages(library(polycor))
## Warning: package 'polycor' was built under R version 3.5.3
suppressMessages(library(caret))
## Warning: package 'caret' was built under R version 3.5.3
## Warning: package 'ggplot2' was built under R version 3.5.3
suppressMessages(library(gridExtra))
suppressMessages(library(MASS))
## Warning: package 'MASS' was built under R version 3.5.3
suppressMessages(library(dplyr))
## Warning: package 'dplyr' was built under R version 3.5.3
suppressMessages(library(caTools))
## Warning: package 'caTools' was built under R version 3.5.3
suppressMessages(library(tidyverse))
## Warning: package 'tidyverse' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
suppressMessages(library(ggplot2))
suppressMessages(library(mice))
## Warning: package 'mice' was built under R version 3.5.3
set.seed(1234)
Data Import for Raw and Validation set
FRA_raw_read_df <- read.csv("raw_data_FRA.csv" ,na.strings = c("","?", "NA"))
test_raw_read_df <- read.csv("validation_data_FRAcsv.csv" ,na.strings = c("","?", "NA"))
####Target variable is "Default" column which is integer and binary in nature.
####1 means Default and 0 means not a Default
FRA_raw_read_df<-as_tibble(FRA_raw_read_df)
test_raw_read_df<-as_tibble(test_raw_read_df)
kable(head(FRA_raw_read_df)) %>% kable_styling(latex_options = "striped","bordered", position = "float_left")
|
Num
|
Default
|
Total.assets
|
Net.worth
|
Total.income
|
Change.in.stock
|
Total.expenses
|
Profit.after.tax
|
PBDITA
|
PBT
|
Cash.profit
|
PBDITA.as…of.total.income
|
PBT.as…of.total.income
|
PAT.as…of.total.income
|
Cash.profit.as…of.total.income
|
PAT.as…of.net.worth
|
Sales
|
Income.from.financial.services
|
Other.income
|
Total.capital
|
Reserves.and.funds
|
Deposits..accepted.by.commercial.banks.
|
Borrowings
|
Current.liabilities…provisions
|
Deferred.tax.liability
|
Shareholders.funds
|
Cumulative.retained.profits
|
Capital.employed
|
TOL.TNW
|
Total.term.liabilities…tangible.net.worth
|
Contingent.liabilities…Net.worth….
|
Contingent.liabilities
|
Net.fixed.assets
|
Investments
|
Current.assets
|
Net.working.capital
|
Quick.ratio..times.
|
Current.ratio..times.
|
Debt.to.equity.ratio..times.
|
Cash.to.current.liabilities..times.
|
Cash.to.average.cost.of.sales.per.day
|
Creditors.turnover
|
Debtors.turnover
|
Finished.goods.turnover
|
WIP.turnover
|
Raw.material.turnover
|
Shares.outstanding
|
Equity.face.value
|
EPS
|
Adjusted.EPS
|
Total.liabilities
|
PE.on.BSE
|
|
1
|
0
|
17512.3
|
7093.2
|
24965.2
|
235.8
|
23657.8
|
1543.2
|
2860.2
|
2417.2
|
1872.8
|
11.46
|
9.68
|
6.18
|
7.50
|
23.78
|
24458.0
|
158.0
|
297.2
|
423.8
|
6822.8
|
NA
|
14.9
|
9965.9
|
284.9
|
7093.2
|
6263.3
|
7108.1
|
1.33
|
0.00
|
14.80
|
1049.7
|
1900.2
|
1069.6
|
13277.5
|
3588.5
|
1.18
|
1.37
|
0.00
|
0.43
|
68.21
|
3.62
|
3.85
|
200.55
|
21.78
|
7.71
|
42381675
|
10
|
35.52
|
7.10
|
17512.3
|
27.31
|
|
2
|
0
|
941.0
|
351.5
|
1527.4
|
42.7
|
1454.9
|
115.2
|
283.0
|
188.4
|
158.6
|
18.53
|
12.33
|
7.54
|
10.38
|
38.08
|
1504.3
|
4.0
|
15.9
|
115.5
|
257.8
|
NA
|
272.5
|
210.0
|
85.2
|
351.5
|
247.4
|
624.0
|
1.23
|
0.34
|
19.23
|
67.6
|
286.4
|
2.2
|
563.9
|
203.5
|
0.95
|
1.56
|
0.78
|
0.06
|
5.96
|
9.80
|
5.70
|
14.21
|
7.49
|
11.46
|
11550000
|
10
|
9.97
|
9.97
|
941.0
|
8.17
|
|
3
|
0
|
232.8
|
100.6
|
477.3
|
-5.2
|
478.7
|
-6.6
|
5.8
|
-6.6
|
0.3
|
1.22
|
-1.38
|
-1.38
|
0.06
|
-6.35
|
475.6
|
1.5
|
0.2
|
81.4
|
19.2
|
NA
|
35.4
|
96.8
|
NA
|
100.6
|
32.4
|
136.0
|
1.44
|
0.29
|
45.83
|
46.1
|
38.7
|
4.3
|
167.5
|
59.6
|
1.11
|
1.55
|
0.35
|
0.21
|
17.07
|
5.28
|
5.07
|
9.24
|
0.23
|
NA
|
8149090
|
10
|
-0.50
|
-0.50
|
232.8
|
-5.76
|
|
4
|
0
|
2.7
|
2.7
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
0.00
|
0.00
|
0.00
|
0.00
|
0.00
|
NA
|
NA
|
NA
|
0.5
|
2.2
|
NA
|
NA
|
NA
|
NA
|
2.7
|
2.2
|
2.7
|
0.00
|
0.00
|
0.00
|
NA
|
2.5
|
NA
|
0.2
|
0.2
|
NA
|
NA
|
0.00
|
NA
|
NA
|
0.00
|
0.00
|
NA
|
NA
|
0.00
|
52404
|
10
|
0.00
|
0.00
|
2.7
|
NA
|
|
5
|
0
|
478.5
|
107.6
|
1580.5
|
-17.0
|
1558.0
|
5.5
|
31.0
|
6.3
|
11.9
|
1.96
|
0.40
|
0.35
|
0.75
|
5.25
|
1575.1
|
3.9
|
0.9
|
6.2
|
161.8
|
NA
|
193.1
|
112.8
|
4.6
|
107.6
|
82.7
|
300.7
|
2.83
|
1.59
|
34.94
|
37.6
|
94.8
|
7.4
|
349.7
|
215.8
|
1.41
|
2.54
|
1.79
|
0.00
|
0.00
|
13.00
|
9.46
|
12.68
|
7.90
|
17.03
|
619635
|
10
|
7.91
|
7.91
|
478.5
|
NA
|
|
6
|
0
|
2434.4
|
675.8
|
2648.6
|
62.3
|
2636.4
|
74.5
|
200.1
|
74.5
|
146.9
|
7.55
|
2.81
|
2.81
|
5.55
|
21.78
|
2639.5
|
6.4
|
0.2
|
33.8
|
972.0
|
NA
|
717.1
|
555.9
|
54.4
|
698.2
|
317.7
|
1415.3
|
1.80
|
0.37
|
36.28
|
245.2
|
864.9
|
22.7
|
1296.2
|
278.5
|
0.48
|
1.27
|
1.09
|
0.11
|
15.78
|
6.50
|
21.13
|
10.14
|
8.38
|
4.74
|
1141718
|
10
|
30.57
|
15.28
|
2434.4
|
NA
|
kable(head(test_raw_read_df)) %>% kable_styling(latex_options = "striped","bordered", position = "float_left")
|
Num
|
Default…1
|
Total.assets
|
Net.worth
|
Total.income
|
Change.in.stock
|
Total.expenses
|
Profit.after.tax
|
PBDITA
|
PBT
|
Cash.profit
|
PBDITA.as…of.total.income
|
PBT.as…of.total.income
|
PAT.as…of.total.income
|
Cash.profit.as…of.total.income
|
PAT.as…of.net.worth
|
Sales
|
Income.from.financial.services
|
Other.income
|
Total.capital
|
Reserves.and.funds
|
Deposits..accepted.by.commercial.banks.
|
Borrowings
|
Current.liabilities…provisions
|
Deferred.tax.liability
|
Shareholders.funds
|
Cumulative.retained.profits
|
Capital.employed
|
TOL.TNW
|
Total.term.liabilities…tangible.net.worth
|
Contingent.liabilities…Net.worth….
|
Contingent.liabilities
|
Net.fixed.assets
|
Investments
|
Current.assets
|
Net.working.capital
|
Quick.ratio..times.
|
Current.ratio..times.
|
Debt.to.equity.ratio..times.
|
Cash.to.current.liabilities..times.
|
Cash.to.average.cost.of.sales.per.day
|
Creditors.turnover
|
Debtors.turnover
|
Finished.goods.turnover
|
WIP.turnover
|
Raw.material.turnover
|
Shares.outstanding
|
Equity.face.value
|
EPS
|
Adjusted.EPS
|
Total.liabilities
|
PE.on.BSE
|
|
1545
|
0
|
970.6
|
275.8
|
2185.2
|
14.2
|
2099.2
|
100.2
|
285.6
|
152.1
|
182.3
|
13.07
|
6.96
|
4.59
|
8.34
|
42.11
|
2171.1
|
2.3
|
NA
|
48.0
|
413.1
|
NA
|
177.3
|
328.5
|
3.7
|
275.8
|
227.8
|
453.1
|
1.80
|
0.27
|
112.94
|
311.5
|
332.3
|
NA
|
559.7
|
134.2
|
0.92
|
1.31
|
0.64
|
0.09
|
7.56
|
5.94
|
5.74
|
25.11
|
20.01
|
17.58
|
4800000
|
10
|
18.60
|
18.60
|
970.6
|
NA
|
|
735
|
0
|
675.0
|
211.6
|
819.2
|
10.4
|
809.9
|
19.7
|
116.0
|
33.7
|
50.5
|
14.16
|
4.11
|
2.40
|
6.16
|
10.66
|
817.0
|
0.8
|
0.2
|
114.0
|
97.6
|
NA
|
339.8
|
100.5
|
23.1
|
211.6
|
97.6
|
551.4
|
2.01
|
0.72
|
5.77
|
12.2
|
199.1
|
NA
|
407.3
|
123.6
|
0.48
|
1.39
|
1.61
|
0.03
|
3.88
|
10.59
|
6.03
|
28.96
|
18.65
|
2.67
|
11400000
|
10
|
1.65
|
1.65
|
675.0
|
NA
|
|
2605
|
1
|
532.1
|
120.2
|
563.5
|
-28.1
|
577.8
|
-42.4
|
-31.0
|
-56.0
|
-35.3
|
-5.50
|
-9.94
|
-7.52
|
-6.26
|
-31.20
|
552.2
|
9.1
|
2.1
|
47.1
|
227.4
|
NA
|
17.5
|
240.1
|
NA
|
120.2
|
69.9
|
137.7
|
1.73
|
0.09
|
102.83
|
123.6
|
270.0
|
0.7
|
147.8
|
-97.1
|
0.32
|
0.60
|
0.15
|
0.04
|
4.63
|
2.35
|
9.60
|
8.23
|
6.60
|
3.77
|
471285
|
100
|
-90.39
|
-90.39
|
532.1
|
-15.50
|
|
1292
|
0
|
857.5
|
201.4
|
3576.5
|
-0.6
|
3613.4
|
-37.5
|
68.2
|
25.7
|
37.3
|
1.91
|
0.72
|
-1.05
|
1.04
|
0.00
|
3573.3
|
1.0
|
1.5
|
50.5
|
150.9
|
NA
|
524.2
|
75.2
|
56.7
|
201.4
|
150.9
|
725.6
|
2.94
|
0.81
|
0.65
|
1.3
|
262.8
|
NA
|
535.6
|
99.6
|
0.51
|
1.23
|
2.60
|
0.08
|
3.71
|
NA
|
NA
|
NA
|
NA
|
NA
|
5050000
|
10
|
-7.09
|
-7.09
|
857.5
|
-0.16
|
|
134
|
0
|
823.3
|
349.2
|
1034.3
|
28.9
|
1041.8
|
21.4
|
90.1
|
29.7
|
62.7
|
8.71
|
2.87
|
2.07
|
6.06
|
6.31
|
1026.7
|
0.7
|
2.3
|
33.0
|
316.2
|
NA
|
162.3
|
299.6
|
12.2
|
349.2
|
316.2
|
511.5
|
1.02
|
0.10
|
28.78
|
100.5
|
190.6
|
NA
|
471.8
|
75.3
|
0.58
|
1.19
|
0.46
|
0.08
|
11.15
|
5.48
|
4.78
|
6.28
|
6.70
|
3.70
|
3205946
|
10
|
5.90
|
5.90
|
823.3
|
NA
|
|
911
|
0
|
166.2
|
88.0
|
201.6
|
-0.5
|
200.2
|
0.9
|
10.6
|
3.4
|
3.9
|
5.26
|
1.69
|
0.45
|
1.93
|
0.00
|
201.0
|
0.5
|
0.1
|
23.3
|
56.8
|
NA
|
28.1
|
48.0
|
2.1
|
88.0
|
8.2
|
116.1
|
0.86
|
0.11
|
0.00
|
NA
|
17.1
|
NA
|
143.3
|
76.9
|
0.97
|
1.86
|
0.32
|
0.00
|
0.57
|
NA
|
NA
|
NA
|
NA
|
NA
|
146385
|
100
|
6.83
|
6.83
|
166.2
|
NA
|
Data type of Raw
str(FRA_raw_read_df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3541 obs. of 52 variables:
## $ Num : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Default : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Total.assets : num 17512.3 941 232.8 2.7 478.5 ...
## $ Net.worth : num 7093.2 351.5 100.6 2.7 107.6 ...
## $ Total.income : num 24965 1527 477 NA 1580 ...
## $ Change.in.stock : num 235.8 42.7 -5.2 NA -17 ...
## $ Total.expenses : num 23658 1455 479 NA 1558 ...
## $ Profit.after.tax : num 1543.2 115.2 -6.6 NA 5.5 ...
## $ PBDITA : num 2860.2 283 5.8 NA 31 ...
## $ PBT : num 2417.2 188.4 -6.6 NA 6.3 ...
## $ Cash.profit : num 1872.8 158.6 0.3 NA 11.9 ...
## $ PBDITA.as...of.total.income : num 11.46 18.53 1.22 0 1.96 ...
## $ PBT.as...of.total.income : num 9.68 12.33 -1.38 0 0.4 ...
## $ PAT.as...of.total.income : num 6.18 7.54 -1.38 0 0.35 2.81 0 0.72 8.29 -2.88 ...
## $ Cash.profit.as...of.total.income : num 7.5 10.38 0.06 0 0.75 ...
## $ PAT.as...of.net.worth : num 23.78 38.08 -6.35 0 5.25 ...
## $ Sales : num 24458 1504 476 NA 1575 ...
## $ Income.from.financial.services : num 158 4 1.5 NA 3.9 6.4 NA NA 7.3 NA ...
## $ Other.income : num 297.2 15.9 0.2 NA 0.9 ...
## $ Total.capital : num 423.8 115.5 81.4 0.5 6.2 ...
## $ Reserves.and.funds : num 6822.8 257.8 19.2 2.2 161.8 ...
## $ Deposits..accepted.by.commercial.banks. : logi NA NA NA NA NA NA ...
## $ Borrowings : num 14.9 272.5 35.4 NA 193.1 ...
## $ Current.liabilities...provisions : num 9965.9 210 96.8 NA 112.8 ...
## $ Deferred.tax.liability : num 284.9 85.2 NA NA 4.6 ...
## $ Shareholders.funds : num 7093.2 351.5 100.6 2.7 107.6 ...
## $ Cumulative.retained.profits : num 6263.3 247.4 32.4 2.2 82.7 ...
## $ Capital.employed : num 7108.1 624 136 2.7 300.7 ...
## $ TOL.TNW : num 1.33 1.23 1.44 0 2.83 1.8 0.03 5.17 1.05 3.25 ...
## $ Total.term.liabilities...tangible.net.worth: num 0 0.34 0.29 0 1.59 0.37 0.03 0.94 0.3 0.54 ...
## $ Contingent.liabilities...Net.worth.... : num 14.8 19.2 45.8 0 34.9 ...
## $ Contingent.liabilities : num 1049.7 67.6 46.1 NA 37.6 ...
## $ Net.fixed.assets : num 1900.2 286.4 38.7 2.5 94.8 ...
## $ Investments : num 1069.6 2.2 4.3 NA 7.4 ...
## $ Current.assets : num 13277.5 563.9 167.5 0.2 349.7 ...
## $ Net.working.capital : num 3588.5 203.5 59.6 0.2 215.8 ...
## $ Quick.ratio..times. : num 1.18 0.95 1.11 NA 1.41 0.48 NA 0.54 0.59 0.39 ...
## $ Current.ratio..times. : num 1.37 1.56 1.55 NA 2.54 1.27 NA 1.15 1.58 0.5 ...
## $ Debt.to.equity.ratio..times. : num 0 0.78 0.35 0 1.79 1.09 0.32 2.31 0.94 3.13 ...
## $ Cash.to.current.liabilities..times. : num 0.43 0.06 0.21 NA 0 0.11 NA 0.04 0.19 0 ...
## $ Cash.to.average.cost.of.sales.per.day : num 68.21 5.96 17.07 NA 0 ...
## $ Creditors.turnover : num 3.62 9.8 5.28 0 13 ...
## $ Debtors.turnover : num 3.85 5.7 5.07 0 9.46 ...
## $ Finished.goods.turnover : num 200.55 14.21 9.24 NA 12.68 ...
## $ WIP.turnover : num 21.78 7.49 0.23 NA 7.9 ...
## $ Raw.material.turnover : num 7.71 11.46 NA 0 17.03 ...
## $ Shares.outstanding : num 42381675 11550000 8149090 52404 619635 ...
## $ Equity.face.value : num 10 10 10 10 10 10 10 NA 10 10 ...
## $ EPS : num 35.52 9.97 -0.5 0 7.91 ...
## $ Adjusted.EPS : num 7.1 9.97 -0.5 0 7.91 ...
## $ Total.liabilities : num 17512.3 941 232.8 2.7 478.5 ...
## $ PE.on.BSE : num 27.31 8.17 -5.76 NA NA ...
Summary of Raw data
summary(FRA_raw_read_df)
## Num Default Total.assets Net.worth
## Min. : 1 Min. :0.00000 Min. : 0.1 Min. : 0.0
## 1st Qu.: 886 1st Qu.:0.00000 1st Qu.: 91.3 1st Qu.: 31.3
## Median :1773 Median :0.00000 Median : 309.7 Median : 102.3
## Mean :1772 Mean :0.06608 Mean : 3443.4 Mean : 1295.9
## 3rd Qu.:2658 3rd Qu.:0.00000 3rd Qu.: 1098.7 3rd Qu.: 377.3
## Max. :3545 Max. :1.00000 Max. :1176509.2 Max. :613151.6
##
## Total.income Change.in.stock Total.expenses Profit.after.tax
## Min. : 0.0 Min. :-3029.40 Min. : -0.1 Min. : -3908.30
## 1st Qu.: 106.5 1st Qu.: -1.80 1st Qu.: 95.8 1st Qu.: 0.50
## Median : 444.9 Median : 1.60 Median : 407.7 Median : 8.80
## Mean : 4582.8 Mean : 41.49 Mean : 4262.9 Mean : 277.36
## 3rd Qu.: 1440.9 3rd Qu.: 18.05 3rd Qu.: 1359.8 3rd Qu.: 52.27
## Max. :2442828.2 Max. :14185.50 Max. :2366035.3 Max. :119439.10
## NA's :198 NA's :458 NA's :139 NA's :131
## PBDITA PBT Cash.profit
## Min. : -440.7 Min. : -3894.80 Min. : -2245.70
## 1st Qu.: 6.9 1st Qu.: 0.70 1st Qu.: 2.90
## Median : 35.4 Median : 12.40 Median : 18.85
## Mean : 578.1 Mean : 383.81 Mean : 392.07
## 3rd Qu.: 150.2 3rd Qu.: 71.97 3rd Qu.: 93.20
## Max. :208576.5 Max. :145292.60 Max. :176911.80
## NA's :131 NA's :131 NA's :131
## PBDITA.as...of.total.income PBT.as...of.total.income PAT.as...of.total.income
## Min. :-6400.000 Min. :-21340.00 Min. :-21340.00
## 1st Qu.: 5.000 1st Qu.: 0.55 1st Qu.: 0.35
## Median : 9.660 Median : 3.31 Median : 2.34
## Mean : 4.571 Mean : -17.28 Mean : -19.20
## 3rd Qu.: 16.390 3rd Qu.: 8.80 3rd Qu.: 6.34
## Max. : 100.000 Max. : 100.00 Max. : 150.00
## NA's :68 NA's :68 NA's :68
## Cash.profit.as...of.total.income PAT.as...of.net.worth Sales
## Min. :-15020.000 Min. :-748.72 Min. : 0.1
## 1st Qu.: 2.020 1st Qu.: 0.00 1st Qu.: 112.7
## Median : 5.640 Median : 7.92 Median : 453.1
## Mean : -8.229 Mean : 10.27 Mean : 4549.5
## 3rd Qu.: 10.700 3rd Qu.: 20.19 3rd Qu.: 1433.5
## Max. : 100.000 Max. :2466.67 Max. :2384984.4
## NA's :68 NA's :259
## Income.from.financial.services Other.income Total.capital
## Min. : 0.00 Min. : 0.00 Min. : 0.1
## 1st Qu.: 0.40 1st Qu.: 0.40 1st Qu.: 13.1
## Median : 1.80 Median : 1.40 Median : 42.1
## Mean : 80.84 Mean : 41.36 Mean : 216.6
## 3rd Qu.: 9.68 3rd Qu.: 5.97 3rd Qu.: 100.3
## Max. :51938.20 Max. :42856.70 Max. :78273.2
## NA's :935 NA's :1295 NA's :4
## Reserves.and.funds Deposits..accepted.by.commercial.banks. Borrowings
## Min. : -6525.9 Mode:logical Min. : 0.10
## 1st Qu.: 5.0 NA's:3541 1st Qu.: 23.95
## Median : 54.8 Median : 99.20
## Mean : 1163.8 Mean : 1122.28
## 3rd Qu.: 277.3 3rd Qu.: 352.60
## Max. :625137.8 Max. :278257.30
## NA's :85 NA's :366
## Current.liabilities...provisions Deferred.tax.liability Shareholders.funds
## Min. : 0.1 Min. : 0.1 Min. : 0.0
## 1st Qu.: 17.8 1st Qu.: 3.2 1st Qu.: 32.0
## Median : 69.4 Median : 13.4 Median : 105.6
## Mean : 940.6 Mean : 227.2 Mean : 1322.1
## 3rd Qu.: 261.7 3rd Qu.: 50.0 3rd Qu.: 393.2
## Max. :352240.3 Max. :72796.6 Max. :613151.6
## NA's :96 NA's :1140
## Cumulative.retained.profits Capital.employed TOL.TNW
## Min. : -6534.3 Min. : 0.0 Min. :-350.480
## 1st Qu.: 1.1 1st Qu.: 60.8 1st Qu.: 0.600
## Median : 37.1 Median : 214.7 Median : 1.430
## Mean : 890.5 Mean : 2328.3 Mean : 3.994
## 3rd Qu.: 202.3 3rd Qu.: 767.3 3rd Qu.: 2.830
## Max. :390133.8 Max. :891408.9 Max. : 473.000
## NA's :38
## Total.term.liabilities...tangible.net.worth
## Min. :-325.600
## 1st Qu.: 0.050
## Median : 0.340
## Mean : 1.844
## 3rd Qu.: 1.000
## Max. : 456.000
##
## Contingent.liabilities...Net.worth.... Contingent.liabilities
## Min. : 0.00 Min. : 0.1
## 1st Qu.: 0.00 1st Qu.: 6.3
## Median : 5.33 Median : 38.0
## Mean : 53.94 Mean : 932.9
## 3rd Qu.: 30.76 3rd Qu.: 192.7
## Max. :14704.27 Max. :559506.8
## NA's :1188
## Net.fixed.assets Investments Current.assets Net.working.capital
## Min. : 0.0 Min. : 0.00 Min. : 0.1 Min. :-63839.0
## 1st Qu.: 26.0 1st Qu.: 1.00 1st Qu.: 36.2 1st Qu.: -1.1
## Median : 93.5 Median : 8.35 Median : 145.1 Median : 16.2
## Mean : 1189.7 Mean : 694.73 Mean : 1293.4 Mean : 138.6
## 3rd Qu.: 344.9 3rd Qu.: 64.30 3rd Qu.: 502.2 3rd Qu.: 84.2
## Max. :636604.6 Max. :199978.60 Max. :354815.2 Max. : 85782.8
## NA's :118 NA's :1435 NA's :66 NA's :32
## Quick.ratio..times. Current.ratio..times. Debt.to.equity.ratio..times.
## Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.410 1st Qu.: 0.93 1st Qu.: 0.22
## Median : 0.670 Median : 1.23 Median : 0.79
## Mean : 1.401 Mean : 2.13 Mean : 2.78
## 3rd Qu.: 1.030 3rd Qu.: 1.71 3rd Qu.: 1.75
## Max. :341.000 Max. :505.00 Max. :456.00
## NA's :93 NA's :93
## Cash.to.current.liabilities..times. Cash.to.average.cost.of.sales.per.day
## Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.0200 1st Qu.: 2.79
## Median : 0.0700 Median : 8.03
## Mean : 0.4904 Mean : 158.44
## 3rd Qu.: 0.1900 3rd Qu.: 21.79
## Max. :165.0000 Max. :128040.76
## NA's :93 NA's :85
## Creditors.turnover Debtors.turnover Finished.goods.turnover WIP.turnover
## Min. : 0.000 Min. : 0.00 Min. : -0.09 Min. : -0.18
## 1st Qu.: 3.700 1st Qu.: 3.76 1st Qu.: 8.20 1st Qu.: 5.10
## Median : 6.095 Median : 6.32 Median : 17.27 Median : 9.76
## Mean : 15.446 Mean : 17.04 Mean : 87.08 Mean : 27.93
## 3rd Qu.: 11.490 3rd Qu.: 11.68 3rd Qu.: 40.35 3rd Qu.: 20.24
## Max. :2401.000 Max. :3135.20 Max. :17947.60 Max. :5651.40
## NA's :333 NA's :328 NA's :740 NA's :640
## Raw.material.turnover Shares.outstanding Equity.face.value
## Min. : -2.00 Min. :-2.147e+09 Min. :-999999
## 1st Qu.: 2.99 1st Qu.: 1.316e+06 1st Qu.: 10
## Median : 6.40 Median : 4.672e+06 Median : 10
## Mean : 19.09 Mean : 2.207e+07 Mean : -1334
## 3rd Qu.: 11.85 3rd Qu.: 1.065e+07 3rd Qu.: 10
## Max. :21092.00 Max. : 4.130e+09 Max. : 100000
## NA's :361 NA's :692 NA's :692
## EPS Adjusted.EPS Total.liabilities PE.on.BSE
## Min. :-843181.8 Min. :-843181.8 Min. : 0.1 Min. :-1116.64
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 91.3 1st Qu.: 3.27
## Median : 1.4 Median : 1.2 Median : 309.7 Median : 9.10
## Mean : -220.3 Mean : -221.5 Mean : 3443.4 Mean : 63.91
## 3rd Qu.: 9.6 3rd Qu.: 7.5 3rd Qu.: 1098.7 3rd Qu.: 17.79
## Max. : 34522.5 Max. : 34522.5 Max. :1176509.2 Max. :51002.74
## NA's :2194
Dimension of Raw data
dim(FRA_raw_read_df) ####3541*52
## [1] 3541 52
Data type of Validation
str(test_raw_read_df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 715 obs. of 52 variables:
## $ Num : int 1545 735 2605 1292 134 911 3425 2124 2066 1996 ...
## $ Default...1 : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Total.assets : num 971 675 532 858 823 ...
## $ Net.worth : num 276 212 120 201 349 ...
## $ Total.income : num 2185 819 564 3576 1034 ...
## $ Change.in.stock : num 14.2 10.4 -28.1 -0.6 28.9 -0.5 NA -7.7 27.2 -0.2 ...
## $ Total.expenses : num 2099 810 578 3613 1042 ...
## $ Profit.after.tax : num 100.2 19.7 -42.4 -37.5 21.4 ...
## $ PBDITA : num 285.6 116 -31 68.2 90.1 ...
## $ PBT : num 152.1 33.7 -56 25.7 29.7 ...
## $ Cash.profit : num 182.3 50.5 -35.3 37.3 62.7 ...
## $ PBDITA.as...of.total.income : num 13.07 14.16 -5.5 1.91 8.71 ...
## $ PBT.as...of.total.income : num 6.96 4.11 -9.94 0.72 2.87 ...
## $ PAT.as...of.total.income : num 4.59 2.4 -7.52 -1.05 2.07 ...
## $ Cash.profit.as...of.total.income : num 8.34 6.16 -6.26 1.04 6.06 ...
## $ PAT.as...of.net.worth : num 42.11 10.66 -31.2 0 6.31 ...
## $ Sales : num 2171 817 552 3573 1027 ...
## $ Income.from.financial.services : num 2.3 0.8 9.1 1 0.7 ...
## $ Other.income : num NA 0.2 2.1 1.5 2.3 0.1 NA NA 0.1 0.1 ...
## $ Total.capital : num 48 114 47.1 50.5 33 ...
## $ Reserves.and.funds : num 413.1 97.6 227.4 150.9 316.2 ...
## $ Deposits..accepted.by.commercial.banks. : logi NA NA NA NA NA NA ...
## $ Borrowings : num 177.3 339.8 17.5 524.2 162.3 ...
## $ Current.liabilities...provisions : num 328.5 100.5 240.1 75.2 299.6 ...
## $ Deferred.tax.liability : num 3.7 23.1 NA 56.7 12.2 2.1 1.9 4.4 2.9 NA ...
## $ Shareholders.funds : num 276 212 120 201 349 ...
## $ Cumulative.retained.profits : num 227.8 97.6 69.9 150.9 316.2 ...
## $ Capital.employed : num 453 551 138 726 512 ...
## $ TOL.TNW : num 1.8 2.01 1.73 2.94 1.02 0.86 0.06 1.92 0.37 1.96 ...
## $ Total.term.liabilities...tangible.net.worth: num 0.27 0.72 0.09 0.81 0.1 0.11 0.05 0.78 0 1.81 ...
## $ Contingent.liabilities...Net.worth.... : num 112.94 5.77 102.83 0.65 28.78 ...
## $ Contingent.liabilities : num 311.5 12.2 123.6 1.3 100.5 ...
## $ Net.fixed.assets : num 332 199 270 263 191 ...
## $ Investments : num NA NA 0.7 NA NA NA 17.3 2.6 NA NA ...
## $ Current.assets : num 560 407 148 536 472 ...
## $ Net.working.capital : num 134.2 123.6 -97.1 99.6 75.3 ...
## $ Quick.ratio..times. : num 0.92 0.48 0.32 0.51 0.58 0.97 166 0.52 0.88 0.6 ...
## $ Current.ratio..times. : num 1.31 1.39 0.6 1.23 1.19 1.86 166 1.56 1.19 0.55 ...
## $ Debt.to.equity.ratio..times. : num 0.64 1.61 0.15 2.6 0.46 0.32 0.05 1.24 0 1.81 ...
## $ Cash.to.current.liabilities..times. : num 0.09 0.03 0.04 0.08 0.08 0 165 0.03 0.35 0.23 ...
## $ Cash.to.average.cost.of.sales.per.day : num 7.56 3.88 4.63 3.71 11.15 ...
## $ Creditors.turnover : num 5.94 10.59 2.35 NA 5.48 ...
## $ Debtors.turnover : num 5.74 6.03 9.6 NA 4.78 ...
## $ Finished.goods.turnover : num 25.11 28.96 8.23 NA 6.28 ...
## $ WIP.turnover : num 20 18.6 6.6 NA 6.7 ...
## $ Raw.material.turnover : num 17.58 2.67 3.77 NA 3.7 ...
## $ Shares.outstanding : num 4800000 11400000 471285 5050000 3205946 ...
## $ Equity.face.value : num 10 10 100 10 10 100 10 NA 10 10 ...
## $ EPS : num 18.6 1.65 -90.39 -7.09 5.9 ...
## $ Adjusted.EPS : num 18.6 1.65 -90.39 -7.09 5.9 ...
## $ Total.liabilities : num 971 675 532 858 823 ...
## $ PE.on.BSE : num NA NA -15.5 -0.16 NA NA NA NA NA NA ...
Summary of Validation data
summary(test_raw_read_df)
## Num Default...1 Total.assets Net.worth
## Min. : 1.0 Min. :0.00000 Min. : 0.1 Min. : 0.1
## 1st Qu.: 869.5 1st Qu.:0.00000 1st Qu.: 93.2 1st Qu.: 34.4
## Median :1741.0 Median :0.00000 Median : 347.7 Median : 120.9
## Mean :1756.3 Mean :0.07552 Mean : 4218.6 Mean : 1629.7
## 3rd Qu.:2588.0 3rd Qu.:0.00000 3rd Qu.: 1315.3 3rd Qu.: 451.5
## Max. :3543.0 Max. :1.00000 Max. :354727.3 Max. :171840.0
##
## Total.income Change.in.stock Total.expenses Profit.after.tax
## Min. : 0.0 Min. :-488.10 Min. : 0.0 Min. : -998.00
## 1st Qu.: 110.8 1st Qu.: -1.90 1st Qu.: 104.1 1st Qu.: 0.68
## Median : 536.0 Median : 1.80 Median : 511.1 Median : 10.20
## Mean : 5204.7 Mean : 54.66 Mean : 4817.3 Mean : 382.22
## 3rd Qu.: 1727.1 3rd Qu.: 19.35 3rd Qu.: 1642.3 3rd Qu.: 68.95
## Max. :1028087.4 Max. :7540.00 Max. :1014813.1 Max. :62022.90
## NA's :33 NA's :92 NA's :26 NA's :23
## PBDITA PBT Cash.profit
## Min. : -393.90 Min. : -993.90 Min. : -894.60
## 1st Qu.: 7.15 1st Qu.: 1.00 1st Qu.: 3.27
## Median : 42.20 Median : 14.25 Median : 22.05
## Mean : 743.35 Mean : 540.59 Mean : 488.11
## 3rd Qu.: 192.82 3rd Qu.: 90.50 3rd Qu.: 120.30
## Max. :110557.10 Max. :94565.20 Max. :71581.60
## NA's :23 NA's :23 NA's :23
## PBDITA.as...of.total.income PBT.as...of.total.income PAT.as...of.total.income
## Min. :-6400.000 Min. :-9700.000 Min. :-9700.000
## 1st Qu.: 4.702 1st Qu.: 0.622 1st Qu.: 0.390
## Median : 9.780 Median : 3.450 Median : 2.405
## Mean : -3.681 Mean : -22.725 Mean : -24.147
## 3rd Qu.: 16.753 3rd Qu.: 9.725 3rd Qu.: 6.790
## Max. : 100.000 Max. : 100.000 Max. : 100.000
## NA's :11 NA's :11 NA's :11
## Cash.profit.as...of.total.income PAT.as...of.net.worth Sales
## Min. :-6400.000 Min. :-194.520 Min. : 0.1
## 1st Qu.: 1.930 1st Qu.: 0.000 1st Qu.: 120.8
## Median : 5.835 Median : 8.710 Median : 552.5
## Mean : -12.929 Mean : 9.666 Mean : 5117.5
## 3rd Qu.: 10.982 3rd Qu.: 20.215 3rd Qu.: 1721.3
## Max. : 100.000 Max. : 441.670 Max. :976884.0
## NA's :11 NA's :46
## Income.from.financial.services Other.income Total.capital
## Min. : 0.10 Min. : 0.00 Min. : 0.1
## 1st Qu.: 0.50 1st Qu.: 0.32 1st Qu.: 14.1
## Median : 2.00 Median : 1.65 Median : 45.3
## Mean : 83.86 Mean : 128.16 Mean : 263.9
## 3rd Qu.: 10.10 3rd Qu.: 7.25 3rd Qu.: 121.1
## Max. :8097.20 Max. :42856.70 Max. :41304.0
## NA's :176 NA's :261 NA's :1
## Reserves.and.funds Deposits..accepted.by.commercial.banks.
## Min. : -1125.00 Mode:logical
## 1st Qu.: 7.33 NA's:715
## Median : 57.45
## Mean : 1440.70
## 3rd Qu.: 334.80
## Max. :133684.20
## NA's :13
## Borrowings Current.liabilities...provisions Deferred.tax.liability
## Min. : 0.20 Min. : 0.1 Min. : 0.10
## 1st Qu.: 25.93 1st Qu.: 16.8 1st Qu.: 3.10
## Median : 105.50 Median : 75.2 Median : 14.70
## Mean : 1439.86 Mean : 1058.9 Mean : 270.45
## 3rd Qu.: 391.82 3rd Qu.: 300.4 3rd Qu.: 62.42
## Max. :105175.30 Max. :112712.7 Max. :27077.90
## NA's :65 NA's :14 NA's :229
## Shareholders.funds Cumulative.retained.profits Capital.employed
## Min. : 0.1 Min. : -2582.4 Min. : 0.10
## 1st Qu.: 35.5 1st Qu.: 0.8 1st Qu.: 64.35
## Median : 124.0 Median : 40.6 Median : 246.10
## Mean : 1646.0 Mean : 1168.1 Mean : 2954.96
## 3rd Qu.: 478.4 3rd Qu.: 244.5 3rd Qu.: 913.65
## Max. :171840.0 Max. :128183.1 Max. :235389.50
## NA's :7
## TOL.TNW Total.term.liabilities...tangible.net.worth
## Min. :-350.480 Min. :-325.600
## 1st Qu.: 0.595 1st Qu.: 0.060
## Median : 1.400 Median : 0.350
## Mean : 4.181 Mean : 1.906
## 3rd Qu.: 2.800 3rd Qu.: 1.005
## Max. : 411.270 Max. : 292.020
##
## Contingent.liabilities...Net.worth.... Contingent.liabilities
## Min. : 0.00 Min. : 0.1
## 1st Qu.: 0.00 1st Qu.: 5.1
## Median : 5.52 Median : 37.5
## Mean : 64.47 Mean : 1022.0
## 3rd Qu.: 31.49 3rd Qu.: 217.1
## Max. :6295.24 Max. :72620.8
## NA's :214
## Net.fixed.assets Investments Current.assets Net.working.capital
## Min. : 0.1 Min. : 0.0 Min. : 0.1 Min. :-41908.3
## 1st Qu.: 27.2 1st Qu.: 0.9 1st Qu.: 38.9 1st Qu.: -1.3
## Median : 95.0 Median : 7.8 Median : 165.6 Median : 20.1
## Mean : 1306.2 Mean : 853.2 Mean : 1632.9 Mean : 283.0
## 3rd Qu.: 409.2 3rd Qu.: 61.6 3rd Qu.: 578.0 3rd Qu.: 99.2
## Max. :115737.5 Max. :88047.8 Max. :196614.6 Max. : 85782.8
## NA's :14 NA's :280 NA's :14 NA's :5
## Quick.ratio..times. Current.ratio..times. Debt.to.equity.ratio..times.
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.410 1st Qu.: 0.920 1st Qu.: 0.220
## Median : 0.660 Median : 1.230 Median : 0.800
## Mean : 1.968 Mean : 2.880 Mean : 3.327
## 3rd Qu.: 1.020 3rd Qu.: 1.725 3rd Qu.: 1.700
## Max. :341.000 Max. :505.000 Max. :341.180
## NA's :12 NA's :12
## Cash.to.current.liabilities..times. Cash.to.average.cost.of.sales.per.day
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0300 1st Qu.: 3.248
## Median : 0.0800 Median : 8.130
## Mean : 0.7149 Mean : 79.565
## 3rd Qu.: 0.1900 3rd Qu.: 22.645
## Max. :165.0000 Max. :15999.170
## NA's :12 NA's :15
## Creditors.turnover Debtors.turnover Finished.goods.turnover
## Min. : 0.00 Min. : 0.000 Min. : -0.09
## 1st Qu.: 3.84 1st Qu.: 4.133 1st Qu.: 8.06
## Median : 6.49 Median : 7.050 Median : 17.49
## Mean : 23.48 Mean : 22.264 Mean : 71.31
## 3rd Qu.: 12.90 3rd Qu.: 12.920 3rd Qu.: 38.67
## Max. :1934.00 Max. :2473.040 Max. :5614.80
## NA's :58 NA's :57 NA's :134
## WIP.turnover Raw.material.turnover Shares.outstanding Equity.face.value
## Min. : 0.000 Min. : 0.000 Min. :1.280e+02 Min. : 1.0
## 1st Qu.: 5.135 1st Qu.: 3.190 1st Qu.:1.262e+06 1st Qu.: 10.0
## Median : 10.710 Median : 6.445 Median :4.940e+06 Median : 10.0
## Mean : 32.386 Mean : 11.087 Mean :3.187e+07 Mean : 45.1
## 3rd Qu.: 20.130 3rd Qu.: 11.650 3rd Qu.:1.282e+07 3rd Qu.: 10.0
## Max. :5651.400 Max. :279.960 Max. :4.130e+09 Max. :10000.0
## NA's :124 NA's :67 NA's :118 NA's :118
## EPS Adjusted.EPS Total.liabilities PE.on.BSE
## Min. :-72750.00 Min. :-72750.00 Min. : 0.1 Min. :-263.920
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 93.2 1st Qu.: 1.863
## Median : 1.83 Median : 1.50 Median : 347.7 Median : 7.670
## Mean : -76.87 Mean : -78.74 Mean : 4218.6 Mean : 15.132
## 3rd Qu.: 11.46 3rd Qu.: 8.35 3rd Qu.: 1315.3 3rd Qu.: 14.665
## Max. : 8784.00 Max. : 8784.00 Max. :354727.3 Max. :1478.420
## NA's :433
Dimension of Raw data
dim(test_raw_read_df) ####715*52
## [1] 715 52
Here all 52 variables are appearing as numeric including Default(1,0 values) and Num(company) which seems to be incorrect. Num and Default should be factors; Thus casted to factors;
#colnames(dplyr::select_if(FRA_raw_read_df, is.numeric))
FRA_raw_df<-FRA_raw_read_df
colnames(FRA_raw_df) <- make.names(colnames(FRA_raw_df))####Correcting Column names
####Target Column 'Default' and Num casted to factor
FRA_raw_df$Num<-as.factor(FRA_raw_read_df$Num)
FRA_raw_df$Default<-as.factor(FRA_raw_read_df$Default)
In Raw dataset, Default column - yes means Company can Default and no means company cannot Default
levels(FRA_raw_df$Default) <- c("no", "yes") #### 0 as no and 1 as yes
table(FRA_raw_df$Default)
##
## no yes
## 3307 234
Post casting, datatype issue is fixed; there are now 2 factors along with 50 numeric columns
colnames(dplyr::select_if(FRA_raw_df, is.numeric)) #### 50 as numeric
## [1] "Total.assets"
## [2] "Net.worth"
## [3] "Total.income"
## [4] "Change.in.stock"
## [5] "Total.expenses"
## [6] "Profit.after.tax"
## [7] "PBDITA"
## [8] "PBT"
## [9] "Cash.profit"
## [10] "PBDITA.as...of.total.income"
## [11] "PBT.as...of.total.income"
## [12] "PAT.as...of.total.income"
## [13] "Cash.profit.as...of.total.income"
## [14] "PAT.as...of.net.worth"
## [15] "Sales"
## [16] "Income.from.financial.services"
## [17] "Other.income"
## [18] "Total.capital"
## [19] "Reserves.and.funds"
## [20] "Borrowings"
## [21] "Current.liabilities...provisions"
## [22] "Deferred.tax.liability"
## [23] "Shareholders.funds"
## [24] "Cumulative.retained.profits"
## [25] "Capital.employed"
## [26] "TOL.TNW"
## [27] "Total.term.liabilities...tangible.net.worth"
## [28] "Contingent.liabilities...Net.worth...."
## [29] "Contingent.liabilities"
## [30] "Net.fixed.assets"
## [31] "Investments"
## [32] "Current.assets"
## [33] "Net.working.capital"
## [34] "Quick.ratio..times."
## [35] "Current.ratio..times."
## [36] "Debt.to.equity.ratio..times."
## [37] "Cash.to.current.liabilities..times."
## [38] "Cash.to.average.cost.of.sales.per.day"
## [39] "Creditors.turnover"
## [40] "Debtors.turnover"
## [41] "Finished.goods.turnover"
## [42] "WIP.turnover"
## [43] "Raw.material.turnover"
## [44] "Shares.outstanding"
## [45] "Equity.face.value"
## [46] "EPS"
## [47] "Adjusted.EPS"
## [48] "Total.liabilities"
## [49] "PE.on.BSE"
colnames(dplyr::select_if(FRA_raw_df, is.factor))#### 2 as factors
## [1] "Num" "Default"
Dropping the columns: “Deposits..accepted.by.commercial.banks.” as that column has all nulls from Raw and Validation dataset
FRA_new_df<-dplyr::select(FRA_raw_df,-Deposits..accepted.by.commercial.banks.)
##Same treatement to validation/test set
test_raw_df<-test_raw_read_df
str(test_raw_df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 715 obs. of 52 variables:
## $ Num : int 1545 735 2605 1292 134 911 3425 2124 2066 1996 ...
## $ Default...1 : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Total.assets : num 971 675 532 858 823 ...
## $ Net.worth : num 276 212 120 201 349 ...
## $ Total.income : num 2185 819 564 3576 1034 ...
## $ Change.in.stock : num 14.2 10.4 -28.1 -0.6 28.9 -0.5 NA -7.7 27.2 -0.2 ...
## $ Total.expenses : num 2099 810 578 3613 1042 ...
## $ Profit.after.tax : num 100.2 19.7 -42.4 -37.5 21.4 ...
## $ PBDITA : num 285.6 116 -31 68.2 90.1 ...
## $ PBT : num 152.1 33.7 -56 25.7 29.7 ...
## $ Cash.profit : num 182.3 50.5 -35.3 37.3 62.7 ...
## $ PBDITA.as...of.total.income : num 13.07 14.16 -5.5 1.91 8.71 ...
## $ PBT.as...of.total.income : num 6.96 4.11 -9.94 0.72 2.87 ...
## $ PAT.as...of.total.income : num 4.59 2.4 -7.52 -1.05 2.07 ...
## $ Cash.profit.as...of.total.income : num 8.34 6.16 -6.26 1.04 6.06 ...
## $ PAT.as...of.net.worth : num 42.11 10.66 -31.2 0 6.31 ...
## $ Sales : num 2171 817 552 3573 1027 ...
## $ Income.from.financial.services : num 2.3 0.8 9.1 1 0.7 ...
## $ Other.income : num NA 0.2 2.1 1.5 2.3 0.1 NA NA 0.1 0.1 ...
## $ Total.capital : num 48 114 47.1 50.5 33 ...
## $ Reserves.and.funds : num 413.1 97.6 227.4 150.9 316.2 ...
## $ Deposits..accepted.by.commercial.banks. : logi NA NA NA NA NA NA ...
## $ Borrowings : num 177.3 339.8 17.5 524.2 162.3 ...
## $ Current.liabilities...provisions : num 328.5 100.5 240.1 75.2 299.6 ...
## $ Deferred.tax.liability : num 3.7 23.1 NA 56.7 12.2 2.1 1.9 4.4 2.9 NA ...
## $ Shareholders.funds : num 276 212 120 201 349 ...
## $ Cumulative.retained.profits : num 227.8 97.6 69.9 150.9 316.2 ...
## $ Capital.employed : num 453 551 138 726 512 ...
## $ TOL.TNW : num 1.8 2.01 1.73 2.94 1.02 0.86 0.06 1.92 0.37 1.96 ...
## $ Total.term.liabilities...tangible.net.worth: num 0.27 0.72 0.09 0.81 0.1 0.11 0.05 0.78 0 1.81 ...
## $ Contingent.liabilities...Net.worth.... : num 112.94 5.77 102.83 0.65 28.78 ...
## $ Contingent.liabilities : num 311.5 12.2 123.6 1.3 100.5 ...
## $ Net.fixed.assets : num 332 199 270 263 191 ...
## $ Investments : num NA NA 0.7 NA NA NA 17.3 2.6 NA NA ...
## $ Current.assets : num 560 407 148 536 472 ...
## $ Net.working.capital : num 134.2 123.6 -97.1 99.6 75.3 ...
## $ Quick.ratio..times. : num 0.92 0.48 0.32 0.51 0.58 0.97 166 0.52 0.88 0.6 ...
## $ Current.ratio..times. : num 1.31 1.39 0.6 1.23 1.19 1.86 166 1.56 1.19 0.55 ...
## $ Debt.to.equity.ratio..times. : num 0.64 1.61 0.15 2.6 0.46 0.32 0.05 1.24 0 1.81 ...
## $ Cash.to.current.liabilities..times. : num 0.09 0.03 0.04 0.08 0.08 0 165 0.03 0.35 0.23 ...
## $ Cash.to.average.cost.of.sales.per.day : num 7.56 3.88 4.63 3.71 11.15 ...
## $ Creditors.turnover : num 5.94 10.59 2.35 NA 5.48 ...
## $ Debtors.turnover : num 5.74 6.03 9.6 NA 4.78 ...
## $ Finished.goods.turnover : num 25.11 28.96 8.23 NA 6.28 ...
## $ WIP.turnover : num 20 18.6 6.6 NA 6.7 ...
## $ Raw.material.turnover : num 17.58 2.67 3.77 NA 3.7 ...
## $ Shares.outstanding : num 4800000 11400000 471285 5050000 3205946 ...
## $ Equity.face.value : num 10 10 100 10 10 100 10 NA 10 10 ...
## $ EPS : num 18.6 1.65 -90.39 -7.09 5.9 ...
## $ Adjusted.EPS : num 18.6 1.65 -90.39 -7.09 5.9 ...
## $ Total.liabilities : num 971 675 532 858 823 ...
## $ PE.on.BSE : num NA NA -15.5 -0.16 NA NA NA NA NA NA ...
colnames(test_raw_df) <- make.names(colnames(test_raw_df))
test_raw_df$Num<-as.factor(test_raw_df$Num)
##Rename column from Default...1 to Default
test_raw_df<-dplyr::rename(test_raw_df,Default = Default...1)
colnames(test_raw_df)
## [1] "Num"
## [2] "Default"
## [3] "Total.assets"
## [4] "Net.worth"
## [5] "Total.income"
## [6] "Change.in.stock"
## [7] "Total.expenses"
## [8] "Profit.after.tax"
## [9] "PBDITA"
## [10] "PBT"
## [11] "Cash.profit"
## [12] "PBDITA.as...of.total.income"
## [13] "PBT.as...of.total.income"
## [14] "PAT.as...of.total.income"
## [15] "Cash.profit.as...of.total.income"
## [16] "PAT.as...of.net.worth"
## [17] "Sales"
## [18] "Income.from.financial.services"
## [19] "Other.income"
## [20] "Total.capital"
## [21] "Reserves.and.funds"
## [22] "Deposits..accepted.by.commercial.banks."
## [23] "Borrowings"
## [24] "Current.liabilities...provisions"
## [25] "Deferred.tax.liability"
## [26] "Shareholders.funds"
## [27] "Cumulative.retained.profits"
## [28] "Capital.employed"
## [29] "TOL.TNW"
## [30] "Total.term.liabilities...tangible.net.worth"
## [31] "Contingent.liabilities...Net.worth...."
## [32] "Contingent.liabilities"
## [33] "Net.fixed.assets"
## [34] "Investments"
## [35] "Current.assets"
## [36] "Net.working.capital"
## [37] "Quick.ratio..times."
## [38] "Current.ratio..times."
## [39] "Debt.to.equity.ratio..times."
## [40] "Cash.to.current.liabilities..times."
## [41] "Cash.to.average.cost.of.sales.per.day"
## [42] "Creditors.turnover"
## [43] "Debtors.turnover"
## [44] "Finished.goods.turnover"
## [45] "WIP.turnover"
## [46] "Raw.material.turnover"
## [47] "Shares.outstanding"
## [48] "Equity.face.value"
## [49] "EPS"
## [50] "Adjusted.EPS"
## [51] "Total.liabilities"
## [52] "PE.on.BSE"
test_raw_df$Default<-as.factor(test_raw_df$Default)
levels(test_raw_df$Default) <- c("no", "yes") ## 0 as no and 1 as yes
table(test_raw_df$Default)
##
## no yes
## 661 54
colnames(dplyr::select_if(test_raw_df, is.numeric))
## [1] "Total.assets"
## [2] "Net.worth"
## [3] "Total.income"
## [4] "Change.in.stock"
## [5] "Total.expenses"
## [6] "Profit.after.tax"
## [7] "PBDITA"
## [8] "PBT"
## [9] "Cash.profit"
## [10] "PBDITA.as...of.total.income"
## [11] "PBT.as...of.total.income"
## [12] "PAT.as...of.total.income"
## [13] "Cash.profit.as...of.total.income"
## [14] "PAT.as...of.net.worth"
## [15] "Sales"
## [16] "Income.from.financial.services"
## [17] "Other.income"
## [18] "Total.capital"
## [19] "Reserves.and.funds"
## [20] "Borrowings"
## [21] "Current.liabilities...provisions"
## [22] "Deferred.tax.liability"
## [23] "Shareholders.funds"
## [24] "Cumulative.retained.profits"
## [25] "Capital.employed"
## [26] "TOL.TNW"
## [27] "Total.term.liabilities...tangible.net.worth"
## [28] "Contingent.liabilities...Net.worth...."
## [29] "Contingent.liabilities"
## [30] "Net.fixed.assets"
## [31] "Investments"
## [32] "Current.assets"
## [33] "Net.working.capital"
## [34] "Quick.ratio..times."
## [35] "Current.ratio..times."
## [36] "Debt.to.equity.ratio..times."
## [37] "Cash.to.current.liabilities..times."
## [38] "Cash.to.average.cost.of.sales.per.day"
## [39] "Creditors.turnover"
## [40] "Debtors.turnover"
## [41] "Finished.goods.turnover"
## [42] "WIP.turnover"
## [43] "Raw.material.turnover"
## [44] "Shares.outstanding"
## [45] "Equity.face.value"
## [46] "EPS"
## [47] "Adjusted.EPS"
## [48] "Total.liabilities"
## [49] "PE.on.BSE"
colnames(dplyr::select_if(test_raw_df, is.factor))
## [1] "Num" "Default"
colnames(test_raw_df)
## [1] "Num"
## [2] "Default"
## [3] "Total.assets"
## [4] "Net.worth"
## [5] "Total.income"
## [6] "Change.in.stock"
## [7] "Total.expenses"
## [8] "Profit.after.tax"
## [9] "PBDITA"
## [10] "PBT"
## [11] "Cash.profit"
## [12] "PBDITA.as...of.total.income"
## [13] "PBT.as...of.total.income"
## [14] "PAT.as...of.total.income"
## [15] "Cash.profit.as...of.total.income"
## [16] "PAT.as...of.net.worth"
## [17] "Sales"
## [18] "Income.from.financial.services"
## [19] "Other.income"
## [20] "Total.capital"
## [21] "Reserves.and.funds"
## [22] "Deposits..accepted.by.commercial.banks."
## [23] "Borrowings"
## [24] "Current.liabilities...provisions"
## [25] "Deferred.tax.liability"
## [26] "Shareholders.funds"
## [27] "Cumulative.retained.profits"
## [28] "Capital.employed"
## [29] "TOL.TNW"
## [30] "Total.term.liabilities...tangible.net.worth"
## [31] "Contingent.liabilities...Net.worth...."
## [32] "Contingent.liabilities"
## [33] "Net.fixed.assets"
## [34] "Investments"
## [35] "Current.assets"
## [36] "Net.working.capital"
## [37] "Quick.ratio..times."
## [38] "Current.ratio..times."
## [39] "Debt.to.equity.ratio..times."
## [40] "Cash.to.current.liabilities..times."
## [41] "Cash.to.average.cost.of.sales.per.day"
## [42] "Creditors.turnover"
## [43] "Debtors.turnover"
## [44] "Finished.goods.turnover"
## [45] "WIP.turnover"
## [46] "Raw.material.turnover"
## [47] "Shares.outstanding"
## [48] "Equity.face.value"
## [49] "EPS"
## [50] "Adjusted.EPS"
## [51] "Total.liabilities"
## [52] "PE.on.BSE"
##Dropping col Deposits for test datasets
test_new_df<-dplyr::select(test_raw_df,-Deposits..accepted.by.commercial.banks.)
Post dropping column, there are now 51 columns in total(49 numeric + 2 factors) in Raw dataset
dim(FRA_new_df)
## [1] 3541 51
Post dropping column, there are now 51 columns in total(49 numeric + 2 factors) in Validation dataset
dim(test_new_df)
## [1] 715 51
Outlier removal for Raw and Validation dataset; The idea is mark outlier below 5% and above 95% percentile as outlier and mark it as NA. The original dataset already has missing data. The idea is if any column or rows has more than 50% missing data, will be dropped as it does not make sense to impute them.
#dim(FRA_num_outasNA_dframe) ##49 Numeric cols
#summary(FRA_num_outasNA_dframe)
#dim(test_num_outasNA_dframe) ##49 Numeric cols
#summary(test_num_outasNA_dframe)
Know percenatge of missing data with NA in data column wise in Raw dataset
p<-function(x) {sum(is.na(x))/length(x)*100}
apply(FRA_num_outasNA_dframe,2,p) ## for raw
## Total.assets
## 9.912454
## Net.worth
## 9.968935
## Total.income
## 15.052245
## Change.in.stock
## 21.688788
## Total.expenses
## 13.583733
## Profit.after.tax
## 13.357809
## PBDITA
## 13.329568
## PBT
## 13.357809
## Cash.profit
## 13.329568
## PBDITA.as...of.total.income
## 11.719853
## PBT.as...of.total.income
## 11.719853
## PAT.as...of.total.income
## 11.719853
## Cash.profit.as...of.total.income
## 11.748094
## PAT.as...of.net.worth
## 9.997176
## Sales
## 16.633719
## Income.from.financial.services
## 30.132731
## Other.income
## 39.819260
## Total.capital
## 9.997176
## Reserves.and.funds
## 12.171703
## Borrowings
## 19.175374
## Current.liabilities...provisions
## 12.397628
## Deferred.tax.liability
## 38.435470
## Shareholders.funds
## 9.997176
## Cumulative.retained.profits
## 11.013838
## Capital.employed
## 9.940695
## TOL.TNW
## 9.630048
## Total.term.liabilities...tangible.net.worth
## 5.393957
## Contingent.liabilities...Net.worth....
## 4.998588
## Contingent.liabilities
## 39.762779
## Net.fixed.assets
## 13.018921
## Investments
## 43.970630
## Current.assets
## 11.635131
## Net.working.capital
## 10.844394
## Quick.ratio..times.
## 12.115222
## Current.ratio..times.
## 12.369387
## Debt.to.equity.ratio..times.
## 4.998588
## Cash.to.current.liabilities..times.
## 7.370799
## Cash.to.average.cost.of.sales.per.day
## 12.143462
## Creditors.turnover
## 13.950861
## Debtors.turnover
## 13.809658
## Finished.goods.turnover
## 28.777182
## WIP.turnover
## 26.263767
## Raw.material.turnover
## 14.769839
## Shares.outstanding
## 27.619317
## Equity.face.value
## 24.597571
## EPS
## 9.968935
## Adjusted.EPS
## 9.997176
## Total.liabilities
## 9.912454
## PE.on.BSE
## 65.800621
Know percenatge of missing data with NA in data column wise in Validation dataset
apply(test_num_outasNA_dframe,2,p) ## for test
## Total.assets
## 10.069930
## Net.worth
## 9.930070
## Total.income
## 14.405594
## Change.in.stock
## 21.818182
## Total.expenses
## 13.426573
## Profit.after.tax
## 13.006993
## PBDITA
## 13.006993
## PBT
## 13.006993
## Cash.profit
## 13.006993
## PBDITA.as...of.total.income
## 11.608392
## PBT.as...of.total.income
## 11.608392
## PAT.as...of.total.income
## 11.608392
## Cash.profit.as...of.total.income
## 11.608392
## PAT.as...of.net.worth
## 10.069930
## Sales
## 15.944056
## Income.from.financial.services
## 28.391608
## Other.income
## 39.860140
## Total.capital
## 10.069930
## Reserves.and.funds
## 11.888112
## Borrowings
## 18.321678
## Current.liabilities...provisions
## 11.608392
## Deferred.tax.liability
## 38.741259
## Shareholders.funds
## 9.930070
## Cumulative.retained.profits
## 11.048951
## Capital.employed
## 10.069930
## TOL.TNW
## 9.930070
## Total.term.liabilities...tangible.net.worth
## 5.734266
## Contingent.liabilities...Net.worth....
## 5.034965
## Contingent.liabilities
## 36.503497
## Net.fixed.assets
## 11.608392
## Investments
## 42.797203
## Current.assets
## 11.748252
## Net.working.capital
## 10.769231
## Quick.ratio..times.
## 11.328671
## Current.ratio..times.
## 11.748252
## Debt.to.equity.ratio..times.
## 5.034965
## Cash.to.current.liabilities..times.
## 6.713287
## Cash.to.average.cost.of.sales.per.day
## 11.748252
## Creditors.turnover
## 17.342657
## Debtors.turnover
## 12.587413
## Finished.goods.turnover
## 26.853147
## WIP.turnover
## 25.734266
## Raw.material.turnover
## 13.986014
## Shares.outstanding
## 24.895105
## Equity.face.value
## 21.678322
## EPS
## 10.069930
## Adjusted.EPS
## 10.069930
## Total.liabilities
## 10.069930
## PE.on.BSE
## 64.755245
In raw dataset, PE.on.BSE 65% missing data; it has more than 50% missing value, so dropping it and new dimension is:
FRA_num_outasNA_df<-dplyr::select(FRA_num_outasNA_dframe,-PE.on.BSE)
dim(FRA_num_outasNA_df) ##3541*48
## [1] 3541 48
In Validation dataset, PE.on.BSE 64% missing data, it has more than 50% missing value, so dropping it and new deimsion is
test_num_outasNA_df<-dplyr::select(test_num_outasNA_dframe,-PE.on.BSE)
dim(test_num_outasNA_df) ##715*48
## [1] 715 48
Even after application of mice with restrictive maxit has left 351 as nulls in Total.liabilities; It is then replaced with median of that column
md.pattern(newFRA_num_imputed_df,rotate.names = T)

## Total.assets Net.worth Total.income Change.in.stock Total.expenses
## 3190 1 1 1 1 1
## 351 1 1 1 1 1
## 0 0 0 0 0
## Profit.after.tax PBDITA PBT Cash.profit PBDITA.as...of.total.income
## 3190 1 1 1 1 1
## 351 1 1 1 1 1
## 0 0 0 0 0
## PBT.as...of.total.income PAT.as...of.total.income
## 3190 1 1
## 351 1 1
## 0 0
## Cash.profit.as...of.total.income PAT.as...of.net.worth Sales
## 3190 1 1 1
## 351 1 1 1
## 0 0 0
## Income.from.financial.services Other.income Total.capital
## 3190 1 1 1
## 351 1 1 1
## 0 0 0
## Reserves.and.funds Borrowings Current.liabilities...provisions
## 3190 1 1 1
## 351 1 1 1
## 0 0 0
## Deferred.tax.liability Shareholders.funds Cumulative.retained.profits
## 3190 1 1 1
## 351 1 1 1
## 0 0 0
## Capital.employed TOL.TNW Total.term.liabilities...tangible.net.worth
## 3190 1 1 1
## 351 1 1 1
## 0 0 0
## Contingent.liabilities...Net.worth.... Contingent.liabilities
## 3190 1 1
## 351 1 1
## 0 0
## Net.fixed.assets Investments Current.assets Net.working.capital
## 3190 1 1 1 1
## 351 1 1 1 1
## 0 0 0 0
## Quick.ratio..times. Current.ratio..times. Debt.to.equity.ratio..times.
## 3190 1 1 1
## 351 1 1 1
## 0 0 0
## Cash.to.current.liabilities..times. Cash.to.average.cost.of.sales.per.day
## 3190 1 1
## 351 1 1
## 0 0
## Creditors.turnover Debtors.turnover Finished.goods.turnover WIP.turnover
## 3190 1 1 1 1
## 351 1 1 1 1
## 0 0 0 0
## Raw.material.turnover Shares.outstanding Equity.face.value EPS
## 3190 1 1 1 1
## 351 1 1 1 1
## 0 0 0 0
## Adjusted.EPS Total.liabilities
## 3190 1 1 0
## 351 1 0 1
## 0 351 351
newFRA_num_imputed_df$Total.liabilities[is.na(newFRA_num_imputed_df$Total.liabilities)]=median(newFRA_num_imputed_df$Total.liabilities,na.rm=TRUE)
sum(is.na(newFRA_num_imputed_df)) ## 0
## [1] 0
There are now no Nulls in Raw dataset.
## creating a dataframe with 48 numeric and 2 categorical df for raw
FRA_complete_df<-cbind(FRA_fac_df,newFRA_num_imputed_df)
# Its a healthy practice to write the data to disk post imputation as its a costly process.
write.csv(FRA_complete_df,"FRA_complete.csv",row.names=FALSE)
dim(FRA_complete_df)##48+2 cols
## [1] 3541 50
Similar treatment will be done in Validation dataset for removing Nulls
##Same missing data treatement (Actual NA + Outlier NA) for Validation/test dataset
#md.pattern(test_num_outasNA_df,rotate.names = T)
imputed_test<-mice(test_num_outasNA_df, m=1,maxit=3, method='rf',printFlag = TRUE, seed=1234 )
newtest_num_imputed_df=complete(imputed_test,1)
sum(is.na(newtest_num_imputed_df)) ##72 still NA in test post mice
#md.pattern(newtest_num_imputed_df,rotate.names = T)
##Total.liabilities has 72 missing value; using median to replace
summary(newtest_num_imputed_df$Total.liabilities)
newtest_num_imputed_df$Total.liabilities[is.na(newtest_num_imputed_df$Total.liabilities)]=median(newtest_num_imputed_df$Total.liabilities,na.rm=TRUE)
sum(is.na(newtest_num_imputed_df)) ## 0
## [1] 0
## creating a dataframe with 48 numeric and 2 categorical df
test_complete_df<-cbind(test_fac_df,newtest_num_imputed_df) ##df+df using cbind
dim(test_complete_df)
## [1] 715 50
write.csv(test_complete_df,"test_complete.csv",row.names=FALSE)
dim(test_complete_df)##48+2 cols
## [1] 715 50
########test_complete_df has test_fac_df and newtest_num_imputed_df 715*50
####Correlation only for Raw data for 48 numeric columns
##checking multicollinearity using polycor
corHet <- hetcor(as.data.frame(newFRA_num_imputed_df %>% mutate_if(is.character,as.factor)))
hetCorrPlot <- function(corHet){
melted_cormat <- reshape::melt(round(corHet,2), na.rm = TRUE)
colnames(melted_cormat) <- c('Var1', 'Var2', 'value')
melted_cormat <- melted_cormat %>% filter(!is.na(value))
## Plot the corelation matrix
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "black")+
scale_fill_gradient2(low = "red", high = "green", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Heterogeneous Correlation Matrix") +
theme_minimal()+ ## minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1))+
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4)+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank()
)
print(ggheatmap)
}
#hetCorrPlot(corHet$correlations)
Checking VIF to check the magnitude for multicollinearity in Train/Raw data
dim(newFRA_num_imputed_df)
## [1] 3541 48
vif(lm(EPS~.,data=newFRA_num_imputed_df))
## Total.assets
## 16.104301
## Net.worth
## 25.584765
## Total.income
## 13.256508
## Change.in.stock
## 1.353992
## Total.expenses
## 24.442873
## Profit.after.tax
## 15.373892
## PBDITA
## 14.888468
## PBT
## 17.209319
## Cash.profit
## 14.331497
## PBDITA.as...of.total.income
## 4.156362
## PBT.as...of.total.income
## 10.389566
## PAT.as...of.total.income
## 8.915065
## Cash.profit.as...of.total.income
## 5.351971
## PAT.as...of.net.worth
## 1.897147
## Sales
## 27.106274
## Income.from.financial.services
## 1.774513
## Other.income
## 1.394993
## Total.capital
## 2.753443
## Reserves.and.funds
## 9.382020
## Borrowings
## 4.527701
## Current.liabilities...provisions
## 5.447240
## Deferred.tax.liability
## 3.303739
## Shareholders.funds
## 25.044217
## Cumulative.retained.profits
## 6.231180
## Capital.employed
## 14.393205
## TOL.TNW
## 3.216945
## Total.term.liabilities...tangible.net.worth
## 3.372679
## Contingent.liabilities...Net.worth....
## 1.181573
## Contingent.liabilities
## 1.457910
## Net.fixed.assets
## 5.820851
## Investments
## 1.612928
## Current.assets
## 8.088887
## Net.working.capital
## 2.231537
## Quick.ratio..times.
## 2.790158
## Current.ratio..times.
## 2.442653
## Debt.to.equity.ratio..times.
## 5.386956
## Cash.to.current.liabilities..times.
## 2.014403
## Cash.to.average.cost.of.sales.per.day
## 1.640008
## Creditors.turnover
## 1.456456
## Debtors.turnover
## 1.465840
## Finished.goods.turnover
## 1.557690
## WIP.turnover
## 1.674422
## Raw.material.turnover
## 1.274257
## Shares.outstanding
## 2.737578
## Equity.face.value
## 1.113920
## Adjusted.EPS
## 1.356729
## Total.liabilities
## 3.819062
##Dropped PBT.as...of.total.income,Total.income,PBT,PBDITA,Capital.employed,Net.worth,Total.expenses,Total.assets due to huge Vif values
new_num_imputed_df<-dplyr::select(newFRA_num_imputed_df,-c(PBT.as...of.total.income,Total.income,PBT,PBDITA,Capital.employed,Net.worth,Total.expenses,Total.assets))
vif(lm(EPS~.,data=new_num_imputed_df))
## Change.in.stock
## 1.349473
## Profit.after.tax
## 7.112117
## Cash.profit
## 9.090822
## PBDITA.as...of.total.income
## 4.022844
## PAT.as...of.total.income
## 3.021874
## Cash.profit.as...of.total.income
## 5.207516
## PAT.as...of.net.worth
## 1.855383
## Sales
## 6.251357
## Income.from.financial.services
## 1.741950
## Other.income
## 1.365593
## Total.capital
## 2.737904
## Reserves.and.funds
## 8.745352
## Borrowings
## 3.489782
## Current.liabilities...provisions
## 4.997676
## Deferred.tax.liability
## 3.250509
## Shareholders.funds
## 8.301318
## Cumulative.retained.profits
## 6.100243
## TOL.TNW
## 3.196418
## Total.term.liabilities...tangible.net.worth
## 3.355634
## Contingent.liabilities...Net.worth....
## 1.176171
## Contingent.liabilities
## 1.435148
## Net.fixed.assets
## 5.424518
## Investments
## 1.575276
## Current.assets
## 7.461207
## Net.working.capital
## 2.192806
## Quick.ratio..times.
## 2.787518
## Current.ratio..times.
## 2.438603
## Debt.to.equity.ratio..times.
## 5.345303
## Cash.to.current.liabilities..times.
## 2.008149
## Cash.to.average.cost.of.sales.per.day
## 1.629659
## Creditors.turnover
## 1.441942
## Debtors.turnover
## 1.459670
## Finished.goods.turnover
## 1.555277
## WIP.turnover
## 1.671010
## Raw.material.turnover
## 1.266103
## Shares.outstanding
## 2.704633
## Equity.face.value
## 1.111895
## Adjusted.EPS
## 1.349915
## Total.liabilities
## 2.816435
VIF is below 10 in Raw; the issue of multicollinearity is resolved. The final dimension of the Raw dataset is:
dim(new_num_imputed_df)
## [1] 3541 40
Same process followed for Validation dataset and 8 columns are dropped due to huge VIF
FRA_complete_df<-cbind(FRA_fac_df,new_num_imputed_df)
#dim(FRA_complete_df)
##Dropping columns from test data too:
test_pvif_num_imputed_df<-dplyr::select(newtest_num_imputed_df,-c(PBT.as...of.total.income,Total.income,PBT,PBDITA,Capital.employed,Net.worth,Total.expenses,Total.assets))
test_complete_df<-cbind(test_fac_df,test_pvif_num_imputed_df) ##df+df using cbind
The final dimension of the Validation dataset is:
dim(test_complete_df)
## [1] 715 42
Dropped Predictor list along with reason of dropping it (only 10 numeric predictors + 1 Factor)
Total.income,
PBT,
PBDITA,
Capital.employed,
Net.worth,
Total.expenses,
Total.assets,
PE.on.BSE
All above due to greater than 10 vif
Deposits..accepted.by.commercial.banks.-all nulls
Total.liabilities - 65% missing
Num - Factor Company name
Dropping Num factor column as not a valid predictor
FRA_complete_df<-as_tibble(FRA_complete_df)
## Change target,set to correct level & Num column to factor
imputed_nonum_df=dplyr::select(FRA_complete_df,-Num)
colnames(imputed_nonum_df) <- make.names(colnames(imputed_nonum_df))
table(imputed_nonum_df$Default) ##7% of yes no need of smote
##
## no yes
## 3307 234
Dimension of Train/Raw dataset
dim(imputed_nonum_df)##3541*41
## [1] 3541 41
Model Building Starts here for Raw data
train <- imputed_nonum_df
dim(train) ##2479*41
anyNA(train)
levels(imputed_nonum_df$Default)
table(train$Default)
##Set up Seed
set.seed(1234)
objControl <- trainControl(method = "none",
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE)
## to increase iterations
control=glm.control(maxit=50)
model <- caret::train(Default ~ ., data = train,
method = 'glmStepAIC',
trControl = objControl,
metric = "ROC",
direction = 'both'
)
model
## Generalized Linear Model with Stepwise Feature Selection
##
## 3541 samples
## 40 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: None
summary(model$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5222 -0.3254 -0.1588 -0.0301 4.4748
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -1.5160220 0.2475164 -6.125
## PAT.as...of.total.income -0.0780114 0.0176135 -4.429
## Cash.profit.as...of.total.income -0.0418059 0.0177989 -2.349
## PAT.as...of.net.worth -0.0127376 0.0085778 -1.485
## Sales -0.0003605 0.0001514 -2.382
## Other.income -0.0276980 0.0174306 -1.589
## Borrowings 0.0011461 0.0003831 2.992
## Cumulative.retained.profits -0.0030246 0.0010567 -2.862
## TOL.TNW 0.1869976 0.0441101 4.239
## Total.term.liabilities...tangible.net.worth -0.2828562 0.1174974 -2.407
## Contingent.liabilities...Net.worth.... 0.0056798 0.0025075 2.265
## Current.assets 0.0009641 0.0004444 2.170
## Net.working.capital -0.0026375 0.0013595 -1.940
## Current.ratio..times. -0.4713044 0.1427510 -3.302
## Debt.to.equity.ratio..times. 0.1254032 0.0832678 1.506
## Cash.to.average.cost.of.sales.per.day 0.0030318 0.0019823 1.529
## Creditors.turnover -0.0327195 0.0156490 -2.091
## Raw.material.turnover -0.0470851 0.0146487 -3.214
## Equity.face.value 0.0055728 0.0034030 1.638
## Adjusted.EPS -0.2077995 0.0511028 -4.066
## Total.liabilities -0.0004812 0.0001798 -2.677
## Pr(>|z|)
## (Intercept) 9.07e-10 ***
## PAT.as...of.total.income 9.46e-06 ***
## Cash.profit.as...of.total.income 0.018834 *
## PAT.as...of.net.worth 0.137555
## Sales 0.017233 *
## Other.income 0.112050
## Borrowings 0.002775 **
## Cumulative.retained.profits 0.004206 **
## TOL.TNW 2.24e-05 ***
## Total.term.liabilities...tangible.net.worth 0.016069 *
## Contingent.liabilities...Net.worth.... 0.023502 *
## Current.assets 0.030042 *
## Net.working.capital 0.052374 .
## Current.ratio..times. 0.000961 ***
## Debt.to.equity.ratio..times. 0.132061
## Cash.to.average.cost.of.sales.per.day 0.126164
## Creditors.turnover 0.036542 *
## Raw.material.turnover 0.001308 **
## Equity.face.value 0.101500
## Adjusted.EPS 4.78e-05 ***
## Total.liabilities 0.007439 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1723.7 on 3540 degrees of freedom
## Residual deviance: 1182.6 on 3520 degrees of freedom
## AIC: 1224.6
##
## Number of Fisher Scoring iterations: 9
##How good the model is when it predicts its own Default on which it has learnt
pred_train <- predict( model, train)
table(pred_train)
## pred_train
## no yes
## 3474 67
##at cutoff of 50%
caret::confusionMatrix(pred_train,train$Default,positive="yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 3282 192
## yes 25 42
##
## Accuracy : 0.9387
## 95% CI : (0.9303, 0.9464)
## No Information Rate : 0.9339
## P-Value [Acc > NIR] : 0.1316
##
## Kappa : 0.2572
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.17949
## Specificity : 0.99244
## Pos Pred Value : 0.62687
## Neg Pred Value : 0.94473
## Prevalence : 0.06608
## Detection Rate : 0.01186
## Detection Prevalence : 0.01892
## Balanced Accuracy : 0.58596
##
## 'Positive' Class : yes
##
Note: False Negative i.e Predicting company has not defaulted, when the company has actually defaulted is more risky; Therefore, here Sensitivity is more important. For train dataset at 50% cutoff - sensitivity is 18% which is quite low when compared to 90+ % accuracy
Model Interpretation with respect to Beta/coefficient of the final model:
Probability of a company will default increases as the Borrowings,Contingent.liabilities…Net.worth….,TOL.TNW and Current.assets increases
Probability of a company will default decreases as the PAT.as…of.total.income,Sales,Cumulative.retained.profits,Total.term.liabilities…tangible.net.worth,Current.ratio..times.,Creditors.turnover,Raw.material.turnover ,Adjusted.EPS , Total.liabilities increases
Residual analysis for the model to check for hetroscadicity.
library(arm)
binnedplot(fitted(model$finalModel),
residuals(model$finalModel, type = "response"),
nclass = NULL,
xlab = "Expected Values",
ylab = "Average residual",
main = "Binned residual plot",
cex.pts = 0.8,
col.pts = 1,
col.int = "gray")

The grey lines represent plus minus side of 2 standard devaiation bands, which we would expect to contain about 95% of the observations. This model does look reasonable as the majority of the fitted values seem to fall inside the bands and are randomly distributed.
Checking for multi collinearity in the final model.
vif(model$finalModel)
## PAT.as...of.total.income
## 1.512013
## Cash.profit.as...of.total.income
## 1.427629
## PAT.as...of.net.worth
## 1.502264
## Sales
## 3.340292
## Other.income
## 1.298118
## Borrowings
## 4.096163
## Cumulative.retained.profits
## 1.465548
## TOL.TNW
## 2.400515
## Total.term.liabilities...tangible.net.worth
## 2.331058
## Contingent.liabilities...Net.worth....
## 1.150472
## Current.assets
## 4.584840
## Net.working.capital
## 1.449347
## Current.ratio..times.
## 1.239409
## Debt.to.equity.ratio..times.
## 3.642694
## Cash.to.average.cost.of.sales.per.day
## 1.173554
## Creditors.turnover
## 1.317369
## Raw.material.turnover
## 1.303051
## Equity.face.value
## 1.028925
## Adjusted.EPS
## 1.249975
## Total.liabilities
## 3.202171
No multicollinearity in the final model as all vif of predictors below 10
How well is the model able to predict the new test set.
test_complete_df<-as_tibble(test_complete_df)
## Change target,set to correct level & Num column to factor
test_imputed_nonum_df=dplyr::select(test_complete_df,-Num)
colnames(test_imputed_nonum_df) <- make.names(colnames(test_imputed_nonum_df))
table(test_imputed_nonum_df$Default)
##
## no yes
## 661 54
pred_test <- predict(model, newdata =test_imputed_nonum_df)
caret::confusionMatrix(pred_test,test_imputed_nonum_df$Default,positive="yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 648 40
## yes 13 14
##
## Accuracy : 0.9259
## 95% CI : (0.9042, 0.944)
## No Information Rate : 0.9245
## P-Value [Acc > NIR] : 0.4797595
##
## Kappa : 0.311
##
## Mcnemar's Test P-Value : 0.0003551
##
## Sensitivity : 0.25926
## Specificity : 0.98033
## Pos Pred Value : 0.51852
## Neg Pred Value : 0.94186
## Prevalence : 0.07552
## Detection Rate : 0.01958
## Detection Prevalence : 0.03776
## Balanced Accuracy : 0.61980
##
## 'Positive' Class : yes
##
The Confusion Matrix is at cutoff of 50% ;
The good news is sensitivity has incraesed in test dataset (more than train’s sensitivity) and accuracy is 90+%. Still it is not best; Another approach could be to find the Optimal Cutoff and then check the sensitivity. Using ROC curve of train dataset, we will try to find the cutoff at highest accuracy.
library(ROCR)
train_pred_prob = predict(model,train , type = "prob")
ROC_pred_train <- prediction(train_pred_prob[2], train$Default)
##Identifying the Best Accuracy & cutoff for this accuracy
ROC_acc_train <- performance(ROC_pred_train, "acc")
plot(ROC_acc_train,main="For getting best cutoff at highest accuracy using train set")
abline(a = 0, b = 1, lwd = 2, lty = 3, col = "black")

max<-which.max(slot(ROC_acc_train,"y.values")[[1]])
max_accuracy<-slot(ROC_acc_train,"y.values")[[1]][max]
cut_off <-slot(ROC_acc_train,"x.values")[[1]][max]
print(c(Accuracy=max_accuracy, cutoff=cut_off))
## Accuracy cutoff
## 0.9392827 0.4892911
Hence highest accuracy that can be acheived by this model for train data is 93% & will be achieved at cutoff of 0.489 (quite near to 0.5 threshold). Hence, no need to create another model at different threshold since by default confusion matrix is created at 50% probability.

Train and Test data AUC is almost same. No case of overfitting.
test_pred_prob = predict(model,test_imputed_nonum_df , type = "prob")
ROC_pred_test <- prediction(test_pred_prob[2], test_imputed_nonum_df$Default)
ROC_perf <- performance(ROC_pred_test, "tpr", "fpr")
plot(ROC_perf, main = "ROC & AUC for Test(Validation) data", col = 2, lwd = 2)
auc<-performance(ROC_pred_test,"auc")
auc<-unlist(slot(auc,"y.values"))
auc<-round(auc,4)
legend(0.6,0.2,auc,title="AUC,cex=1.2")##89%
