knitr::opts_chunk$set(
  echo = TRUE,
  message = FALSE,
  warning = FALSE
)
setwd("C:/Users/aksha/OneDrive/Documents/Shilpi_xtras/Shilpi_extras/GL_BAPI/Rprgm")
suppressMessages(library(kableExtra))
## Warning: package 'kableExtra' was built under R version 3.5.3
suppressMessages(library(gridExtra))
## Warning: package 'gridExtra' was built under R version 3.5.3
suppressMessages(library(VIF))
suppressMessages(library(car))
suppressMessages(library(corrplot))
## Warning: package 'corrplot' was built under R version 3.5.3
suppressMessages(library(polycor))
## Warning: package 'polycor' was built under R version 3.5.3
suppressMessages(library(caret))
## Warning: package 'caret' was built under R version 3.5.3
## Warning: package 'ggplot2' was built under R version 3.5.3
suppressMessages(library(gridExtra))
suppressMessages(library(MASS))
## Warning: package 'MASS' was built under R version 3.5.3
suppressMessages(library(dplyr))
## Warning: package 'dplyr' was built under R version 3.5.3
suppressMessages(library(caTools))
## Warning: package 'caTools' was built under R version 3.5.3
suppressMessages(library(tidyverse))
## Warning: package 'tidyverse' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
suppressMessages(library(ggplot2))
suppressMessages(library(mice))
## Warning: package 'mice' was built under R version 3.5.3
set.seed(1234)

Data Import for Raw and Validation set

FRA_raw_read_df <- read.csv("raw_data_FRA.csv" ,na.strings = c("","?", "NA"))
test_raw_read_df <- read.csv("validation_data_FRAcsv.csv" ,na.strings = c("","?", "NA"))
####Target variable is "Default" column which is integer and binary in nature.
####1 means Default and 0 means not a Default 
FRA_raw_read_df<-as_tibble(FRA_raw_read_df)
test_raw_read_df<-as_tibble(test_raw_read_df)

kable(head(FRA_raw_read_df)) %>% kable_styling(latex_options = "striped","bordered", position = "float_left")
Num Default Total.assets Net.worth Total.income Change.in.stock Total.expenses Profit.after.tax PBDITA PBT Cash.profit PBDITA.as…of.total.income PBT.as…of.total.income PAT.as…of.total.income Cash.profit.as…of.total.income PAT.as…of.net.worth Sales Income.from.financial.services Other.income Total.capital Reserves.and.funds Deposits..accepted.by.commercial.banks. Borrowings Current.liabilities…provisions Deferred.tax.liability Shareholders.funds Cumulative.retained.profits Capital.employed TOL.TNW Total.term.liabilities…tangible.net.worth Contingent.liabilities…Net.worth…. Contingent.liabilities Net.fixed.assets Investments Current.assets Net.working.capital Quick.ratio..times. Current.ratio..times. Debt.to.equity.ratio..times. Cash.to.current.liabilities..times. Cash.to.average.cost.of.sales.per.day Creditors.turnover Debtors.turnover Finished.goods.turnover WIP.turnover Raw.material.turnover Shares.outstanding Equity.face.value EPS Adjusted.EPS Total.liabilities PE.on.BSE
1 0 17512.3 7093.2 24965.2 235.8 23657.8 1543.2 2860.2 2417.2 1872.8 11.46 9.68 6.18 7.50 23.78 24458.0 158.0 297.2 423.8 6822.8 NA 14.9 9965.9 284.9 7093.2 6263.3 7108.1 1.33 0.00 14.80 1049.7 1900.2 1069.6 13277.5 3588.5 1.18 1.37 0.00 0.43 68.21 3.62 3.85 200.55 21.78 7.71 42381675 10 35.52 7.10 17512.3 27.31
2 0 941.0 351.5 1527.4 42.7 1454.9 115.2 283.0 188.4 158.6 18.53 12.33 7.54 10.38 38.08 1504.3 4.0 15.9 115.5 257.8 NA 272.5 210.0 85.2 351.5 247.4 624.0 1.23 0.34 19.23 67.6 286.4 2.2 563.9 203.5 0.95 1.56 0.78 0.06 5.96 9.80 5.70 14.21 7.49 11.46 11550000 10 9.97 9.97 941.0 8.17
3 0 232.8 100.6 477.3 -5.2 478.7 -6.6 5.8 -6.6 0.3 1.22 -1.38 -1.38 0.06 -6.35 475.6 1.5 0.2 81.4 19.2 NA 35.4 96.8 NA 100.6 32.4 136.0 1.44 0.29 45.83 46.1 38.7 4.3 167.5 59.6 1.11 1.55 0.35 0.21 17.07 5.28 5.07 9.24 0.23 NA 8149090 10 -0.50 -0.50 232.8 -5.76
4 0 2.7 2.7 NA NA NA NA NA NA NA 0.00 0.00 0.00 0.00 0.00 NA NA NA 0.5 2.2 NA NA NA NA 2.7 2.2 2.7 0.00 0.00 0.00 NA 2.5 NA 0.2 0.2 NA NA 0.00 NA NA 0.00 0.00 NA NA 0.00 52404 10 0.00 0.00 2.7 NA
5 0 478.5 107.6 1580.5 -17.0 1558.0 5.5 31.0 6.3 11.9 1.96 0.40 0.35 0.75 5.25 1575.1 3.9 0.9 6.2 161.8 NA 193.1 112.8 4.6 107.6 82.7 300.7 2.83 1.59 34.94 37.6 94.8 7.4 349.7 215.8 1.41 2.54 1.79 0.00 0.00 13.00 9.46 12.68 7.90 17.03 619635 10 7.91 7.91 478.5 NA
6 0 2434.4 675.8 2648.6 62.3 2636.4 74.5 200.1 74.5 146.9 7.55 2.81 2.81 5.55 21.78 2639.5 6.4 0.2 33.8 972.0 NA 717.1 555.9 54.4 698.2 317.7 1415.3 1.80 0.37 36.28 245.2 864.9 22.7 1296.2 278.5 0.48 1.27 1.09 0.11 15.78 6.50 21.13 10.14 8.38 4.74 1141718 10 30.57 15.28 2434.4 NA
kable(head(test_raw_read_df)) %>% kable_styling(latex_options = "striped","bordered", position = "float_left")
Num Default…1 Total.assets Net.worth Total.income Change.in.stock Total.expenses Profit.after.tax PBDITA PBT Cash.profit PBDITA.as…of.total.income PBT.as…of.total.income PAT.as…of.total.income Cash.profit.as…of.total.income PAT.as…of.net.worth Sales Income.from.financial.services Other.income Total.capital Reserves.and.funds Deposits..accepted.by.commercial.banks. Borrowings Current.liabilities…provisions Deferred.tax.liability Shareholders.funds Cumulative.retained.profits Capital.employed TOL.TNW Total.term.liabilities…tangible.net.worth Contingent.liabilities…Net.worth…. Contingent.liabilities Net.fixed.assets Investments Current.assets Net.working.capital Quick.ratio..times. Current.ratio..times. Debt.to.equity.ratio..times. Cash.to.current.liabilities..times. Cash.to.average.cost.of.sales.per.day Creditors.turnover Debtors.turnover Finished.goods.turnover WIP.turnover Raw.material.turnover Shares.outstanding Equity.face.value EPS Adjusted.EPS Total.liabilities PE.on.BSE
1545 0 970.6 275.8 2185.2 14.2 2099.2 100.2 285.6 152.1 182.3 13.07 6.96 4.59 8.34 42.11 2171.1 2.3 NA 48.0 413.1 NA 177.3 328.5 3.7 275.8 227.8 453.1 1.80 0.27 112.94 311.5 332.3 NA 559.7 134.2 0.92 1.31 0.64 0.09 7.56 5.94 5.74 25.11 20.01 17.58 4800000 10 18.60 18.60 970.6 NA
735 0 675.0 211.6 819.2 10.4 809.9 19.7 116.0 33.7 50.5 14.16 4.11 2.40 6.16 10.66 817.0 0.8 0.2 114.0 97.6 NA 339.8 100.5 23.1 211.6 97.6 551.4 2.01 0.72 5.77 12.2 199.1 NA 407.3 123.6 0.48 1.39 1.61 0.03 3.88 10.59 6.03 28.96 18.65 2.67 11400000 10 1.65 1.65 675.0 NA
2605 1 532.1 120.2 563.5 -28.1 577.8 -42.4 -31.0 -56.0 -35.3 -5.50 -9.94 -7.52 -6.26 -31.20 552.2 9.1 2.1 47.1 227.4 NA 17.5 240.1 NA 120.2 69.9 137.7 1.73 0.09 102.83 123.6 270.0 0.7 147.8 -97.1 0.32 0.60 0.15 0.04 4.63 2.35 9.60 8.23 6.60 3.77 471285 100 -90.39 -90.39 532.1 -15.50
1292 0 857.5 201.4 3576.5 -0.6 3613.4 -37.5 68.2 25.7 37.3 1.91 0.72 -1.05 1.04 0.00 3573.3 1.0 1.5 50.5 150.9 NA 524.2 75.2 56.7 201.4 150.9 725.6 2.94 0.81 0.65 1.3 262.8 NA 535.6 99.6 0.51 1.23 2.60 0.08 3.71 NA NA NA NA NA 5050000 10 -7.09 -7.09 857.5 -0.16
134 0 823.3 349.2 1034.3 28.9 1041.8 21.4 90.1 29.7 62.7 8.71 2.87 2.07 6.06 6.31 1026.7 0.7 2.3 33.0 316.2 NA 162.3 299.6 12.2 349.2 316.2 511.5 1.02 0.10 28.78 100.5 190.6 NA 471.8 75.3 0.58 1.19 0.46 0.08 11.15 5.48 4.78 6.28 6.70 3.70 3205946 10 5.90 5.90 823.3 NA
911 0 166.2 88.0 201.6 -0.5 200.2 0.9 10.6 3.4 3.9 5.26 1.69 0.45 1.93 0.00 201.0 0.5 0.1 23.3 56.8 NA 28.1 48.0 2.1 88.0 8.2 116.1 0.86 0.11 0.00 NA 17.1 NA 143.3 76.9 0.97 1.86 0.32 0.00 0.57 NA NA NA NA NA 146385 100 6.83 6.83 166.2 NA

Data type of Raw

str(FRA_raw_read_df)
## Classes 'tbl_df', 'tbl' and 'data.frame':    3541 obs. of  52 variables:
##  $ Num                                        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Default                                    : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Total.assets                               : num  17512.3 941 232.8 2.7 478.5 ...
##  $ Net.worth                                  : num  7093.2 351.5 100.6 2.7 107.6 ...
##  $ Total.income                               : num  24965 1527 477 NA 1580 ...
##  $ Change.in.stock                            : num  235.8 42.7 -5.2 NA -17 ...
##  $ Total.expenses                             : num  23658 1455 479 NA 1558 ...
##  $ Profit.after.tax                           : num  1543.2 115.2 -6.6 NA 5.5 ...
##  $ PBDITA                                     : num  2860.2 283 5.8 NA 31 ...
##  $ PBT                                        : num  2417.2 188.4 -6.6 NA 6.3 ...
##  $ Cash.profit                                : num  1872.8 158.6 0.3 NA 11.9 ...
##  $ PBDITA.as...of.total.income                : num  11.46 18.53 1.22 0 1.96 ...
##  $ PBT.as...of.total.income                   : num  9.68 12.33 -1.38 0 0.4 ...
##  $ PAT.as...of.total.income                   : num  6.18 7.54 -1.38 0 0.35 2.81 0 0.72 8.29 -2.88 ...
##  $ Cash.profit.as...of.total.income           : num  7.5 10.38 0.06 0 0.75 ...
##  $ PAT.as...of.net.worth                      : num  23.78 38.08 -6.35 0 5.25 ...
##  $ Sales                                      : num  24458 1504 476 NA 1575 ...
##  $ Income.from.financial.services             : num  158 4 1.5 NA 3.9 6.4 NA NA 7.3 NA ...
##  $ Other.income                               : num  297.2 15.9 0.2 NA 0.9 ...
##  $ Total.capital                              : num  423.8 115.5 81.4 0.5 6.2 ...
##  $ Reserves.and.funds                         : num  6822.8 257.8 19.2 2.2 161.8 ...
##  $ Deposits..accepted.by.commercial.banks.    : logi  NA NA NA NA NA NA ...
##  $ Borrowings                                 : num  14.9 272.5 35.4 NA 193.1 ...
##  $ Current.liabilities...provisions           : num  9965.9 210 96.8 NA 112.8 ...
##  $ Deferred.tax.liability                     : num  284.9 85.2 NA NA 4.6 ...
##  $ Shareholders.funds                         : num  7093.2 351.5 100.6 2.7 107.6 ...
##  $ Cumulative.retained.profits                : num  6263.3 247.4 32.4 2.2 82.7 ...
##  $ Capital.employed                           : num  7108.1 624 136 2.7 300.7 ...
##  $ TOL.TNW                                    : num  1.33 1.23 1.44 0 2.83 1.8 0.03 5.17 1.05 3.25 ...
##  $ Total.term.liabilities...tangible.net.worth: num  0 0.34 0.29 0 1.59 0.37 0.03 0.94 0.3 0.54 ...
##  $ Contingent.liabilities...Net.worth....     : num  14.8 19.2 45.8 0 34.9 ...
##  $ Contingent.liabilities                     : num  1049.7 67.6 46.1 NA 37.6 ...
##  $ Net.fixed.assets                           : num  1900.2 286.4 38.7 2.5 94.8 ...
##  $ Investments                                : num  1069.6 2.2 4.3 NA 7.4 ...
##  $ Current.assets                             : num  13277.5 563.9 167.5 0.2 349.7 ...
##  $ Net.working.capital                        : num  3588.5 203.5 59.6 0.2 215.8 ...
##  $ Quick.ratio..times.                        : num  1.18 0.95 1.11 NA 1.41 0.48 NA 0.54 0.59 0.39 ...
##  $ Current.ratio..times.                      : num  1.37 1.56 1.55 NA 2.54 1.27 NA 1.15 1.58 0.5 ...
##  $ Debt.to.equity.ratio..times.               : num  0 0.78 0.35 0 1.79 1.09 0.32 2.31 0.94 3.13 ...
##  $ Cash.to.current.liabilities..times.        : num  0.43 0.06 0.21 NA 0 0.11 NA 0.04 0.19 0 ...
##  $ Cash.to.average.cost.of.sales.per.day      : num  68.21 5.96 17.07 NA 0 ...
##  $ Creditors.turnover                         : num  3.62 9.8 5.28 0 13 ...
##  $ Debtors.turnover                           : num  3.85 5.7 5.07 0 9.46 ...
##  $ Finished.goods.turnover                    : num  200.55 14.21 9.24 NA 12.68 ...
##  $ WIP.turnover                               : num  21.78 7.49 0.23 NA 7.9 ...
##  $ Raw.material.turnover                      : num  7.71 11.46 NA 0 17.03 ...
##  $ Shares.outstanding                         : num  42381675 11550000 8149090 52404 619635 ...
##  $ Equity.face.value                          : num  10 10 10 10 10 10 10 NA 10 10 ...
##  $ EPS                                        : num  35.52 9.97 -0.5 0 7.91 ...
##  $ Adjusted.EPS                               : num  7.1 9.97 -0.5 0 7.91 ...
##  $ Total.liabilities                          : num  17512.3 941 232.8 2.7 478.5 ...
##  $ PE.on.BSE                                  : num  27.31 8.17 -5.76 NA NA ...

Summary of Raw data

summary(FRA_raw_read_df)
##       Num          Default         Total.assets         Net.worth       
##  Min.   :   1   Min.   :0.00000   Min.   :      0.1   Min.   :     0.0  
##  1st Qu.: 886   1st Qu.:0.00000   1st Qu.:     91.3   1st Qu.:    31.3  
##  Median :1773   Median :0.00000   Median :    309.7   Median :   102.3  
##  Mean   :1772   Mean   :0.06608   Mean   :   3443.4   Mean   :  1295.9  
##  3rd Qu.:2658   3rd Qu.:0.00000   3rd Qu.:   1098.7   3rd Qu.:   377.3  
##  Max.   :3545   Max.   :1.00000   Max.   :1176509.2   Max.   :613151.6  
##                                                                         
##   Total.income       Change.in.stock    Total.expenses      Profit.after.tax   
##  Min.   :      0.0   Min.   :-3029.40   Min.   :     -0.1   Min.   : -3908.30  
##  1st Qu.:    106.5   1st Qu.:   -1.80   1st Qu.:     95.8   1st Qu.:     0.50  
##  Median :    444.9   Median :    1.60   Median :    407.7   Median :     8.80  
##  Mean   :   4582.8   Mean   :   41.49   Mean   :   4262.9   Mean   :   277.36  
##  3rd Qu.:   1440.9   3rd Qu.:   18.05   3rd Qu.:   1359.8   3rd Qu.:    52.27  
##  Max.   :2442828.2   Max.   :14185.50   Max.   :2366035.3   Max.   :119439.10  
##  NA's   :198         NA's   :458        NA's   :139         NA's   :131        
##      PBDITA              PBT             Cash.profit       
##  Min.   :  -440.7   Min.   : -3894.80   Min.   : -2245.70  
##  1st Qu.:     6.9   1st Qu.:     0.70   1st Qu.:     2.90  
##  Median :    35.4   Median :    12.40   Median :    18.85  
##  Mean   :   578.1   Mean   :   383.81   Mean   :   392.07  
##  3rd Qu.:   150.2   3rd Qu.:    71.97   3rd Qu.:    93.20  
##  Max.   :208576.5   Max.   :145292.60   Max.   :176911.80  
##  NA's   :131        NA's   :131         NA's   :131        
##  PBDITA.as...of.total.income PBT.as...of.total.income PAT.as...of.total.income
##  Min.   :-6400.000           Min.   :-21340.00        Min.   :-21340.00       
##  1st Qu.:    5.000           1st Qu.:     0.55        1st Qu.:     0.35       
##  Median :    9.660           Median :     3.31        Median :     2.34       
##  Mean   :    4.571           Mean   :   -17.28        Mean   :   -19.20       
##  3rd Qu.:   16.390           3rd Qu.:     8.80        3rd Qu.:     6.34       
##  Max.   :  100.000           Max.   :   100.00        Max.   :   150.00       
##  NA's   :68                  NA's   :68               NA's   :68              
##  Cash.profit.as...of.total.income PAT.as...of.net.worth     Sales          
##  Min.   :-15020.000               Min.   :-748.72       Min.   :      0.1  
##  1st Qu.:     2.020               1st Qu.:   0.00       1st Qu.:    112.7  
##  Median :     5.640               Median :   7.92       Median :    453.1  
##  Mean   :    -8.229               Mean   :  10.27       Mean   :   4549.5  
##  3rd Qu.:    10.700               3rd Qu.:  20.19       3rd Qu.:   1433.5  
##  Max.   :   100.000               Max.   :2466.67       Max.   :2384984.4  
##  NA's   :68                                             NA's   :259        
##  Income.from.financial.services  Other.income      Total.capital    
##  Min.   :    0.00               Min.   :    0.00   Min.   :    0.1  
##  1st Qu.:    0.40               1st Qu.:    0.40   1st Qu.:   13.1  
##  Median :    1.80               Median :    1.40   Median :   42.1  
##  Mean   :   80.84               Mean   :   41.36   Mean   :  216.6  
##  3rd Qu.:    9.68               3rd Qu.:    5.97   3rd Qu.:  100.3  
##  Max.   :51938.20               Max.   :42856.70   Max.   :78273.2  
##  NA's   :935                    NA's   :1295       NA's   :4        
##  Reserves.and.funds Deposits..accepted.by.commercial.banks.   Borrowings       
##  Min.   : -6525.9   Mode:logical                            Min.   :     0.10  
##  1st Qu.:     5.0   NA's:3541                               1st Qu.:    23.95  
##  Median :    54.8                                           Median :    99.20  
##  Mean   :  1163.8                                           Mean   :  1122.28  
##  3rd Qu.:   277.3                                           3rd Qu.:   352.60  
##  Max.   :625137.8                                           Max.   :278257.30  
##  NA's   :85                                                 NA's   :366        
##  Current.liabilities...provisions Deferred.tax.liability Shareholders.funds
##  Min.   :     0.1                 Min.   :    0.1        Min.   :     0.0  
##  1st Qu.:    17.8                 1st Qu.:    3.2        1st Qu.:    32.0  
##  Median :    69.4                 Median :   13.4        Median :   105.6  
##  Mean   :   940.6                 Mean   :  227.2        Mean   :  1322.1  
##  3rd Qu.:   261.7                 3rd Qu.:   50.0        3rd Qu.:   393.2  
##  Max.   :352240.3                 Max.   :72796.6        Max.   :613151.6  
##  NA's   :96                       NA's   :1140                             
##  Cumulative.retained.profits Capital.employed      TOL.TNW        
##  Min.   : -6534.3            Min.   :     0.0   Min.   :-350.480  
##  1st Qu.:     1.1            1st Qu.:    60.8   1st Qu.:   0.600  
##  Median :    37.1            Median :   214.7   Median :   1.430  
##  Mean   :   890.5            Mean   :  2328.3   Mean   :   3.994  
##  3rd Qu.:   202.3            3rd Qu.:   767.3   3rd Qu.:   2.830  
##  Max.   :390133.8            Max.   :891408.9   Max.   : 473.000  
##  NA's   :38                                                       
##  Total.term.liabilities...tangible.net.worth
##  Min.   :-325.600                           
##  1st Qu.:   0.050                           
##  Median :   0.340                           
##  Mean   :   1.844                           
##  3rd Qu.:   1.000                           
##  Max.   : 456.000                           
##                                             
##  Contingent.liabilities...Net.worth.... Contingent.liabilities
##  Min.   :    0.00                       Min.   :     0.1      
##  1st Qu.:    0.00                       1st Qu.:     6.3      
##  Median :    5.33                       Median :    38.0      
##  Mean   :   53.94                       Mean   :   932.9      
##  3rd Qu.:   30.76                       3rd Qu.:   192.7      
##  Max.   :14704.27                       Max.   :559506.8      
##                                         NA's   :1188          
##  Net.fixed.assets    Investments        Current.assets     Net.working.capital
##  Min.   :     0.0   Min.   :     0.00   Min.   :     0.1   Min.   :-63839.0   
##  1st Qu.:    26.0   1st Qu.:     1.00   1st Qu.:    36.2   1st Qu.:    -1.1   
##  Median :    93.5   Median :     8.35   Median :   145.1   Median :    16.2   
##  Mean   :  1189.7   Mean   :   694.73   Mean   :  1293.4   Mean   :   138.6   
##  3rd Qu.:   344.9   3rd Qu.:    64.30   3rd Qu.:   502.2   3rd Qu.:    84.2   
##  Max.   :636604.6   Max.   :199978.60   Max.   :354815.2   Max.   : 85782.8   
##  NA's   :118        NA's   :1435        NA's   :66         NA's   :32         
##  Quick.ratio..times. Current.ratio..times. Debt.to.equity.ratio..times.
##  Min.   :  0.000     Min.   :  0.00        Min.   :  0.00              
##  1st Qu.:  0.410     1st Qu.:  0.93        1st Qu.:  0.22              
##  Median :  0.670     Median :  1.23        Median :  0.79              
##  Mean   :  1.401     Mean   :  2.13        Mean   :  2.78              
##  3rd Qu.:  1.030     3rd Qu.:  1.71        3rd Qu.:  1.75              
##  Max.   :341.000     Max.   :505.00        Max.   :456.00              
##  NA's   :93          NA's   :93                                        
##  Cash.to.current.liabilities..times. Cash.to.average.cost.of.sales.per.day
##  Min.   :  0.0000                    Min.   :     0.00                    
##  1st Qu.:  0.0200                    1st Qu.:     2.79                    
##  Median :  0.0700                    Median :     8.03                    
##  Mean   :  0.4904                    Mean   :   158.44                    
##  3rd Qu.:  0.1900                    3rd Qu.:    21.79                    
##  Max.   :165.0000                    Max.   :128040.76                    
##  NA's   :93                          NA's   :85                           
##  Creditors.turnover Debtors.turnover  Finished.goods.turnover  WIP.turnover    
##  Min.   :   0.000   Min.   :   0.00   Min.   :   -0.09        Min.   :  -0.18  
##  1st Qu.:   3.700   1st Qu.:   3.76   1st Qu.:    8.20        1st Qu.:   5.10  
##  Median :   6.095   Median :   6.32   Median :   17.27        Median :   9.76  
##  Mean   :  15.446   Mean   :  17.04   Mean   :   87.08        Mean   :  27.93  
##  3rd Qu.:  11.490   3rd Qu.:  11.68   3rd Qu.:   40.35        3rd Qu.:  20.24  
##  Max.   :2401.000   Max.   :3135.20   Max.   :17947.60        Max.   :5651.40  
##  NA's   :333        NA's   :328       NA's   :740             NA's   :640      
##  Raw.material.turnover Shares.outstanding   Equity.face.value
##  Min.   :   -2.00      Min.   :-2.147e+09   Min.   :-999999  
##  1st Qu.:    2.99      1st Qu.: 1.316e+06   1st Qu.:     10  
##  Median :    6.40      Median : 4.672e+06   Median :     10  
##  Mean   :   19.09      Mean   : 2.207e+07   Mean   :  -1334  
##  3rd Qu.:   11.85      3rd Qu.: 1.065e+07   3rd Qu.:     10  
##  Max.   :21092.00      Max.   : 4.130e+09   Max.   : 100000  
##  NA's   :361           NA's   :692          NA's   :692      
##       EPS             Adjusted.EPS       Total.liabilities     PE.on.BSE       
##  Min.   :-843181.8   Min.   :-843181.8   Min.   :      0.1   Min.   :-1116.64  
##  1st Qu.:      0.0   1st Qu.:      0.0   1st Qu.:     91.3   1st Qu.:    3.27  
##  Median :      1.4   Median :      1.2   Median :    309.7   Median :    9.10  
##  Mean   :   -220.3   Mean   :   -221.5   Mean   :   3443.4   Mean   :   63.91  
##  3rd Qu.:      9.6   3rd Qu.:      7.5   3rd Qu.:   1098.7   3rd Qu.:   17.79  
##  Max.   :  34522.5   Max.   :  34522.5   Max.   :1176509.2   Max.   :51002.74  
##                                                              NA's   :2194

Dimension of Raw data

dim(FRA_raw_read_df) ####3541*52
## [1] 3541   52

Data type of Validation

str(test_raw_read_df)
## Classes 'tbl_df', 'tbl' and 'data.frame':    715 obs. of  52 variables:
##  $ Num                                        : int  1545 735 2605 1292 134 911 3425 2124 2066 1996 ...
##  $ Default...1                                : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Total.assets                               : num  971 675 532 858 823 ...
##  $ Net.worth                                  : num  276 212 120 201 349 ...
##  $ Total.income                               : num  2185 819 564 3576 1034 ...
##  $ Change.in.stock                            : num  14.2 10.4 -28.1 -0.6 28.9 -0.5 NA -7.7 27.2 -0.2 ...
##  $ Total.expenses                             : num  2099 810 578 3613 1042 ...
##  $ Profit.after.tax                           : num  100.2 19.7 -42.4 -37.5 21.4 ...
##  $ PBDITA                                     : num  285.6 116 -31 68.2 90.1 ...
##  $ PBT                                        : num  152.1 33.7 -56 25.7 29.7 ...
##  $ Cash.profit                                : num  182.3 50.5 -35.3 37.3 62.7 ...
##  $ PBDITA.as...of.total.income                : num  13.07 14.16 -5.5 1.91 8.71 ...
##  $ PBT.as...of.total.income                   : num  6.96 4.11 -9.94 0.72 2.87 ...
##  $ PAT.as...of.total.income                   : num  4.59 2.4 -7.52 -1.05 2.07 ...
##  $ Cash.profit.as...of.total.income           : num  8.34 6.16 -6.26 1.04 6.06 ...
##  $ PAT.as...of.net.worth                      : num  42.11 10.66 -31.2 0 6.31 ...
##  $ Sales                                      : num  2171 817 552 3573 1027 ...
##  $ Income.from.financial.services             : num  2.3 0.8 9.1 1 0.7 ...
##  $ Other.income                               : num  NA 0.2 2.1 1.5 2.3 0.1 NA NA 0.1 0.1 ...
##  $ Total.capital                              : num  48 114 47.1 50.5 33 ...
##  $ Reserves.and.funds                         : num  413.1 97.6 227.4 150.9 316.2 ...
##  $ Deposits..accepted.by.commercial.banks.    : logi  NA NA NA NA NA NA ...
##  $ Borrowings                                 : num  177.3 339.8 17.5 524.2 162.3 ...
##  $ Current.liabilities...provisions           : num  328.5 100.5 240.1 75.2 299.6 ...
##  $ Deferred.tax.liability                     : num  3.7 23.1 NA 56.7 12.2 2.1 1.9 4.4 2.9 NA ...
##  $ Shareholders.funds                         : num  276 212 120 201 349 ...
##  $ Cumulative.retained.profits                : num  227.8 97.6 69.9 150.9 316.2 ...
##  $ Capital.employed                           : num  453 551 138 726 512 ...
##  $ TOL.TNW                                    : num  1.8 2.01 1.73 2.94 1.02 0.86 0.06 1.92 0.37 1.96 ...
##  $ Total.term.liabilities...tangible.net.worth: num  0.27 0.72 0.09 0.81 0.1 0.11 0.05 0.78 0 1.81 ...
##  $ Contingent.liabilities...Net.worth....     : num  112.94 5.77 102.83 0.65 28.78 ...
##  $ Contingent.liabilities                     : num  311.5 12.2 123.6 1.3 100.5 ...
##  $ Net.fixed.assets                           : num  332 199 270 263 191 ...
##  $ Investments                                : num  NA NA 0.7 NA NA NA 17.3 2.6 NA NA ...
##  $ Current.assets                             : num  560 407 148 536 472 ...
##  $ Net.working.capital                        : num  134.2 123.6 -97.1 99.6 75.3 ...
##  $ Quick.ratio..times.                        : num  0.92 0.48 0.32 0.51 0.58 0.97 166 0.52 0.88 0.6 ...
##  $ Current.ratio..times.                      : num  1.31 1.39 0.6 1.23 1.19 1.86 166 1.56 1.19 0.55 ...
##  $ Debt.to.equity.ratio..times.               : num  0.64 1.61 0.15 2.6 0.46 0.32 0.05 1.24 0 1.81 ...
##  $ Cash.to.current.liabilities..times.        : num  0.09 0.03 0.04 0.08 0.08 0 165 0.03 0.35 0.23 ...
##  $ Cash.to.average.cost.of.sales.per.day      : num  7.56 3.88 4.63 3.71 11.15 ...
##  $ Creditors.turnover                         : num  5.94 10.59 2.35 NA 5.48 ...
##  $ Debtors.turnover                           : num  5.74 6.03 9.6 NA 4.78 ...
##  $ Finished.goods.turnover                    : num  25.11 28.96 8.23 NA 6.28 ...
##  $ WIP.turnover                               : num  20 18.6 6.6 NA 6.7 ...
##  $ Raw.material.turnover                      : num  17.58 2.67 3.77 NA 3.7 ...
##  $ Shares.outstanding                         : num  4800000 11400000 471285 5050000 3205946 ...
##  $ Equity.face.value                          : num  10 10 100 10 10 100 10 NA 10 10 ...
##  $ EPS                                        : num  18.6 1.65 -90.39 -7.09 5.9 ...
##  $ Adjusted.EPS                               : num  18.6 1.65 -90.39 -7.09 5.9 ...
##  $ Total.liabilities                          : num  971 675 532 858 823 ...
##  $ PE.on.BSE                                  : num  NA NA -15.5 -0.16 NA NA NA NA NA NA ...

Summary of Validation data

summary(test_raw_read_df)
##       Num          Default...1       Total.assets        Net.worth       
##  Min.   :   1.0   Min.   :0.00000   Min.   :     0.1   Min.   :     0.1  
##  1st Qu.: 869.5   1st Qu.:0.00000   1st Qu.:    93.2   1st Qu.:    34.4  
##  Median :1741.0   Median :0.00000   Median :   347.7   Median :   120.9  
##  Mean   :1756.3   Mean   :0.07552   Mean   :  4218.6   Mean   :  1629.7  
##  3rd Qu.:2588.0   3rd Qu.:0.00000   3rd Qu.:  1315.3   3rd Qu.:   451.5  
##  Max.   :3543.0   Max.   :1.00000   Max.   :354727.3   Max.   :171840.0  
##                                                                          
##   Total.income       Change.in.stock   Total.expenses      Profit.after.tax  
##  Min.   :      0.0   Min.   :-488.10   Min.   :      0.0   Min.   : -998.00  
##  1st Qu.:    110.8   1st Qu.:  -1.90   1st Qu.:    104.1   1st Qu.:    0.68  
##  Median :    536.0   Median :   1.80   Median :    511.1   Median :   10.20  
##  Mean   :   5204.7   Mean   :  54.66   Mean   :   4817.3   Mean   :  382.22  
##  3rd Qu.:   1727.1   3rd Qu.:  19.35   3rd Qu.:   1642.3   3rd Qu.:   68.95  
##  Max.   :1028087.4   Max.   :7540.00   Max.   :1014813.1   Max.   :62022.90  
##  NA's   :33          NA's   :92        NA's   :26          NA's   :23        
##      PBDITA               PBT            Cash.profit      
##  Min.   :  -393.90   Min.   : -993.90   Min.   : -894.60  
##  1st Qu.:     7.15   1st Qu.:    1.00   1st Qu.:    3.27  
##  Median :    42.20   Median :   14.25   Median :   22.05  
##  Mean   :   743.35   Mean   :  540.59   Mean   :  488.11  
##  3rd Qu.:   192.82   3rd Qu.:   90.50   3rd Qu.:  120.30  
##  Max.   :110557.10   Max.   :94565.20   Max.   :71581.60  
##  NA's   :23          NA's   :23         NA's   :23        
##  PBDITA.as...of.total.income PBT.as...of.total.income PAT.as...of.total.income
##  Min.   :-6400.000           Min.   :-9700.000        Min.   :-9700.000       
##  1st Qu.:    4.702           1st Qu.:    0.622        1st Qu.:    0.390       
##  Median :    9.780           Median :    3.450        Median :    2.405       
##  Mean   :   -3.681           Mean   :  -22.725        Mean   :  -24.147       
##  3rd Qu.:   16.753           3rd Qu.:    9.725        3rd Qu.:    6.790       
##  Max.   :  100.000           Max.   :  100.000        Max.   :  100.000       
##  NA's   :11                  NA's   :11               NA's   :11              
##  Cash.profit.as...of.total.income PAT.as...of.net.worth     Sales         
##  Min.   :-6400.000                Min.   :-194.520      Min.   :     0.1  
##  1st Qu.:    1.930                1st Qu.:   0.000      1st Qu.:   120.8  
##  Median :    5.835                Median :   8.710      Median :   552.5  
##  Mean   :  -12.929                Mean   :   9.666      Mean   :  5117.5  
##  3rd Qu.:   10.982                3rd Qu.:  20.215      3rd Qu.:  1721.3  
##  Max.   :  100.000                Max.   : 441.670      Max.   :976884.0  
##  NA's   :11                                             NA's   :46        
##  Income.from.financial.services  Other.income      Total.capital    
##  Min.   :   0.10                Min.   :    0.00   Min.   :    0.1  
##  1st Qu.:   0.50                1st Qu.:    0.32   1st Qu.:   14.1  
##  Median :   2.00                Median :    1.65   Median :   45.3  
##  Mean   :  83.86                Mean   :  128.16   Mean   :  263.9  
##  3rd Qu.:  10.10                3rd Qu.:    7.25   3rd Qu.:  121.1  
##  Max.   :8097.20                Max.   :42856.70   Max.   :41304.0  
##  NA's   :176                    NA's   :261        NA's   :1        
##  Reserves.and.funds  Deposits..accepted.by.commercial.banks.
##  Min.   : -1125.00   Mode:logical                           
##  1st Qu.:     7.33   NA's:715                               
##  Median :    57.45                                          
##  Mean   :  1440.70                                          
##  3rd Qu.:   334.80                                          
##  Max.   :133684.20                                          
##  NA's   :13                                                 
##    Borrowings        Current.liabilities...provisions Deferred.tax.liability
##  Min.   :     0.20   Min.   :     0.1                 Min.   :    0.10      
##  1st Qu.:    25.93   1st Qu.:    16.8                 1st Qu.:    3.10      
##  Median :   105.50   Median :    75.2                 Median :   14.70      
##  Mean   :  1439.86   Mean   :  1058.9                 Mean   :  270.45      
##  3rd Qu.:   391.82   3rd Qu.:   300.4                 3rd Qu.:   62.42      
##  Max.   :105175.30   Max.   :112712.7                 Max.   :27077.90      
##  NA's   :65          NA's   :14                       NA's   :229           
##  Shareholders.funds Cumulative.retained.profits Capital.employed   
##  Min.   :     0.1   Min.   : -2582.4            Min.   :     0.10  
##  1st Qu.:    35.5   1st Qu.:     0.8            1st Qu.:    64.35  
##  Median :   124.0   Median :    40.6            Median :   246.10  
##  Mean   :  1646.0   Mean   :  1168.1            Mean   :  2954.96  
##  3rd Qu.:   478.4   3rd Qu.:   244.5            3rd Qu.:   913.65  
##  Max.   :171840.0   Max.   :128183.1            Max.   :235389.50  
##                     NA's   :7                                      
##     TOL.TNW         Total.term.liabilities...tangible.net.worth
##  Min.   :-350.480   Min.   :-325.600                           
##  1st Qu.:   0.595   1st Qu.:   0.060                           
##  Median :   1.400   Median :   0.350                           
##  Mean   :   4.181   Mean   :   1.906                           
##  3rd Qu.:   2.800   3rd Qu.:   1.005                           
##  Max.   : 411.270   Max.   : 292.020                           
##                                                                
##  Contingent.liabilities...Net.worth.... Contingent.liabilities
##  Min.   :   0.00                        Min.   :    0.1       
##  1st Qu.:   0.00                        1st Qu.:    5.1       
##  Median :   5.52                        Median :   37.5       
##  Mean   :  64.47                        Mean   : 1022.0       
##  3rd Qu.:  31.49                        3rd Qu.:  217.1       
##  Max.   :6295.24                        Max.   :72620.8       
##                                         NA's   :214           
##  Net.fixed.assets    Investments      Current.assets     Net.working.capital
##  Min.   :     0.1   Min.   :    0.0   Min.   :     0.1   Min.   :-41908.3   
##  1st Qu.:    27.2   1st Qu.:    0.9   1st Qu.:    38.9   1st Qu.:    -1.3   
##  Median :    95.0   Median :    7.8   Median :   165.6   Median :    20.1   
##  Mean   :  1306.2   Mean   :  853.2   Mean   :  1632.9   Mean   :   283.0   
##  3rd Qu.:   409.2   3rd Qu.:   61.6   3rd Qu.:   578.0   3rd Qu.:    99.2   
##  Max.   :115737.5   Max.   :88047.8   Max.   :196614.6   Max.   : 85782.8   
##  NA's   :14         NA's   :280       NA's   :14         NA's   :5          
##  Quick.ratio..times. Current.ratio..times. Debt.to.equity.ratio..times.
##  Min.   :  0.000     Min.   :  0.000       Min.   :  0.000             
##  1st Qu.:  0.410     1st Qu.:  0.920       1st Qu.:  0.220             
##  Median :  0.660     Median :  1.230       Median :  0.800             
##  Mean   :  1.968     Mean   :  2.880       Mean   :  3.327             
##  3rd Qu.:  1.020     3rd Qu.:  1.725       3rd Qu.:  1.700             
##  Max.   :341.000     Max.   :505.000       Max.   :341.180             
##  NA's   :12          NA's   :12                                        
##  Cash.to.current.liabilities..times. Cash.to.average.cost.of.sales.per.day
##  Min.   :  0.0000                    Min.   :    0.000                    
##  1st Qu.:  0.0300                    1st Qu.:    3.248                    
##  Median :  0.0800                    Median :    8.130                    
##  Mean   :  0.7149                    Mean   :   79.565                    
##  3rd Qu.:  0.1900                    3rd Qu.:   22.645                    
##  Max.   :165.0000                    Max.   :15999.170                    
##  NA's   :12                          NA's   :15                           
##  Creditors.turnover Debtors.turnover   Finished.goods.turnover
##  Min.   :   0.00    Min.   :   0.000   Min.   :  -0.09        
##  1st Qu.:   3.84    1st Qu.:   4.133   1st Qu.:   8.06        
##  Median :   6.49    Median :   7.050   Median :  17.49        
##  Mean   :  23.48    Mean   :  22.264   Mean   :  71.31        
##  3rd Qu.:  12.90    3rd Qu.:  12.920   3rd Qu.:  38.67        
##  Max.   :1934.00    Max.   :2473.040   Max.   :5614.80        
##  NA's   :58         NA's   :57         NA's   :134            
##   WIP.turnover      Raw.material.turnover Shares.outstanding  Equity.face.value
##  Min.   :   0.000   Min.   :  0.000       Min.   :1.280e+02   Min.   :    1.0  
##  1st Qu.:   5.135   1st Qu.:  3.190       1st Qu.:1.262e+06   1st Qu.:   10.0  
##  Median :  10.710   Median :  6.445       Median :4.940e+06   Median :   10.0  
##  Mean   :  32.386   Mean   : 11.087       Mean   :3.187e+07   Mean   :   45.1  
##  3rd Qu.:  20.130   3rd Qu.: 11.650       3rd Qu.:1.282e+07   3rd Qu.:   10.0  
##  Max.   :5651.400   Max.   :279.960       Max.   :4.130e+09   Max.   :10000.0  
##  NA's   :124        NA's   :67            NA's   :118         NA's   :118      
##       EPS             Adjusted.EPS       Total.liabilities    PE.on.BSE       
##  Min.   :-72750.00   Min.   :-72750.00   Min.   :     0.1   Min.   :-263.920  
##  1st Qu.:     0.00   1st Qu.:     0.00   1st Qu.:    93.2   1st Qu.:   1.863  
##  Median :     1.83   Median :     1.50   Median :   347.7   Median :   7.670  
##  Mean   :   -76.87   Mean   :   -78.74   Mean   :  4218.6   Mean   :  15.132  
##  3rd Qu.:    11.46   3rd Qu.:     8.35   3rd Qu.:  1315.3   3rd Qu.:  14.665  
##  Max.   :  8784.00   Max.   :  8784.00   Max.   :354727.3   Max.   :1478.420  
##                                                             NA's   :433

Dimension of Raw data

dim(test_raw_read_df) ####715*52
## [1] 715  52
Here all 52 variables are appearing as numeric including Default(1,0 values) and Num(company) which seems to be incorrect. Num and Default should be factors; Thus casted to factors;
#colnames(dplyr::select_if(FRA_raw_read_df, is.numeric))
FRA_raw_df<-FRA_raw_read_df
colnames(FRA_raw_df) <- make.names(colnames(FRA_raw_df))####Correcting Column names
####Target Column 'Default' and Num casted to factor
FRA_raw_df$Num<-as.factor(FRA_raw_read_df$Num)
FRA_raw_df$Default<-as.factor(FRA_raw_read_df$Default)
In Raw dataset, Default column - yes means Company can Default and no means company cannot Default
levels(FRA_raw_df$Default) <- c("no", "yes") #### 0 as no and 1 as yes
table(FRA_raw_df$Default)
## 
##   no  yes 
## 3307  234
Post casting, datatype issue is fixed; there are now 2 factors along with 50 numeric columns
colnames(dplyr::select_if(FRA_raw_df, is.numeric)) #### 50  as numeric
##  [1] "Total.assets"                               
##  [2] "Net.worth"                                  
##  [3] "Total.income"                               
##  [4] "Change.in.stock"                            
##  [5] "Total.expenses"                             
##  [6] "Profit.after.tax"                           
##  [7] "PBDITA"                                     
##  [8] "PBT"                                        
##  [9] "Cash.profit"                                
## [10] "PBDITA.as...of.total.income"                
## [11] "PBT.as...of.total.income"                   
## [12] "PAT.as...of.total.income"                   
## [13] "Cash.profit.as...of.total.income"           
## [14] "PAT.as...of.net.worth"                      
## [15] "Sales"                                      
## [16] "Income.from.financial.services"             
## [17] "Other.income"                               
## [18] "Total.capital"                              
## [19] "Reserves.and.funds"                         
## [20] "Borrowings"                                 
## [21] "Current.liabilities...provisions"           
## [22] "Deferred.tax.liability"                     
## [23] "Shareholders.funds"                         
## [24] "Cumulative.retained.profits"                
## [25] "Capital.employed"                           
## [26] "TOL.TNW"                                    
## [27] "Total.term.liabilities...tangible.net.worth"
## [28] "Contingent.liabilities...Net.worth...."     
## [29] "Contingent.liabilities"                     
## [30] "Net.fixed.assets"                           
## [31] "Investments"                                
## [32] "Current.assets"                             
## [33] "Net.working.capital"                        
## [34] "Quick.ratio..times."                        
## [35] "Current.ratio..times."                      
## [36] "Debt.to.equity.ratio..times."               
## [37] "Cash.to.current.liabilities..times."        
## [38] "Cash.to.average.cost.of.sales.per.day"      
## [39] "Creditors.turnover"                         
## [40] "Debtors.turnover"                           
## [41] "Finished.goods.turnover"                    
## [42] "WIP.turnover"                               
## [43] "Raw.material.turnover"                      
## [44] "Shares.outstanding"                         
## [45] "Equity.face.value"                          
## [46] "EPS"                                        
## [47] "Adjusted.EPS"                               
## [48] "Total.liabilities"                          
## [49] "PE.on.BSE"
colnames(dplyr::select_if(FRA_raw_df, is.factor))#### 2 as factors
## [1] "Num"     "Default"
Dropping the columns: “Deposits..accepted.by.commercial.banks.” as that column has all nulls from Raw and Validation dataset
FRA_new_df<-dplyr::select(FRA_raw_df,-Deposits..accepted.by.commercial.banks.)
##Same treatement to validation/test set
test_raw_df<-test_raw_read_df
str(test_raw_df)
## Classes 'tbl_df', 'tbl' and 'data.frame':    715 obs. of  52 variables:
##  $ Num                                        : int  1545 735 2605 1292 134 911 3425 2124 2066 1996 ...
##  $ Default...1                                : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Total.assets                               : num  971 675 532 858 823 ...
##  $ Net.worth                                  : num  276 212 120 201 349 ...
##  $ Total.income                               : num  2185 819 564 3576 1034 ...
##  $ Change.in.stock                            : num  14.2 10.4 -28.1 -0.6 28.9 -0.5 NA -7.7 27.2 -0.2 ...
##  $ Total.expenses                             : num  2099 810 578 3613 1042 ...
##  $ Profit.after.tax                           : num  100.2 19.7 -42.4 -37.5 21.4 ...
##  $ PBDITA                                     : num  285.6 116 -31 68.2 90.1 ...
##  $ PBT                                        : num  152.1 33.7 -56 25.7 29.7 ...
##  $ Cash.profit                                : num  182.3 50.5 -35.3 37.3 62.7 ...
##  $ PBDITA.as...of.total.income                : num  13.07 14.16 -5.5 1.91 8.71 ...
##  $ PBT.as...of.total.income                   : num  6.96 4.11 -9.94 0.72 2.87 ...
##  $ PAT.as...of.total.income                   : num  4.59 2.4 -7.52 -1.05 2.07 ...
##  $ Cash.profit.as...of.total.income           : num  8.34 6.16 -6.26 1.04 6.06 ...
##  $ PAT.as...of.net.worth                      : num  42.11 10.66 -31.2 0 6.31 ...
##  $ Sales                                      : num  2171 817 552 3573 1027 ...
##  $ Income.from.financial.services             : num  2.3 0.8 9.1 1 0.7 ...
##  $ Other.income                               : num  NA 0.2 2.1 1.5 2.3 0.1 NA NA 0.1 0.1 ...
##  $ Total.capital                              : num  48 114 47.1 50.5 33 ...
##  $ Reserves.and.funds                         : num  413.1 97.6 227.4 150.9 316.2 ...
##  $ Deposits..accepted.by.commercial.banks.    : logi  NA NA NA NA NA NA ...
##  $ Borrowings                                 : num  177.3 339.8 17.5 524.2 162.3 ...
##  $ Current.liabilities...provisions           : num  328.5 100.5 240.1 75.2 299.6 ...
##  $ Deferred.tax.liability                     : num  3.7 23.1 NA 56.7 12.2 2.1 1.9 4.4 2.9 NA ...
##  $ Shareholders.funds                         : num  276 212 120 201 349 ...
##  $ Cumulative.retained.profits                : num  227.8 97.6 69.9 150.9 316.2 ...
##  $ Capital.employed                           : num  453 551 138 726 512 ...
##  $ TOL.TNW                                    : num  1.8 2.01 1.73 2.94 1.02 0.86 0.06 1.92 0.37 1.96 ...
##  $ Total.term.liabilities...tangible.net.worth: num  0.27 0.72 0.09 0.81 0.1 0.11 0.05 0.78 0 1.81 ...
##  $ Contingent.liabilities...Net.worth....     : num  112.94 5.77 102.83 0.65 28.78 ...
##  $ Contingent.liabilities                     : num  311.5 12.2 123.6 1.3 100.5 ...
##  $ Net.fixed.assets                           : num  332 199 270 263 191 ...
##  $ Investments                                : num  NA NA 0.7 NA NA NA 17.3 2.6 NA NA ...
##  $ Current.assets                             : num  560 407 148 536 472 ...
##  $ Net.working.capital                        : num  134.2 123.6 -97.1 99.6 75.3 ...
##  $ Quick.ratio..times.                        : num  0.92 0.48 0.32 0.51 0.58 0.97 166 0.52 0.88 0.6 ...
##  $ Current.ratio..times.                      : num  1.31 1.39 0.6 1.23 1.19 1.86 166 1.56 1.19 0.55 ...
##  $ Debt.to.equity.ratio..times.               : num  0.64 1.61 0.15 2.6 0.46 0.32 0.05 1.24 0 1.81 ...
##  $ Cash.to.current.liabilities..times.        : num  0.09 0.03 0.04 0.08 0.08 0 165 0.03 0.35 0.23 ...
##  $ Cash.to.average.cost.of.sales.per.day      : num  7.56 3.88 4.63 3.71 11.15 ...
##  $ Creditors.turnover                         : num  5.94 10.59 2.35 NA 5.48 ...
##  $ Debtors.turnover                           : num  5.74 6.03 9.6 NA 4.78 ...
##  $ Finished.goods.turnover                    : num  25.11 28.96 8.23 NA 6.28 ...
##  $ WIP.turnover                               : num  20 18.6 6.6 NA 6.7 ...
##  $ Raw.material.turnover                      : num  17.58 2.67 3.77 NA 3.7 ...
##  $ Shares.outstanding                         : num  4800000 11400000 471285 5050000 3205946 ...
##  $ Equity.face.value                          : num  10 10 100 10 10 100 10 NA 10 10 ...
##  $ EPS                                        : num  18.6 1.65 -90.39 -7.09 5.9 ...
##  $ Adjusted.EPS                               : num  18.6 1.65 -90.39 -7.09 5.9 ...
##  $ Total.liabilities                          : num  971 675 532 858 823 ...
##  $ PE.on.BSE                                  : num  NA NA -15.5 -0.16 NA NA NA NA NA NA ...
colnames(test_raw_df) <- make.names(colnames(test_raw_df))
test_raw_df$Num<-as.factor(test_raw_df$Num)
##Rename column from Default...1 to Default
test_raw_df<-dplyr::rename(test_raw_df,Default = Default...1)
colnames(test_raw_df)
##  [1] "Num"                                        
##  [2] "Default"                                    
##  [3] "Total.assets"                               
##  [4] "Net.worth"                                  
##  [5] "Total.income"                               
##  [6] "Change.in.stock"                            
##  [7] "Total.expenses"                             
##  [8] "Profit.after.tax"                           
##  [9] "PBDITA"                                     
## [10] "PBT"                                        
## [11] "Cash.profit"                                
## [12] "PBDITA.as...of.total.income"                
## [13] "PBT.as...of.total.income"                   
## [14] "PAT.as...of.total.income"                   
## [15] "Cash.profit.as...of.total.income"           
## [16] "PAT.as...of.net.worth"                      
## [17] "Sales"                                      
## [18] "Income.from.financial.services"             
## [19] "Other.income"                               
## [20] "Total.capital"                              
## [21] "Reserves.and.funds"                         
## [22] "Deposits..accepted.by.commercial.banks."    
## [23] "Borrowings"                                 
## [24] "Current.liabilities...provisions"           
## [25] "Deferred.tax.liability"                     
## [26] "Shareholders.funds"                         
## [27] "Cumulative.retained.profits"                
## [28] "Capital.employed"                           
## [29] "TOL.TNW"                                    
## [30] "Total.term.liabilities...tangible.net.worth"
## [31] "Contingent.liabilities...Net.worth...."     
## [32] "Contingent.liabilities"                     
## [33] "Net.fixed.assets"                           
## [34] "Investments"                                
## [35] "Current.assets"                             
## [36] "Net.working.capital"                        
## [37] "Quick.ratio..times."                        
## [38] "Current.ratio..times."                      
## [39] "Debt.to.equity.ratio..times."               
## [40] "Cash.to.current.liabilities..times."        
## [41] "Cash.to.average.cost.of.sales.per.day"      
## [42] "Creditors.turnover"                         
## [43] "Debtors.turnover"                           
## [44] "Finished.goods.turnover"                    
## [45] "WIP.turnover"                               
## [46] "Raw.material.turnover"                      
## [47] "Shares.outstanding"                         
## [48] "Equity.face.value"                          
## [49] "EPS"                                        
## [50] "Adjusted.EPS"                               
## [51] "Total.liabilities"                          
## [52] "PE.on.BSE"
test_raw_df$Default<-as.factor(test_raw_df$Default)
levels(test_raw_df$Default) <- c("no", "yes") ## 0 as no and 1 as yes
table(test_raw_df$Default)
## 
##  no yes 
## 661  54
colnames(dplyr::select_if(test_raw_df, is.numeric)) 
##  [1] "Total.assets"                               
##  [2] "Net.worth"                                  
##  [3] "Total.income"                               
##  [4] "Change.in.stock"                            
##  [5] "Total.expenses"                             
##  [6] "Profit.after.tax"                           
##  [7] "PBDITA"                                     
##  [8] "PBT"                                        
##  [9] "Cash.profit"                                
## [10] "PBDITA.as...of.total.income"                
## [11] "PBT.as...of.total.income"                   
## [12] "PAT.as...of.total.income"                   
## [13] "Cash.profit.as...of.total.income"           
## [14] "PAT.as...of.net.worth"                      
## [15] "Sales"                                      
## [16] "Income.from.financial.services"             
## [17] "Other.income"                               
## [18] "Total.capital"                              
## [19] "Reserves.and.funds"                         
## [20] "Borrowings"                                 
## [21] "Current.liabilities...provisions"           
## [22] "Deferred.tax.liability"                     
## [23] "Shareholders.funds"                         
## [24] "Cumulative.retained.profits"                
## [25] "Capital.employed"                           
## [26] "TOL.TNW"                                    
## [27] "Total.term.liabilities...tangible.net.worth"
## [28] "Contingent.liabilities...Net.worth...."     
## [29] "Contingent.liabilities"                     
## [30] "Net.fixed.assets"                           
## [31] "Investments"                                
## [32] "Current.assets"                             
## [33] "Net.working.capital"                        
## [34] "Quick.ratio..times."                        
## [35] "Current.ratio..times."                      
## [36] "Debt.to.equity.ratio..times."               
## [37] "Cash.to.current.liabilities..times."        
## [38] "Cash.to.average.cost.of.sales.per.day"      
## [39] "Creditors.turnover"                         
## [40] "Debtors.turnover"                           
## [41] "Finished.goods.turnover"                    
## [42] "WIP.turnover"                               
## [43] "Raw.material.turnover"                      
## [44] "Shares.outstanding"                         
## [45] "Equity.face.value"                          
## [46] "EPS"                                        
## [47] "Adjusted.EPS"                               
## [48] "Total.liabilities"                          
## [49] "PE.on.BSE"
colnames(dplyr::select_if(test_raw_df, is.factor))
## [1] "Num"     "Default"
colnames(test_raw_df)
##  [1] "Num"                                        
##  [2] "Default"                                    
##  [3] "Total.assets"                               
##  [4] "Net.worth"                                  
##  [5] "Total.income"                               
##  [6] "Change.in.stock"                            
##  [7] "Total.expenses"                             
##  [8] "Profit.after.tax"                           
##  [9] "PBDITA"                                     
## [10] "PBT"                                        
## [11] "Cash.profit"                                
## [12] "PBDITA.as...of.total.income"                
## [13] "PBT.as...of.total.income"                   
## [14] "PAT.as...of.total.income"                   
## [15] "Cash.profit.as...of.total.income"           
## [16] "PAT.as...of.net.worth"                      
## [17] "Sales"                                      
## [18] "Income.from.financial.services"             
## [19] "Other.income"                               
## [20] "Total.capital"                              
## [21] "Reserves.and.funds"                         
## [22] "Deposits..accepted.by.commercial.banks."    
## [23] "Borrowings"                                 
## [24] "Current.liabilities...provisions"           
## [25] "Deferred.tax.liability"                     
## [26] "Shareholders.funds"                         
## [27] "Cumulative.retained.profits"                
## [28] "Capital.employed"                           
## [29] "TOL.TNW"                                    
## [30] "Total.term.liabilities...tangible.net.worth"
## [31] "Contingent.liabilities...Net.worth...."     
## [32] "Contingent.liabilities"                     
## [33] "Net.fixed.assets"                           
## [34] "Investments"                                
## [35] "Current.assets"                             
## [36] "Net.working.capital"                        
## [37] "Quick.ratio..times."                        
## [38] "Current.ratio..times."                      
## [39] "Debt.to.equity.ratio..times."               
## [40] "Cash.to.current.liabilities..times."        
## [41] "Cash.to.average.cost.of.sales.per.day"      
## [42] "Creditors.turnover"                         
## [43] "Debtors.turnover"                           
## [44] "Finished.goods.turnover"                    
## [45] "WIP.turnover"                               
## [46] "Raw.material.turnover"                      
## [47] "Shares.outstanding"                         
## [48] "Equity.face.value"                          
## [49] "EPS"                                        
## [50] "Adjusted.EPS"                               
## [51] "Total.liabilities"                          
## [52] "PE.on.BSE"
##Dropping col Deposits for test datasets
test_new_df<-dplyr::select(test_raw_df,-Deposits..accepted.by.commercial.banks.)
Post dropping column, there are now 51 columns in total(49 numeric + 2 factors) in Raw dataset
dim(FRA_new_df)
## [1] 3541   51
Post dropping column, there are now 51 columns in total(49 numeric + 2 factors) in Validation dataset
dim(test_new_df)
## [1] 715  51
Outlier removal for Raw and Validation dataset; The idea is mark outlier below 5% and above 95% percentile as outlier and mark it as NA. The original dataset already has missing data. The idea is if any column or rows has more than 50% missing data, will be dropped as it does not make sense to impute them.
#dim(FRA_num_outasNA_dframe) ##49 Numeric cols 
#summary(FRA_num_outasNA_dframe)

#dim(test_num_outasNA_dframe) ##49 Numeric cols
#summary(test_num_outasNA_dframe)

Know percenatge of missing data with NA in data column wise in Raw dataset

p<-function(x) {sum(is.na(x))/length(x)*100}
apply(FRA_num_outasNA_dframe,2,p) ## for raw
##                                Total.assets 
##                                    9.912454 
##                                   Net.worth 
##                                    9.968935 
##                                Total.income 
##                                   15.052245 
##                             Change.in.stock 
##                                   21.688788 
##                              Total.expenses 
##                                   13.583733 
##                            Profit.after.tax 
##                                   13.357809 
##                                      PBDITA 
##                                   13.329568 
##                                         PBT 
##                                   13.357809 
##                                 Cash.profit 
##                                   13.329568 
##                 PBDITA.as...of.total.income 
##                                   11.719853 
##                    PBT.as...of.total.income 
##                                   11.719853 
##                    PAT.as...of.total.income 
##                                   11.719853 
##            Cash.profit.as...of.total.income 
##                                   11.748094 
##                       PAT.as...of.net.worth 
##                                    9.997176 
##                                       Sales 
##                                   16.633719 
##              Income.from.financial.services 
##                                   30.132731 
##                                Other.income 
##                                   39.819260 
##                               Total.capital 
##                                    9.997176 
##                          Reserves.and.funds 
##                                   12.171703 
##                                  Borrowings 
##                                   19.175374 
##            Current.liabilities...provisions 
##                                   12.397628 
##                      Deferred.tax.liability 
##                                   38.435470 
##                          Shareholders.funds 
##                                    9.997176 
##                 Cumulative.retained.profits 
##                                   11.013838 
##                            Capital.employed 
##                                    9.940695 
##                                     TOL.TNW 
##                                    9.630048 
## Total.term.liabilities...tangible.net.worth 
##                                    5.393957 
##      Contingent.liabilities...Net.worth.... 
##                                    4.998588 
##                      Contingent.liabilities 
##                                   39.762779 
##                            Net.fixed.assets 
##                                   13.018921 
##                                 Investments 
##                                   43.970630 
##                              Current.assets 
##                                   11.635131 
##                         Net.working.capital 
##                                   10.844394 
##                         Quick.ratio..times. 
##                                   12.115222 
##                       Current.ratio..times. 
##                                   12.369387 
##                Debt.to.equity.ratio..times. 
##                                    4.998588 
##         Cash.to.current.liabilities..times. 
##                                    7.370799 
##       Cash.to.average.cost.of.sales.per.day 
##                                   12.143462 
##                          Creditors.turnover 
##                                   13.950861 
##                            Debtors.turnover 
##                                   13.809658 
##                     Finished.goods.turnover 
##                                   28.777182 
##                                WIP.turnover 
##                                   26.263767 
##                       Raw.material.turnover 
##                                   14.769839 
##                          Shares.outstanding 
##                                   27.619317 
##                           Equity.face.value 
##                                   24.597571 
##                                         EPS 
##                                    9.968935 
##                                Adjusted.EPS 
##                                    9.997176 
##                           Total.liabilities 
##                                    9.912454 
##                                   PE.on.BSE 
##                                   65.800621

Know percenatge of missing data with NA in data column wise in Validation dataset

apply(test_num_outasNA_dframe,2,p) ## for test
##                                Total.assets 
##                                   10.069930 
##                                   Net.worth 
##                                    9.930070 
##                                Total.income 
##                                   14.405594 
##                             Change.in.stock 
##                                   21.818182 
##                              Total.expenses 
##                                   13.426573 
##                            Profit.after.tax 
##                                   13.006993 
##                                      PBDITA 
##                                   13.006993 
##                                         PBT 
##                                   13.006993 
##                                 Cash.profit 
##                                   13.006993 
##                 PBDITA.as...of.total.income 
##                                   11.608392 
##                    PBT.as...of.total.income 
##                                   11.608392 
##                    PAT.as...of.total.income 
##                                   11.608392 
##            Cash.profit.as...of.total.income 
##                                   11.608392 
##                       PAT.as...of.net.worth 
##                                   10.069930 
##                                       Sales 
##                                   15.944056 
##              Income.from.financial.services 
##                                   28.391608 
##                                Other.income 
##                                   39.860140 
##                               Total.capital 
##                                   10.069930 
##                          Reserves.and.funds 
##                                   11.888112 
##                                  Borrowings 
##                                   18.321678 
##            Current.liabilities...provisions 
##                                   11.608392 
##                      Deferred.tax.liability 
##                                   38.741259 
##                          Shareholders.funds 
##                                    9.930070 
##                 Cumulative.retained.profits 
##                                   11.048951 
##                            Capital.employed 
##                                   10.069930 
##                                     TOL.TNW 
##                                    9.930070 
## Total.term.liabilities...tangible.net.worth 
##                                    5.734266 
##      Contingent.liabilities...Net.worth.... 
##                                    5.034965 
##                      Contingent.liabilities 
##                                   36.503497 
##                            Net.fixed.assets 
##                                   11.608392 
##                                 Investments 
##                                   42.797203 
##                              Current.assets 
##                                   11.748252 
##                         Net.working.capital 
##                                   10.769231 
##                         Quick.ratio..times. 
##                                   11.328671 
##                       Current.ratio..times. 
##                                   11.748252 
##                Debt.to.equity.ratio..times. 
##                                    5.034965 
##         Cash.to.current.liabilities..times. 
##                                    6.713287 
##       Cash.to.average.cost.of.sales.per.day 
##                                   11.748252 
##                          Creditors.turnover 
##                                   17.342657 
##                            Debtors.turnover 
##                                   12.587413 
##                     Finished.goods.turnover 
##                                   26.853147 
##                                WIP.turnover 
##                                   25.734266 
##                       Raw.material.turnover 
##                                   13.986014 
##                          Shares.outstanding 
##                                   24.895105 
##                           Equity.face.value 
##                                   21.678322 
##                                         EPS 
##                                   10.069930 
##                                Adjusted.EPS 
##                                   10.069930 
##                           Total.liabilities 
##                                   10.069930 
##                                   PE.on.BSE 
##                                   64.755245
In raw dataset, PE.on.BSE 65% missing data; it has more than 50% missing value, so dropping it and new dimension is:
FRA_num_outasNA_df<-dplyr::select(FRA_num_outasNA_dframe,-PE.on.BSE)
dim(FRA_num_outasNA_df) ##3541*48
## [1] 3541   48
In Validation dataset, PE.on.BSE 64% missing data, it has more than 50% missing value, so dropping it and new deimsion is
test_num_outasNA_df<-dplyr::select(test_num_outasNA_dframe,-PE.on.BSE)
dim(test_num_outasNA_df) ##715*48
## [1] 715  48

Missing data treatement (Actual Nulls in the data + Outlier has been set as Nulls) for Train/Raw by using mice() with method as ‘RandformForest’

#md.pattern(FRA_num_outasNA_df,rotate.names = T) 
##Checking this diagram shows no row or column has more than 50%
imputed_FRA<-mice(FRA_num_outasNA_df, m=1,maxit=3, method='rf',printFlag = TRUE, seed=1234 )
newFRA_num_imputed_df=complete(imputed_FRA,1)
sum(is.na(newFRA_num_imputed_df))

Even after application of mice with restrictive maxit has left 351 as nulls in Total.liabilities; It is then replaced with median of that column

md.pattern(newFRA_num_imputed_df,rotate.names = T) 

##      Total.assets Net.worth Total.income Change.in.stock Total.expenses
## 3190            1         1            1               1              1
## 351             1         1            1               1              1
##                 0         0            0               0              0
##      Profit.after.tax PBDITA PBT Cash.profit PBDITA.as...of.total.income
## 3190                1      1   1           1                           1
## 351                 1      1   1           1                           1
##                     0      0   0           0                           0
##      PBT.as...of.total.income PAT.as...of.total.income
## 3190                        1                        1
## 351                         1                        1
##                             0                        0
##      Cash.profit.as...of.total.income PAT.as...of.net.worth Sales
## 3190                                1                     1     1
## 351                                 1                     1     1
##                                     0                     0     0
##      Income.from.financial.services Other.income Total.capital
## 3190                              1            1             1
## 351                               1            1             1
##                                   0            0             0
##      Reserves.and.funds Borrowings Current.liabilities...provisions
## 3190                  1          1                                1
## 351                   1          1                                1
##                       0          0                                0
##      Deferred.tax.liability Shareholders.funds Cumulative.retained.profits
## 3190                      1                  1                           1
## 351                       1                  1                           1
##                           0                  0                           0
##      Capital.employed TOL.TNW Total.term.liabilities...tangible.net.worth
## 3190                1       1                                           1
## 351                 1       1                                           1
##                     0       0                                           0
##      Contingent.liabilities...Net.worth.... Contingent.liabilities
## 3190                                      1                      1
## 351                                       1                      1
##                                           0                      0
##      Net.fixed.assets Investments Current.assets Net.working.capital
## 3190                1           1              1                   1
## 351                 1           1              1                   1
##                     0           0              0                   0
##      Quick.ratio..times. Current.ratio..times. Debt.to.equity.ratio..times.
## 3190                   1                     1                            1
## 351                    1                     1                            1
##                        0                     0                            0
##      Cash.to.current.liabilities..times. Cash.to.average.cost.of.sales.per.day
## 3190                                   1                                     1
## 351                                    1                                     1
##                                        0                                     0
##      Creditors.turnover Debtors.turnover Finished.goods.turnover WIP.turnover
## 3190                  1                1                       1            1
## 351                   1                1                       1            1
##                       0                0                       0            0
##      Raw.material.turnover Shares.outstanding Equity.face.value EPS
## 3190                     1                  1                 1   1
## 351                      1                  1                 1   1
##                          0                  0                 0   0
##      Adjusted.EPS Total.liabilities    
## 3190            1                 1   0
## 351             1                 0   1
##                 0               351 351
newFRA_num_imputed_df$Total.liabilities[is.na(newFRA_num_imputed_df$Total.liabilities)]=median(newFRA_num_imputed_df$Total.liabilities,na.rm=TRUE)
sum(is.na(newFRA_num_imputed_df)) ## 0 
## [1] 0
There are now no Nulls in Raw dataset.
## creating a dataframe with 48 numeric and 2 categorical df for raw
FRA_complete_df<-cbind(FRA_fac_df,newFRA_num_imputed_df)
# Its a healthy practice to write the data to disk post imputation as its a costly process.
write.csv(FRA_complete_df,"FRA_complete.csv",row.names=FALSE)
dim(FRA_complete_df)##48+2 cols
## [1] 3541   50
Similar treatment will be done in Validation dataset for removing Nulls
##Same missing data treatement (Actual NA + Outlier NA) for Validation/test dataset
#md.pattern(test_num_outasNA_df,rotate.names = T) 
imputed_test<-mice(test_num_outasNA_df, m=1,maxit=3, method='rf',printFlag = TRUE, seed=1234 )
newtest_num_imputed_df=complete(imputed_test,1)
sum(is.na(newtest_num_imputed_df)) ##72 still NA in test post mice
#md.pattern(newtest_num_imputed_df,rotate.names = T) 
##Total.liabilities has 72 missing value; using median to replace
summary(newtest_num_imputed_df$Total.liabilities)
newtest_num_imputed_df$Total.liabilities[is.na(newtest_num_imputed_df$Total.liabilities)]=median(newtest_num_imputed_df$Total.liabilities,na.rm=TRUE)
sum(is.na(newtest_num_imputed_df)) ## 0 
## [1] 0
## creating a dataframe with 48 numeric and 2 categorical df
test_complete_df<-cbind(test_fac_df,newtest_num_imputed_df) ##df+df using cbind
dim(test_complete_df)
## [1] 715  50
write.csv(test_complete_df,"test_complete.csv",row.names=FALSE)
dim(test_complete_df)##48+2 cols
## [1] 715  50
########test_complete_df has test_fac_df and newtest_num_imputed_df 715*50
####Correlation only for Raw data for 48 numeric columns
##checking multicollinearity using polycor
corHet <- hetcor(as.data.frame(newFRA_num_imputed_df %>% mutate_if(is.character,as.factor)))
hetCorrPlot <- function(corHet){
  melted_cormat <- reshape::melt(round(corHet,2), na.rm = TRUE)
  colnames(melted_cormat) <- c('Var1', 'Var2', 'value')
  melted_cormat <- melted_cormat %>% filter(!is.na(value))
  
  ## Plot the corelation matrix
  ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
    geom_tile(color = "black")+
    scale_fill_gradient2(low = "red", high = "green", mid = "white", 
                         midpoint = 0, limit = c(-1,1), space = "Lab", 
                         name="Heterogeneous Correlation Matrix") +
    theme_minimal()+ ## minimal theme
    theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1))+
    geom_text(aes(Var2, Var1, label = value), color = "black", size = 4)+
    theme(
      axis.title.x = element_blank(),
      axis.title.y = element_blank(),
      panel.grid.major = element_blank(),
      panel.border = element_blank(),
      panel.background = element_blank(),
      axis.ticks = element_blank()
    )
  print(ggheatmap)
}
#hetCorrPlot(corHet$correlations)

Checking VIF to check the magnitude for multicollinearity in Train/Raw data

dim(newFRA_num_imputed_df)
## [1] 3541   48
vif(lm(EPS~.,data=newFRA_num_imputed_df))
##                                Total.assets 
##                                   16.104301 
##                                   Net.worth 
##                                   25.584765 
##                                Total.income 
##                                   13.256508 
##                             Change.in.stock 
##                                    1.353992 
##                              Total.expenses 
##                                   24.442873 
##                            Profit.after.tax 
##                                   15.373892 
##                                      PBDITA 
##                                   14.888468 
##                                         PBT 
##                                   17.209319 
##                                 Cash.profit 
##                                   14.331497 
##                 PBDITA.as...of.total.income 
##                                    4.156362 
##                    PBT.as...of.total.income 
##                                   10.389566 
##                    PAT.as...of.total.income 
##                                    8.915065 
##            Cash.profit.as...of.total.income 
##                                    5.351971 
##                       PAT.as...of.net.worth 
##                                    1.897147 
##                                       Sales 
##                                   27.106274 
##              Income.from.financial.services 
##                                    1.774513 
##                                Other.income 
##                                    1.394993 
##                               Total.capital 
##                                    2.753443 
##                          Reserves.and.funds 
##                                    9.382020 
##                                  Borrowings 
##                                    4.527701 
##            Current.liabilities...provisions 
##                                    5.447240 
##                      Deferred.tax.liability 
##                                    3.303739 
##                          Shareholders.funds 
##                                   25.044217 
##                 Cumulative.retained.profits 
##                                    6.231180 
##                            Capital.employed 
##                                   14.393205 
##                                     TOL.TNW 
##                                    3.216945 
## Total.term.liabilities...tangible.net.worth 
##                                    3.372679 
##      Contingent.liabilities...Net.worth.... 
##                                    1.181573 
##                      Contingent.liabilities 
##                                    1.457910 
##                            Net.fixed.assets 
##                                    5.820851 
##                                 Investments 
##                                    1.612928 
##                              Current.assets 
##                                    8.088887 
##                         Net.working.capital 
##                                    2.231537 
##                         Quick.ratio..times. 
##                                    2.790158 
##                       Current.ratio..times. 
##                                    2.442653 
##                Debt.to.equity.ratio..times. 
##                                    5.386956 
##         Cash.to.current.liabilities..times. 
##                                    2.014403 
##       Cash.to.average.cost.of.sales.per.day 
##                                    1.640008 
##                          Creditors.turnover 
##                                    1.456456 
##                            Debtors.turnover 
##                                    1.465840 
##                     Finished.goods.turnover 
##                                    1.557690 
##                                WIP.turnover 
##                                    1.674422 
##                       Raw.material.turnover 
##                                    1.274257 
##                          Shares.outstanding 
##                                    2.737578 
##                           Equity.face.value 
##                                    1.113920 
##                                Adjusted.EPS 
##                                    1.356729 
##                           Total.liabilities 
##                                    3.819062
##Dropped PBT.as...of.total.income,Total.income,PBT,PBDITA,Capital.employed,Net.worth,Total.expenses,Total.assets due to huge Vif values
new_num_imputed_df<-dplyr::select(newFRA_num_imputed_df,-c(PBT.as...of.total.income,Total.income,PBT,PBDITA,Capital.employed,Net.worth,Total.expenses,Total.assets))
vif(lm(EPS~.,data=new_num_imputed_df))
##                             Change.in.stock 
##                                    1.349473 
##                            Profit.after.tax 
##                                    7.112117 
##                                 Cash.profit 
##                                    9.090822 
##                 PBDITA.as...of.total.income 
##                                    4.022844 
##                    PAT.as...of.total.income 
##                                    3.021874 
##            Cash.profit.as...of.total.income 
##                                    5.207516 
##                       PAT.as...of.net.worth 
##                                    1.855383 
##                                       Sales 
##                                    6.251357 
##              Income.from.financial.services 
##                                    1.741950 
##                                Other.income 
##                                    1.365593 
##                               Total.capital 
##                                    2.737904 
##                          Reserves.and.funds 
##                                    8.745352 
##                                  Borrowings 
##                                    3.489782 
##            Current.liabilities...provisions 
##                                    4.997676 
##                      Deferred.tax.liability 
##                                    3.250509 
##                          Shareholders.funds 
##                                    8.301318 
##                 Cumulative.retained.profits 
##                                    6.100243 
##                                     TOL.TNW 
##                                    3.196418 
## Total.term.liabilities...tangible.net.worth 
##                                    3.355634 
##      Contingent.liabilities...Net.worth.... 
##                                    1.176171 
##                      Contingent.liabilities 
##                                    1.435148 
##                            Net.fixed.assets 
##                                    5.424518 
##                                 Investments 
##                                    1.575276 
##                              Current.assets 
##                                    7.461207 
##                         Net.working.capital 
##                                    2.192806 
##                         Quick.ratio..times. 
##                                    2.787518 
##                       Current.ratio..times. 
##                                    2.438603 
##                Debt.to.equity.ratio..times. 
##                                    5.345303 
##         Cash.to.current.liabilities..times. 
##                                    2.008149 
##       Cash.to.average.cost.of.sales.per.day 
##                                    1.629659 
##                          Creditors.turnover 
##                                    1.441942 
##                            Debtors.turnover 
##                                    1.459670 
##                     Finished.goods.turnover 
##                                    1.555277 
##                                WIP.turnover 
##                                    1.671010 
##                       Raw.material.turnover 
##                                    1.266103 
##                          Shares.outstanding 
##                                    2.704633 
##                           Equity.face.value 
##                                    1.111895 
##                                Adjusted.EPS 
##                                    1.349915 
##                           Total.liabilities 
##                                    2.816435
VIF is below 10 in Raw; the issue of multicollinearity is resolved. The final dimension of the Raw dataset is:
dim(new_num_imputed_df)
## [1] 3541   40

Same process followed for Validation dataset and 8 columns are dropped due to huge VIF

FRA_complete_df<-cbind(FRA_fac_df,new_num_imputed_df)
#dim(FRA_complete_df)
##Dropping columns from test data too:
test_pvif_num_imputed_df<-dplyr::select(newtest_num_imputed_df,-c(PBT.as...of.total.income,Total.income,PBT,PBDITA,Capital.employed,Net.worth,Total.expenses,Total.assets))
test_complete_df<-cbind(test_fac_df,test_pvif_num_imputed_df) ##df+df using cbind
The final dimension of the Validation dataset is:
dim(test_complete_df)
## [1] 715  42
Dropped Predictor list along with reason of dropping it (only 10 numeric predictors + 1 Factor)
Total.income,
PBT,
PBDITA,
Capital.employed,
Net.worth,
Total.expenses,
Total.assets,
PE.on.BSE
All above due to greater than 10 vif
Deposits..accepted.by.commercial.banks.-all nulls
Total.liabilities - 65% missing
Num - Factor Company name

Dropping Num factor column as not a valid predictor

FRA_complete_df<-as_tibble(FRA_complete_df)
## Change target,set to correct level & Num column to factor
imputed_nonum_df=dplyr::select(FRA_complete_df,-Num)
colnames(imputed_nonum_df) <- make.names(colnames(imputed_nonum_df))
table(imputed_nonum_df$Default) ##7% of yes no need of smote
## 
##   no  yes 
## 3307  234
Dimension of Train/Raw dataset
dim(imputed_nonum_df)##3541*41
## [1] 3541   41

Model Building Starts here for Raw data

train <- imputed_nonum_df
dim(train) ##2479*41
anyNA(train)
levels(imputed_nonum_df$Default)
table(train$Default)

##Set up Seed
set.seed(1234)
objControl <- trainControl(method = "none",
                           summaryFunction = twoClassSummary,
                           classProbs = TRUE,
                           savePredictions = TRUE)
## to increase iterations
control=glm.control(maxit=50) 

model <- caret::train(Default ~ ., data = train,
                      method = 'glmStepAIC',
                      trControl = objControl,
                      metric = "ROC",
                      direction = 'both'
)
model
## Generalized Linear Model with Stepwise Feature Selection 
## 
## 3541 samples
##   40 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: None
summary(model$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5222  -0.3254  -0.1588  -0.0301   4.4748  
## 
## Coefficients:
##                                               Estimate Std. Error z value
## (Intercept)                                 -1.5160220  0.2475164  -6.125
## PAT.as...of.total.income                    -0.0780114  0.0176135  -4.429
## Cash.profit.as...of.total.income            -0.0418059  0.0177989  -2.349
## PAT.as...of.net.worth                       -0.0127376  0.0085778  -1.485
## Sales                                       -0.0003605  0.0001514  -2.382
## Other.income                                -0.0276980  0.0174306  -1.589
## Borrowings                                   0.0011461  0.0003831   2.992
## Cumulative.retained.profits                 -0.0030246  0.0010567  -2.862
## TOL.TNW                                      0.1869976  0.0441101   4.239
## Total.term.liabilities...tangible.net.worth -0.2828562  0.1174974  -2.407
## Contingent.liabilities...Net.worth....       0.0056798  0.0025075   2.265
## Current.assets                               0.0009641  0.0004444   2.170
## Net.working.capital                         -0.0026375  0.0013595  -1.940
## Current.ratio..times.                       -0.4713044  0.1427510  -3.302
## Debt.to.equity.ratio..times.                 0.1254032  0.0832678   1.506
## Cash.to.average.cost.of.sales.per.day        0.0030318  0.0019823   1.529
## Creditors.turnover                          -0.0327195  0.0156490  -2.091
## Raw.material.turnover                       -0.0470851  0.0146487  -3.214
## Equity.face.value                            0.0055728  0.0034030   1.638
## Adjusted.EPS                                -0.2077995  0.0511028  -4.066
## Total.liabilities                           -0.0004812  0.0001798  -2.677
##                                             Pr(>|z|)    
## (Intercept)                                 9.07e-10 ***
## PAT.as...of.total.income                    9.46e-06 ***
## Cash.profit.as...of.total.income            0.018834 *  
## PAT.as...of.net.worth                       0.137555    
## Sales                                       0.017233 *  
## Other.income                                0.112050    
## Borrowings                                  0.002775 ** 
## Cumulative.retained.profits                 0.004206 ** 
## TOL.TNW                                     2.24e-05 ***
## Total.term.liabilities...tangible.net.worth 0.016069 *  
## Contingent.liabilities...Net.worth....      0.023502 *  
## Current.assets                              0.030042 *  
## Net.working.capital                         0.052374 .  
## Current.ratio..times.                       0.000961 ***
## Debt.to.equity.ratio..times.                0.132061    
## Cash.to.average.cost.of.sales.per.day       0.126164    
## Creditors.turnover                          0.036542 *  
## Raw.material.turnover                       0.001308 ** 
## Equity.face.value                           0.101500    
## Adjusted.EPS                                4.78e-05 ***
## Total.liabilities                           0.007439 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1723.7  on 3540  degrees of freedom
## Residual deviance: 1182.6  on 3520  degrees of freedom
## AIC: 1224.6
## 
## Number of Fisher Scoring iterations: 9
##How good the model is when it predicts its own Default on which it has learnt
pred_train <- predict( model, train)
table(pred_train)
## pred_train
##   no  yes 
## 3474   67
##at cutoff of 50%
caret::confusionMatrix(pred_train,train$Default,positive="yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  3282  192
##        yes   25   42
##                                           
##                Accuracy : 0.9387          
##                  95% CI : (0.9303, 0.9464)
##     No Information Rate : 0.9339          
##     P-Value [Acc > NIR] : 0.1316          
##                                           
##                   Kappa : 0.2572          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.17949         
##             Specificity : 0.99244         
##          Pos Pred Value : 0.62687         
##          Neg Pred Value : 0.94473         
##              Prevalence : 0.06608         
##          Detection Rate : 0.01186         
##    Detection Prevalence : 0.01892         
##       Balanced Accuracy : 0.58596         
##                                           
##        'Positive' Class : yes             
## 
Note: False Negative i.e Predicting company has not defaulted, when the company has actually defaulted is more risky; Therefore, here Sensitivity is more important. For train dataset at 50% cutoff - sensitivity is 18% which is quite low when compared to 90+ % accuracy

Model Interpretation with respect to Beta/coefficient of the final model:

Probability of a company will default increases as the Borrowings,Contingent.liabilities…Net.worth….,TOL.TNW and Current.assets increases
Probability of a company will default decreases as the PAT.as…of.total.income,Sales,Cumulative.retained.profits,Total.term.liabilities…tangible.net.worth,Current.ratio..times.,Creditors.turnover,Raw.material.turnover ,Adjusted.EPS , Total.liabilities increases

Residual analysis for the model to check for hetroscadicity.

library(arm)
binnedplot(fitted(model$finalModel), 
           residuals(model$finalModel, type = "response"), 
           nclass = NULL, 
           xlab = "Expected Values", 
           ylab = "Average residual", 
           main = "Binned residual plot", 
           cex.pts = 0.8, 
           col.pts = 1, 
           col.int = "gray")

The grey lines represent plus minus side of 2 standard devaiation bands, which we would expect to contain about 95% of the observations. This model does look reasonable as the majority of the fitted values seem to fall inside the bands and are randomly distributed.

Checking for multi collinearity in the final model.

vif(model$finalModel)
##                    PAT.as...of.total.income 
##                                    1.512013 
##            Cash.profit.as...of.total.income 
##                                    1.427629 
##                       PAT.as...of.net.worth 
##                                    1.502264 
##                                       Sales 
##                                    3.340292 
##                                Other.income 
##                                    1.298118 
##                                  Borrowings 
##                                    4.096163 
##                 Cumulative.retained.profits 
##                                    1.465548 
##                                     TOL.TNW 
##                                    2.400515 
## Total.term.liabilities...tangible.net.worth 
##                                    2.331058 
##      Contingent.liabilities...Net.worth.... 
##                                    1.150472 
##                              Current.assets 
##                                    4.584840 
##                         Net.working.capital 
##                                    1.449347 
##                       Current.ratio..times. 
##                                    1.239409 
##                Debt.to.equity.ratio..times. 
##                                    3.642694 
##       Cash.to.average.cost.of.sales.per.day 
##                                    1.173554 
##                          Creditors.turnover 
##                                    1.317369 
##                       Raw.material.turnover 
##                                    1.303051 
##                           Equity.face.value 
##                                    1.028925 
##                                Adjusted.EPS 
##                                    1.249975 
##                           Total.liabilities 
##                                    3.202171
No multicollinearity in the final model as all vif of predictors below 10

How well is the model able to predict the new test set.

test_complete_df<-as_tibble(test_complete_df)
## Change target,set to correct level & Num column to factor 
test_imputed_nonum_df=dplyr::select(test_complete_df,-Num)
colnames(test_imputed_nonum_df) <- make.names(colnames(test_imputed_nonum_df))
table(test_imputed_nonum_df$Default)
## 
##  no yes 
## 661  54
pred_test <- predict(model, newdata =test_imputed_nonum_df) 
caret::confusionMatrix(pred_test,test_imputed_nonum_df$Default,positive="yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  648  40
##        yes  13  14
##                                          
##                Accuracy : 0.9259         
##                  95% CI : (0.9042, 0.944)
##     No Information Rate : 0.9245         
##     P-Value [Acc > NIR] : 0.4797595      
##                                          
##                   Kappa : 0.311          
##                                          
##  Mcnemar's Test P-Value : 0.0003551      
##                                          
##             Sensitivity : 0.25926        
##             Specificity : 0.98033        
##          Pos Pred Value : 0.51852        
##          Neg Pred Value : 0.94186        
##              Prevalence : 0.07552        
##          Detection Rate : 0.01958        
##    Detection Prevalence : 0.03776        
##       Balanced Accuracy : 0.61980        
##                                          
##        'Positive' Class : yes            
## 
The Confusion Matrix is at cutoff of 50% ;
The good news is sensitivity has incraesed in test dataset (more than train’s sensitivity) and accuracy is 90+%. Still it is not best; Another approach could be to find the Optimal Cutoff and then check the sensitivity. Using ROC curve of train dataset, we will try to find the cutoff at highest accuracy.
library(ROCR)
train_pred_prob = predict(model,train , type = "prob")
ROC_pred_train <- prediction(train_pred_prob[2], train$Default)
##Identifying the Best Accuracy & cutoff for this accuracy
ROC_acc_train <- performance(ROC_pred_train, "acc")
plot(ROC_acc_train,main="For getting best cutoff at highest accuracy using train set")
abline(a = 0, b = 1, lwd = 2, lty = 3, col = "black")

max<-which.max(slot(ROC_acc_train,"y.values")[[1]])
max_accuracy<-slot(ROC_acc_train,"y.values")[[1]][max]
cut_off <-slot(ROC_acc_train,"x.values")[[1]][max] 
print(c(Accuracy=max_accuracy, cutoff=cut_off)) 
##  Accuracy    cutoff 
## 0.9392827 0.4892911
Hence highest accuracy that can be acheived by this model for train data is 93% & will be achieved at cutoff of 0.489 (quite near to 0.5 threshold). Hence, no need to create another model at different threshold since by default confusion matrix is created at 50% probability.

Train and Test data AUC is almost same. No case of overfitting.

test_pred_prob = predict(model,test_imputed_nonum_df , type = "prob")
ROC_pred_test <- prediction(test_pred_prob[2], test_imputed_nonum_df$Default)
ROC_perf <- performance(ROC_pred_test, "tpr", "fpr")
plot(ROC_perf, main = "ROC & AUC for Test(Validation) data", col = 2, lwd = 2)
auc<-performance(ROC_pred_test,"auc")
auc<-unlist(slot(auc,"y.values"))
auc<-round(auc,4)
legend(0.6,0.2,auc,title="AUC,cex=1.2")##89%