library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
bankruptcy <- read.csv("C:/Users/khann/Desktop/COLLEGE/R/bankruptcy.csv",h=T)

bankruptcy <- select(bankruptcy, -c(CUSIP, FYEAR))
str(bankruptcy)
## 'data.frame':    5436 obs. of  11 variables:
##  $ DLRSN: int  0 0 0 1 0 0 0 0 1 0 ...
##  $ R1   : num  0.307 0.761 -0.514 -0.466 2.023 ...
##  $ R2   : num  0.887 0.592 0.338 0.371 0.215 ...
##  $ R3   : num  1.648 0.453 0.299 0.496 0.183 ...
##  $ R4   : num  -0.1992 -0.3699 -0.0291 -0.3734 6.6954 ...
##  $ R5   : num  1.093 0.186 -0.433 -0.267 -1.148 ...
##  $ R6   : num  -0.3133 0.0396 0.83 0.9778 -1.5059 ...
##  $ R7   : num  -0.197 0.327 -0.708 -0.611 2.876 ...
##  $ R8   : num  1.207 0.428 0.476 0.457 0.287 ...
##  $ R9   : num  0.282 1.107 2.179 0.152 -0.986 ...
##  $ R10  : num  0.1589 0.7934 2.4846 0.0478 0.7911 ...
head(bankruptcy)
##   DLRSN         R1        R2        R3          R4         R5          R6
## 1     0  0.3071395 0.8870057 1.6476808 -0.19915760  1.0929640 -0.31328867
## 2     0  0.7607365 0.5924934 0.4530028 -0.36989014  0.1861539  0.03961907
## 3     0 -0.5135961 0.3376148 0.2990145 -0.02907996 -0.4326047  0.82999324
## 4     1 -0.4661293 0.3707467 0.4960667 -0.37342897 -0.2674240  0.97779888
## 5     0  2.0234223 0.2148764 0.1825948  6.69536040 -1.1483381 -1.50588927
## 6     0  0.9074985 0.3868797 0.4778914 -0.34715872  1.4079893 -0.48390230
##            R7        R8         R9         R10
## 1 -0.19679332 1.2067628  0.2824709  0.15889625
## 2  0.32749732 0.4284181  1.1069652  0.79344276
## 3 -0.70778613 0.4761533  2.1791755  2.48458450
## 4 -0.61097507 0.4568098  0.1519511  0.04778851
## 5  2.87647687 0.2873749 -0.9864421  0.79107673
## 6  0.07025944 0.5278110  0.5024659 -0.16464802
colnames(bankruptcy)
##  [1] "DLRSN" "R1"    "R2"    "R3"    "R4"    "R5"    "R6"    "R7"   
##  [9] "R8"    "R9"    "R10"
#EDA
#From summary analysis, we see that none of the variables are having factors. Also, with domain understanding, it is clear that no variables should be converted into factors. DLRSN is the only variable which is 0/1 and this variable will be used as a predictor for logistic regression.
#R10 which is the market cap and R9 which is the sales data is in log-scale. 
#We have also checked for null values. Checking the null values in dataset is very important, and if we find any, then we have to impute the values, either with mean or median or depending upon the domain. There are no Null Values in the dataset.


#We now check the distributions of all independent variables.

par(mfrow = c(3,4))
par(mfrow = c(3,4))
i <- 4
for (i in 2:11) 
{
  hist((bankruptcy[,i]), main = paste("Distibution of ", colnames(bankruptcy[i])), xlab = colnames(bankruptcy[i]))
}


#R9 and R10 seem to be close to a normal distribution and nothing concrete can be commented about the others. R2 and R4 have heavy tails.

#Let us see what is the distribution of the bankruptcy indicator in the dataset.

par(mfrow = c(1,1))

barplot(table(bankruptcy$DLRSN), main = "Frequency of Bankruptcy Flag")

#Number of Non- Bankrupt Companies are way more than the Bankrupt ones in the data set.

#Now let us check the correlation between independent variables and the bankruptcy flag.
install.packages("GGally")
## Installing package into 'C:/Users/khann/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'GGally' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\khann\AppData\Local\Temp\Rtmp6t6mRI\downloaded_packages
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa

ggcorr(bankruptcy, label = TRUE)

#We can see that R10 has the highest negative correlation with the bankruptcy flag with the correlation coefficient as -0.4 followed by R6 and R3 with a correlation coefficient of 0.3 and -0.3 respectively. We see a high correlation between R8 and R3 and moderate correlation among other variables.
#
set.seed(13232767)
sample_index <- sample(nrow(bankruptcy),nrow(bankruptcy)*0.70)
bankruptcy.train <- bankruptcy[sample_index,]
bankruptcy.test <- bankruptcy[-sample_index,]
colnames(bankruptcy.test)
##  [1] "DLRSN" "R1"    "R2"    "R3"    "R4"    "R5"    "R6"    "R7"   
##  [9] "R8"    "R9"    "R10"
#Training a logistic regression model
bankruptcy.glm.logit<- glm(DLRSN~., family=binomial(link="logit"), data=bankruptcy.train)
summary(bankruptcy.glm.logit)
## 
## Call:
## glm(formula = DLRSN ~ ., family = binomial(link = "logit"), data = bankruptcy.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1633  -0.4588  -0.2320  -0.0981   3.4576  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.57251    0.08584 -29.969  < 2e-16 ***
## R1           0.28684    0.08080   3.550 0.000385 ***
## R2           0.54664    0.08474   6.450 1.12e-10 ***
## R3          -0.41780    0.10789  -3.873 0.000108 ***
## R4          -0.29132    0.14980  -1.945 0.051812 .  
## R5          -0.01881    0.05958  -0.316 0.752196    
## R6           0.32199    0.06115   5.266 1.40e-07 ***
## R7          -0.49603    0.11147  -4.450 8.60e-06 ***
## R8          -0.31465    0.09386  -3.352 0.000801 ***
## R9           0.21650    0.11648   1.859 0.063069 .  
## R10         -1.51574    0.10927 -13.871  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3133.2  on 3804  degrees of freedom
## Residual deviance: 2096.6  on 3794  degrees of freedom
## AIC: 2118.6
## 
## Number of Fisher Scoring iterations: 7
bankruptcy.glm.logit$deviance
## [1] 2096.574
AIC(bankruptcy.glm.logit)
## [1] 2118.574
BIC(bankruptcy.glm.logit)
## [1] 2187.259
#Model selection using the BIC criterion
#We build two models. First model is the null model containing only the bankruptcy indicator and no predictors. The second model is the full model containing all predictors.

#Now we select the best model using the stepwise selection method - with BIC criterion.

model.full <- glm(DLRSN ~ R1 + R2 + R3 + R4 + R5 + R6 + R7 + R8 + R9 + R10, family = "binomial" ,data = bankruptcy.train)
model.null <- glm(DLRSN ~ 1, family = "binomial" ,data = bankruptcy.train)
step.BIC <- step(model.null, scope = list(lower = model.null, upper = model.full), direction = "both", k = log(nrow(bankruptcy.train)))
## Start:  AIC=3141.49
## DLRSN ~ 1
## 
##        Df Deviance    AIC
## + R10   1   2325.9 2342.4
## + R6    1   2818.8 2835.3
## + R8    1   2851.2 2867.7
## + R3    1   2868.5 2885.0
## + R1    1   2923.7 2940.1
## + R4    1   2928.3 2944.8
## + R9    1   2955.6 2972.1
## + R7    1   2969.9 2986.4
## + R2    1   2976.4 2992.9
## + R5    1   3052.0 3068.4
## <none>      3133.2 3141.5
## 
## Step:  AIC=2342.39
## DLRSN ~ R10
## 
##        Df Deviance    AIC
## + R6    1   2256.0 2280.7
## + R7    1   2257.7 2282.5
## + R4    1   2262.1 2286.8
## + R8    1   2280.1 2304.8
## + R9    1   2286.7 2311.4
## + R3    1   2292.9 2317.6
## + R1    1   2293.1 2317.8
## + R5    1   2310.9 2335.6
## <none>      2325.9 2342.4
## + R2    1   2325.6 2350.3
## - R10   1   3133.2 3141.5
## 
## Step:  AIC=2280.74
## DLRSN ~ R10 + R6
## 
##        Df Deviance    AIC
## + R4    1   2219.5 2252.4
## + R9    1   2224.9 2257.9
## + R7    1   2234.9 2267.9
## + R2    1   2236.5 2269.4
## + R8    1   2238.5 2271.5
## + R3    1   2242.4 2275.4
## <none>      2256.0 2280.7
## + R5    1   2252.1 2285.1
## + R1    1   2255.4 2288.4
## - R6    1   2325.9 2342.4
## - R10   1   2818.8 2835.3
## 
## Step:  AIC=2252.45
## DLRSN ~ R10 + R6 + R4
## 
##        Df Deviance    AIC
## + R8    1   2181.7 2222.9
## + R3    1   2183.9 2225.1
## <none>      2219.5 2252.4
## + R9    1   2211.7 2253.0
## + R7    1   2212.3 2253.6
## + R2    1   2213.3 2254.5
## + R5    1   2217.8 2259.0
## + R1    1   2218.6 2259.8
## - R4    1   2256.0 2280.7
## - R6    1   2262.1 2286.8
## - R10   1   2734.9 2759.6
## 
## Step:  AIC=2222.87
## DLRSN ~ R10 + R6 + R4 + R8
## 
##        Df Deviance    AIC
## + R2    1   2138.3 2187.8
## + R9    1   2164.5 2214.0
## <none>      2181.7 2222.9
## - R6    1   2192.7 2225.7
## + R7    1   2177.8 2227.2
## + R5    1   2178.0 2227.4
## + R1    1   2178.3 2227.7
## + R3    1   2178.5 2227.9
## - R8    1   2219.5 2252.4
## - R4    1   2238.5 2271.5
## - R10   1   2532.9 2565.8
## 
## Step:  AIC=2187.81
## DLRSN ~ R10 + R6 + R4 + R8 + R2
## 
##        Df Deviance    AIC
## + R3    1   2127.6 2185.3
## + R7    1   2129.6 2187.3
## <none>      2138.3 2187.8
## + R9    1   2133.6 2191.3
## + R1    1   2136.4 2194.1
## + R5    1   2137.0 2194.7
## - R4    1   2167.6 2208.8
## - R6    1   2168.7 2209.9
## - R2    1   2181.7 2222.9
## - R8    1   2213.3 2254.5
## - R10   1   2532.5 2573.8
## 
## Step:  AIC=2185.29
## DLRSN ~ R10 + R6 + R4 + R8 + R2 + R3
## 
##        Df Deviance    AIC
## + R7    1   2117.9 2183.9
## + R9    1   2119.2 2185.2
## <none>      2127.6 2185.3
## - R3    1   2138.3 2187.8
## - R8    1   2141.4 2190.8
## + R5    1   2125.2 2191.2
## + R1    1   2125.6 2191.6
## - R6    1   2160.0 2209.5
## - R4    1   2160.8 2210.3
## - R2    1   2178.5 2227.9
## - R10   1   2491.9 2541.4
## 
## Step:  AIC=2183.88
## DLRSN ~ R10 + R6 + R4 + R8 + R2 + R3 + R7
## 
##        Df Deviance    AIC
## + R1    1   2100.7 2174.9
## + R9    1   2109.3 2183.6
## <none>      2117.9 2183.9
## - R7    1   2127.6 2185.3
## - R3    1   2129.6 2187.3
## - R8    1   2130.7 2188.4
## + R5    1   2115.8 2189.9
## - R4    1   2135.4 2193.2
## - R6    1   2137.2 2194.9
## - R2    1   2174.5 2232.2
## - R10   1   2491.3 2549.1
## 
## Step:  AIC=2174.88
## DLRSN ~ R10 + R6 + R4 + R8 + R2 + R3 + R7 + R1
## 
##        Df Deviance    AIC
## <none>      2100.7 2174.9
## - R4    1   2111.8 2177.7
## + R9    1   2096.7 2179.1
## - R3    1   2113.3 2179.3
## - R8    1   2114.3 2180.2
## + R5    1   2100.1 2182.5
## - R1    1   2117.9 2183.9
## - R7    1   2125.6 2191.6
## - R6    1   2136.7 2202.6
## - R2    1   2159.0 2224.9
## - R10   1   2488.2 2554.2
summary(step.BIC)
## 
## Call:
## glm(formula = DLRSN ~ R10 + R6 + R4 + R8 + R2 + R3 + R7 + R1, 
##     family = "binomial", data = bankruptcy.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0903  -0.4608  -0.2280  -0.0939   3.6100  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.57759    0.08677 -29.707  < 2e-16 ***
## R10         -1.39220    0.08131 -17.123  < 2e-16 ***
## R6           0.34665    0.05839   5.936 2.92e-09 ***
## R4          -0.40914    0.15032  -2.722 0.006493 ** 
## R8          -0.34015    0.09299  -3.658 0.000254 ***
## R2           0.59480    0.08154   7.295 2.99e-13 ***
## R3          -0.37377    0.10523  -3.552 0.000382 ***
## R7          -0.52305    0.11065  -4.727 2.28e-06 ***
## R1           0.32172    0.07831   4.108 3.99e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3133.2  on 3804  degrees of freedom
## Residual deviance: 2100.7  on 3796  degrees of freedom
## AIC: 2118.7
## 
## Number of Fisher Scoring iterations: 7
#The model with BIC criterion has 8 predictors namely R1, R2, R3, R4, R6, R7, R8 and R10.
#In-Sample performance of the model
#Once we have the final logistic regression model, we will plot the ROC curve to get the area under the curve. The ROC curve is plot of Sensitivity (True Positives) vs Specificity (False Positives). Higher area under the curve, better is the model performance.

#install.packages("fields")
library(fields)
## Loading required package: spam
## Loading required package: dotCall64
## Loading required package: grid
## Spam version 2.5-1 (2019-12-12) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve
## Loading required package: maps
## See https://github.com/NCAR/Fields for
##  an extensive vignette, other supplements and source code
#install.packages("verification")
library(verification)
## Loading required package: boot
## Loading required package: CircStats
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: dtw
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:spam':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loaded dtw v1.21-3. See ?dtw for help, citation("dtw") for use in publication.
final.model <- glm(DLRSN ~ R1 + R2 + R3 + R4+ R6 + R7 + R8 + R10, family = "binomial" ,data = bankruptcy.train)

prob.insample <- predict(final.model, type = "response")
roc.plot(bankruptcy.train$DLRSN == "1", prob.insample)$roc.vol
## Warning in roc.plot.default(bankruptcy.train$DLRSN == "1", prob.insample):
## Large amount of unique predictions used as thresholds. Consider specifying
## thresholds.

##      Model      Area       p.value binorm.area
## 1 Model  1 0.8831149 1.137188e-181          NA
#The above figure shows the ROC Curve for the training Dataset and the Area under the Curve (AUC) obtained is 88.3%.
#Cost Function
#For obtaining the misclassification rate, we need to decide the threshold for categorizing the bankruptcy status. By using the cost function, we will plot a graph of Cost of Training data vs threshold value and choose the value with lowest cost as the threshold for classifying the bankruptcy status. The ratio of weightage for False Positive and False Negative is 1:35. Here we are trying to say that wrongly predicting the companies which will go bankrupt as Non-Bankrupt is more serious and risky than predicting the vis versa. Therefore, it is important to reduce the False Negatives.

#Optimal Cutoff
searchgrid = seq(0.01, 0.99, 0.01)
result = cbind(searchgrid, NA)

cost1 <- function(r, pi) {
  weight1 = 35
  weight0 = 1
  c1 = (r == 1) & (pi < cutoff)  #logical vector - true if actual 1 but predict 0
  c0 = (r == 0) & (pi > cutoff)  #logical vecotr - true if actual 0 but predict 1
  return(mean(weight1 * c1 + weight0 * c0))
}

for (i in 1:length(searchgrid)) {
  cutoff <- result[i, 1]
  # assign the cost to the 2nd col
  result[i, 2] <- cost1(bankruptcy.train$DLRSN, predict(final.model, type = "response"))
}
plot(result, ylab = "Cost in Training Set", main = "Asymmetric Cost Function")

opt.cutoff <- searchgrid[which(result[,2]==min(result[,2]))]
opt.cutoff
## [1] 0.03
#The optimal cutoff probability comes out to be 0.03
#In-Sample performance of the model
#We now calculate the misclassification rate of the training dataset using this threshold value.
bankruptcy.train$pred <-  ifelse(prob.insample>opt.cutoff,1,0)
table(pred = bankruptcy.train$pred , true = bankruptcy.train$DLRSN)
##     true
## pred    0    1
##    0 1466   16
##    1 1792  531
mean(bankruptcy.train$pred != bankruptcy.train$DLRSN)
## [1] 0.4751643
#The misclassification rate obtained for training dataset is 0.4751643.This means our model is around 53% accurate in predicting the bankruptcy status.
#Out-of-Sample Performance of the Model
#Now, let us test the performance of the model on the test dataset. The 30% of the data sampled initially will be used as the test dataset. To gauge the model performance on the test data, we will predict the bankruptcy status for the companies present in the test dataset using the Final Logistic Regression Model built above. We will check for the asymmetric misclassification rate and the Area under the Curve for test dataset.

#Let us plot the ROC Curve for test dataset.
prob.outsample <- predict(final.model, bankruptcy.test, type = "response")
roc.plot(bankruptcy.test$DLRSN == "1", prob.outsample)$roc.vol
## Warning in roc.plot.default(bankruptcy.test$DLRSN == "1", prob.outsample):
## Large amount of unique predictions used as thresholds. Consider specifying
## thresholds.

##      Model     Area      p.value binorm.area
## 1 Model  1 0.872372 1.827252e-73          NA
#The Area Under the Curve (AUC) for the test dataset is 87%
#We now calculate the asymmetric classification rate for the test data set. It is to be noted that we will be using the value of cut-off probability as 0.03(the value we had derived using the cost function).

bankruptcy.test$pred <-  ifelse(prob.outsample>opt.cutoff,1,0)
table(pred = bankruptcy.test$pred , true = bankruptcy.test$DLRSN)
##     true
## pred   0   1
##    0 624   9
##    1 778 220
mean(bankruptcy.test$pred != bankruptcy.test$DLRSN)
## [1] 0.4825261
#The misclassification rate obtained for the test dataset is 0.4825261. This means our model shows nearly 52% accuracy in predicting the bankruptcy status for observations in the test dataset.

The misclassification rate and AUC (Area Under Curve) is close to each other for the training as well as the test dataset. The AUC for training dataset is slightly higher than the test data while the misclassification rate for the test data is slightly higher than the training dataset.

From the above process, we can say that the Logistic Regression model built has a decent performance for training as well as test dataset and is able to predict the bankruptcy status of the companies with an accuracy of around 52%.Therefore, for out-of-sample model is performing good considering our domain here. It is better to predict a company as bankrupt initially rather than loose out on the principal amount after the company actually goes bankrupt.

#logistic regression using probit link, just for comparision with logit link 
bankruptcy.glm.probit<- glm(DLRSN~., family=binomial(link="probit"), data=bankruptcy.train)
summary(bankruptcy.glm.probit)
## 
## Call:
## glm(formula = DLRSN ~ ., family = binomial(link = "probit"), 
##     data = bankruptcy.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1266  -0.4734  -0.2353  -0.0674   3.5601  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.30389    0.11092 -11.756  < 2e-16 ***
## R1           0.14442    0.04582   3.152 0.001622 ** 
## R2           0.31799    0.04708   6.754 1.44e-11 ***
## R3          -0.22616    0.06076  -3.722 0.000198 ***
## R4          -0.11834    0.06507  -1.819 0.068956 .  
## R5          -0.01350    0.03369  -0.401 0.688752    
## R6           0.19424    0.03445   5.638 1.72e-08 ***
## R7          -0.24098    0.05855  -4.116 3.86e-05 ***
## R8          -0.16361    0.05377  -3.043 0.002344 ** 
## R9           0.10600    0.06486   1.634 0.102207    
## R10         -0.85912    0.06181 -13.898  < 2e-16 ***
## pred        -0.14613    0.13535  -1.080 0.280325    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3133.2  on 3804  degrees of freedom
## Residual deviance: 2098.9  on 3793  degrees of freedom
## AIC: 2122.9
## 
## Number of Fisher Scoring iterations: 7
bankruptcy.glm.probit$deviance
## [1] 2098.945
AIC(bankruptcy.glm.probit)
## [1] 2122.945
BIC(bankruptcy.glm.probit)
## [1] 2197.874
#Classification Tree
library(rpart)
library(rpart.plot)

bankruptcy.rpart <- rpart(formula = DLRSN ~ ., data = bankruptcy.train, method = "class", 
                          parms = list(loss=matrix(c(0,35,1,0), nrow = 2)))
prp(bankruptcy.rpart, extra = 1)

bankruptcy.train.pred.tree = predict(bankruptcy.rpart)
bankruptcy.test.pred.tree = predict(bankruptcy.rpart,bankruptcy.test)

MSE.tree <- mean((bankruptcy.train.pred.tree-bankruptcy.train$DLRSN)^2)

MSPE.tree <- mean((bankruptcy.test.pred.tree-bankruptcy.test$DLRSN)^2)
#Pruning with cp
#cannot work with bushy trees;)
plotcp(bankruptcy.rpart)

#MSE for pruned tree
#Using complexity parameter obtained from plotcp
bankruptcy.pruned.tree <- prune(bankruptcy.rpart, cp = 0.011)
prp(bankruptcy.pruned.tree,digits = 4, extra = 1)

mean((predict(bankruptcy.pruned.tree) - bankruptcy.train$DLRSN)^2)
## [1] 0.3999889
#MSPE for pruned tree
bankruptcy.test.pred.pruned.tree = predict(bankruptcy.pruned.tree,bankruptcy.test)
mean((bankruptcy.test.pred.pruned.tree - bankruptcy.test$DLRSN)^2)
## [1] 0.3987942
##-----------------2. Random Forest-----------------------##
#The idea of random forests is to randomly select m out of p predictors as candidate variables for each split in each tree. The reason of doing this is that it can decorrelates the trees such that it reduces variance when we aggregate the trees.

#install.packages("randomForest")
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
bankruptcy.rf<- randomForest(as.factor(DLRSN)~., data = bankruptcy.train)
bankruptcy.rf
## 
## Call:
##  randomForest(formula = as.factor(DLRSN) ~ ., data = bankruptcy.train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 11.35%
## Confusion matrix:
##      0   1 class.error
## 0 3137 121  0.03713935
## 1  311 236  0.56855576
#The out-of-bag prediction is similar to LOOCV. We use full sample. In every bootstrap, the unused sample serves as testing sample, and testing error is calculated. In the end, OOB error, root mean squared error by default, is obtained. In this case, our OOB estimate of error rate is 11.25%.

#Variable Importance: The top 2 most important features are R10 and R4

varImpPlot(bankruptcy.rf)

#Plotting the error rate vs. ntree. 
plot(bankruptcy.rf, lwd=rep(2, 3))
legend("right", legend = c("OOB Error", "FPR", "FNR"), lwd=rep(2, 3), lty = c(1,2,3), col = c("black", "red", "green"))

#Misclassification rate- MSE
bankruptcy.train.pred.tree = predict(bankruptcy.rf)
mean(ifelse(bankruptcy.train$DLRSN != bankruptcy.train.pred.tree, 1, 0))
## [1] 0.1135348
#Misclassification rate- MSPE
bankruptcy.test.pred.tree = predict(bankruptcy.rf, bankruptcy.test)
mean(ifelse(bankruptcy.test$DLRSN != bankruptcy.test.pred.tree, 1, 0))
## [1] 0.1085224
#Neural Network
#install.packages("nnet")
#install.packages("neuralnet")

library(nnet)
library(neuralnet)
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
## 
##     compute
bankruptcy.nnet <- nnet(DLRSN ~ ., data = bankruptcy.train, size = 1, maxit = 500, type='class')
## # weights:  14
## initial  value 927.943826 
## final  value 547.000000 
## converged
#In-sample confusion matrix for NN
prob.nnet.train = predict(bankruptcy.nnet, bankruptcy.train)
pred.nnet.train = as.numeric(prob.nnet.train > opt.cutoff)
table(bankruptcy.train$DLRSN, pred.nnet.train, dnn = c("Observed","Predicted"))
##         Predicted
## Observed    0
##        0 3258
##        1  547
#Out-of-sample confusion matrix for NN
prob.nnet.test = predict(bankruptcy.nnet, bankruptcy.test)
pred.nnet.test = as.numeric(prob.nnet.test > opt.cutoff)
table(bankruptcy.test$DLRSN, pred.nnet.test, dnn = c("Observation", "Prediction"))
##            Prediction
## Observation    0
##           0 1402
##           1  229
#Misclassification Rate - MSE
mean(ifelse(bankruptcy.train$DLRSN != pred.nnet.train, 1, 0))
## [1] 0.1437582
#Misclassification Rate - MSPE
mean(ifelse(bankruptcy.test$DLRSN != pred.nnet.test, 1, 0))
## [1] 0.1404047
nn <- neuralnet(DLRSN ~ R1+R2+R3+R4+R5+R6+R7+R8+R9+R10,
                data=bankruptcy.train,hidden=c(8,1),linear.output=FALSE,stepmax = 1e6) 
plot(nn)
#------------------------Model Comparison------------------------#
model = factor(c("Logit", "Tree", "RF", "NN"),
               levels=c("Logit", "Tree", "RF", "NN"))

 

train_mse <- c(
  mean(bankruptcy.train$pred != bankruptcy.train$DLRSN),
  mean((predict(bankruptcy.pruned.tree) - bankruptcy.train$DLRSN)^2),
  mean(ifelse(bankruptcy.train$DLRSN != bankruptcy.train.pred.tree, 1, 0)),
  mean(ifelse(bankruptcy.train$DLRSN != pred.nnet.train, 1, 0))
  )

test_mspe <- c(
  mean(bankruptcy.test$pred != bankruptcy.test$DLRSN),
  mean((bankruptcy.test.pred.pruned.tree - bankruptcy.test$DLRSN)^2),
  mean(ifelse(bankruptcy.test$DLRSN != bankruptcy.test.pred.tree, 1, 0)),
  mean(ifelse(bankruptcy.test$DLRSN != pred.nnet.test, 1, 0))
)

comparison_table <- data.frame(model=model,
                               train = train_mse,
                               test = test_mspe)

 

comparison_table$train <- round(comparison_table$train,2)
comparison_table$test <- round(comparison_table$test,2)

#install.packages("tidyr")
library(tidyr)
comparison_table1 <- gather(comparison_table, subset, mspe, 2:3)

library(knitr)
kable(comparison_table)
model train test
Logit 0.48 0.48
Tree 0.40 0.40
RF 0.11 0.11
NN 0.14 0.14
install.packages("ggplot2")
## Installing package into 'C:/Users/khann/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
ggplot(comparison_table1, aes(x=model, y=mspe, group=subset, label=mspe)) +
  geom_line(linetype="dashed", size=1.2)+
  geom_point(size=3) +
  geom_label(show_guide  = F) 
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.