club <- read.csv("LC_Data.csv")
head(club)
##        X loan_amnt       term int_rate grade sub_grade        emp_title
## 1  74670     10775  36 months   10.64%     B        B2      US Govt SBA
## 2  67945     24000  60 months   16.55%     D        D2          Teacher
## 3  47114      5000  36 months   11.49%     B        B5    HR Generalist
## 4  67453     14000  36 months    9.99%     B        B3 HR Administrator
## 5 412679     25000  60 months   10.49%     B        B3            owner
## 6 183222      3475  36 months   14.16%     C        C2                 
##   emp_length home_ownership annual_inc issue_d loan_status            purpose
## 1    7 years       MORTGAGE      87500  Jul-13  Fully Paid debt_consolidation
## 2    9 years            OWN      51500  Nov-15  Fully Paid debt_consolidation
## 3    9 years       MORTGAGE      47000  Nov-15  Fully Paid            medical
## 4  10+ years       MORTGAGE     129000  Nov-15  Fully Paid debt_consolidation
## 5  10+ years       MORTGAGE     140000  Jan-15  Fully Paid debt_consolidation
## 6                      RENT      15000  Apr-14  Fully Paid        credit_card
##                     title addr_state   dti delinq_2yrs inq_last_6mths
## 1         Chase debt loan         FL 12.59           1              1
## 2      Debt consolidation         NJ 15.26           0              3
## 3        Medical expenses         TN 27.91           0              1
## 4      Debt consolidation         CT 15.92           0              0
## 5      Debt consolidation         OH  9.63           0              0
## 6 Credit card refinancing         GA  7.36           1              2
##   mths_since_last_delinq open_acc total_pymnt total_rec_int last_pymnt_d
## 1                     21        8   12554.340       1779.34       Jan-16
## 2                     NA        8   26389.860       2389.86       Jun-16
## 3                     NA       16    5926.820        926.82       Nov-18
## 4                     NA       13   16304.237       2304.24       Dec-18
## 5                     NA       12   32028.878       7028.88       Apr-19
## 6                     11       10    3670.708        195.71       Sep-14
##   last_pymnt_amnt last_credit_pull_d application_type tot_cur_bal
## 1         1277.37             Jun-19       Individual      366119
## 2        22878.88             Feb-18       Individual      205741
## 3          164.70             Jun-19       Individual       17651
## 4            1.03             Jun-19       Individual      251327
## 5         5181.95             Jun-19       Individual      173388
## 6         3194.55             Jul-18       Individual        3733
##   acc_open_past_24mths pub_rec_bankruptcies Orig..Index issue_Month issue_Year
## 1                    5                    0       74670         Jul       2013
## 2                    0                    0       67945         Nov       2015
## 3                    7                    1       47114         Nov       2015
## 4                    5                    0       67453         Nov       2015
## 5                    4                    0      412679         Jan       2015
## 6                    5                    0      183222         Apr       2014
summary(club)
##        X            loan_amnt         term             int_rate        
##  Min.   :     0   Min.   : 1000   Length:430723      Length:430723     
##  1st Qu.: 75110   1st Qu.: 8000   Class :character   Class :character  
##  Median :168025   Median :12800   Mode  :character   Mode  :character  
##  Mean   :185236   Mean   :14874                                        
##  3rd Qu.:288922   3rd Qu.:20000                                        
##  Max.   :443576   Max.   :40000                                        
##                                                                        
##     grade            sub_grade          emp_title          emp_length       
##  Length:430723      Length:430723      Length:430723      Length:430723     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  home_ownership       annual_inc        issue_d          loan_status       
##  Length:430723      Min.   :      0   Length:430723      Length:430723     
##  Class :character   1st Qu.:  46600   Class :character   Class :character  
##  Mode  :character   Median :  65000   Mode  :character   Mode  :character  
##                     Mean   :  77519                                        
##                     3rd Qu.:  92000                                        
##                     Max.   :9573072                                        
##                                                                            
##    purpose             title            addr_state             dti        
##  Length:430723      Length:430723      Length:430723      Min.   :  0.00  
##  Class :character   Class :character   Class :character   1st Qu.: 12.12  
##  Mode  :character   Mode  :character   Mode  :character   Median : 18.00  
##                                                           Mean   : 18.73  
##                                                           3rd Qu.: 24.51  
##                                                           Max.   :999.00  
##                                                           NA's   :159     
##   delinq_2yrs     inq_last_6mths   mths_since_last_delinq    open_acc    
##  Min.   : 0.000   Min.   :0.0000   Min.   :  0.00         Min.   : 0.00  
##  1st Qu.: 0.000   1st Qu.:0.0000   1st Qu.: 15.00         1st Qu.: 8.00  
##  Median : 0.000   Median :0.0000   Median : 31.00         Median :11.00  
##  Mean   : 0.334   Mean   :0.6061   Mean   : 33.99         Mean   :11.69  
##  3rd Qu.: 0.000   3rd Qu.:1.0000   3rd Qu.: 49.00         3rd Qu.:14.00  
##  Max.   :39.000   Max.   :7.0000   Max.   :188.00         Max.   :90.00  
##                   NA's   :1        NA's   :213683                        
##   total_pymnt    total_rec_int     last_pymnt_d       last_pymnt_amnt  
##  Min.   :    0   Min.   :    0.0   Length:430723      Min.   :    0.0  
##  1st Qu.: 6883   1st Qu.:  922.6   Class :character   1st Qu.:  329.5  
##  Median :11995   Median : 1862.3   Mode  :character   Median :  743.1  
##  Mean   :14603   Mean   : 2859.6                      Mean   : 4087.8  
##  3rd Qu.:20012   3rd Qu.: 3741.6                      3rd Qu.: 5602.4  
##  Max.   :63297   Max.   :28285.6                      Max.   :42192.1  
##                                                                        
##  last_credit_pull_d application_type    tot_cur_bal      acc_open_past_24mths
##  Length:430723      Length:430723      Min.   :      0   Min.   : 0.000      
##  Class :character   Class :character   1st Qu.:  29557   1st Qu.: 2.000      
##  Mode  :character   Mode  :character   Median :  80215   Median : 4.000      
##                                        Mean   : 141668   Mean   : 4.547      
##                                        3rd Qu.: 211315   3rd Qu.: 6.000      
##                                        Max.   :5445012   Max.   :64.000      
##                                        NA's   :6878      NA's   :1800        
##  pub_rec_bankruptcies  Orig..Index     issue_Month          issue_Year  
##  Min.   :0.0000       Min.   :     0   Length:430723      Min.   :2012  
##  1st Qu.:0.0000       1st Qu.: 39093   Class :character   1st Qu.:2015  
##  Median :0.0000       Median : 78220   Mode  :character   Median :2016  
##  Mean   :0.1329       Mean   :106100                      Mean   :2015  
##  3rd Qu.:0.0000       3rd Qu.:137919                      3rd Qu.:2017  
##  Max.   :8.0000       Max.   :421091                      Max.   :2017  
## 
## a) Comb_Risk_One: Create a binary column by combining categories A and B (Low Risk) into one category and all the remaining categories in another (High Risk).
## b) Comb_Risk_Two: Create a binary column by combining categories A, B and C (Low Risk) into one category and all the remaining categories in another (High Risk).

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
club<-club %>%
  mutate(Comb_Risk_One = ifelse(grade=='A' | grade=='B', 0, 1))
head(club)
##        X loan_amnt       term int_rate grade sub_grade        emp_title
## 1  74670     10775  36 months   10.64%     B        B2      US Govt SBA
## 2  67945     24000  60 months   16.55%     D        D2          Teacher
## 3  47114      5000  36 months   11.49%     B        B5    HR Generalist
## 4  67453     14000  36 months    9.99%     B        B3 HR Administrator
## 5 412679     25000  60 months   10.49%     B        B3            owner
## 6 183222      3475  36 months   14.16%     C        C2                 
##   emp_length home_ownership annual_inc issue_d loan_status            purpose
## 1    7 years       MORTGAGE      87500  Jul-13  Fully Paid debt_consolidation
## 2    9 years            OWN      51500  Nov-15  Fully Paid debt_consolidation
## 3    9 years       MORTGAGE      47000  Nov-15  Fully Paid            medical
## 4  10+ years       MORTGAGE     129000  Nov-15  Fully Paid debt_consolidation
## 5  10+ years       MORTGAGE     140000  Jan-15  Fully Paid debt_consolidation
## 6                      RENT      15000  Apr-14  Fully Paid        credit_card
##                     title addr_state   dti delinq_2yrs inq_last_6mths
## 1         Chase debt loan         FL 12.59           1              1
## 2      Debt consolidation         NJ 15.26           0              3
## 3        Medical expenses         TN 27.91           0              1
## 4      Debt consolidation         CT 15.92           0              0
## 5      Debt consolidation         OH  9.63           0              0
## 6 Credit card refinancing         GA  7.36           1              2
##   mths_since_last_delinq open_acc total_pymnt total_rec_int last_pymnt_d
## 1                     21        8   12554.340       1779.34       Jan-16
## 2                     NA        8   26389.860       2389.86       Jun-16
## 3                     NA       16    5926.820        926.82       Nov-18
## 4                     NA       13   16304.237       2304.24       Dec-18
## 5                     NA       12   32028.878       7028.88       Apr-19
## 6                     11       10    3670.708        195.71       Sep-14
##   last_pymnt_amnt last_credit_pull_d application_type tot_cur_bal
## 1         1277.37             Jun-19       Individual      366119
## 2        22878.88             Feb-18       Individual      205741
## 3          164.70             Jun-19       Individual       17651
## 4            1.03             Jun-19       Individual      251327
## 5         5181.95             Jun-19       Individual      173388
## 6         3194.55             Jul-18       Individual        3733
##   acc_open_past_24mths pub_rec_bankruptcies Orig..Index issue_Month issue_Year
## 1                    5                    0       74670         Jul       2013
## 2                    0                    0       67945         Nov       2015
## 3                    7                    1       47114         Nov       2015
## 4                    5                    0       67453         Nov       2015
## 5                    4                    0      412679         Jan       2015
## 6                    5                    0      183222         Apr       2014
##   Comb_Risk_One
## 1             0
## 2             1
## 3             0
## 4             0
## 5             0
## 6             1
club<-club%>%mutate(Comb_Risk_Two = ifelse(grade=='A' | grade=='B' |grade=='C',
0, 1))


##break the file into two files filtering out data for 2012, 13, and 14 in one file and 2015, 16 and 17 in another file.

club_12_13_14<- subset(club,club$issue_Year %in% c(2012,2013,2014))
club_15_16_17<- subset(club,club$issue_Year %in% c(2015,2016,2017))
##Check for null values.Impute the missing values.

summary(club_12_13_14)
##        X            loan_amnt         term             int_rate        
##  Min.   :     1   Min.   : 1000   Length:105814      Length:105814     
##  1st Qu.: 34772   1st Qu.: 8000   Class :character   Class :character  
##  Median : 78381   Median :12800   Mode  :character   Mode  :character  
##  Mean   : 89995   Mean   :14630                                        
##  3rd Qu.:131891   3rd Qu.:20000                                        
##  Max.   :235628   Max.   :35000                                        
##                                                                        
##     grade            sub_grade          emp_title          emp_length       
##  Length:105814      Length:105814      Length:105814      Length:105814     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  home_ownership       annual_inc        issue_d          loan_status       
##  Length:105814      Min.   :   4000   Length:105814      Length:105814     
##  Class :character   1st Qu.:  45000   Class :character   Class :character  
##  Mode  :character   Median :  63000   Mode  :character   Mode  :character  
##                     Mean   :  73794                                        
##                     3rd Qu.:  89500                                        
##                     Max.   :7446395                                        
##                                                                            
##    purpose             title            addr_state             dti       
##  Length:105814      Length:105814      Length:105814      Min.   : 0.00  
##  Class :character   Class :character   Class :character   1st Qu.:11.71  
##  Mode  :character   Mode  :character   Mode  :character   Median :17.24  
##                                                           Mean   :17.62  
##                                                           3rd Qu.:23.26  
##                                                           Max.   :39.99  
##                                                                          
##   delinq_2yrs      inq_last_6mths   mths_since_last_delinq    open_acc    
##  Min.   : 0.0000   Min.   :0.0000   Min.   :  0.00         Min.   : 1.00  
##  1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.: 15.00         1st Qu.: 8.00  
##  Median : 0.0000   Median :0.0000   Median : 31.00         Median :11.00  
##  Mean   : 0.2955   Mean   :0.7784   Mean   : 34.04         Mean   :11.37  
##  3rd Qu.: 0.0000   3rd Qu.:1.0000   3rd Qu.: 49.00         3rd Qu.:14.00  
##  Max.   :22.0000   Max.   :7.0000   Max.   :180.00         Max.   :84.00  
##                                     NA's   :55873                         
##   total_pymnt    total_rec_int   last_pymnt_d       last_pymnt_amnt  
##  Min.   :    0   Min.   :    0   Length:105814      Min.   :    0.0  
##  1st Qu.: 8249   1st Qu.: 1118   Class :character   1st Qu.:  367.3  
##  Median :13813   Median : 2116   Mode  :character   Median : 1052.7  
##  Mean   :16438   Mean   : 3314                      Mean   : 4352.6  
##  3rd Qu.:22385   3rd Qu.: 4196                      3rd Qu.: 6446.9  
##  Max.   :63297   Max.   :28193                      Max.   :36170.1  
##                                                                      
##  last_credit_pull_d application_type    tot_cur_bal      acc_open_past_24mths
##  Length:105814      Length:105814      Min.   :      0   Min.   : 0.000      
##  Class :character   Class :character   1st Qu.:  28485   1st Qu.: 2.000      
##  Mode  :character   Mode  :character   Median :  81304   Median : 4.000      
##                                        Mean   : 138499   Mean   : 4.195      
##                                        3rd Qu.: 208202   3rd Qu.: 6.000      
##                                        Max.   :3610743   Max.   :53.000      
##                                        NA's   :6878      NA's   :1800        
##  pub_rec_bankruptcies  Orig..Index     issue_Month          issue_Year  
##  Min.   :0.0000       Min.   :     1   Length:105814      Min.   :2012  
##  1st Qu.:0.0000       1st Qu.: 52488   Class :character   1st Qu.:2013  
##  Median :0.0000       Median :105865   Mode  :character   Median :2014  
##  Mean   :0.1131       Mean   :107057                      Mean   :2013  
##  3rd Qu.:0.0000       3rd Qu.:158785                      3rd Qu.:2014  
##  Max.   :7.0000       Max.   :235628                      Max.   :2014  
##                                                                         
##  Comb_Risk_One    Comb_Risk_Two  
##  Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.000  
##  Median :1.0000   Median :0.000  
##  Mean   :0.5535   Mean   :0.278  
##  3rd Qu.:1.0000   3rd Qu.:1.000  
##  Max.   :1.0000   Max.   :1.000  
## 
summary(club_15_16_17)
##        X            loan_amnt         term             int_rate        
##  Min.   :     0   Min.   : 1000   Length:324909      Length:324909     
##  1st Qu.:107872   1st Qu.: 8000   Class :character   Class :character  
##  Median :215894   Median :12800   Mode  :character   Mode  :character  
##  Mean   :216253   Mean   :14953                                        
##  3rd Qu.:324189   3rd Qu.:20000                                        
##  Max.   :443576   Max.   :40000                                        
##                                                                        
##     grade            sub_grade          emp_title          emp_length       
##  Length:324909      Length:324909      Length:324909      Length:324909     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  home_ownership       annual_inc        issue_d          loan_status       
##  Length:324909      Min.   :      0   Length:324909      Length:324909     
##  Class :character   1st Qu.:  47000   Class :character   Class :character  
##  Mode  :character   Median :  65000   Mode  :character   Mode  :character  
##                     Mean   :  78732                                        
##                     3rd Qu.:  95000                                        
##                     Max.   :9573072                                        
##                                                                            
##    purpose             title            addr_state             dti        
##  Length:324909      Length:324909      Length:324909      Min.   :  0.00  
##  Class :character   Class :character   Class :character   1st Qu.: 12.27  
##  Mode  :character   Mode  :character   Mode  :character   Median : 18.27  
##                                                           Mean   : 19.09  
##                                                           3rd Qu.: 24.94  
##                                                           Max.   :999.00  
##                                                           NA's   :159     
##   delinq_2yrs      inq_last_6mths mths_since_last_delinq    open_acc   
##  Min.   : 0.0000   Min.   :0.00   Min.   :  0.00         Min.   : 0.0  
##  1st Qu.: 0.0000   1st Qu.:0.00   1st Qu.: 15.00         1st Qu.: 8.0  
##  Median : 0.0000   Median :0.00   Median : 31.00         Median :11.0  
##  Mean   : 0.3466   Mean   :0.55   Mean   : 33.98         Mean   :11.8  
##  3rd Qu.: 0.0000   3rd Qu.:1.00   3rd Qu.: 50.00         3rd Qu.:15.0  
##  Max.   :39.0000   Max.   :6.00   Max.   :188.00         Max.   :90.0  
##                    NA's   :1      NA's   :157810                       
##   total_pymnt    total_rec_int     last_pymnt_d       last_pymnt_amnt  
##  Min.   :    0   Min.   :    0.0   Length:324909      Min.   :    0.0  
##  1st Qu.: 6554   1st Qu.:  868.6   Class :character   1st Qu.:  320.1  
##  Median :11461   Median : 1769.9   Mode  :character   Median :  683.5  
##  Mean   :14006   Mean   : 2711.5                      Mean   : 4001.5  
##  3rd Qu.:19075   3rd Qu.: 3616.2                      3rd Qu.: 5229.3  
##  Max.   :62942   Max.   :28285.6                      Max.   :42192.1  
##                                                                        
##  last_credit_pull_d application_type    tot_cur_bal      acc_open_past_24mths
##  Length:324909      Length:324909      Min.   :      0   Min.   : 0.00       
##  Class :character   Class :character   1st Qu.:  29903   1st Qu.: 2.00       
##  Mode  :character   Mode  :character   Median :  79809   Median : 4.00       
##                                        Mean   : 142633   Mean   : 4.66       
##                                        3rd Qu.: 212247   3rd Qu.: 6.00       
##                                        Max.   :5445012   Max.   :64.00       
##                                                                              
##  pub_rec_bankruptcies  Orig..Index     issue_Month          issue_Year  
##  Min.   :0.0000       Min.   :     0   Length:324909      Min.   :2015  
##  1st Qu.:0.0000       1st Qu.: 36132   Class :character   1st Qu.:2015  
##  Median :0.0000       Median : 72230   Mode  :character   Median :2016  
##  Mean   :0.1393       Mean   :105788                      Mean   :2016  
##  3rd Qu.:0.0000       3rd Qu.:117884                      3rd Qu.:2017  
##  Max.   :8.0000       Max.   :421091                      Max.   :2017  
##                                                                         
##  Comb_Risk_One    Comb_Risk_Two   
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000  
##  Mean   :0.5327   Mean   :0.2268  
##  3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000  
## 
club_12_13_14$mths_since_last_delinq[is.na(club_12_13_14$mths_since_last_delinq)
]=
mean(club_12_13_14$mths_since_last_delinq,na.rm=T)
club_15_16_17$mths_since_last_delinq[is.na(club_15_16_17$mths_since_last_delinq)
]=
mean(club_15_16_17$mths_since_last_delinq,na.rm=T)

## predict Low and High-risk categories (for the two new response variables)
## using various modeling techniques like Naïve Bayes’, KNN, Logistic Regression, and CART model

library(dplyr)
club_12_13_14 <- club_12_13_14 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_12_13_14<-club_12_13_14[complete.cases(club_12_13_14), ]

## BASIC EDA

## Relation between annual income and total payment.

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
ggplot(club_12_13_14) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)

## barchart of home ownership vs loan amount

data.for.plot <- aggregate(club_12_13_14$loan_amnt, by =
list(club_12_13_14$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")

## Histogram of loan amount

hist(club_12_13_14$loan_amnt, xlab = "Loan Amount")

## Find the outliers

FindOutliers <- function(data) {
lowerq = quantile(data)[2]
upperq = quantile(data)[4]
iqr = upperq - lowerq
### identify extreme outliers
extreme.threshold.upper = (iqr * 3) + upperq
extreme.threshold.lower = lowerq - (iqr * 3)
result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
length(result)
}
df <- subset(club_12_13_14,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index)) 
apply(df, 2, FindOutliers)
##       loan_amnt        int_rate      annual_inc             dti     total_pymnt 
##               0               0            1241               0               0 
##   total_rec_int last_pymnt_amnt     tot_cur_bal     Orig..Index 
##            2244            1284             538               0
## Check for multicollinearity
### Use correlation function for correlation analysis.Variables with high correlation leads to multicollinearity.These variables need to be dropped.

M<-cor(df)

## total_pymnt,loan_amnt and total_rec_int are correlated.We can remove 2 of the 3 variables.

df<-select(df,-total_pymnt,-total_rec_int)

###Scaling and standardization of predictors.

df.scaled = scale(df, center= TRUE, scale=TRUE)
df.scaled<-as.data.frame(df.scaled)

##combine scaled numerical columns with categorical column in original data frame 
club_12_13_14.dummy<- select(club_12_13_14,term,emp_length,home_ownership,loan_status,purpose,addr_state,application_type)

library(forcats)
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(dplyr)
club_12_13_14.dummy$term<-(fct_lump(club_12_13_14.dummy$term, n=5))
club_12_13_14.dummy$emp_length<-fct_lump(club_12_13_14.dummy$emp_length, n=5)
club_12_13_14.dummy$home_ownership<-fct_lump(club_12_13_14.dummy$home_ownership,
n = 5)
club_12_13_14.dummy$loan_status<-fct_lump(club_12_13_14.dummy$loan_status, n =
5)
club_12_13_14.dummy$purpose<-fct_lump(club_12_13_14.dummy$purpose, n=5)
club_12_13_14.dummy$addr_state<-fct_lump(club_12_13_14.dummy$addr_state,n=5)
club_12_13_14.dummy$application_type<-
fct_lump(club_12_13_14.dummy$application_type, n=5)
df.dummy<-dummy.data.frame(club_12_13_14.dummy)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
club_12_13_14.nw<-
cbind(df.dummy,df.scaled,club_12_13_14$Comb_Risk_One,club_12_13_14$Comb_Risk_Two
)
club_12_13_14.nw$`club_12_13_14$Comb_Risk_One`<-
as.factor(club_12_13_14.nw$`club_12_13_14$Comb_Risk_One`)
club_12_13_14.nw$`club_12_13_14$Comb_Risk_Two`<-
as.factor(club_12_13_14.nw$`club_12_13_14$Comb_Risk_Two`)
club_12_13_14.nw<-club_12_13_14.nw %>%mutate_if(is.integer,as.factor)

## train-test split and build different models.

set.seed(111)
train.index <- sample(row.names(club_12_13_14.nw), 0.6*dim(club_12_13_14.nw)[1])
valid.index <- setdiff(row.names(club_12_13_14.nw), train.index)
train.df <- club_12_13_14.nw[train.index, ]
valid.df <- club_12_13_14.nw[valid.index, ]

## KNN for a) Comb_Risk_One

library(class)
nn <- knn(train.df,valid.df,cl=train.df[, 39],k=13)
tab <- table(nn,valid.df[, 39])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab)
## [1] 99.45925
## KNN for b) Comb_Risk_Two

library(class)
nn1 <- knn(train.df,valid.df,cl=train.df[, 40],k=13)
tab1 <- table(nn1,valid.df[, 40])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab1)
## [1] 99.42388
##Naïve Bayes’ for a) Comb_Risk_One

library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
nb<- naiveBayes(train.df[, 39] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df,type="class")
confusionMatrix(pred.class, valid.df[, 39])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 17329    86
##          1    13 22147
##                                         
##                Accuracy : 0.9975        
##                  95% CI : (0.997, 0.998)
##     No Information Rate : 0.5618        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.9949        
##                                         
##  Mcnemar's Test P-Value : 4.612e-13     
##                                         
##             Sensitivity : 0.9993        
##             Specificity : 0.9961        
##          Pos Pred Value : 0.9951        
##          Neg Pred Value : 0.9994        
##              Prevalence : 0.4382        
##          Detection Rate : 0.4379        
##    Detection Prevalence : 0.4401        
##       Balanced Accuracy : 0.9977        
##                                         
##        'Positive' Class : 0             
## 
##Naïve Bayes’ for a) Comb_Risk_Two

library(caret)
nb<- naiveBayes(train.df[, 40] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df)
confusionMatrix(pred.class, valid.df[, 40])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 28332    35
##          1    26 11182
##                                          
##                Accuracy : 0.9985         
##                  95% CI : (0.998, 0.9988)
##     No Information Rate : 0.7166         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9962         
##                                          
##  Mcnemar's Test P-Value : 0.3057         
##                                          
##             Sensitivity : 0.9991         
##             Specificity : 0.9969         
##          Pos Pred Value : 0.9988         
##          Neg Pred Value : 0.9977         
##              Prevalence : 0.7166         
##          Detection Rate : 0.7159         
##    Detection Prevalence : 0.7168         
##       Balanced Accuracy : 0.9980         
##                                          
##        'Positive' Class : 0              
## 
##Logistic regression for a)Comb_Risk_One

log <- glm(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, family =
"binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
options(scipen=999)
summary(log)
## 
## Call:
## glm(formula = `club_12_13_14$Comb_Risk_One` ~ ., family = "binomial", 
##     data = train.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8426  -0.0139   0.0000   0.0182   6.5128  
## 
## Coefficients: (6 not defined because of singularities)
##                                       Estimate    Std. Error z value
## (Intercept)                           4.271350      0.973039   4.390
## `term 36 months`1                    -1.135147      0.065877 -17.231
## `term 60 months`1                           NA            NA      NA
## `emp_length< 1 year`1                 0.131427      0.090140   1.458
## `emp_length10+ years`1                0.079974      0.054015   1.481
## `emp_length2 years`1                  0.162556      0.081755   1.988
## `emp_length3 years`1                 -0.009620      0.085630  -0.112
## `emp_length5 years`1                 -0.195925      0.097236  -2.015
## emp_lengthOther1                            NA            NA      NA
## home_ownershipMORTGAGE1              -0.233234      0.057253  -4.074
## home_ownershipNONE1                  -2.204006      1.432948  -1.538
## home_ownershipOTHER1                 -3.585638      5.163079  -0.694
## home_ownershipOWN1                   -0.021199      0.082093  -0.258
## home_ownershipRENT1                         NA            NA      NA
## `loan_statusCharged Off`1            -0.272893      0.965963  -0.283
## loan_statusCurrent1                   0.732117      0.977940   0.749
## `loan_statusFully Paid`1             -0.545828      0.965318  -0.565
## `loan_statusIn Grace Period`1         0.046238      2.438590   0.019
## `loan_statusLate (16-30 days)`1      13.037180    193.213203   0.067
## `loan_statusLate (31-120 days)`1            NA            NA      NA
## purposecredit_card1                  -0.763437      0.130678  -5.842
## purposedebt_consolidation1           -0.370449      0.125213  -2.959
## purposehome_improvement1             -0.081582      0.155479  -0.525
## purposemajor_purchase1               -0.057675      0.210999  -0.273
## purposeother1                         0.240540      0.169938   1.415
## purposeOther1                               NA            NA      NA
## addr_stateCA1                        -0.122681      0.066184  -1.854
## addr_stateFL1                         0.159390      0.091863   1.735
## addr_stateIL1                         0.137504      0.118484   1.161
## addr_stateNY1                        -0.093689      0.083287  -1.125
## addr_stateTX1                        -0.001494      0.082034  -0.018
## addr_stateOther1                            NA            NA      NA
## loan_amnt                            -0.095510      0.028942  -3.300
## int_rate                             12.146254      0.158122  76.815
## annual_inc                            0.023645      0.016279   1.453
## dti                                   0.099273      0.022863   4.342
## last_pymnt_amnt                       0.088968      0.029459   3.020
## tot_cur_bal                          -0.005803      0.028076  -0.207
## Orig..Index                          -0.481437      0.022935 -20.992
## `club_12_13_14$Comb_Risk_Two`1       51.302730 507373.435564   0.000
##                                              Pr(>|z|)    
## (Intercept)                             0.00001135056 ***
## `term 36 months`1                < 0.0000000000000002 ***
## `term 60 months`1                                  NA    
## `emp_length< 1 year`1                        0.144831    
## `emp_length10+ years`1                       0.138717    
## `emp_length2 years`1                         0.046775 *  
## `emp_length3 years`1                         0.910548    
## `emp_length5 years`1                         0.043909 *  
## emp_lengthOther1                                   NA    
## home_ownershipMORTGAGE1                 0.00004626274 ***
## home_ownershipNONE1                          0.124026    
## home_ownershipOTHER1                         0.487383    
## home_ownershipOWN1                           0.796229    
## home_ownershipRENT1                                NA    
## `loan_statusCharged Off`1                    0.777553    
## loan_statusCurrent1                          0.454079    
## `loan_statusFully Paid`1                     0.571775    
## `loan_statusIn Grace Period`1                0.984872    
## `loan_statusLate (16-30 days)`1              0.946203    
## `loan_statusLate (31-120 days)`1                   NA    
## purposecredit_card1                     0.00000000515 ***
## purposedebt_consolidation1                   0.003091 ** 
## purposehome_improvement1                     0.599780    
## purposemajor_purchase1                       0.784592    
## purposeother1                                0.156934    
## purposeOther1                                      NA    
## addr_stateCA1                                0.063793 .  
## addr_stateFL1                                0.082724 .  
## addr_stateIL1                                0.245833    
## addr_stateNY1                                0.260631    
## addr_stateTX1                                0.985467    
## addr_stateOther1                                   NA    
## loan_amnt                                    0.000967 ***
## int_rate                         < 0.0000000000000002 ***
## annual_inc                                   0.146362    
## dti                                     0.00001410975 ***
## last_pymnt_amnt                              0.002527 ** 
## tot_cur_bal                                  0.836240    
## Orig..Index                      < 0.0000000000000002 ***
## `club_12_13_14$Comb_Risk_Two`1               0.999919    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 81347  on 59360  degrees of freedom
## Residual deviance: 13247  on 59327  degrees of freedom
## AIC: 13315
## 
## Number of Fisher Scoring iterations: 12
pred <- predict(log, valid.df, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_One`)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 16475  1089
##          1   867 21144
##                                                
##                Accuracy : 0.9506               
##                  95% CI : (0.9484, 0.9527)     
##     No Information Rate : 0.5618               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.8998               
##                                                
##  Mcnemar's Test P-Value : 0.0000005823         
##                                                
##             Sensitivity : 0.9500               
##             Specificity : 0.9510               
##          Pos Pred Value : 0.9380               
##          Neg Pred Value : 0.9606               
##              Prevalence : 0.4382               
##          Detection Rate : 0.4163               
##    Detection Prevalence : 0.4438               
##       Balanced Accuracy : 0.9505               
##                                                
##        'Positive' Class : 0                    
## 
##Logistic regression for b)Comb_Risk_Two

log <- glm(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, family =
"binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
options(scipen=999)
summary(log)
## 
## Call:
## glm(formula = `club_12_13_14$Comb_Risk_Two` ~ ., family = "binomial", 
##     data = train.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3122  -0.0630   0.0000   0.0018   6.3826  
## 
## Coefficients: (6 not defined because of singularities)
##                                     Estimate  Std. Error z value
## (Intercept)                      -16.1064942 136.3729857  -0.118
## `term 36 months`1                 -0.2959404   0.0530117  -5.583
## `term 60 months`1                         NA          NA      NA
## `emp_length< 1 year`1              0.1368701   0.0896253   1.527
## `emp_length10+ years`1            -0.0295975   0.0530489  -0.558
## `emp_length2 years`1               0.0854241   0.0843317   1.013
## `emp_length3 years`1               0.0954012   0.0877952   1.087
## `emp_length5 years`1              -0.4281566   0.0982763  -4.357
## emp_lengthOther1                          NA          NA      NA
## home_ownershipMORTGAGE1           -0.1114470   0.0582006  -1.915
## home_ownershipNONE1               -5.6428155  14.9448168  -0.378
## home_ownershipOTHER1              -2.4751657   2.2223750  -1.114
## home_ownershipOWN1                 0.0751672   0.0814497   0.923
## home_ownershipRENT1                       NA          NA      NA
## `loan_statusCharged Off`1         -0.7775067   0.7412507  -1.049
## loan_statusCurrent1                1.0032358   0.7497336   1.338
## `loan_statusFully Paid`1          -1.1222327   0.7412750  -1.514
## `loan_statusIn Grace Period`1      1.6675006   1.2933286   1.289
## `loan_statusLate (16-30 days)`1   -5.3800810 140.9438430  -0.038
## `loan_statusLate (31-120 days)`1          NA          NA      NA
## purposecredit_card1               -0.8645642   0.1155635  -7.481
## purposedebt_consolidation1        -0.5651192   0.1062272  -5.320
## purposehome_improvement1          -0.4478086   0.1380899  -3.243
## purposemajor_purchase1            -0.0004094   0.1946646  -0.002
## purposeother1                     -0.0303945   0.1399472  -0.217
## purposeOther1                             NA          NA      NA
## addr_stateCA1                     -0.1694840   0.0666801  -2.542
## addr_stateFL1                      0.1866297   0.0877697   2.126
## addr_stateIL1                      0.0716518   0.1129772   0.634
## addr_stateNY1                      0.0250077   0.0827985   0.302
## addr_stateTX1                     -0.1747501   0.0844757  -2.069
## addr_stateOther1                          NA          NA      NA
## loan_amnt                         -0.1202968   0.0297146  -4.048
## int_rate                           8.3566683   0.1039750  80.372
## annual_inc                         0.0700708   0.0414379   1.691
## dti                                0.1868972   0.0232316   8.045
## last_pymnt_amnt                    0.1436006   0.0258449   5.556
## tot_cur_bal                       -0.0123406   0.0325589  -0.379
## Orig..Index                       -0.0344380   0.0224856  -1.532
## `club_12_13_14$Comb_Risk_One`1    13.3757991 136.3709396   0.098
##                                              Pr(>|z|)    
## (Intercept)                                   0.90598    
## `term 36 months`1                0.000000023702396143 ***
## `term 60 months`1                                  NA    
## `emp_length< 1 year`1                         0.12673    
## `emp_length10+ years`1                        0.57689    
## `emp_length2 years`1                          0.31108    
## `emp_length3 years`1                          0.27720    
## `emp_length5 years`1             0.000013206059995968 ***
## emp_lengthOther1                                   NA    
## home_ownershipMORTGAGE1                       0.05551 .  
## home_ownershipNONE1                           0.70575    
## home_ownershipOTHER1                          0.26539    
## home_ownershipOWN1                            0.35608    
## home_ownershipRENT1                                NA    
## `loan_statusCharged Off`1                     0.29422    
## loan_statusCurrent1                           0.18086    
## `loan_statusFully Paid`1                      0.13005    
## `loan_statusIn Grace Period`1                 0.19729    
## `loan_statusLate (16-30 days)`1               0.96955    
## `loan_statusLate (31-120 days)`1                   NA    
## purposecredit_card1              0.000000000000073596 ***
## purposedebt_consolidation1       0.000000103818082074 ***
## purposehome_improvement1                      0.00118 ** 
## purposemajor_purchase1                        0.99832    
## purposeother1                                 0.82806    
## purposeOther1                                      NA    
## addr_stateCA1                                 0.01103 *  
## addr_stateFL1                                 0.03347 *  
## addr_stateIL1                                 0.52594    
## addr_stateNY1                                 0.76263    
## addr_stateTX1                                 0.03858 *  
## addr_stateOther1                                   NA    
## loan_amnt                        0.000051566026350301 ***
## int_rate                         < 0.0000000000000002 ***
## annual_inc                                    0.09084 .  
## dti                              0.000000000000000863 ***
## last_pymnt_amnt                  0.000000027564979126 ***
## tot_cur_bal                                   0.70467    
## Orig..Index                                   0.12563    
## `club_12_13_14$Comb_Risk_One`1                0.92187    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70706  on 59360  degrees of freedom
## Residual deviance: 13621  on 59327  degrees of freedom
## AIC: 13689
## 
## Number of Fisher Scoring iterations: 20
pred <- predict(log, valid.df, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_Two`)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 27457  1235
##          1   901  9982
##                                                
##                Accuracy : 0.946                
##                  95% CI : (0.9438, 0.9482)     
##     No Information Rate : 0.7166               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.8659               
##                                                
##  Mcnemar's Test P-Value : 0.0000000000005798   
##                                                
##             Sensitivity : 0.9682               
##             Specificity : 0.8899               
##          Pos Pred Value : 0.9570               
##          Neg Pred Value : 0.9172               
##              Prevalence : 0.7166               
##          Detection Rate : 0.6938               
##    Detection Prevalence : 0.7250               
##       Balanced Accuracy : 0.9291               
##                                                
##        'Positive' Class : 0                    
## 
##CART for a)Comb_Risk_One

library(rpart)
library(caret)
ct <- rpart(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_One`)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 17240   101
##          1   102 22132
##                                              
##                Accuracy : 0.9949             
##                  95% CI : (0.9941, 0.9956)   
##     No Information Rate : 0.5618             
##     P-Value [Acc > NIR] : <0.0000000000000002
##                                              
##                   Kappa : 0.9896             
##                                              
##  Mcnemar's Test P-Value : 1                  
##                                              
##             Sensitivity : 0.9941             
##             Specificity : 0.9955             
##          Pos Pred Value : 0.9942             
##          Neg Pred Value : 0.9954             
##              Prevalence : 0.4382             
##          Detection Rate : 0.4356             
##    Detection Prevalence : 0.4382             
##       Balanced Accuracy : 0.9948             
##                                              
##        'Positive' Class : 0                  
## 
##CART for a)Comb_Risk_Two


library(rpart)
ct <- rpart(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_Two`)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 28178   169
##          1   180 11048
##                                              
##                Accuracy : 0.9912             
##                  95% CI : (0.9902, 0.9921)   
##     No Information Rate : 0.7166             
##     P-Value [Acc > NIR] : <0.0000000000000002
##                                              
##                   Kappa : 0.9783             
##                                              
##  Mcnemar's Test P-Value : 0.5925             
##                                              
##             Sensitivity : 0.9937             
##             Specificity : 0.9849             
##          Pos Pred Value : 0.9940             
##          Neg Pred Value : 0.9840             
##              Prevalence : 0.7166             
##          Detection Rate : 0.7120             
##    Detection Prevalence : 0.7163             
##       Balanced Accuracy : 0.9893             
##                                              
##        'Positive' Class : 0                  
## 
## From the accuracy results it can be seen that models Naive Bayes,KNN and CART perform really well.Of this Naive Bayes classifier performs slightly better.
#For period 2015-2017
## predict Low and High-risk categories (for the two new response variables) using various modeling techniques like Naïve Bayes’, KNN, Logistic Regression, and CART model

club_15_16_17 <- club_15_16_17 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_15_16_17<-club_15_16_17[complete.cases(club_15_16_17), ]

## BASIC EDA
## Relation between annual income and total payment.

library(ggplot2)
ggplot(club_15_16_17) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)

## barchart of home ownership vs loan amount

data.for.plot <- aggregate(club_15_16_17$loan_amnt, by =
list(club_15_16_17$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")

## Histogram of loan amount

hist(club_15_16_17$loan_amnt, xlab = "Loan Amount")

## Find the outliers

FindOutliers <- function(data) {
lowerq = quantile(data)[2]
upperq = quantile(data)[4]
iqr = upperq - lowerq
### identify extreme outliers
extreme.threshold.upper = (iqr * 3) + upperq
extreme.threshold.lower = lowerq - (iqr * 3)
result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
length(result)
}
df <- subset(club_15_16_17,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index))

apply(df, 2, FindOutliers)
##       loan_amnt        int_rate      annual_inc             dti     total_pymnt 
##               0               0            5189             771              22 
##   total_rec_int last_pymnt_amnt     tot_cur_bal     Orig..Index 
##            4836           13306            2204           14333
## Check for multicollinearity
### Use correlation function for correlation analysis.Variables with high correlation leads to multicollinearity.These variables need to be dropped.

M<-cor(df)

## total_pymnt,loan_amnt and total_rec_int are correlated.We can remove 2 of the 3 variables.
df<-select(df,-total_pymnt,-total_rec_int)

###Scaling and standardization of predictors.

df.scaled = scale(df, center= TRUE, scale=TRUE)
df.scaled=as.data.frame(df.scaled)

##combine scaled numerical columns with categorical column in original data frame 

club_15_16_17.dummy<- select(club_15_16_17,term,emp_length,home_ownership,loan_status,purpose,addr_state,application_type)
library(forcats)
library(dummies)
library(dplyr)
club_15_16_17.dummy$term<-(fct_lump(club_15_16_17.dummy$term, n=5))
club_15_16_17.dummy$emp_length<-fct_lump(club_15_16_17.dummy$emp_length, n=5)
club_15_16_17.dummy$home_ownership<-fct_lump(club_15_16_17.dummy$home_ownership,
n = 5)
club_15_16_17.dummy$loan_status<-fct_lump(club_15_16_17.dummy$loan_status, n =
5)
club_15_16_17.dummy$purpose<-fct_lump(club_15_16_17.dummy$purpose, n=5)
club_15_16_17.dummy$addr_state<-fct_lump(club_15_16_17.dummy$addr_state,n=5)
club_15_16_17.dummy$application_type<-
fct_lump(club_15_16_17.dummy$application_type, n=5)
df.dummy<-dummy.data.frame(club_15_16_17.dummy)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
club_15_16_17.nw<-cbind(df.dummy,df.scaled,club_15_16_17$Comb_Risk_One,club_15_16_17$Comb_Risk_Two
)
club_15_16_17.nw$`club_15_16_17$Comb_Risk_One`<-
as.factor(club_15_16_17.nw$`club_15_16_17$Comb_Risk_One`)
club_15_16_17.nw$`club_15_16_17$Comb_Risk_Two`<-
as.factor(club_15_16_17.nw$`club_15_16_17$Comb_Risk_Two`)
club_15_16_17.nw<-club_15_16_17.nw %>%mutate_if(is.integer,as.factor)

## train-test split and build different models.

set.seed(111)
train.index <- sample(row.names(club_15_16_17.nw), 0.6*dim(club_15_16_17.nw)[1])
valid.index <- setdiff(row.names(club_15_16_17.nw), train.index)
train.df <- club_15_16_17.nw[train.index, ]
valid.df <- club_15_16_17.nw[valid.index, ]