setwd("C:/Users/Atul/Desktop/risk analysis")
train1 <- read.csv("cs-training.csv")
test1 <- read.csv("cs-test.csv")
nrow(test1)
## [1] 101503
nrow(train1)
## [1] 150000
#View(train)
#View(test)
combi <- rbind(train1,test1)
#Now its time to look closely at the data and to decide what else is required.
str(combi)
## 'data.frame':    251503 obs. of  12 variables:
##  $ X                                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SeriousDlqin2yrs                    : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ RevolvingUtilizationOfUnsecuredLines: num  0.766 0.957 0.658 0.234 0.907 ...
##  $ age                                 : int  45 40 38 30 49 74 57 39 27 57 ...
##  $ NumberOfTime30.59DaysPastDueNotWorse: int  2 0 1 0 1 0 0 0 0 0 ...
##  $ DebtRatio                           : num  0.803 0.1219 0.0851 0.036 0.0249 ...
##  $ MonthlyIncome                       : int  9120 2600 3042 3300 63588 3500 NA 3500 NA 23684 ...
##  $ NumberOfOpenCreditLinesAndLoans     : int  13 4 2 5 7 3 8 8 2 9 ...
##  $ NumberOfTimes90DaysLate             : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ NumberRealEstateLoansOrLines        : int  6 0 0 0 1 1 3 0 0 4 ...
##  $ NumberOfTime60.89DaysPastDueNotWorse: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NumberOfDependents                  : int  2 1 0 0 0 1 0 0 NA 2 ...
summary(combi)
##        X          SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines
##  Min.   :     1   Min.   :0.00     Min.   :    0.00                    
##  1st Qu.: 31439   1st Qu.:0.00     1st Qu.:    0.03                    
##  Median : 62876   Median :0.00     Median :    0.15                    
##  Mean   : 65214   Mean   :0.07     Mean   :    5.75                    
##  3rd Qu.: 94314   3rd Qu.:0.00     3rd Qu.:    0.56                    
##  Max.   :150000   Max.   :1.00     Max.   :50708.00                    
##                   NA's   :101503                                       
##       age         NumberOfTime30.59DaysPastDueNotWorse   DebtRatio       
##  Min.   :  0.00   Min.   : 0.0000                      Min.   :     0.0  
##  1st Qu.: 41.00   1st Qu.: 0.0000                      1st Qu.:     0.2  
##  Median : 52.00   Median : 0.0000                      Median :     0.4  
##  Mean   : 52.34   Mean   : 0.4343                      Mean   :   349.6  
##  3rd Qu.: 63.00   3rd Qu.: 0.0000                      3rd Qu.:     0.9  
##  Max.   :109.00   Max.   :98.0000                      Max.   :329664.0  
##                                                                          
##  MonthlyIncome     NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
##  Min.   :      0   Min.   : 0.000                  Min.   : 0.0000        
##  1st Qu.:   3400   1st Qu.: 5.000                  1st Qu.: 0.0000        
##  Median :   5400   Median : 8.000                  Median : 0.0000        
##  Mean   :   6745   Mean   : 8.453                  Mean   : 0.2784        
##  3rd Qu.:   8212   3rd Qu.:11.000                  3rd Qu.: 0.0000        
##  Max.   :7727000   Max.   :85.000                  Max.   :98.0000        
##  NA's   :49834                                                            
##  NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
##  Min.   : 0.000               Min.   : 0.0000                     
##  1st Qu.: 0.000               1st Qu.: 0.0000                     
##  Median : 1.000               Median : 0.0000                     
##  Mean   : 1.016               Mean   : 0.2525                     
##  3rd Qu.: 2.000               3rd Qu.: 0.0000                     
##  Max.   :54.000               Max.   :98.0000                     
##                                                                   
##  NumberOfDependents
##  Min.   : 0.000    
##  1st Qu.: 0.000    
##  Median : 0.000    
##  Mean   : 0.762    
##  3rd Qu.: 1.000    
##  Max.   :43.000    
##  NA's   :6550
#After looking at the summary and going through the data dictionary (xls file)
#it is clear that the model requires more categorical data to make better sense of it.
library(rpart)
combi$AgeClass <- '64+'
combi$AgeClass[combi$age > 0 & combi$age <= 40] <- '0-40'
combi$AgeClass[combi$age > 40 & combi$age <=52] <- '41-52'
combi$AgeClass[combi$age > 52 & combi$age <=64] <- '43-54'
combi$AgeClass <- as.factor(combi$AgeClass)
#combi$AgeClass[is.na(combi$age)] <- NA


library(gmodels)
CrossTable(combi$SeriousDlqin2yrs)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  150000 
## 
##  
##           |         0 |         1 | 
##           |-----------|-----------|
##           |    139974 |     10026 | 
##           |     0.933 |     0.067 | 
##           |-----------|-----------|
## 
## 
## 
## 
#This function gave me the analysis of the variable to be predicted,
#A stagerring 93.3 % of people did not have deliquencies post 90 days.
#This would mean if I were to predict the test data as 0 for all people, 
#I would be right with 93% of them.


#lets test the respose with the remaining variables.
CrossTable(combi$SeriousDlqin2yrs,combi$AgeClass, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  150000 
## 
##  
##                        | combi$AgeClass 
## combi$SeriousDlqin2yrs |      0-40 |     41-52 |     43-54 |       64+ | Row Total | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|
##                      0 |     31462 |     38830 |     39270 |     30412 |    139974 | 
##                        |     0.225 |     0.277 |     0.281 |     0.217 |     0.933 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|
##                      1 |      3634 |      3443 |      2167 |       782 |     10026 | 
##                        |     0.362 |     0.343 |     0.216 |     0.078 |     0.067 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|
##           Column Total |     35096 |     42273 |     41437 |     31194 |    150000 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|
## 
## 
# Well this makes sense, the output clearly shows that 36.2% of serious deliquencies and 34.3% of serious deliquencies 
# are done by people of the age groups of 0-40 and 41-52 respectively. And the rest aren't as significant.
#This makes sense, because this is the age where people go though serious life turnouts, it is around the age where people eitherlose their loved ones, or sickness entraps them,
#while their young ones are making a career or getting married. Lets make some more categorical data from the rest of the features


DependentsFit <- rpart(NumberOfDependents ~ RevolvingUtilizationOfUnsecuredLines + age + NumberOfOpenCreditLinesAndLoans 
                       + MonthlyIncome + NumberRealEstateLoansOrLines + DebtRatio,
                       data=combi[!is.na(combi$NumberOfDependents),], 
                       method="anova")

combi$NumberOfDependents[is.na(combi$NumberOfDependents)] <- predict(DependentsFit, combi[is.na(combi$NumberOfDependents),])
sum(is.na(combi$NumberOfDependents))
## [1] 0
# combi$NumberOfDependents <- as.factor(combi$NumberOfDependents)
# levels(combi$NumberOfDependents) <- c("0", "1", "2", "3",  "4-5", "4-5","6-10","6-10","6-10","6-10","6-10" , "10+","10+")
# CrossTable(combi$NumberOfDependents)
# CrossTable(combi$SeriousDlqin2yrs,combi$NumberOfDependents, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
# 

#Well this seems controversial, the number of people who do not have nay dependents seem to constitute 51.7%
#of the total deliquencies, that would not make sense, but theres a catch, the deliquencies are much lesser than their total composition of the people with 0 dependants which is 59.5%
#whereas the percentage of deliquencies in greater than their composition as the number of dependencies start increasing.

#Lets now look at another predictor vatible, lets look at the number of times people were lateb by 30-59 dats,

#lets change the datatype of the variable from integer to factor and change the levels.

combi$NumberOfTime30.59DaysPastDueNotWorse <- as.factor(combi$NumberOfTime30.59DaysPastDueNotWorse)

CrossTable(combi$SeriousDlqin2yrs,combi$NumberOfTime30.59DaysPastDueNotWorse, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  150000 
## 
##  
##                        | combi$NumberOfTime30.59DaysPastDueNotWorse 
## combi$SeriousDlqin2yrs |         0 |         1 |         2 |         3 |         4 |         5 |         6 |         7 |         8 |         9 |        10 |        11 |        12 |        13 |        19 |        96 |        98 | Row Total | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      0 |    120977 |     13624 |      3379 |      1136 |       429 |       188 |        66 |        26 |        17 |         8 |         1 |         0 |         1 |         0 |         0 |         1 |       121 |    139974 | 
##                        |     0.864 |     0.097 |     0.024 |     0.008 |     0.003 |     0.001 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.001 |     0.933 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      1 |      5041 |      2409 |      1219 |       618 |       318 |       154 |        74 |        28 |         8 |         4 |         3 |         1 |         1 |         1 |         0 |         4 |       143 |     10026 | 
##                        |     0.503 |     0.240 |     0.122 |     0.062 |     0.032 |     0.015 |     0.007 |     0.003 |     0.001 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.014 |     0.067 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##           Column Total |    126018 |     16033 |      4598 |      1754 |       747 |       342 |       140 |        54 |        25 |        12 |         4 |         1 |         2 |         1 |         0 |         5 |       264 |    150000 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
## 
#Lets apply the same the numberoftimepast 60-89 days column.
combi$NumberOfTime60.89DaysPastDueNotWorse <- as.factor(combi$NumberOfTime60.89DaysPastDueNotWorse)


CrossTable(combi$SeriousDlqin2yrs,combi$NumberOfTime60.89DaysPastDueNotWorse, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  150000 
## 
##  
##                        | combi$NumberOfTime60.89DaysPastDueNotWorse 
## combi$SeriousDlqin2yrs |         0 |         1 |         2 |         3 |         4 |         5 |         6 |         7 |         8 |         9 |        11 |        96 |        98 | Row Total | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      0 |    135140 |      3954 |       557 |       138 |        40 |        13 |         4 |         4 |         1 |         1 |         0 |         1 |       121 |    139974 | 
##                        |     0.965 |     0.028 |     0.004 |     0.001 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.001 |     0.933 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      1 |      7256 |      1777 |       561 |       180 |        65 |        21 |        12 |         5 |         1 |         0 |         1 |         4 |       143 |     10026 | 
##                        |     0.724 |     0.177 |     0.056 |     0.018 |     0.006 |     0.002 |     0.001 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.014 |     0.067 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##           Column Total |    142396 |      5731 |      1118 |       318 |       105 |        34 |        16 |         9 |         2 |         1 |         1 |         5 |       264 |    150000 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
## 
#Number of times 90 days late.
combi$NumberOfTimes90DaysLate <- as.factor(combi$NumberOfTimes90DaysLate)
CrossTable(combi$SeriousDlqin2yrs,combi$NumberOfTimes90DaysLate, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  150000 
## 
##  
##                        | combi$NumberOfTimes90DaysLate 
## combi$SeriousDlqin2yrs |         0 |         1 |         2 |         3 |         4 |         5 |         6 |         7 |         8 |         9 |        10 |        11 |        12 |        13 |        14 |        15 |        16 |        17 |        18 |        96 |        98 | Row Total | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      0 |    135108 |      3478 |       779 |       282 |        96 |        48 |        32 |         7 |         6 |         5 |         3 |         2 |         1 |         2 |         1 |         2 |         0 |         0 |         0 |         1 |       121 |    139974 | 
##                        |     0.965 |     0.025 |     0.006 |     0.002 |     0.001 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.001 |     0.933 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      1 |      6554 |      1765 |       776 |       385 |       195 |        83 |        48 |        31 |        15 |        14 |         5 |         3 |         1 |         2 |         1 |         0 |         0 |         1 |         0 |         4 |       143 |     10026 | 
##                        |     0.654 |     0.176 |     0.077 |     0.038 |     0.019 |     0.008 |     0.005 |     0.003 |     0.001 |     0.001 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.000 |     0.014 |     0.067 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##           Column Total |    141662 |      5243 |      1555 |       667 |       291 |       131 |        80 |        38 |        21 |        19 |         8 |         5 |         2 |         4 |         2 |         2 |         0 |         1 |         0 |         5 |       264 |    150000 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
## 
#Filling the NAs in monthly income using regression anova method.


IncomeFit <- rpart(MonthlyIncome ~ RevolvingUtilizationOfUnsecuredLines + age + NumberOfOpenCreditLinesAndLoans + NumberOfDependents + NumberRealEstateLoansOrLines + DebtRatio,
                   data=combi[!is.na(combi$MonthlyIncome),], 
                   method="anova")

combi$MonthlyIncome[is.na(combi$MonthlyIncome)] <- predict(IncomeFit, combi[is.na(combi$MonthlyIncome),])
sum(is.na(combi$MonthlyIncome))
## [1] 0
#Viola


#Lets now go to the salary data and create categorical variables for the salary data

combi$IncomeClass[combi$MonthlyIncome >= 0 & combi$MonthlyIncome <= 1000] <- '0-1000'
combi$IncomeClass[combi$MonthlyIncome > 1000 & combi$MonthlyIncome <= 2000] <- '1001-2000'
combi$IncomeClass[combi$MonthlyIncome > 2000 & combi$MonthlyIncome <= 3000] <- '2001-3000'
combi$IncomeClass[combi$MonthlyIncome > 3000 & combi$MonthlyIncome <= 4000] <- '3001-4000'
combi$IncomeClass[combi$MonthlyIncome > 4000 & combi$MonthlyIncome <= 6000] <- '4001-6000'
combi$IncomeClass[combi$MonthlyIncome > 6001 & combi$MonthlyIncome <= 8000] <- '6001-8000'
combi$IncomeClass[combi$MonthlyIncome > 6000 & combi$MonthlyIncome <= 10000] <- '8001-10000'
combi$IncomeClass[combi$MonthlyIncome > 10000 & combi$MonthlyIncome <= 20000] <- '10001-20000'
combi$IncomeClass[combi$MonthlyIncome > 20000] <- '20000+'
combi$IncomeClass <- as.factor(combi$IncomeClass)


#One of the most insightful variable in my opinion is the debt ratrio, this has to make a lo0t of sense once we model the Cross Table of Debt Ratio\
combi$DebtRatioClass <- '100+'
combi$DebtRatioClass[combi$DebtRatio >= 0 & combi$DebtRatio <= 0.5] <- '0-0.5'
combi$DebtRatioClass[combi$DebtRatio > 0.5 & combi$DebtRatio <= 1] <- '0.5-1'
combi$DebtRatioClass[combi$DebtRatio > 1 & combi$DebtRatio <= 2] <- '1-2'
combi$DebtRatioClass[combi$DebtRatio > 2 & combi$DebtRatio <= 10] <- '2-10'
combi$DebtRatioClass[combi$DebtRatio > 10 & combi$DebtRatio <= 100] <- '10-100'
combi$DebtRatioClass <- as.factor(combi$DebtRatioClass)
summary(combi$DebtRatioClass)
##  0-0.5  0.5-1    1-2 10-100   100+   2-10 
## 157240  35548   6855   7521  40753   3586
CrossTable(combi$DebtRatioClass)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  251503 
## 
##  
##           |     0-0.5 |     0.5-1 |       1-2 |    10-100 |      100+ | 
##           |-----------|-----------|-----------|-----------|-----------|
##           |    157240 |     35548 |      6855 |      7521 |     40753 | 
##           |     0.625 |     0.141 |     0.027 |     0.030 |     0.162 | 
##           |-----------|-----------|-----------|-----------|-----------|
## 
## 
##           |      2-10 | 
##           |-----------|
##           |      3586 | 
##           |     0.014 | 
##           |-----------|
## 
## 
## 
## 
CrossTable(combi$SeriousDlqin2yrs,combi$DebtRatioClass, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  150000 
## 
##  
##                        | combi$DebtRatioClass 
## combi$SeriousDlqin2yrs |     0-0.5 |     0.5-1 |       1-2 |    10-100 |      100+ |      2-10 | Row Total | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      0 |     88053 |     19075 |      3553 |      4298 |     22970 |      2025 |    139974 | 
##                        |     0.629 |     0.136 |     0.025 |     0.031 |     0.164 |     0.014 |     0.933 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      1 |      5655 |      2080 |       539 |       199 |      1410 |       143 |     10026 | 
##                        |     0.564 |     0.207 |     0.054 |     0.020 |     0.141 |     0.014 |     0.067 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##           Column Total |     93708 |     21155 |      4092 |      4497 |     24380 |      2168 |    150000 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
## 
#As can be seen using the comparison, this makes perfect sense..

#the next feature of the dataset to be consideredq, is the RevolvingUtilizationOfUnsecuredLine, this should be on the same lines of that of debt ratio
summary(combi$RevolvingUtilizationOfUnsecuredLines)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.03     0.15     5.75     0.56 50708.00
combi$RUUCCLass <- '100+'
combi$RUUCCLass[combi$RevolvingUtilizationOfUnsecuredLines >= 0 & combi$RevolvingUtilizationOfUnsecuredLines <= 0.15] <- '0-0.15'
combi$RUUCCLass[combi$RevolvingUtilizationOfUnsecuredLines > 0.15 & combi$RevolvingUtilizationOfUnsecuredLines <= 0.5] <- '0.15-0.5'
combi$RUUCCLass[combi$RevolvingUtilizationOfUnsecuredLines > 0.5 & combi$RevolvingUtilizationOfUnsecuredLines <= 1] <- '0.5-1'
combi$RUUCCLass[combi$RevolvingUtilizationOfUnsecuredLines > 1 & combi$RevolvingUtilizationOfUnsecuredLines <= 10] <- '1-10'
combi$RUUCCLass[combi$RevolvingUtilizationOfUnsecuredLines > 10 & combi$RevolvingUtilizationOfUnsecuredLines <= 100] <- '10-100'
combi$RUUCCLass[is.na(combi$RevolvingUtilizationOfUnsecuredLines)] <- NA
combi$RUUCCLass <- as.factor(combi$RUUCCLass)
sum(is.na(combi$RevolvingUtilizationOfUnsecuredLines))
## [1] 0
CrossTable(combi$RUUCCLass)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  251503 
## 
##  
##           |    0-0.15 |  0.15-0.5 |     0.5-1 |      1-10 |    10-100 | 
##           |-----------|-----------|-----------|-----------|-----------|
##           |    124783 |     57307 |     63911 |      5108 |        29 | 
##           |     0.496 |     0.228 |     0.254 |     0.020 |     0.000 | 
##           |-----------|-----------|-----------|-----------|-----------|
## 
## 
##           |      100+ | 
##           |-----------|
##           |       365 | 
##           |     0.001 | 
##           |-----------|
## 
## 
## 
## 
CrossTable(combi$SeriousDlqin2yrs,combi$RUUCCLass, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  150000 
## 
##  
##                        | combi$RUUCCLass 
## combi$SeriousDlqin2yrs |    0-0.15 |  0.15-0.5 |     0.5-1 |      1-10 |    10-100 |      100+ | Row Total | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      0 |     72900 |     32825 |     32165 |      1860 |        12 |       212 |    139974 | 
##                        |     0.521 |     0.235 |     0.230 |     0.013 |     0.000 |     0.002 |     0.933 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      1 |      1422 |      1565 |      5802 |      1220 |         6 |        11 |     10026 | 
##                        |     0.142 |     0.156 |     0.579 |     0.122 |     0.001 |     0.001 |     0.067 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##           Column Total |     74322 |     34390 |     37967 |      3080 |        18 |       223 |    150000 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
## 
#As predicted, the RevolvingUtilizationOfUnsecuredLines is on the similar lines of that of the debt ratio. 

#Lets go ahead to the number of opern credits and lines laon, one can predict that less are the number of open credit lines, less are the chances of person to default the loan
#But that would be awrong assertion, gien we fdon't know how much he/she is earning and how much is the loan.
summary(combi$NumberOfOpenCreditLinesAndLoans)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.000   8.000   8.453  11.000  85.000
plot(density(combi$NumberOfOpenCreditLinesAndLoans))

#Let us model categorically for this variable
combi$OpenCreditClass <- '20+'
combi$OpenCreditClass[combi$NumberOfOpenCreditLinesAndLoans >= 0 & combi$NumberOfOpenCreditLinesAndLoans<=5] <- '0-5'
combi$OpenCreditClass[combi$NumberOfOpenCreditLinesAndLoans > 5 & combi$NumberOfOpenCreditLinesAndLoans<=10] <- '5-10'
combi$OpenCreditClass[combi$NumberOfOpenCreditLinesAndLoans > 10 & combi$NumberOfOpenCreditLinesAndLoans<=15] <- '10-15'
combi$OpenCreditClass[combi$NumberOfOpenCreditLinesAndLoans > 15 & combi$NumberOfOpenCreditLinesAndLoans<= 20] <- '15-20'
combi$OpenCreditClass <- as.factor(combi$OpenCreditClass)
CrossTable(combi$OpenCreditClass)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  251503 
## 
##  
##           |       0-5 |     10-15 |     15-20 |       20+ |      5-10 | 
##           |-----------|-----------|-----------|-----------|-----------|
##           |     78018 |     48912 |     16442 |      6679 |    101452 | 
##           |     0.310 |     0.194 |     0.065 |     0.027 |     0.403 | 
##           |-----------|-----------|-----------|-----------|-----------|
## 
## 
## 
## 
CrossTable(combi$SeriousDlqin2yrs,combi$OpenCreditClass, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  150000 
## 
##  
##                        | combi$OpenCreditClass 
## combi$SeriousDlqin2yrs |       0-5 |     10-15 |     15-20 |       20+ |      5-10 | Row Total | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      0 |     42668 |     27380 |      9170 |      3701 |     57055 |    139974 | 
##                        |     0.305 |     0.196 |     0.066 |     0.026 |     0.408 |     0.933 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                      1 |      3922 |      1804 |       676 |       279 |      3345 |     10026 | 
##                        |     0.391 |     0.180 |     0.067 |     0.028 |     0.334 |     0.067 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##           Column Total |     46590 |     29184 |      9846 |      3980 |     60400 |    150000 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
## 
#independently, this variable does not seem as insightful as other, but i am certain collectively it will be very important.

summary(combi$NumberRealEstateLoansOrLines)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.016   2.000  54.000
combi$RealtyLinesClass <- '3+' 
combi$RealtyLinesClass[combi$NumberRealEstateLoansOrLines >=0 & combi$NumberRealEstateLoansOrLines <= 1] <- '0-1'
combi$RealtyLinesClass[combi$NumberRealEstateLoansOrLines >1 & combi$NumberRealEstateLoansOrLines <= 2] <- '1-2'
combi$RealtyLinesClass[combi$NumberRealEstateLoansOrLines >2 & combi$NumberRealEstateLoansOrLines <= 3] <- '2-3'
combi$RealtyLinesClass <- as.factor(combi$RealtyLinesClass)
CrossTable(combi$RealtyLinesClass)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  251503 
## 
##  
##           |       0-1 |       1-2 |       2-3 |        3+ | 
##           |-----------|-----------|-----------|-----------|
##           |    182262 |     52477 |     10723 |      6041 | 
##           |     0.725 |     0.209 |     0.043 |     0.024 | 
##           |-----------|-----------|-----------|-----------|
## 
## 
## 
## 
CrossTable(combi$SeriousDlqin2yrs,combi$RealtyLinesClass, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  150000 
## 
##  
##                        | combi$RealtyLinesClass 
## combi$SeriousDlqin2yrs |       0-1 |       1-2 |       2-3 |        3+ | Row Total | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|
##                      0 |    101106 |     29757 |      5878 |      3233 |    139974 | 
##                        |     0.722 |     0.213 |     0.042 |     0.023 |     0.933 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|
##                      1 |      7420 |      1765 |       422 |       419 |     10026 | 
##                        |     0.740 |     0.176 |     0.042 |     0.042 |     0.067 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|
##           Column Total |    108526 |     31522 |      6300 |      3652 |    150000 | 
## -----------------------|-----------|-----------|-----------|-----------|-----------|
## 
## 
str(combi)
## 'data.frame':    251503 obs. of  18 variables:
##  $ X                                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SeriousDlqin2yrs                    : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ RevolvingUtilizationOfUnsecuredLines: num  0.766 0.957 0.658 0.234 0.907 ...
##  $ age                                 : int  45 40 38 30 49 74 57 39 27 57 ...
##  $ NumberOfTime30.59DaysPastDueNotWorse: Factor w/ 17 levels "0","1","2","3",..: 3 1 2 1 2 1 1 1 1 1 ...
##  $ DebtRatio                           : num  0.803 0.1219 0.0851 0.036 0.0249 ...
##  $ MonthlyIncome                       : num  9120 2600 3042 3300 63588 ...
##  $ NumberOfOpenCreditLinesAndLoans     : int  13 4 2 5 7 3 8 8 2 9 ...
##  $ NumberOfTimes90DaysLate             : Factor w/ 21 levels "0","1","2","3",..: 1 1 2 1 1 1 1 1 1 1 ...
##  $ NumberRealEstateLoansOrLines        : int  6 0 0 0 1 1 3 0 0 4 ...
##  $ NumberOfTime60.89DaysPastDueNotWorse: Factor w/ 13 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ NumberOfDependents                  : num  2 1 0 0 0 ...
##  $ AgeClass                            : Factor w/ 4 levels "0-40","41-52",..: 2 1 1 1 2 4 3 1 1 3 ...
##  $ IncomeClass                         : Factor w/ 8 levels "0-1000","10001-20000",..: 8 5 6 6 4 6 8 6 6 4 ...
##  $ DebtRatioClass                      : Factor w/ 6 levels "0-0.5","0.5-1",..: 2 1 1 1 1 1 5 1 4 2 ...
##  $ RUUCCLass                           : Factor w/ 6 levels "0-0.15","0.15-0.5",..: 3 3 3 2 3 2 2 3 1 2 ...
##  $ OpenCreditClass                     : Factor w/ 5 levels "0-5","10-15",..: 2 1 1 1 5 1 5 5 1 5 ...
##  $ RealtyLinesClass                    : Factor w/ 4 levels "0-1","1-2","2-3",..: 4 1 1 1 1 1 3 1 1 4 ...
#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(888)
nrow(combi)
## [1] 251503
train <- combi[1:150000,]
test <- combi[150001:251503,]
#I used randomFOrest algorithm(my favourite) to get the probability, I did not use a lot of paprameter tuning.

fit <- randomForest(as.factor(SeriousDlqin2yrs) ~ NumberOfTimes90DaysLate + NumberOfTime60.89DaysPastDueNotWorse
                    + NumberOfTime30.59DaysPastDueNotWorse + NumberOfDependents + AgeClass + DebtRatioClass +
                      RUUCCLass + OpenCreditClass + RealtyLinesClass,
                    data=train, 
                    importance=TRUE, 
                    ntree=25, keep.forest = TRUE)

sum(is.na(train$SeriousDlqin2yrs))
## [1] 0
Prediction <- predict(fit, test, type = "prob")
submit <- data.frame(Id = test$X, Probability = Prediction)
entry <- data.frame(Id = submit$Id, Probability = submit$Probability.1)
write.csv(entry, file = "Entry.csv", row.names = FALSE)
nrow(test)
## [1] 101503