setwd("C:/Users/Atul/Desktop/risk analysis")
train1 <- read.csv("cs-training.csv")
test1 <- read.csv("cs-test.csv")
nrow(test1)
## [1] 101503
nrow(train1)
## [1] 150000
#View(train)
#View(test)
combi <- rbind(train1,test1)
#Now its time to look closely at the data and to decide what else is required.
str(combi)
## 'data.frame': 251503 obs. of 12 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SeriousDlqin2yrs : int 1 0 0 0 0 0 0 0 0 0 ...
## $ RevolvingUtilizationOfUnsecuredLines: num 0.766 0.957 0.658 0.234 0.907 ...
## $ age : int 45 40 38 30 49 74 57 39 27 57 ...
## $ NumberOfTime30.59DaysPastDueNotWorse: int 2 0 1 0 1 0 0 0 0 0 ...
## $ DebtRatio : num 0.803 0.1219 0.0851 0.036 0.0249 ...
## $ MonthlyIncome : int 9120 2600 3042 3300 63588 3500 NA 3500 NA 23684 ...
## $ NumberOfOpenCreditLinesAndLoans : int 13 4 2 5 7 3 8 8 2 9 ...
## $ NumberOfTimes90DaysLate : int 0 0 1 0 0 0 0 0 0 0 ...
## $ NumberRealEstateLoansOrLines : int 6 0 0 0 1 1 3 0 0 4 ...
## $ NumberOfTime60.89DaysPastDueNotWorse: int 0 0 0 0 0 0 0 0 0 0 ...
## $ NumberOfDependents : int 2 1 0 0 0 1 0 0 NA 2 ...
summary(combi)
## X SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines
## Min. : 1 Min. :0.00 Min. : 0.00
## 1st Qu.: 31439 1st Qu.:0.00 1st Qu.: 0.03
## Median : 62876 Median :0.00 Median : 0.15
## Mean : 65214 Mean :0.07 Mean : 5.75
## 3rd Qu.: 94314 3rd Qu.:0.00 3rd Qu.: 0.56
## Max. :150000 Max. :1.00 Max. :50708.00
## NA's :101503
## age NumberOfTime30.59DaysPastDueNotWorse DebtRatio
## Min. : 0.00 Min. : 0.0000 Min. : 0.0
## 1st Qu.: 41.00 1st Qu.: 0.0000 1st Qu.: 0.2
## Median : 52.00 Median : 0.0000 Median : 0.4
## Mean : 52.34 Mean : 0.4343 Mean : 349.6
## 3rd Qu.: 63.00 3rd Qu.: 0.0000 3rd Qu.: 0.9
## Max. :109.00 Max. :98.0000 Max. :329664.0
##
## MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
## Min. : 0 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 3400 1st Qu.: 5.000 1st Qu.: 0.0000
## Median : 5400 Median : 8.000 Median : 0.0000
## Mean : 6745 Mean : 8.453 Mean : 0.2784
## 3rd Qu.: 8212 3rd Qu.:11.000 3rd Qu.: 0.0000
## Max. :7727000 Max. :85.000 Max. :98.0000
## NA's :49834
## NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 1.000 Median : 0.0000
## Mean : 1.016 Mean : 0.2525
## 3rd Qu.: 2.000 3rd Qu.: 0.0000
## Max. :54.000 Max. :98.0000
##
## NumberOfDependents
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 0.762
## 3rd Qu.: 1.000
## Max. :43.000
## NA's :6550
#After looking at the summary and going through the data dictionary (xls file)
#it is clear that the model requires more categorical data to make better sense of it.
library(rpart)
combi$AgeClass <- '64+'
combi$AgeClass[combi$age > 0 & combi$age <= 40] <- '0-40'
combi$AgeClass[combi$age > 40 & combi$age <=52] <- '41-52'
combi$AgeClass[combi$age > 52 & combi$age <=64] <- '43-54'
combi$AgeClass <- as.factor(combi$AgeClass)
#combi$AgeClass[is.na(combi$age)] <- NA
library(gmodels)
CrossTable(combi$SeriousDlqin2yrs)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 150000
##
##
## | 0 | 1 |
## |-----------|-----------|
## | 139974 | 10026 |
## | 0.933 | 0.067 |
## |-----------|-----------|
##
##
##
##
#This function gave me the analysis of the variable to be predicted,
#A stagerring 93.3 % of people did not have deliquencies post 90 days.
#This would mean if I were to predict the test data as 0 for all people,
#I would be right with 93% of them.
#lets test the respose with the remaining variables.
CrossTable(combi$SeriousDlqin2yrs,combi$AgeClass, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 150000
##
##
## | combi$AgeClass
## combi$SeriousDlqin2yrs | 0-40 | 41-52 | 43-54 | 64+ | Row Total |
## -----------------------|-----------|-----------|-----------|-----------|-----------|
## 0 | 31462 | 38830 | 39270 | 30412 | 139974 |
## | 0.225 | 0.277 | 0.281 | 0.217 | 0.933 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|
## 1 | 3634 | 3443 | 2167 | 782 | 10026 |
## | 0.362 | 0.343 | 0.216 | 0.078 | 0.067 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 35096 | 42273 | 41437 | 31194 | 150000 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|
##
##
# Well this makes sense, the output clearly shows that 36.2% of serious deliquencies and 34.3% of serious deliquencies
# are done by people of the age groups of 0-40 and 41-52 respectively. And the rest aren't as significant.
#This makes sense, because this is the age where people go though serious life turnouts, it is around the age where people eitherlose their loved ones, or sickness entraps them,
#while their young ones are making a career or getting married. Lets make some more categorical data from the rest of the features
DependentsFit <- rpart(NumberOfDependents ~ RevolvingUtilizationOfUnsecuredLines + age + NumberOfOpenCreditLinesAndLoans
+ MonthlyIncome + NumberRealEstateLoansOrLines + DebtRatio,
data=combi[!is.na(combi$NumberOfDependents),],
method="anova")
combi$NumberOfDependents[is.na(combi$NumberOfDependents)] <- predict(DependentsFit, combi[is.na(combi$NumberOfDependents),])
sum(is.na(combi$NumberOfDependents))
## [1] 0
# combi$NumberOfDependents <- as.factor(combi$NumberOfDependents)
# levels(combi$NumberOfDependents) <- c("0", "1", "2", "3", "4-5", "4-5","6-10","6-10","6-10","6-10","6-10" , "10+","10+")
# CrossTable(combi$NumberOfDependents)
# CrossTable(combi$SeriousDlqin2yrs,combi$NumberOfDependents, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
#
#Well this seems controversial, the number of people who do not have nay dependents seem to constitute 51.7%
#of the total deliquencies, that would not make sense, but theres a catch, the deliquencies are much lesser than their total composition of the people with 0 dependants which is 59.5%
#whereas the percentage of deliquencies in greater than their composition as the number of dependencies start increasing.
#Lets now look at another predictor vatible, lets look at the number of times people were lateb by 30-59 dats,
#lets change the datatype of the variable from integer to factor and change the levels.
combi$NumberOfTime30.59DaysPastDueNotWorse <- as.factor(combi$NumberOfTime30.59DaysPastDueNotWorse)
CrossTable(combi$SeriousDlqin2yrs,combi$NumberOfTime30.59DaysPastDueNotWorse, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 150000
##
##
## | combi$NumberOfTime30.59DaysPastDueNotWorse
## combi$SeriousDlqin2yrs | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 19 | 96 | 98 | Row Total |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 0 | 120977 | 13624 | 3379 | 1136 | 429 | 188 | 66 | 26 | 17 | 8 | 1 | 0 | 1 | 0 | 0 | 1 | 121 | 139974 |
## | 0.864 | 0.097 | 0.024 | 0.008 | 0.003 | 0.001 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.001 | 0.933 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 5041 | 2409 | 1219 | 618 | 318 | 154 | 74 | 28 | 8 | 4 | 3 | 1 | 1 | 1 | 0 | 4 | 143 | 10026 |
## | 0.503 | 0.240 | 0.122 | 0.062 | 0.032 | 0.015 | 0.007 | 0.003 | 0.001 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.014 | 0.067 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 126018 | 16033 | 4598 | 1754 | 747 | 342 | 140 | 54 | 25 | 12 | 4 | 1 | 2 | 1 | 0 | 5 | 264 | 150000 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
#Lets apply the same the numberoftimepast 60-89 days column.
combi$NumberOfTime60.89DaysPastDueNotWorse <- as.factor(combi$NumberOfTime60.89DaysPastDueNotWorse)
CrossTable(combi$SeriousDlqin2yrs,combi$NumberOfTime60.89DaysPastDueNotWorse, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 150000
##
##
## | combi$NumberOfTime60.89DaysPastDueNotWorse
## combi$SeriousDlqin2yrs | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | 96 | 98 | Row Total |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 0 | 135140 | 3954 | 557 | 138 | 40 | 13 | 4 | 4 | 1 | 1 | 0 | 1 | 121 | 139974 |
## | 0.965 | 0.028 | 0.004 | 0.001 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.001 | 0.933 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 7256 | 1777 | 561 | 180 | 65 | 21 | 12 | 5 | 1 | 0 | 1 | 4 | 143 | 10026 |
## | 0.724 | 0.177 | 0.056 | 0.018 | 0.006 | 0.002 | 0.001 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.014 | 0.067 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 142396 | 5731 | 1118 | 318 | 105 | 34 | 16 | 9 | 2 | 1 | 1 | 5 | 264 | 150000 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
#Number of times 90 days late.
combi$NumberOfTimes90DaysLate <- as.factor(combi$NumberOfTimes90DaysLate)
CrossTable(combi$SeriousDlqin2yrs,combi$NumberOfTimes90DaysLate, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 150000
##
##
## | combi$NumberOfTimes90DaysLate
## combi$SeriousDlqin2yrs | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 96 | 98 | Row Total |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 0 | 135108 | 3478 | 779 | 282 | 96 | 48 | 32 | 7 | 6 | 5 | 3 | 2 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 1 | 121 | 139974 |
## | 0.965 | 0.025 | 0.006 | 0.002 | 0.001 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.001 | 0.933 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 6554 | 1765 | 776 | 385 | 195 | 83 | 48 | 31 | 15 | 14 | 5 | 3 | 1 | 2 | 1 | 0 | 0 | 1 | 0 | 4 | 143 | 10026 |
## | 0.654 | 0.176 | 0.077 | 0.038 | 0.019 | 0.008 | 0.005 | 0.003 | 0.001 | 0.001 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.014 | 0.067 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 141662 | 5243 | 1555 | 667 | 291 | 131 | 80 | 38 | 21 | 19 | 8 | 5 | 2 | 4 | 2 | 2 | 0 | 1 | 0 | 5 | 264 | 150000 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
#Filling the NAs in monthly income using regression anova method.
IncomeFit <- rpart(MonthlyIncome ~ RevolvingUtilizationOfUnsecuredLines + age + NumberOfOpenCreditLinesAndLoans + NumberOfDependents + NumberRealEstateLoansOrLines + DebtRatio,
data=combi[!is.na(combi$MonthlyIncome),],
method="anova")
combi$MonthlyIncome[is.na(combi$MonthlyIncome)] <- predict(IncomeFit, combi[is.na(combi$MonthlyIncome),])
sum(is.na(combi$MonthlyIncome))
## [1] 0
#Viola
#Lets now go to the salary data and create categorical variables for the salary data
combi$IncomeClass[combi$MonthlyIncome >= 0 & combi$MonthlyIncome <= 1000] <- '0-1000'
combi$IncomeClass[combi$MonthlyIncome > 1000 & combi$MonthlyIncome <= 2000] <- '1001-2000'
combi$IncomeClass[combi$MonthlyIncome > 2000 & combi$MonthlyIncome <= 3000] <- '2001-3000'
combi$IncomeClass[combi$MonthlyIncome > 3000 & combi$MonthlyIncome <= 4000] <- '3001-4000'
combi$IncomeClass[combi$MonthlyIncome > 4000 & combi$MonthlyIncome <= 6000] <- '4001-6000'
combi$IncomeClass[combi$MonthlyIncome > 6001 & combi$MonthlyIncome <= 8000] <- '6001-8000'
combi$IncomeClass[combi$MonthlyIncome > 6000 & combi$MonthlyIncome <= 10000] <- '8001-10000'
combi$IncomeClass[combi$MonthlyIncome > 10000 & combi$MonthlyIncome <= 20000] <- '10001-20000'
combi$IncomeClass[combi$MonthlyIncome > 20000] <- '20000+'
combi$IncomeClass <- as.factor(combi$IncomeClass)
#One of the most insightful variable in my opinion is the debt ratrio, this has to make a lo0t of sense once we model the Cross Table of Debt Ratio\
combi$DebtRatioClass <- '100+'
combi$DebtRatioClass[combi$DebtRatio >= 0 & combi$DebtRatio <= 0.5] <- '0-0.5'
combi$DebtRatioClass[combi$DebtRatio > 0.5 & combi$DebtRatio <= 1] <- '0.5-1'
combi$DebtRatioClass[combi$DebtRatio > 1 & combi$DebtRatio <= 2] <- '1-2'
combi$DebtRatioClass[combi$DebtRatio > 2 & combi$DebtRatio <= 10] <- '2-10'
combi$DebtRatioClass[combi$DebtRatio > 10 & combi$DebtRatio <= 100] <- '10-100'
combi$DebtRatioClass <- as.factor(combi$DebtRatioClass)
summary(combi$DebtRatioClass)
## 0-0.5 0.5-1 1-2 10-100 100+ 2-10
## 157240 35548 6855 7521 40753 3586
CrossTable(combi$DebtRatioClass)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 251503
##
##
## | 0-0.5 | 0.5-1 | 1-2 | 10-100 | 100+ |
## |-----------|-----------|-----------|-----------|-----------|
## | 157240 | 35548 | 6855 | 7521 | 40753 |
## | 0.625 | 0.141 | 0.027 | 0.030 | 0.162 |
## |-----------|-----------|-----------|-----------|-----------|
##
##
## | 2-10 |
## |-----------|
## | 3586 |
## | 0.014 |
## |-----------|
##
##
##
##
CrossTable(combi$SeriousDlqin2yrs,combi$DebtRatioClass, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 150000
##
##
## | combi$DebtRatioClass
## combi$SeriousDlqin2yrs | 0-0.5 | 0.5-1 | 1-2 | 10-100 | 100+ | 2-10 | Row Total |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 0 | 88053 | 19075 | 3553 | 4298 | 22970 | 2025 | 139974 |
## | 0.629 | 0.136 | 0.025 | 0.031 | 0.164 | 0.014 | 0.933 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 5655 | 2080 | 539 | 199 | 1410 | 143 | 10026 |
## | 0.564 | 0.207 | 0.054 | 0.020 | 0.141 | 0.014 | 0.067 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 93708 | 21155 | 4092 | 4497 | 24380 | 2168 | 150000 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
#As can be seen using the comparison, this makes perfect sense..
#the next feature of the dataset to be consideredq, is the RevolvingUtilizationOfUnsecuredLine, this should be on the same lines of that of debt ratio
summary(combi$RevolvingUtilizationOfUnsecuredLines)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.03 0.15 5.75 0.56 50708.00
combi$RUUCCLass <- '100+'
combi$RUUCCLass[combi$RevolvingUtilizationOfUnsecuredLines >= 0 & combi$RevolvingUtilizationOfUnsecuredLines <= 0.15] <- '0-0.15'
combi$RUUCCLass[combi$RevolvingUtilizationOfUnsecuredLines > 0.15 & combi$RevolvingUtilizationOfUnsecuredLines <= 0.5] <- '0.15-0.5'
combi$RUUCCLass[combi$RevolvingUtilizationOfUnsecuredLines > 0.5 & combi$RevolvingUtilizationOfUnsecuredLines <= 1] <- '0.5-1'
combi$RUUCCLass[combi$RevolvingUtilizationOfUnsecuredLines > 1 & combi$RevolvingUtilizationOfUnsecuredLines <= 10] <- '1-10'
combi$RUUCCLass[combi$RevolvingUtilizationOfUnsecuredLines > 10 & combi$RevolvingUtilizationOfUnsecuredLines <= 100] <- '10-100'
combi$RUUCCLass[is.na(combi$RevolvingUtilizationOfUnsecuredLines)] <- NA
combi$RUUCCLass <- as.factor(combi$RUUCCLass)
sum(is.na(combi$RevolvingUtilizationOfUnsecuredLines))
## [1] 0
CrossTable(combi$RUUCCLass)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 251503
##
##
## | 0-0.15 | 0.15-0.5 | 0.5-1 | 1-10 | 10-100 |
## |-----------|-----------|-----------|-----------|-----------|
## | 124783 | 57307 | 63911 | 5108 | 29 |
## | 0.496 | 0.228 | 0.254 | 0.020 | 0.000 |
## |-----------|-----------|-----------|-----------|-----------|
##
##
## | 100+ |
## |-----------|
## | 365 |
## | 0.001 |
## |-----------|
##
##
##
##
CrossTable(combi$SeriousDlqin2yrs,combi$RUUCCLass, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 150000
##
##
## | combi$RUUCCLass
## combi$SeriousDlqin2yrs | 0-0.15 | 0.15-0.5 | 0.5-1 | 1-10 | 10-100 | 100+ | Row Total |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 0 | 72900 | 32825 | 32165 | 1860 | 12 | 212 | 139974 |
## | 0.521 | 0.235 | 0.230 | 0.013 | 0.000 | 0.002 | 0.933 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 1422 | 1565 | 5802 | 1220 | 6 | 11 | 10026 |
## | 0.142 | 0.156 | 0.579 | 0.122 | 0.001 | 0.001 | 0.067 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 74322 | 34390 | 37967 | 3080 | 18 | 223 | 150000 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
#As predicted, the RevolvingUtilizationOfUnsecuredLines is on the similar lines of that of the debt ratio.
#Lets go ahead to the number of opern credits and lines laon, one can predict that less are the number of open credit lines, less are the chances of person to default the loan
#But that would be awrong assertion, gien we fdon't know how much he/she is earning and how much is the loan.
summary(combi$NumberOfOpenCreditLinesAndLoans)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.000 8.000 8.453 11.000 85.000
plot(density(combi$NumberOfOpenCreditLinesAndLoans))

#Let us model categorically for this variable
combi$OpenCreditClass <- '20+'
combi$OpenCreditClass[combi$NumberOfOpenCreditLinesAndLoans >= 0 & combi$NumberOfOpenCreditLinesAndLoans<=5] <- '0-5'
combi$OpenCreditClass[combi$NumberOfOpenCreditLinesAndLoans > 5 & combi$NumberOfOpenCreditLinesAndLoans<=10] <- '5-10'
combi$OpenCreditClass[combi$NumberOfOpenCreditLinesAndLoans > 10 & combi$NumberOfOpenCreditLinesAndLoans<=15] <- '10-15'
combi$OpenCreditClass[combi$NumberOfOpenCreditLinesAndLoans > 15 & combi$NumberOfOpenCreditLinesAndLoans<= 20] <- '15-20'
combi$OpenCreditClass <- as.factor(combi$OpenCreditClass)
CrossTable(combi$OpenCreditClass)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 251503
##
##
## | 0-5 | 10-15 | 15-20 | 20+ | 5-10 |
## |-----------|-----------|-----------|-----------|-----------|
## | 78018 | 48912 | 16442 | 6679 | 101452 |
## | 0.310 | 0.194 | 0.065 | 0.027 | 0.403 |
## |-----------|-----------|-----------|-----------|-----------|
##
##
##
##
CrossTable(combi$SeriousDlqin2yrs,combi$OpenCreditClass, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 150000
##
##
## | combi$OpenCreditClass
## combi$SeriousDlqin2yrs | 0-5 | 10-15 | 15-20 | 20+ | 5-10 | Row Total |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 0 | 42668 | 27380 | 9170 | 3701 | 57055 | 139974 |
## | 0.305 | 0.196 | 0.066 | 0.026 | 0.408 | 0.933 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 3922 | 1804 | 676 | 279 | 3345 | 10026 |
## | 0.391 | 0.180 | 0.067 | 0.028 | 0.334 | 0.067 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 46590 | 29184 | 9846 | 3980 | 60400 | 150000 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
#independently, this variable does not seem as insightful as other, but i am certain collectively it will be very important.
summary(combi$NumberRealEstateLoansOrLines)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.016 2.000 54.000
combi$RealtyLinesClass <- '3+'
combi$RealtyLinesClass[combi$NumberRealEstateLoansOrLines >=0 & combi$NumberRealEstateLoansOrLines <= 1] <- '0-1'
combi$RealtyLinesClass[combi$NumberRealEstateLoansOrLines >1 & combi$NumberRealEstateLoansOrLines <= 2] <- '1-2'
combi$RealtyLinesClass[combi$NumberRealEstateLoansOrLines >2 & combi$NumberRealEstateLoansOrLines <= 3] <- '2-3'
combi$RealtyLinesClass <- as.factor(combi$RealtyLinesClass)
CrossTable(combi$RealtyLinesClass)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 251503
##
##
## | 0-1 | 1-2 | 2-3 | 3+ |
## |-----------|-----------|-----------|-----------|
## | 182262 | 52477 | 10723 | 6041 |
## | 0.725 | 0.209 | 0.043 | 0.024 |
## |-----------|-----------|-----------|-----------|
##
##
##
##
CrossTable(combi$SeriousDlqin2yrs,combi$RealtyLinesClass, prop.r = TRUE, prop.c = FALSE, prop.t = FALSE, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 150000
##
##
## | combi$RealtyLinesClass
## combi$SeriousDlqin2yrs | 0-1 | 1-2 | 2-3 | 3+ | Row Total |
## -----------------------|-----------|-----------|-----------|-----------|-----------|
## 0 | 101106 | 29757 | 5878 | 3233 | 139974 |
## | 0.722 | 0.213 | 0.042 | 0.023 | 0.933 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|
## 1 | 7420 | 1765 | 422 | 419 | 10026 |
## | 0.740 | 0.176 | 0.042 | 0.042 | 0.067 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 108526 | 31522 | 6300 | 3652 | 150000 |
## -----------------------|-----------|-----------|-----------|-----------|-----------|
##
##
str(combi)
## 'data.frame': 251503 obs. of 18 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SeriousDlqin2yrs : int 1 0 0 0 0 0 0 0 0 0 ...
## $ RevolvingUtilizationOfUnsecuredLines: num 0.766 0.957 0.658 0.234 0.907 ...
## $ age : int 45 40 38 30 49 74 57 39 27 57 ...
## $ NumberOfTime30.59DaysPastDueNotWorse: Factor w/ 17 levels "0","1","2","3",..: 3 1 2 1 2 1 1 1 1 1 ...
## $ DebtRatio : num 0.803 0.1219 0.0851 0.036 0.0249 ...
## $ MonthlyIncome : num 9120 2600 3042 3300 63588 ...
## $ NumberOfOpenCreditLinesAndLoans : int 13 4 2 5 7 3 8 8 2 9 ...
## $ NumberOfTimes90DaysLate : Factor w/ 21 levels "0","1","2","3",..: 1 1 2 1 1 1 1 1 1 1 ...
## $ NumberRealEstateLoansOrLines : int 6 0 0 0 1 1 3 0 0 4 ...
## $ NumberOfTime60.89DaysPastDueNotWorse: Factor w/ 13 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ NumberOfDependents : num 2 1 0 0 0 ...
## $ AgeClass : Factor w/ 4 levels "0-40","41-52",..: 2 1 1 1 2 4 3 1 1 3 ...
## $ IncomeClass : Factor w/ 8 levels "0-1000","10001-20000",..: 8 5 6 6 4 6 8 6 6 4 ...
## $ DebtRatioClass : Factor w/ 6 levels "0-0.5","0.5-1",..: 2 1 1 1 1 1 5 1 4 2 ...
## $ RUUCCLass : Factor w/ 6 levels "0-0.15","0.15-0.5",..: 3 3 3 2 3 2 2 3 1 2 ...
## $ OpenCreditClass : Factor w/ 5 levels "0-5","10-15",..: 2 1 1 1 5 1 5 5 1 5 ...
## $ RealtyLinesClass : Factor w/ 4 levels "0-1","1-2","2-3",..: 4 1 1 1 1 1 3 1 1 4 ...
#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(888)
nrow(combi)
## [1] 251503
train <- combi[1:150000,]
test <- combi[150001:251503,]
#I used randomFOrest algorithm(my favourite) to get the probability, I did not use a lot of paprameter tuning.
fit <- randomForest(as.factor(SeriousDlqin2yrs) ~ NumberOfTimes90DaysLate + NumberOfTime60.89DaysPastDueNotWorse
+ NumberOfTime30.59DaysPastDueNotWorse + NumberOfDependents + AgeClass + DebtRatioClass +
RUUCCLass + OpenCreditClass + RealtyLinesClass,
data=train,
importance=TRUE,
ntree=25, keep.forest = TRUE)
sum(is.na(train$SeriousDlqin2yrs))
## [1] 0
Prediction <- predict(fit, test, type = "prob")
submit <- data.frame(Id = test$X, Probability = Prediction)
entry <- data.frame(Id = submit$Id, Probability = submit$Probability.1)
write.csv(entry, file = "Entry.csv", row.names = FALSE)
nrow(test)
## [1] 101503