library(openxlsx)
library(dplyr)
library(randomForest)
library(tree)
library(ggplot2)
library(ggpubr)
dta <- read.xlsx("DataSet.xlsx")
dim(dta)
[1] 21662 34
summary(dta)
ID Requested_Amount Emi_Amount Age
Min. : 1 Min. :7.200e+01 Min. : 0 Min. :18.10
1st Qu.: 6466 1st Qu.:2.750e+05 1st Qu.: 8664 1st Qu.:31.20
Median :13046 Median :4.050e+05 Median : 12696 Median :37.58
Mean :13145 Mean :7.324e+07 Mean : 17882 Mean :39.09
3rd Qu.:19760 3rd Qu.:6.500e+05 3rd Qu.: 19800 3rd Qu.:45.67
Max. :26508 Max. :4.630e+11 Max. :1379000 Max. :83.80
NA's :602
Applicant_Postal_Code Duration_Of_Current_Emp Product_Name1
Min. :110001 Min. : 0.000 Length:21662
1st Qu.:208006 1st Qu.: 2.000 Class :character
Median :400610 Median : 4.000 Mode :character
Mean :394240 Mean : 6.858
3rd Qu.:500090 3rd Qu.: 8.000
Max. :835103 Max. :50.000
NA's :3 NA's :16648
Product_Line Loan_Term Variant_Code Manufacturer_Desc
Length:21662 Min. : 3.00 Length:21662 Length:21662
Class :character 1st Qu.: 36.00 Class :character Class :character
Mode :character Median : 36.00 Mode :character Mode :character
Mean : 43.98
3rd Qu.: 60.00
Max. :960.00
NA's :2
Gender_Desc Marital_Status_Desc Applicant_State_Desc
Length:21662 Length:21662 Length:21662
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
Applicant_City_Desc Resid_Owned_By_Desc Employment_Type_Desc
Length:21662 Length:21662 Length:21662
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
Total_Work_Experience application_creation_date Ex_Showroom_Price
Min. : 0.000 Min. :41277 Min. : 0
1st Qu.: 2.700 1st Qu.:41753 1st Qu.: 335440
Median : 5.000 Median :41988 Median : 488544
Mean : 8.573 Mean :41904 Mean : 686893
3rd Qu.: 10.000 3rd Qu.:42105 3rd Qu.: 744948
Max. :1212.000 Max. :42186 Max. :81882000
NA's :16708 NA's :2779
Segment Current.Valuation Years_In_Current_Business
Length:21662 Min. :1.000e+00 Min. : 0.000
Class :character 1st Qu.:3.349e+05 1st Qu.: 4.000
Mode :character Median :4.879e+05 Median : 7.000
Mean :2.302e+07 Mean : 8.888
3rd Qu.:7.444e+05 3rd Qu.:10.000
Max. :3.470e+11 Max. :54.000
NA's :2787 NA's :7455
No_Of_Years_At_Residence No_Of_Years_In_City No_Of_Years_At_Business
Min. : 0.00 Min. :0.000e+00 Min. : NA
1st Qu.: 5.00 1st Qu.:5.000e+00 1st Qu.: NA
Median :10.00 Median :1.000e+01 Median : NA
Mean :13.93 Mean :4.361e+05 Mean :NaN
3rd Qu.:20.00 3rd Qu.:2.000e+01 3rd Qu.: NA
Max. :69.00 Max. :9.291e+09 Max. : NA
NA's :2723 NA's :354 NA's :21662
Segment_Desc Cost_Of_Vehicle Average_Bank_Balance
Length:21662 Min. : 48000 Min. :-1.798e+07
Class :character 1st Qu.: 350000 1st Qu.: 6.307e+03
Mode :character Median : 506730 Median : 1.704e+04
Mean : 771360 Mean : 2.226e+06
3rd Qu.: 800000 3rd Qu.: 5.069e+04
Max. :504486334 Max. : 1.864e+10
NA's :2 NA's :12689
cibil_score Disbursed IRR MAX_OD_F_12M
Min. : -1.0 Length:21662 Min. :10.69 Min. : 0.000
1st Qu.:597.0 Class :character 1st Qu.:15.71 1st Qu.: 0.000
Median :757.0 Mode :character Median :17.51 Median : 0.000
Mean :606.2 Mean :18.23 Mean : 0.349
3rd Qu.:796.0 3rd Qu.:20.60 3rd Qu.: 0.000
Max. :897.0 Max. :26.50 Max. :12.000
NA's :8 NA's :13356
TOTAL_NTECH_BNC_F12M
Min. : 0.000
1st Qu.: 0.000
Median : 0.000
Mean : 1.259
3rd Qu.: 1.000
Max. :12.000
NA's :13356
str(dta)
'data.frame': 21662 obs. of 34 variables:
$ ID : num 1 2 3 4 5 6 7 8 9 10 ...
$ Requested_Amount : num 200000 1900000 1600000 300000 250000 150000 430000 405000 276000 234000 ...
$ Emi_Amount : num 5750 NA NA 9050 8750 ...
$ Age : num 40.5 41.2 39.5 35 34 ...
$ Applicant_Postal_Code : num 400091 641002 641002 390019 390023 ...
$ Duration_Of_Current_Emp : num 5 NA NA 16 NA NA NA NA NA NA ...
$ Product_Name1 : chr "CHEVROLET SPARK" "FORTUNER 4 WD" NA "NISSAN MICRA" ...
$ Product_Line : chr "CHEVROLET SPARK LT 1.0 BS-IV OBDII" "TOYOTA - FORTUNER 4 WD" "TOYOTA - FORTUNER 3.0 L" "NISSAN MICRA" ...
$ Loan_Term : num 48 48 48 48 36 36 36 36 36 36 ...
$ Variant_Code : chr "LT 1.0 BS-IV OBDII" "FORTUNER 4 WD" "FORTUNER 3.0 L" "NISSAN MICRA XL" ...
$ Manufacturer_Desc : chr "CHEVROLET INDIA LTD" "TOYOTA" "TOYOTA" "NISSAN" ...
$ Gender_Desc : chr "Male" "Male" "Male" "Male" ...
$ Marital_Status_Desc : chr NA "Married" "Married" "Married" ...
$ Applicant_State_Desc : chr "MAHARASHTRA" "TAMILNADU" "TAMILNADU" "GUJRAT" ...
$ Applicant_City_Desc : chr "MUMBAI" "COIMBATORE" "COIMBATORE" "BARODA" ...
$ Resid_Owned_By_Desc : chr "SELF" "SELF" "SELF" "SELF" ...
$ Employment_Type_Desc : chr "SALARIED" "RETIRED" "RETIRED" "SALARIED" ...
$ Total_Work_Experience : num 5 NA NA 16 NA NA NA NA NA NA ...
$ application_creation_date: num 42112 41753 41755 41277 41287 ...
$ Ex_Showroom_Price : num NA NA NA 365750 548750 ...
$ Segment : chr "A2" "SUV" "SUV" "A2" ...
$ Current.Valuation : num NA NA NA 365750 548750 ...
$ Years_In_Current_Business: num NA NA NA NA 3 6 15 10 5 5 ...
$ No_Of_Years_At_Residence : num NA NA NA 7 2 25 40 9 10 25 ...
$ No_Of_Years_In_City : num 35 1 1 7 2 25 40 9 10 25 ...
$ No_Of_Years_At_Business : num NA NA NA NA NA NA NA NA NA NA ...
$ Segment_Desc : chr "Compact" "SUV" "SUV" "Compact" ...
$ Cost_Of_Vehicle : num 237342 2400000 2200000 365750 548750 ...
$ Average_Bank_Balance : num NA NA NA NA NA NA NA NA NA NA ...
$ cibil_score : num 594 754 779 818 853 777 817 789 828 837 ...
$ Disbursed : chr "No" "Yes" "Yes" "Yes" ...
$ IRR : num 24.7 24.7 16.5 18.1 17.7 ...
$ MAX_OD_F_12M : num NA 0 0 0 0 0 0 0 0 0 ...
$ TOTAL_NTECH_BNC_F12M : num NA 0 0 0 1 0 0 0 0 0 ...
dta$IRRcut <- cut(dta$IRR,breaks = c(0,10,15,20,25,30))
Every sample has more than 2 missing, therefore we can’t merely use complete data. Therefore we use those variables with missing rate <0.25
na.omit(dta)
[1] ID Requested_Amount
[3] Emi_Amount Age
[5] Applicant_Postal_Code Duration_Of_Current_Emp
[7] Product_Name1 Product_Line
[9] Loan_Term Variant_Code
[11] Manufacturer_Desc Gender_Desc
[13] Marital_Status_Desc Applicant_State_Desc
[15] Applicant_City_Desc Resid_Owned_By_Desc
[17] Employment_Type_Desc Total_Work_Experience
[19] application_creation_date Ex_Showroom_Price
[21] Segment Current.Valuation
[23] Years_In_Current_Business No_Of_Years_At_Residence
[25] No_Of_Years_In_City No_Of_Years_At_Business
[27] Segment_Desc Cost_Of_Vehicle
[29] Average_Bank_Balance cibil_score
[31] Disbursed IRR
[33] MAX_OD_F_12M TOTAL_NTECH_BNC_F12M
[35] IRRcut
<0 rows> (or 0-length row.names)
is.na(dta) %>% apply(.,1,sum) %>% `<`(.,2) %>% sum
[1] 0
less_missing_variables <- is.na(dta) %>% apply(.,2,sum) %>% `<`(.,ncol(dta)*0.25)
less_missing_dta <- dta[,less_missing_variables]
dta <- less_missing_dta
str(dta)
'data.frame': 21662 obs. of 17 variables:
$ ID : num 1 2 3 4 5 6 7 8 9 10 ...
$ Requested_Amount : num 200000 1900000 1600000 300000 250000 150000 430000 405000 276000 234000 ...
$ Age : num 40.5 41.2 39.5 35 34 ...
$ Applicant_Postal_Code : num 400091 641002 641002 390019 390023 ...
$ Product_Line : chr "CHEVROLET SPARK LT 1.0 BS-IV OBDII" "TOYOTA - FORTUNER 4 WD" "TOYOTA - FORTUNER 3.0 L" "NISSAN MICRA" ...
$ Loan_Term : num 48 48 48 48 36 36 36 36 36 36 ...
$ Variant_Code : chr "LT 1.0 BS-IV OBDII" "FORTUNER 4 WD" "FORTUNER 3.0 L" "NISSAN MICRA XL" ...
$ Manufacturer_Desc : chr "CHEVROLET INDIA LTD" "TOYOTA" "TOYOTA" "NISSAN" ...
$ Gender_Desc : chr "Male" "Male" "Male" "Male" ...
$ Applicant_State_Desc : chr "MAHARASHTRA" "TAMILNADU" "TAMILNADU" "GUJRAT" ...
$ Employment_Type_Desc : chr "SALARIED" "RETIRED" "RETIRED" "SALARIED" ...
$ application_creation_date: num 42112 41753 41755 41277 41287 ...
$ Cost_Of_Vehicle : num 237342 2400000 2200000 365750 548750 ...
$ cibil_score : num 594 754 779 818 853 777 817 789 828 837 ...
$ Disbursed : chr "No" "Yes" "Yes" "Yes" ...
$ IRR : num 24.7 24.7 16.5 18.1 17.7 ...
$ IRRcut : Factor w/ 5 levels "(0,10]","(10,15]",..: 4 4 3 3 3 2 2 2 2 2 ...
rabefore <-ggplot(dta, aes(x=Requested_Amount,y = IRR))+ layer(
geom = "point",
stat = "identity",
position = "identity") +theme_classic()
(dta$Requested_Amount > 10^8) %>% which
[1] 369 1397 1751 11802 17098
dta <- dta[-c((dta$Requested_Amount > 10^8) %>% which),]
rabafter <- ggplot(dta, aes(x=Requested_Amount/10000,y=IRR,color=Disbursed))+
geom_point(stat="identity")+
geom_smooth(method='lm')+
theme_classic()
ggarrange(rabefore, rabafter,
labels = c("Requested Amount(with outliers)", "Requested Amount(outliers removed)"),
ncol = 1, nrow = 2)
ggplot(dta, aes(y = IRR,x = Age,color=Disbursed))+
geom_smooth(method="lm",na.rm = TRUE)+
theme_classic()
unique(dta$Product_Line) %>% length
[1] 1443
#to get the short product line (first word) of the car
position <- gregexpr(" ",dta$Product_Line)
endposition <- sapply(position, function(x) x[[1]][1])
dta$Product_Line_group <- substring(dta$Product_Line,first = 1,last = endposition-1)
unique(dta$Product_Line_group) %>% length
[1] 67
plgfill <- ggplot(data=dta, aes(x=Product_Line_group, y=IRR, fill=IRRcut)) +
geom_bar(stat="identity", position="fill")+
theme_classic()+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
plgstack <-ggplot(data=dta, aes(x=Product_Line_group, y=IRR, fill=IRRcut)) +
geom_bar(stat="identity", position="stack")+
theme_classic()+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
ggarrange(plgfill, plgstack,
labels = c("P. Line(Partiton)", "P. Line(count)"),
ncol = 1, nrow = 2)
lt <- ggplot(data=dta,aes(x=Loan_Term,y=IRR)) +
geom_point()+
theme_classic()
(dta$Loan_Term > 250) %>% which
[1] 3325
dta <- dta[-c((dta$Loan_Term > 250) %>% which),]
lt2 <-ggplot(data=dta,aes(x=Loan_Term,y=IRR,color=Disbursed)) +
geom_point()+
geom_smooth(method='lm')+
theme_classic()
ggarrange(lt, lt2,
labels = c("Loan Term(with outliers)", "Loan Term(outliers removed)"),
ncol = 1, nrow = 2)
unique(dta$Variant_Code) %>% length
[1] 1259
#to get the brand name (first word) of the car
position <- gregexpr(" ",dta$Variant_Code)
endposition <- sapply(position, function(x) x[[1]][1])
dta$Variant_Code_Brand <- substring(dta$Variant_Code,first = 1,last = endposition-1)
unique(dta$Variant_Code_Brand) %>% length
[1] 291
Variant_Code_Brand1 <- ggplot(data=dta, aes(x=Variant_Code_Brand, y=IRR, fill=IRRcut)) +
geom_bar(stat="identity", position="fill")+
theme_classic()+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
Variant_Code_Brand2 <- ggplot(data=dta, aes(x=Variant_Code_Brand, y=IRR, fill=IRRcut)) +
geom_bar(stat="identity", position="stack")+
theme_classic()+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
ggarrange(Variant_Code_Brand1, Variant_Code_Brand2,
labels = c("Variant Code (Partition)", "Variant Code (counts)"),
ncol = 1, nrow = 2)
unique(dta$Manufacturer_Desc) %>% length
[1] 33
manufacturer1 <- ggplot(data=dta, aes(x=Manufacturer_Desc, y=IRR, fill=IRRcut)) +
geom_bar(stat="identity", position="fill")+
theme_classic()+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
manufacturer2 <- ggplot(data=dta, aes(x=Manufacturer_Desc, y=IRR, fill=IRRcut)) +
geom_bar(stat="identity", position="stack")+
theme_classic()+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
ggarrange(manufacturer1, manufacturer2,
labels = c("Manufacturer (Counts)", "Manufacturer (Partition)"),
ncol = 1, nrow = 2)
ggplot(data=dta, aes(x=Gender_Desc, y=IRR, fill=IRRcut)) +
geom_bar(stat="identity", position="stack")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
asd1 <- ggplot(data=dta, aes(x=Applicant_State_Desc, y=IRR, fill=IRRcut)) +
geom_bar(stat="identity", position="stack")+
theme_classic()+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
asd2 <- ggplot(data=dta, aes(x=Applicant_State_Desc, y=IRR, fill=IRRcut)) +
geom_bar(stat="identity", position="fill")+
theme_classic()+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
ggarrange(asd2, asd1,
labels = c("Applicant State(count)", "Applicant State(fill)"),
ncol = 1, nrow = 2)
cost1 <- ggplot(data=dta, aes(x=Cost_Of_Vehicle , y=IRR)) +
geom_point(stat="identity")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
dta <- dta[-c((dta$Cost_Of_Vehicle> 10^8) %>% which),]
cost2 <- ggplot(data=dta, aes(x=Cost_Of_Vehicle , y=IRR)) +
geom_point(stat="identity")+
geom_smooth(method='lm')+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
dta2 <- dta[-c((dta$Cost_Of_Vehicle> 10^7) %>% which),]
cost3 <-ggplot(data=dta2, aes(x=Cost_Of_Vehicle , y=IRR,color=Disbursed)) +
geom_point(stat="identity")+
geom_smooth(method='lm')+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggarrange(cost1, cost2, cost3,
labels = c("Cost of Vihicle", "Cost of Vihicle","Cost of Vihicle"),ncol = 1, nrow = 3)
ggplot(data=dta, aes(x=Employment_Type_Desc, y=IRR, color=IRRcut)) +
geom_bar(stat="identity", position="stack")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(data=dta, aes(x=cibil_score, y=IRR, color=Disbursed)) +
geom_point(stat="identity")+
geom_smooth(method = "lm")+
theme_classic()
3. Generate IRR model using variables: Requested Amount/Age/Loan_Term/Gender_Desc/Employment_Type_Desc/Cost_Of_Vehicle/cibil_score
## set a regression model for IRR
IRRdta <- dta[,c("Requested_Amount",
"Age",
"Loan_Term",
"Gender_Desc",
"Cost_Of_Vehicle",
"cibil_score",
"IRR")]
IRRdta$cibil_score[(IRRdta$cibil_score == -1)] <- NA
IRRdta$Requested_Amount <- IRRdta$Requested_Amount/100000
IRRdta$Cost_Of_Vehicle <- IRRdta$Cost_Of_Vehicle/100000
IRRdta <- na.omit(IRRdta)
IRRfit <- lm(IRR~.,IRRdta)
summary(IRRfit)
Call:
lm(formula = IRR ~ ., data = IRRdta)
Residuals:
Min 1Q Median 3Q Max
-7.8707 -2.5235 -0.6947 2.3566 8.7315
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 19.013434 0.227452 83.593 < 2e-16 ***
Requested_Amount -0.009175 0.005069 -1.810 0.0703 .
Age -0.005278 0.002812 -1.877 0.0605 .
Loan_Term 0.006240 0.002479 2.517 0.0118 *
Gender_DescMale -0.035151 0.090426 -0.389 0.6975
Cost_Of_Vehicle 0.002626 0.003076 0.854 0.3933
cibil_score -0.001135 0.000192 -5.913 3.42e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.699 on 17993 degrees of freedom
Multiple R-squared: 0.002955, Adjusted R-squared: 0.002622
F-statistic: 8.887 on 6 and 17993 DF, p-value: 1.045e-09
IRRfit_simple <- lm(IRR ~ Requested_Amount +
Age+
Loan_Term+
cibil_score, data= IRRdta)
summary(IRRfit_simple)
Call:
lm(formula = IRR ~ Requested_Amount + Age + Loan_Term + cibil_score,
data = IRRdta)
Residuals:
Min 1Q Median 3Q Max
-7.8733 -2.5242 -0.6962 2.3650 8.8337
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 18.9801656 0.2070477 91.671 < 2e-16 ***
Requested_Amount -0.0065412 0.0040293 -1.623 0.1045
Age -0.0052105 0.0028081 -1.856 0.0635 .
Loan_Term 0.0063040 0.0024775 2.545 0.0110 *
cibil_score -0.0011348 0.0001919 -5.914 3.39e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.699 on 17995 degrees of freedom
Multiple R-squared: 0.002905, Adjusted R-squared: 0.002683
F-statistic: 13.11 on 4 and 17995 DF, p-value: 1.163e-10
Disburseddta <- dta[,c("Requested_Amount",
"Age",
"Loan_Term",
"Gender_Desc",
"Cost_Of_Vehicle",
"cibil_score",
"IRR",
"Disbursed")]
Disburseddta$cibil_score[(Disburseddta$cibil_score == -1)] <- NA
Disburseddta$Requested_Amount <- Disburseddta$Requested_Amount/100000
Disburseddta$Cost_Of_Vehicle <- Disburseddta$Cost_Of_Vehicle/100000
Disburseddta$Disbursed <- as.factor(Disburseddta$Disbursed)
Disburseddta <- na.omit(Disburseddta)
disbursefit <- glm(Disbursed~.,
family = binomial,
data = Disburseddta)
summary(disbursefit)
Call:
glm(formula = Disbursed ~ ., family = binomial, data = Disburseddta)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.7749 -0.9962 -0.6285 1.2185 2.6554
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.0996547 0.1732910 6.346 2.21e-10 ***
Requested_Amount 0.0027963 0.0033042 0.846 0.3974
Age 0.0082572 0.0016302 5.065 4.08e-07 ***
Loan_Term -0.0120967 0.0014543 -8.318 < 2e-16 ***
Gender_DescMale 0.0327846 0.0527666 0.621 0.5344
Cost_Of_Vehicle -0.0049477 0.0024017 -2.060 0.0394 *
cibil_score 0.0022760 0.0001465 15.532 < 2e-16 ***
IRR -0.1689325 0.0048179 -35.063 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 24109 on 17999 degrees of freedom
Residual deviance: 22212 on 17992 degrees of freedom
AIC: 22228
Number of Fisher Scoring iterations: 4