library(readxl)
mortgage<-read_excel("d:/ds/mortgage.xlsx")
x<-mortgage
The data set mortgage is in panel form and reports origination and performance observations for 50,000 residential U.S. mortgage borrowers over 60 periods.
str(mortgage)
## Classes 'tbl_df', 'tbl' and 'data.frame': 622489 obs. of 23 variables:
## $ id : num 1 1 1 1 1 1 1 1 1 1 ...
## $ time : num 25 26 27 28 29 30 31 32 33 34 ...
## $ orig_time : num -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 ...
## $ first_time : num 25 25 25 25 25 25 25 25 25 25 ...
## $ mat_time : num 113 113 113 113 113 113 113 113 113 113 ...
## $ balance_time : num 41303 41062 40804 40484 40367 ...
## $ LTV_time : num 24.5 24.5 24.6 24.7 24.9 ...
## $ interest_rate_time : num 9.2 9.2 9.2 9.2 9.2 9.2 9.2 9.2 9.2 9.2 ...
## $ hpi_time : num 226 225 222 220 217 ...
## $ gdp_time : num 2.9 2.15 2.36 1.23 1.69 ...
## $ uer_time : num 4.7 4.7 4.4 4.6 4.5 4.7 4.7 5 5 5.8 ...
## $ REtype_CO_orig_time : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REtype_PU_orig_time : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REtype_SF_orig_time : num 1 1 1 1 1 1 1 1 1 1 ...
## $ investor_orig_time : num 0 0 0 0 0 0 0 0 0 0 ...
## $ balance_orig_time : num 45000 45000 45000 45000 45000 45000 45000 45000 45000 45000 ...
## $ FICO_orig_time : num 715 715 715 715 715 715 715 715 715 715 ...
## $ LTV_orig_time : num 69.4 69.4 69.4 69.4 69.4 69.4 69.4 69.4 69.4 69.4 ...
## $ Interest_Rate_orig_time: num 9.2 9.2 9.2 9.2 9.2 9.2 9.2 9.2 9.2 9.2 ...
## $ hpi_orig_time : num 87 87 87 87 87 ...
## $ default_time : num 0 0 0 0 0 0 0 0 0 0 ...
## $ payoff_time : num 0 0 0 0 0 0 0 0 0 0 ...
## $ status_time : num 0 0 0 0 0 0 0 0 0 0 ...
x$status_time<-as.factor(x$status_time)
x<-x[,-1]
x<-subset(x,select = -c(default_time,payoff_time))
summary(mortgage)
## id time orig_time first_time
## Min. : 1 Min. : 1.0 Min. :-40.00 Min. : 1.00
## 1st Qu.:13580 1st Qu.:27.0 1st Qu.: 18.00 1st Qu.:21.00
## Median :24881 Median :34.0 Median : 22.00 Median :25.00
## Mean :25147 Mean :35.8 Mean : 20.57 Mean :24.61
## 3rd Qu.:37045 3rd Qu.:44.0 3rd Qu.: 25.00 3rd Qu.:28.00
## Max. :50000 Max. :60.0 Max. : 60.00 Max. :60.00
##
## mat_time balance_time LTV_time interest_rate_time
## Min. : 18.0 Min. : 0 Min. : 0.00 Min. : 0.000
## 1st Qu.:137.0 1st Qu.: 102017 1st Qu.: 67.11 1st Qu.: 5.650
## Median :142.0 Median : 180618 Median : 82.25 Median : 6.625
## Mean :137.2 Mean : 245965 Mean : 83.08 Mean : 6.702
## 3rd Qu.:145.0 3rd Qu.: 337495 3rd Qu.:100.63 3rd Qu.: 7.875
## Max. :229.0 Max. :8701859 Max. :803.51 Max. :37.500
## NA's :270
## hpi_time gdp_time uer_time REtype_CO_orig_time
## Min. :107.8 Min. :-4.147 Min. : 3.800 Min. :0.0000
## 1st Qu.:158.6 1st Qu.: 1.104 1st Qu.: 4.700 1st Qu.:0.0000
## Median :180.5 Median : 1.851 Median : 5.700 Median :0.0000
## Mean :184.1 Mean : 1.381 Mean : 6.517 Mean :0.0676
## 3rd Qu.:212.7 3rd Qu.: 2.694 3rd Qu.: 8.200 3rd Qu.:0.0000
## Max. :226.3 Max. : 5.132 Max. :10.000 Max. :1.0000
##
## REtype_PU_orig_time REtype_SF_orig_time investor_orig_time
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.0000
## Mean :0.1248 Mean :0.6121 Mean :0.1382
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## balance_orig_time FICO_orig_time LTV_orig_time
## Min. : 0 Min. :400.0 Min. : 50.10
## 1st Qu.: 108000 1st Qu.:626.0 1st Qu.: 75.00
## Median : 188000 Median :678.0 Median : 80.00
## Mean : 256254 Mean :673.6 Mean : 78.98
## 3rd Qu.: 352000 3rd Qu.:729.0 3rd Qu.: 80.00
## Max. :8000000 Max. :840.0 Max. :218.50
##
## Interest_Rate_orig_time hpi_orig_time default_time
## Min. : 0.000 Min. : 75.71 Min. :0.00000
## 1st Qu.: 5.000 1st Qu.:179.45 1st Qu.:0.00000
## Median : 6.290 Median :216.77 Median :0.00000
## Mean : 5.650 Mean :198.12 Mean :0.02435
## 3rd Qu.: 7.456 3rd Qu.:222.39 3rd Qu.:0.00000
## Max. :19.750 Max. :226.29 Max. :1.00000
##
## payoff_time status_time
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.04271 Mean :0.1098
## 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :2.0000
##
barplot(table(mortgage$status_time))
Customers who are nondefault/non payoff are more compared to other types of categories.
counts<-table(x$status_time,x$uer_time)
barplot(counts,legend=rownames(counts),col = c("red","yellow","green"),main = "Unemployee Rate vs Status of payment")
counts1<-table(x$status_time,x$gdp_time)
barplot(counts,legend=rownames(counts),col = c("red","yellow","green"),main = "GDP Rate vs Status of payment")
counts2<-table(x$status_time,x$FICO_orig_time)
barplot(counts,legend=rownames(counts),col = c("red","yellow","green"),main = "Fico score vs Status of payment")
Spliting the data
x1<-x[1:200000,]
x2<-x[200000:300000,]
x3<-x[300000:400000,]
x4<-x[400000:500000,]
x5<-x[500000:nrow(x),]
x1[,1:19]<-scale(x1[,1:19])
x2[,1:19]<-scale(x2[,1:19])
x3[,1:19]<-scale(x3[,1:19])
x4[,1:19]<-scale(x4[,1:19])
x5[,1:19]<-scale(x5[,1:19])
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
## Loading required package: ggplot2
control=trainControl(method = "cv",number = 5)
model<-train(status_time ~ .,data = x1,method="multinom",trControl=control)
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 117690.876959
## iter 20 value 111521.847778
## iter 30 value 106926.957288
## iter 40 value 98518.438488
## iter 50 value 55866.153139
## iter 60 value 46565.550544
## iter 70 value 46208.753454
## final value 46207.948382
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 117740.960423
## iter 20 value 111597.549586
## iter 30 value 106975.411273
## iter 40 value 98767.562989
## iter 50 value 61593.640048
## iter 60 value 46767.326766
## iter 70 value 46211.101584
## final value 46210.750433
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 117690.927049
## iter 20 value 111521.923354
## iter 30 value 106926.984359
## iter 40 value 98518.208306
## iter 50 value 55841.353628
## iter 60 value 46384.378006
## iter 70 value 46208.082191
## final value 46207.951417
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 110888.945636
## iter 20 value 101861.623767
## iter 30 value 100028.696660
## iter 40 value 95952.750229
## iter 50 value 60108.524981
## iter 60 value 47052.286577
## iter 70 value 46175.669135
## iter 80 value 46169.229558
## final value 46169.227919
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 110950.414089
## iter 20 value 101997.723881
## iter 30 value 100189.579115
## iter 40 value 96131.193898
## iter 50 value 55493.201632
## iter 60 value 46377.944428
## iter 70 value 46172.261332
## final value 46172.019925
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 110889.007102
## iter 20 value 101861.739575
## iter 30 value 100028.820550
## iter 40 value 95952.752790
## iter 50 value 60116.678889
## iter 60 value 47058.110064
## iter 70 value 46175.672160
## iter 80 value 46169.232393
## final value 46169.230713
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 111424.500973
## iter 20 value 105364.002812
## iter 30 value 103475.015946
## iter 40 value 96206.289673
## iter 50 value 58035.527924
## iter 60 value 46339.993088
## iter 70 value 46160.937776
## final value 46160.782776
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 111480.226589
## iter 20 value 105450.235206
## iter 30 value 103630.271884
## iter 40 value 96007.185391
## iter 50 value 58863.266324
## iter 60 value 46345.744843
## iter 70 value 46163.621625
## final value 46163.574833
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 111424.556697
## iter 20 value 105364.088965
## iter 30 value 103475.260533
## iter 40 value 96208.001796
## iter 50 value 58316.310461
## iter 60 value 46345.526109
## iter 70 value 46160.905410
## final value 46160.785542
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 116147.356849
## iter 20 value 110738.399205
## iter 30 value 108050.141196
## iter 40 value 98253.518494
## iter 50 value 55745.348724
## iter 60 value 46251.094576
## iter 70 value 46191.551577
## final value 46191.477926
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 116204.864313
## iter 20 value 110810.163833
## iter 30 value 108025.737495
## iter 40 value 98704.629967
## iter 50 value 64429.459243
## iter 60 value 46302.296799
## iter 70 value 46194.507669
## final value 46194.280434
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 116147.414361
## iter 20 value 110738.470922
## iter 30 value 108050.144674
## iter 40 value 98242.341047
## iter 50 value 53759.077833
## iter 60 value 46281.202170
## iter 70 value 46191.810311
## final value 46191.480702
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 109288.134648
## iter 20 value 102747.725951
## iter 30 value 101151.829455
## iter 40 value 94445.135253
## iter 50 value 53481.543426
## iter 60 value 46083.408000
## iter 70 value 46040.322733
## final value 46040.251039
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 109343.572294
## iter 20 value 102813.001733
## iter 30 value 101122.787652
## iter 40 value 93502.485671
## iter 50 value 55580.084030
## iter 60 value 46101.674570
## iter 70 value 46043.276235
## final value 46043.144370
## converged
## # weights: 63 (40 variable)
## initial value 175777.966187
## iter 10 value 109288.190083
## iter 20 value 102747.790970
## iter 30 value 101151.785904
## iter 40 value 94444.495152
## iter 50 value 53501.745176
## iter 60 value 46085.813447
## iter 70 value 46040.326692
## final value 46040.253820
## converged
## # weights: 63 (40 variable)
## initial value 219722.457735
## iter 10 value 58258.532496
## iter 20 value 58211.913884
## iter 30 value 58174.495999
## iter 40 value 58026.785025
## final value 57700.066127
## converged
model
## Penalized Multinomial Regression
##
## 200000 samples
## 19 predictor
## 3 classes: '0', '1', '2'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 160000, 160000, 160000, 160000, 160000
## Resampling results across tuning parameters:
##
## decay Accuracy Kappa
## 0e+00 0.92677 0.0004110008
## 1e-04 0.92677 0.0004110008
## 1e-01 0.92677 0.0004110008
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was decay = 0.1.
pred<-predict(model,x2)
confusionMatrix(x2$status_time,pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 94346 1 3
## 1 2038 0 0
## 2 3609 0 4
##
## Overall Statistics
##
## Accuracy : 0.9435
## 95% CI : (0.942, 0.9449)
## No Information Rate : 0.9999
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0013
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 0.9435260 0.00000 0.571429
## Specificity 0.5000000 0.97962 0.963908
## Pos Pred Value 0.9999576 0.00000 0.001107
## Neg Pred Value 0.0007078 0.99999 0.999969
## Prevalence 0.9999200 0.00001 0.000070
## Detection Rate 0.9434506 0.00000 0.000040
## Detection Prevalence 0.9434906 0.02038 0.036130
## Balanced Accuracy 0.7217630 0.48981 0.767668
x1<-x[1:200000,]
x2<-x[200000:300000,]
x3<-x[300000:400000,]
x4<-x[400000:500000,]
x5<-x[500000:nrow(x),]
library(C50)
## Warning: package 'C50' was built under R version 3.5.1
model2<-C5.0(status_time ~ .,data=x1)
pred2<-predict(model2,x2)
confusionMatrix(x2$status_time,pred2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 94306 0 44
## 1 2037 0 1
## 2 3569 0 44
##
## Overall Statistics
##
## Accuracy : 0.9435
## 95% CI : (0.942, 0.9449)
## No Information Rate : 0.9991
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0141
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 0.943891 NA 0.49438
## Specificity 0.505618 0.97962 0.96428
## Pos Pred Value 0.999534 NA 0.01218
## Neg Pred Value 0.007963 NA 0.99953
## Prevalence 0.999110 0.00000 0.00089
## Detection Rate 0.943051 0.00000 0.00044
## Detection Prevalence 0.943491 0.02038 0.03613
## Balanced Accuracy 0.724754 NA 0.72933
library(caret)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.1
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
model4<-train(status_time ~ .,data=x1,method="rf",ntree=20)
pred4<-predict(model4,x2)
confusionMatrix(x2$status_time,pred4)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 94250 1 99
## 1 2036 0 2
## 2 3562 0 51
##
## Overall Statistics
##
## Accuracy : 0.943
## 95% CI : (0.9415, 0.9444)
## No Information Rate : 0.9985
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0155
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 0.943935 0.00000 0.33553
## Specificity 0.346405 0.97962 0.96433
## Pos Pred Value 0.998940 0.00000 0.01412
## Neg Pred Value 0.009379 0.99999 0.99895
## Prevalence 0.998470 0.00001 0.00152
## Detection Rate 0.942491 0.00000 0.00051
## Detection Prevalence 0.943491 0.02038 0.03613
## Balanced Accuracy 0.645170 0.48981 0.64993
pred5<-predict(model4,x3)
confusionMatrix(x3$status_time,pred5)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 92579 4 154
## 1 2777 0 11
## 2 4407 1 68
##
## Overall Statistics
##
## Accuracy : 0.9265
## 95% CI : (0.9248, 0.9281)
## No Information Rate : 0.9976
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0161
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 0.92799 0.00000 0.29185
## Specificity 0.33613 0.97212 0.95582
## Pos Pred Value 0.99830 0.00000 0.01519
## Neg Pred Value 0.01101 0.99995 0.99827
## Prevalence 0.99762 0.00005 0.00233
## Detection Rate 0.92578 0.00000 0.00068
## Detection Prevalence 0.92736 0.02788 0.04476
## Balanced Accuracy 0.63206 0.48606 0.62383
pred6<-predict(model4,x4)
confusionMatrix(x4$status_time,pred6)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 94125 1 19
## 1 2029 0 1
## 2 3798 0 28
##
## Overall Statistics
##
## Accuracy : 0.9415
## 95% CI : (0.94, 0.943)
## No Information Rate : 0.9995
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0089
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 0.941702 0.00000 0.583333
## Specificity 0.591837 0.97970 0.962002
## Pos Pred Value 0.999788 0.00000 0.007318
## Neg Pred Value 0.004952 0.99999 0.999792
## Prevalence 0.999510 0.00001 0.000480
## Detection Rate 0.941241 0.00000 0.000280
## Detection Prevalence 0.941441 0.02030 0.038260
## Balanced Accuracy 0.766769 0.48985 0.772668
This model is very consistent through out the data and has accuracy of 100% (It might be over fitted, so we need new data to check it once again)
Although it predicting the status of customers who is going to payoff/default/none.