library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
library(class)
## Warning: package 'class' was built under R version 3.5.3
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(Metrics)
## Warning: package 'Metrics' was built under R version 3.5.3
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(pROC)
## Warning: package 'pROC' was built under R version 3.5.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:Metrics':
##
## auc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
Reading the data
Train_data<-read.csv("train.csv")
Test_data<-read.csv("test.csv")
Looking at data
print("The column names of Train data are:")
## [1] "The column names of Train data are:"
names(Train_data)
## [1] "loan_id" "source"
## [3] "financial_institution" "interest_rate"
## [5] "unpaid_principal_bal" "loan_term"
## [7] "origination_date" "first_payment_date"
## [9] "loan_to_value" "number_of_borrowers"
## [11] "debt_to_income_ratio" "borrower_credit_score"
## [13] "loan_purpose" "insurance_percent"
## [15] "co.borrower_credit_score" "insurance_type"
## [17] "m1" "m2"
## [19] "m3" "m4"
## [21] "m5" "m6"
## [23] "m7" "m8"
## [25] "m9" "m10"
## [27] "m11" "m12"
## [29] "m13"
print("The column names of Test data are:")
## [1] "The column names of Test data are:"
names(Test_data)
## [1] "loan_id" "source"
## [3] "financial_institution" "interest_rate"
## [5] "unpaid_principal_bal" "loan_term"
## [7] "origination_date" "first_payment_date"
## [9] "loan_to_value" "number_of_borrowers"
## [11] "debt_to_income_ratio" "borrower_credit_score"
## [13] "loan_purpose" "insurance_percent"
## [15] "co.borrower_credit_score" "insurance_type"
## [17] "m1" "m2"
## [19] "m3" "m4"
## [21] "m5" "m6"
## [23] "m7" "m8"
## [25] "m9" "m10"
## [27] "m11" "m12"
Looking at Dimensions
print("Dimensions od Train data:")
## [1] "Dimensions od Train data:"
dim(Train_data)
## [1] 116058 29
print("Dimensions of Test data")
## [1] "Dimensions of Test data"
dim(Test_data)
## [1] 35866 28
Looks like m13 is our Response variable โAdding variables to combine our Test and Train.
Test_data$m13<-NA
Test_data$Set<-"Test"
Train_data$Set<-"Train"
Combining the Test and Train data for better cleaning
data<-rbind(Train_data,Test_data)
Looking at the structure first
str(data)
## 'data.frame': 151924 obs. of 30 variables:
## $ loan_id : num 2.68e+11 6.73e+11 7.43e+11 6.01e+11 2.74e+11 ...
## $ source : Factor w/ 3 levels "X","Y","Z": 3 2 3 1 1 2 1 2 1 1 ...
## $ financial_institution : Factor w/ 19 levels "Anderson-Taylor",..: 19 16 18 9 9 2 9 2 2 9 ...
## $ interest_rate : num 4.25 4.88 3.25 4.75 4.75 ...
## $ unpaid_principal_bal : int 214000 144000 366000 135000 124000 150000 59000 319000 520000 214000 ...
## $ loan_term : int 360 360 180 360 360 360 360 300 360 360 ...
## $ origination_date : Factor w/ 6 levels "2012-01-01","2012-02-01",..: 3 1 1 2 2 2 2 1 3 1 ...
## $ first_payment_date : Factor w/ 8 levels "02/2012","03/2012",..: 4 2 2 3 3 3 3 2 4 2 ...
## $ loan_to_value : int 95 72 49 46 80 80 95 62 76 95 ...
## $ number_of_borrowers : num 1 1 1 2 1 1 1 1 1 2 ...
## $ debt_to_income_ratio : num 22 44 33 44 43 46 44 45 35 41 ...
## $ borrower_credit_score : num 694 697 780 633 681 675 723 652 808 702 ...
## $ loan_purpose : Factor w/ 3 levels "A23","B12","C86": 3 2 2 2 3 3 3 1 3 1 ...
## $ insurance_percent : num 30 0 0 0 0 0 30 0 0 30 ...
## $ co.borrower_credit_score: num 0 0 0 638 0 0 0 0 0 700 ...
## $ insurance_type : num 0 0 0 0 0 0 0 0 0 0 ...
## $ m1 : int 0 0 0 0 0 1 0 0 0 0 ...
## $ m2 : int 0 0 0 0 1 0 0 1 0 0 ...
## $ m3 : int 0 0 0 0 2 0 0 0 0 0 ...
## $ m4 : int 0 0 0 0 3 0 0 0 0 0 ...
## $ m5 : int 0 0 0 0 4 0 0 0 1 0 ...
## $ m6 : int 0 0 0 0 5 0 0 0 0 1 ...
## $ m7 : int 1 0 0 0 6 0 0 0 1 1 ...
## $ m8 : int 0 0 0 0 7 0 0 0 0 1 ...
## $ m9 : int 0 0 0 1 8 0 0 0 1 1 ...
## $ m10 : int 0 0 0 1 9 0 0 0 2 1 ...
## $ m11 : int 0 1 0 1 10 0 0 0 0 2 ...
## $ m12 : int 0 0 0 1 11 0 0 0 1 2 ...
## $ m13 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Set : chr "Train" "Train" "Train" "Train" ...
missing<-summarise_all(data,funs(sum(is.na(.))/n()))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.))
##
## # After:
## list(name = ~ f(.))
## This warning is displayed once per session.
missing<-gather(missing,key="Feature",value="Missing")
g<-ggplot(data=missing,aes(x=reorder(Feature,-Missing),y=Missing))
g<-g+geom_bar(stat="identity")+coord_flip()
g
levels(data$origination_date)
## [1] "2012-01-01" "2012-02-01" "2012-03-01" "01/01/12" "01/02/12"
## [6] "01/03/12"
Converting levels into same format
levels(data$origination_date)<-c("2012-01-01","2012-02-01","2012-03-01","2012-01-01","2012-02-01","2012-03-01")
Looking at it after changing levels
levels(data$origination_date)
## [1] "2012-01-01" "2012-02-01" "2012-03-01"
Converting to date format
data$origination_date<-strptime(data$origination_date,"%Y-%m-%d")
Converting to Date
data$origination_date<-as.POSIXct(data$origination_date)
levels(data$first_payment_date)
## [1] "02/2012" "03/2012" "04/2012" "05/2012" "Apr-12" "Feb-12" "Mar-12"
## [8] "May-12"
Converting levels into same format
levels(data$first_payment_date)<-c("2012-02-01","2012-03-01","2012-04-01","2012-05-01","2012-04-01","2012-02-01","2012-03-01","2012-05-01")
Looking at levels after changing
levels(data$first_payment_date)
## [1] "2012-02-01" "2012-03-01" "2012-04-01" "2012-05-01"
Converting into date format
data$first_payment_date<-strptime(data$first_payment_date,"%Y-%m-%d")
Converting to a date
data$first_payment_date<-as.POSIXct(data$first_payment_date)
data$Paid_After<-as.numeric(data$first_payment_date-data$origination_date)
data$origination_date<-NULL
data$first_payment_date<-NULL
levels(data$financial_institution)
## [1] "Anderson-Taylor" "Browning-Hart"
## [3] "Chapman-Mcmahon" "Cole, Brooks and Vincent"
## [5] "Edwards-Hoffman" "Martinez, Duffy and Bird"
## [7] "Miller, Mcclure and Allen" "Nicholson Group"
## [9] "OTHER" "Richards-Walters"
## [11] "Richardson Ltd" "Romero, Woods and Johnson"
## [13] "Sanchez-Robinson" "Sanchez, Hays and Wilkerson"
## [15] "Suarez Inc" "Swanson, Newton and Miller"
## [17] "Taylor, Hunt and Rodriguez" "Thornton-Davis"
## [19] "Turner, Baldwin and Rhodes"
data$financial_institution<-NULL
Looking at the variables again
str(data)
## 'data.frame': 151924 obs. of 28 variables:
## $ loan_id : num 2.68e+11 6.73e+11 7.43e+11 6.01e+11 2.74e+11 ...
## $ source : Factor w/ 3 levels "X","Y","Z": 3 2 3 1 1 2 1 2 1 1 ...
## $ interest_rate : num 4.25 4.88 3.25 4.75 4.75 ...
## $ unpaid_principal_bal : int 214000 144000 366000 135000 124000 150000 59000 319000 520000 214000 ...
## $ loan_term : int 360 360 180 360 360 360 360 300 360 360 ...
## $ loan_to_value : int 95 72 49 46 80 80 95 62 76 95 ...
## $ number_of_borrowers : num 1 1 1 2 1 1 1 1 1 2 ...
## $ debt_to_income_ratio : num 22 44 33 44 43 46 44 45 35 41 ...
## $ borrower_credit_score : num 694 697 780 633 681 675 723 652 808 702 ...
## $ loan_purpose : Factor w/ 3 levels "A23","B12","C86": 3 2 2 2 3 3 3 1 3 1 ...
## $ insurance_percent : num 30 0 0 0 0 0 30 0 0 30 ...
## $ co.borrower_credit_score: num 0 0 0 638 0 0 0 0 0 700 ...
## $ insurance_type : num 0 0 0 0 0 0 0 0 0 0 ...
## $ m1 : int 0 0 0 0 0 1 0 0 0 0 ...
## $ m2 : int 0 0 0 0 1 0 0 1 0 0 ...
## $ m3 : int 0 0 0 0 2 0 0 0 0 0 ...
## $ m4 : int 0 0 0 0 3 0 0 0 0 0 ...
## $ m5 : int 0 0 0 0 4 0 0 0 1 0 ...
## $ m6 : int 0 0 0 0 5 0 0 0 0 1 ...
## $ m7 : int 1 0 0 0 6 0 0 0 1 1 ...
## $ m8 : int 0 0 0 0 7 0 0 0 0 1 ...
## $ m9 : int 0 0 0 1 8 0 0 0 1 1 ...
## $ m10 : int 0 0 0 1 9 0 0 0 2 1 ...
## $ m11 : int 0 1 0 1 10 0 0 0 0 2 ...
## $ m12 : int 0 0 0 1 11 0 0 0 1 2 ...
## $ m13 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Set : chr "Train" "Train" "Train" "Train" ...
## $ Paid_After : num 61 60 60 60 60 60 60 60 61 60 ...
summary(data$insurance_type)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.003239 0.000000 1.000000
data$insurance_type<-as.factor(data$insurance_type)
levels(data$insurance_type)
## [1] "0" "1"
for(i in 1:nrow(data)){
data[i,"loan_performance"]<-sum(data[i,c(14:25)])
}
data[,c(14:25)]<-NULL
data$number_of_borrowers<-as.factor(data$number_of_borrowers)
data$income<-data$unpaid_principal_bal/data$debt_to_income_ratio
data$unpaid_principal_bal<-NULL
data$debt_to_income_ratio<-NULL
data$insurance_percent<-NULL
data_scale<-data[,-c(1,2,6,8,10,11,12)]
preproc<-preProcess(data_scale,c("center","scale"))
data_scale<-predict(preproc,data_scale)
data[,-c(1,2,6,8,10,11,12)]<-data_scale
#data$loan_performance<-ifelse(data$loan_performance>2,mean(data$loan_performance),data$loan_performance)
Looking at the data again
str(data)
## 'data.frame': 151924 obs. of 15 variables:
## $ loan_id : num 2.68e+11 6.73e+11 7.43e+11 6.01e+11 2.74e+11 ...
## $ source : Factor w/ 3 levels "X","Y","Z": 3 2 3 1 1 2 1 2 1 1 ...
## $ interest_rate : num 0.825 2.181 -1.345 1.91 1.91 ...
## $ loan_term : num 0.753 0.753 -1.255 0.753 0.753 ...
## $ loan_to_value : num 1.596 0.265 -1.066 -1.24 0.728 ...
## $ number_of_borrowers : Factor w/ 2 levels "1","2": 1 1 1 2 1 1 1 1 1 2 ...
## $ borrower_credit_score : num -1.803 -1.732 0.239 -3.252 -2.112 ...
## $ loan_purpose : Factor w/ 3 levels "A23","B12","C86": 3 2 2 2 3 3 3 1 3 1 ...
## $ co.borrower_credit_score: num -1.206 -1.206 -1.206 0.465 -1.206 ...
## $ insurance_type : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ m13 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Set : chr "Train" "Train" "Train" "Train" ...
## $ Paid_After : num 0.0926 -0.0452 -0.0452 -0.0452 -0.0452 ...
## $ loan_performance : num 1.0292 1.0292 -0.0549 4.2814 71.4931 ...
## $ income : num 0.324 -0.692 0.538 -0.724 -0.753 ...
Train<-filter(data,Set=="Train")
co<-cor(data_scale)
corrplot(co)
#data[,c(16:25)]<-NULL
data$loan_term<-NULL
Train<-filter(data,Set=="Train")
Train$m13<-as.factor(Train$m13)
Test<-filter(data,Set=="Test")
Train$Set<-NULL
Test$Set<-NULL
split<-sample(nrow(Train),nrow(Train)*.7)
Train<-Train[split,]
Train_Validated<-Train[-split,]
rf<-randomForest(m13~.-loan_id,Train,ntree=500)
Predicting for Validated set
pred_valid<-predict(rf,Train_Validated,type="response")
confusionMatrix(Train_Validated$m13,pred_valid)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 24281 0
## 1 25 106
##
## Accuracy : 0.999
## 95% CI : (0.9985, 0.9993)
## No Information Rate : 0.9957
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.894
##
## Mcnemar's Test P-Value : 1.587e-06
##
## Sensitivity : 0.9990
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8092
## Prevalence : 0.9957
## Detection Rate : 0.9946
## Detection Prevalence : 0.9946
## Balanced Accuracy : 0.9995
##
## 'Positive' Class : 0
##
Predicting for test
dt_Test<-rpart.predict(rf,newdata = Test,type="response")
sub<-data.frame(loan_id=Test$loan_id,m13=dt_Test)
write.csv(sub,"subrf.csv",row.names = F)