R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(reshape2)
library(class)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
library(rpart)
library(rpart.plot)
getwd()
## [1] "C:/Users/prasnaya/Desktop/Personal/python/Data_Science/DS With R-Saharan/Projects/Decision_Tree"
data<-read.csv("loans.csv")
str(data)
## 'data.frame':    39732 obs. of  16 variables:
##  $ keep              : int  1 1 0 0 0 0 0 1 1 1 ...
##  $ rand              : num  0.13 0.998 0.628 0.252 0.474 ...
##  $ default           : int  0 1 0 0 0 0 0 0 1 1 ...
##  $ loan_amount       : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 2 2 2 3 2 2 3 2 3 2 ...
##  $ emp_length        : Factor w/ 5 levels "< 2 years","10+ years",..: 2 1 2 2 1 3 4 4 3 1 ...
##  $ home_ownership    : Factor w/ 4 levels "MORTGAGE","OTHER",..: 4 4 4 4 4 4 4 4 3 4 ...
##  $ income            : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 2 2 2 3 1 2 3 3 3 2 ...
##  $ loan_purpose      : Factor w/ 14 levels "car","credit_card",..: 2 1 12 10 10 14 3 1 12 10 ...
##  $ debt_to_income    : Factor w/ 3 levels "AVERAGE","HIGH",..: 2 3 1 2 1 1 2 3 3 1 ...
##  $ credit_score      : Factor w/ 3 levels "AVERAGE","HIGH",..: 1 1 1 1 1 1 1 3 1 1 ...
##  $ recent_inquiry    : Factor w/ 2 levels "NO","YES": 2 2 2 2 1 2 2 2 2 1 ...
##  $ delinquent        : Factor w/ 3 levels "IN PAST 2 YEARS",..: 3 3 3 2 2 3 3 3 3 3 ...
##  $ credit_accounts   : Factor w/ 3 levels "AVERAGE","FEW",..: 2 2 2 1 3 1 1 2 1 2 ...
##  $ bad_public_record : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
##  $ credit_utilization: Factor w/ 3 levels "HIGH","LOW","MEDIUM": 1 2 1 2 3 3 1 1 3 3 ...
##  $ past_bankrupt     : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...

Data may be homogeneous resuffeling to make sure we have well spread out data set 

set.seed(1000)
rand_run<- runif(nrow(data))
data<-data[order(rand_run),]


head(data)
data_barchat <- data %>% 
  count(income,loan_amount) %>%
  rename(avg_count = n)

data_barchat
barchat1 <- ggplot(data=data_barchat, aes(x=income,y=avg_count,fill=loan_amount))
barchat1 + geom_bar(stat="identity", position="dodge")

data_barchat2 <- data %>% 
  count(loan_amount,loan_purpose) %>%
  rename(avg_count = n)

data_barchat2
data_barchat3 <- ggplot(data=data_barchat2, aes(x=loan_amount,y=avg_count,fill=loan_purpose))
data_barchat3 + geom_bar(stat="identity", position="dodge")

look at two loan anount levels of the applicant

table(data$loan_amount)
## 
##   HIGH    LOW MEDIUM 
##  11138   9851  18743
table(data$income)
## 
##   HIGH    LOW MEDIUM 
##  11051   8976  19705
table(data$credit_score)
## 
## AVERAGE    HIGH     LOW 
##   27098    7931    4703
str(data)
## 'data.frame':    39732 obs. of  16 variables:
##  $ keep              : int  1 0 0 0 0 0 0 1 0 0 ...
##  $ rand              : num  0.132 0.92 0.299 0.172 0.895 ...
##  $ default           : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ loan_amount       : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 2 3 2 3 2 1 3 3 2 3 ...
##  $ emp_length        : Factor w/ 5 levels "< 2 years","10+ years",..: 2 1 2 4 1 1 3 2 2 3 ...
##  $ home_ownership    : Factor w/ 4 levels "MORTGAGE","OTHER",..: 1 1 1 1 4 1 1 3 1 4 ...
##  $ income            : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 1 3 3 2 3 1 3 3 3 ...
##  $ loan_purpose      : Factor w/ 14 levels "car","credit_card",..: 4 12 3 8 9 3 5 3 10 3 ...
##  $ debt_to_income    : Factor w/ 3 levels "AVERAGE","HIGH",..: 2 3 2 1 3 1 1 2 1 1 ...
##  $ credit_score      : Factor w/ 3 levels "AVERAGE","HIGH",..: 1 3 1 1 1 1 2 1 2 1 ...
##  $ recent_inquiry    : Factor w/ 2 levels "NO","YES": 2 2 1 1 2 1 1 2 2 1 ...
##  $ delinquent        : Factor w/ 3 levels "IN PAST 2 YEARS",..: 2 1 3 1 3 3 3 3 3 3 ...
##  $ credit_accounts   : Factor w/ 3 levels "AVERAGE","FEW",..: 3 1 2 1 2 1 1 3 1 3 ...
##  $ bad_public_record : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 2 1 1 ...
##  $ credit_utilization: Factor w/ 3 levels "HIGH","LOW","MEDIUM": 1 1 3 3 2 3 2 3 2 2 ...
##  $ past_bankrupt     : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...

creating the model

loan_model1 <- rpart(default ~ loan_amount + credit_score, data = data, method = "class", control = rpart.control(cp = 0))

loan_model <- rpart(default ~ ., data = data, method = "class", control = rpart.control(cp = .15))

rpart.plot(loan_model1)

rpart.plot(loan_model)

rpart.plot(loan_model, type = 3, box.palette = c("red", "green"), fallen.leaves = TRUE)

nrow(data)
## [1] 39732
data_training<-data[1:29000,]
data_test<-data[29001:39732,]
str(data_training)
## 'data.frame':    29000 obs. of  16 variables:
##  $ keep              : int  1 0 0 0 0 0 0 1 0 0 ...
##  $ rand              : num  0.132 0.92 0.299 0.172 0.895 ...
##  $ default           : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ loan_amount       : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 2 3 2 3 2 1 3 3 2 3 ...
##  $ emp_length        : Factor w/ 5 levels "< 2 years","10+ years",..: 2 1 2 4 1 1 3 2 2 3 ...
##  $ home_ownership    : Factor w/ 4 levels "MORTGAGE","OTHER",..: 1 1 1 1 4 1 1 3 1 4 ...
##  $ income            : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 1 3 3 2 3 1 3 3 3 ...
##  $ loan_purpose      : Factor w/ 14 levels "car","credit_card",..: 4 12 3 8 9 3 5 3 10 3 ...
##  $ debt_to_income    : Factor w/ 3 levels "AVERAGE","HIGH",..: 2 3 2 1 3 1 1 2 1 1 ...
##  $ credit_score      : Factor w/ 3 levels "AVERAGE","HIGH",..: 1 3 1 1 1 1 2 1 2 1 ...
##  $ recent_inquiry    : Factor w/ 2 levels "NO","YES": 2 2 1 1 2 1 1 2 2 1 ...
##  $ delinquent        : Factor w/ 3 levels "IN PAST 2 YEARS",..: 2 1 3 1 3 3 3 3 3 3 ...
##  $ credit_accounts   : Factor w/ 3 levels "AVERAGE","FEW",..: 3 1 2 1 2 1 1 3 1 3 ...
##  $ bad_public_record : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 2 1 1 ...
##  $ credit_utilization: Factor w/ 3 levels "HIGH","LOW","MEDIUM": 1 1 3 3 2 3 2 3 2 2 ...
##  $ past_bankrupt     : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
str(data_test)
## 'data.frame':    10732 obs. of  16 variables:
##  $ keep              : int  0 0 0 1 1 0 0 1 0 0 ...
##  $ rand              : num  0.5942 0.6413 0.9471 0.0715 0.6168 ...
##  $ default           : int  0 0 0 0 1 0 0 1 0 0 ...
##  $ loan_amount       : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 3 2 1 2 3 3 3 2 2 ...
##  $ emp_length        : Factor w/ 5 levels "< 2 years","10+ years",..: 4 4 1 2 3 2 1 1 4 3 ...
##  $ home_ownership    : Factor w/ 4 levels "MORTGAGE","OTHER",..: 1 4 4 1 4 1 1 3 1 1 ...
##  $ income            : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 2 3 1 2 3 2 2 3 3 ...
##  $ loan_purpose      : Factor w/ 14 levels "car","credit_card",..: 3 7 9 3 3 3 3 10 5 3 ...
##  $ debt_to_income    : Factor w/ 3 levels "AVERAGE","HIGH",..: 1 3 2 3 1 2 3 1 3 2 ...
##  $ credit_score      : Factor w/ 3 levels "AVERAGE","HIGH",..: 3 2 1 2 3 1 1 1 2 1 ...
##  $ recent_inquiry    : Factor w/ 2 levels "NO","YES": 2 2 1 2 2 2 1 2 2 1 ...
##  $ delinquent        : Factor w/ 3 levels "IN PAST 2 YEARS",..: 2 3 3 3 2 2 2 3 3 1 ...
##  $ credit_accounts   : Factor w/ 3 levels "AVERAGE","FEW",..: 3 1 3 1 2 1 1 2 1 3 ...
##  $ bad_public_record : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...
##  $ credit_utilization: Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 2 3 3 3 1 2 3 2 3 ...
##  $ past_bankrupt     : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...
loan_model <- rpart(default ~ ., data = data_training, method = "class", control = rpart.control(cp = 0))
rpart.plot(loan_model, type = 3, box.palette = c("red", "green"), fallen.leaves = TRUE)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

data_test$pred<-predict(loan_model, data_test, type = "class")

str(data_test)
## 'data.frame':    10732 obs. of  17 variables:
##  $ keep              : int  0 0 0 1 1 0 0 1 0 0 ...
##  $ rand              : num  0.5942 0.6413 0.9471 0.0715 0.6168 ...
##  $ default           : int  0 0 0 0 1 0 0 1 0 0 ...
##  $ loan_amount       : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 3 2 1 2 3 3 3 2 2 ...
##  $ emp_length        : Factor w/ 5 levels "< 2 years","10+ years",..: 4 4 1 2 3 2 1 1 4 3 ...
##  $ home_ownership    : Factor w/ 4 levels "MORTGAGE","OTHER",..: 1 4 4 1 4 1 1 3 1 1 ...
##  $ income            : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 2 3 1 2 3 2 2 3 3 ...
##  $ loan_purpose      : Factor w/ 14 levels "car","credit_card",..: 3 7 9 3 3 3 3 10 5 3 ...
##  $ debt_to_income    : Factor w/ 3 levels "AVERAGE","HIGH",..: 1 3 2 3 1 2 3 1 3 2 ...
##  $ credit_score      : Factor w/ 3 levels "AVERAGE","HIGH",..: 3 2 1 2 3 1 1 1 2 1 ...
##  $ recent_inquiry    : Factor w/ 2 levels "NO","YES": 2 2 1 2 2 2 1 2 2 1 ...
##  $ delinquent        : Factor w/ 3 levels "IN PAST 2 YEARS",..: 2 3 3 3 2 2 2 3 3 1 ...
##  $ credit_accounts   : Factor w/ 3 levels "AVERAGE","FEW",..: 3 1 3 1 2 1 1 2 1 3 ...
##  $ bad_public_record : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...
##  $ credit_utilization: Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 2 3 3 3 1 2 3 2 3 ...
##  $ past_bankrupt     : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...
##  $ pred              : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1 1 ...
table(data_test$pred, data_test$default)
##    
##        0    1
##   0 9126  218
##   1  101 1287
mean(data_test$pred == data_test$default)
## [1] 0.9702758