This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(reshape2)
library(class)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
library(rpart)
library(rpart.plot)
getwd()
## [1] "C:/Users/prasnaya/Desktop/Personal/python/Data_Science/DS With R-Saharan/Projects/Decision_Tree"
data<-read.csv("loans.csv")
str(data)
## 'data.frame': 39732 obs. of 16 variables:
## $ keep : int 1 1 0 0 0 0 0 1 1 1 ...
## $ rand : num 0.13 0.998 0.628 0.252 0.474 ...
## $ default : int 0 1 0 0 0 0 0 0 1 1 ...
## $ loan_amount : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 2 2 2 3 2 2 3 2 3 2 ...
## $ emp_length : Factor w/ 5 levels "< 2 years","10+ years",..: 2 1 2 2 1 3 4 4 3 1 ...
## $ home_ownership : Factor w/ 4 levels "MORTGAGE","OTHER",..: 4 4 4 4 4 4 4 4 3 4 ...
## $ income : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 2 2 2 3 1 2 3 3 3 2 ...
## $ loan_purpose : Factor w/ 14 levels "car","credit_card",..: 2 1 12 10 10 14 3 1 12 10 ...
## $ debt_to_income : Factor w/ 3 levels "AVERAGE","HIGH",..: 2 3 1 2 1 1 2 3 3 1 ...
## $ credit_score : Factor w/ 3 levels "AVERAGE","HIGH",..: 1 1 1 1 1 1 1 3 1 1 ...
## $ recent_inquiry : Factor w/ 2 levels "NO","YES": 2 2 2 2 1 2 2 2 2 1 ...
## $ delinquent : Factor w/ 3 levels "IN PAST 2 YEARS",..: 3 3 3 2 2 3 3 3 3 3 ...
## $ credit_accounts : Factor w/ 3 levels "AVERAGE","FEW",..: 2 2 2 1 3 1 1 2 1 2 ...
## $ bad_public_record : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
## $ credit_utilization: Factor w/ 3 levels "HIGH","LOW","MEDIUM": 1 2 1 2 3 3 1 1 3 3 ...
## $ past_bankrupt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
set.seed(1000)
rand_run<- runif(nrow(data))
data<-data[order(rand_run),]
head(data)
data_barchat <- data %>%
count(income,loan_amount) %>%
rename(avg_count = n)
data_barchat
barchat1 <- ggplot(data=data_barchat, aes(x=income,y=avg_count,fill=loan_amount))
barchat1 + geom_bar(stat="identity", position="dodge")
data_barchat2 <- data %>%
count(loan_amount,loan_purpose) %>%
rename(avg_count = n)
data_barchat2
data_barchat3 <- ggplot(data=data_barchat2, aes(x=loan_amount,y=avg_count,fill=loan_purpose))
data_barchat3 + geom_bar(stat="identity", position="dodge")
table(data$loan_amount)
##
## HIGH LOW MEDIUM
## 11138 9851 18743
table(data$income)
##
## HIGH LOW MEDIUM
## 11051 8976 19705
table(data$credit_score)
##
## AVERAGE HIGH LOW
## 27098 7931 4703
str(data)
## 'data.frame': 39732 obs. of 16 variables:
## $ keep : int 1 0 0 0 0 0 0 1 0 0 ...
## $ rand : num 0.132 0.92 0.299 0.172 0.895 ...
## $ default : int 1 0 0 0 0 0 0 0 0 0 ...
## $ loan_amount : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 2 3 2 3 2 1 3 3 2 3 ...
## $ emp_length : Factor w/ 5 levels "< 2 years","10+ years",..: 2 1 2 4 1 1 3 2 2 3 ...
## $ home_ownership : Factor w/ 4 levels "MORTGAGE","OTHER",..: 1 1 1 1 4 1 1 3 1 4 ...
## $ income : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 1 3 3 2 3 1 3 3 3 ...
## $ loan_purpose : Factor w/ 14 levels "car","credit_card",..: 4 12 3 8 9 3 5 3 10 3 ...
## $ debt_to_income : Factor w/ 3 levels "AVERAGE","HIGH",..: 2 3 2 1 3 1 1 2 1 1 ...
## $ credit_score : Factor w/ 3 levels "AVERAGE","HIGH",..: 1 3 1 1 1 1 2 1 2 1 ...
## $ recent_inquiry : Factor w/ 2 levels "NO","YES": 2 2 1 1 2 1 1 2 2 1 ...
## $ delinquent : Factor w/ 3 levels "IN PAST 2 YEARS",..: 2 1 3 1 3 3 3 3 3 3 ...
## $ credit_accounts : Factor w/ 3 levels "AVERAGE","FEW",..: 3 1 2 1 2 1 1 3 1 3 ...
## $ bad_public_record : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 2 1 1 ...
## $ credit_utilization: Factor w/ 3 levels "HIGH","LOW","MEDIUM": 1 1 3 3 2 3 2 3 2 2 ...
## $ past_bankrupt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
loan_model1 <- rpart(default ~ loan_amount + credit_score, data = data, method = "class", control = rpart.control(cp = 0))
loan_model <- rpart(default ~ ., data = data, method = "class", control = rpart.control(cp = .15))
rpart.plot(loan_model1)
rpart.plot(loan_model)
rpart.plot(loan_model, type = 3, box.palette = c("red", "green"), fallen.leaves = TRUE)
nrow(data)
## [1] 39732
data_training<-data[1:29000,]
data_test<-data[29001:39732,]
str(data_training)
## 'data.frame': 29000 obs. of 16 variables:
## $ keep : int 1 0 0 0 0 0 0 1 0 0 ...
## $ rand : num 0.132 0.92 0.299 0.172 0.895 ...
## $ default : int 1 0 0 0 0 0 0 0 0 0 ...
## $ loan_amount : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 2 3 2 3 2 1 3 3 2 3 ...
## $ emp_length : Factor w/ 5 levels "< 2 years","10+ years",..: 2 1 2 4 1 1 3 2 2 3 ...
## $ home_ownership : Factor w/ 4 levels "MORTGAGE","OTHER",..: 1 1 1 1 4 1 1 3 1 4 ...
## $ income : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 1 3 3 2 3 1 3 3 3 ...
## $ loan_purpose : Factor w/ 14 levels "car","credit_card",..: 4 12 3 8 9 3 5 3 10 3 ...
## $ debt_to_income : Factor w/ 3 levels "AVERAGE","HIGH",..: 2 3 2 1 3 1 1 2 1 1 ...
## $ credit_score : Factor w/ 3 levels "AVERAGE","HIGH",..: 1 3 1 1 1 1 2 1 2 1 ...
## $ recent_inquiry : Factor w/ 2 levels "NO","YES": 2 2 1 1 2 1 1 2 2 1 ...
## $ delinquent : Factor w/ 3 levels "IN PAST 2 YEARS",..: 2 1 3 1 3 3 3 3 3 3 ...
## $ credit_accounts : Factor w/ 3 levels "AVERAGE","FEW",..: 3 1 2 1 2 1 1 3 1 3 ...
## $ bad_public_record : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 2 1 1 ...
## $ credit_utilization: Factor w/ 3 levels "HIGH","LOW","MEDIUM": 1 1 3 3 2 3 2 3 2 2 ...
## $ past_bankrupt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
str(data_test)
## 'data.frame': 10732 obs. of 16 variables:
## $ keep : int 0 0 0 1 1 0 0 1 0 0 ...
## $ rand : num 0.5942 0.6413 0.9471 0.0715 0.6168 ...
## $ default : int 0 0 0 0 1 0 0 1 0 0 ...
## $ loan_amount : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 3 2 1 2 3 3 3 2 2 ...
## $ emp_length : Factor w/ 5 levels "< 2 years","10+ years",..: 4 4 1 2 3 2 1 1 4 3 ...
## $ home_ownership : Factor w/ 4 levels "MORTGAGE","OTHER",..: 1 4 4 1 4 1 1 3 1 1 ...
## $ income : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 2 3 1 2 3 2 2 3 3 ...
## $ loan_purpose : Factor w/ 14 levels "car","credit_card",..: 3 7 9 3 3 3 3 10 5 3 ...
## $ debt_to_income : Factor w/ 3 levels "AVERAGE","HIGH",..: 1 3 2 3 1 2 3 1 3 2 ...
## $ credit_score : Factor w/ 3 levels "AVERAGE","HIGH",..: 3 2 1 2 3 1 1 1 2 1 ...
## $ recent_inquiry : Factor w/ 2 levels "NO","YES": 2 2 1 2 2 2 1 2 2 1 ...
## $ delinquent : Factor w/ 3 levels "IN PAST 2 YEARS",..: 2 3 3 3 2 2 2 3 3 1 ...
## $ credit_accounts : Factor w/ 3 levels "AVERAGE","FEW",..: 3 1 3 1 2 1 1 2 1 3 ...
## $ bad_public_record : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...
## $ credit_utilization: Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 2 3 3 3 1 2 3 2 3 ...
## $ past_bankrupt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...
loan_model <- rpart(default ~ ., data = data_training, method = "class", control = rpart.control(cp = 0))
rpart.plot(loan_model, type = 3, box.palette = c("red", "green"), fallen.leaves = TRUE)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
data_test$pred<-predict(loan_model, data_test, type = "class")
str(data_test)
## 'data.frame': 10732 obs. of 17 variables:
## $ keep : int 0 0 0 1 1 0 0 1 0 0 ...
## $ rand : num 0.5942 0.6413 0.9471 0.0715 0.6168 ...
## $ default : int 0 0 0 0 1 0 0 1 0 0 ...
## $ loan_amount : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 3 2 1 2 3 3 3 2 2 ...
## $ emp_length : Factor w/ 5 levels "< 2 years","10+ years",..: 4 4 1 2 3 2 1 1 4 3 ...
## $ home_ownership : Factor w/ 4 levels "MORTGAGE","OTHER",..: 1 4 4 1 4 1 1 3 1 1 ...
## $ income : Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 2 3 1 2 3 2 2 3 3 ...
## $ loan_purpose : Factor w/ 14 levels "car","credit_card",..: 3 7 9 3 3 3 3 10 5 3 ...
## $ debt_to_income : Factor w/ 3 levels "AVERAGE","HIGH",..: 1 3 2 3 1 2 3 1 3 2 ...
## $ credit_score : Factor w/ 3 levels "AVERAGE","HIGH",..: 3 2 1 2 3 1 1 1 2 1 ...
## $ recent_inquiry : Factor w/ 2 levels "NO","YES": 2 2 1 2 2 2 1 2 2 1 ...
## $ delinquent : Factor w/ 3 levels "IN PAST 2 YEARS",..: 2 3 3 3 2 2 2 3 3 1 ...
## $ credit_accounts : Factor w/ 3 levels "AVERAGE","FEW",..: 3 1 3 1 2 1 1 2 1 3 ...
## $ bad_public_record : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...
## $ credit_utilization: Factor w/ 3 levels "HIGH","LOW","MEDIUM": 3 2 3 3 3 1 2 3 2 3 ...
## $ past_bankrupt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...
## $ pred : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1 1 ...
table(data_test$pred, data_test$default)
##
## 0 1
## 0 9126 218
## 1 101 1287
mean(data_test$pred == data_test$default)
## [1] 0.9702758