###
setwd("C:/Users/rpandey/Desktop/Classes")
library(rpart)
##Read the data in the file
cust_data<-read.csv("Default_On_Payment.csv")
fit<-rpart(Default_On_Payment~Status_Checking_Acc+Credit_History, data=cust_data, method="class",
control=rpart.control(minsplit=50, cp=0.001))
##display complexity parameter table
printcp(fit)
##
## Classification tree:
## rpart(formula = Default_On_Payment ~ Status_Checking_Acc + Credit_History,
## data = cust_data, method = "class", control = rpart.control(minsplit = 50,
## cp = 0.001))
##
## Variables actually used in tree construction:
## [1] Credit_History Status_Checking_Acc
##
## Root node error: 12001/40119 = 0.29914
##
## n=40119 (2 observations deleted due to missingness)
##
## CP nsplit rel error xerror xstd
## 1 0.043288 0 1.00000 1.00000 0.0076420
## 2 0.016790 2 0.91342 0.91342 0.0074374
## 3 0.001000 4 0.87984 0.87984 0.0073497
###plot cross-validation results
plotcp(fit)
###detailed results including splits
summary(fit)
## Call:
## rpart(formula = Default_On_Payment ~ Status_Checking_Acc + Credit_History,
## data = cust_data, method = "class", control = rpart.control(minsplit = 50,
## cp = 0.001))
## n=40119 (2 observations deleted due to missingness)
##
## CP nsplit rel error xerror xstd
## 1 0.04328806 0 1.0000000 1.0000000 0.007642025
## 2 0.01679027 2 0.9134239 0.9134239 0.007437446
## 3 0.00100000 4 0.8798433 0.8798433 0.007349726
##
## Variable importance
## Status_Checking_Acc Credit_History
## 73 27
##
## Node number 1: 40119 observations, complexity param=0.04328806
## predicted class=0 expected loss=0.2991351 P(node) =1
## class counts: 28118 12001
## probabilities: 0.701 0.299
## left son=2 (18330 obs) right son=3 (21789 obs)
## Primary splits:
## Status_Checking_Acc splits as -RRLL, improve=1898.5820, (0 missing)
## Credit_History splits as -RRLLL, improve= 685.6503, (0 missing)
## Surrogate splits:
## Credit_History splits as -RRRRL, agree=0.592, adj=0.107, (0 split)
##
## Node number 2: 18330 observations
## predicted class=0 expected loss=0.1314239 P(node) =0.4568908
## class counts: 15921 2409
## probabilities: 0.869 0.131
##
## Node number 3: 21789 observations, complexity param=0.04328806
## predicted class=0 expected loss=0.4402221 P(node) =0.5431092
## class counts: 12197 9592
## probabilities: 0.560 0.440
## left son=6 (19058 obs) right son=7 (2731 obs)
## Primary splits:
## Credit_History splits as -RRLLL, improve=390.2977, (0 missing)
## Status_Checking_Acc splits as -RL--, improve=123.0783, (0 missing)
##
## Node number 6: 19058 observations, complexity param=0.01679027
## predicted class=0 expected loss=0.4043971 P(node) =0.4750368
## class counts: 11351 7707
## probabilities: 0.596 0.404
## left son=12 (4900 obs) right son=13 (14158 obs)
## Primary splits:
## Credit_History splits as ---RRL, improve=155.8200, (0 missing)
## Status_Checking_Acc splits as -RL--, improve=103.8786, (0 missing)
##
## Node number 7: 2731 observations
## predicted class=1 expected loss=0.3097766 P(node) =0.06807248
## class counts: 846 1885
## probabilities: 0.310 0.690
##
## Node number 12: 4900 observations
## predicted class=0 expected loss=0.2957143 P(node) =0.1221366
## class counts: 3451 1449
## probabilities: 0.704 0.296
##
## Node number 13: 14158 observations, complexity param=0.01679027
## predicted class=0 expected loss=0.4420116 P(node) =0.3529001
## class counts: 7900 6258
## probabilities: 0.558 0.442
## left son=26 (7261 obs) right son=27 (6897 obs)
## Primary splits:
## Status_Checking_Acc splits as -RL--, improve=204.5355, (0 missing)
## Credit_History splits as ---RL-, improve= 15.2918, (0 missing)
## Surrogate splits:
## Credit_History splits as ---RL-, agree=0.552, adj=0.081, (0 split)
##
## Node number 26: 7261 observations
## predicted class=0 expected loss=0.3591792 P(node) =0.1809866
## class counts: 4653 2608
## probabilities: 0.641 0.359
##
## Node number 27: 6897 observations
## predicted class=1 expected loss=0.4707844 P(node) =0.1719136
## class counts: 3247 3650
## probabilities: 0.471 0.529
printcp(fit)
##
## Classification tree:
## rpart(formula = Default_On_Payment ~ Status_Checking_Acc + Credit_History,
## data = cust_data, method = "class", control = rpart.control(minsplit = 50,
## cp = 0.001))
##
## Variables actually used in tree construction:
## [1] Credit_History Status_Checking_Acc
##
## Root node error: 12001/40119 = 0.29914
##
## n=40119 (2 observations deleted due to missingness)
##
## CP nsplit rel error xerror xstd
## 1 0.043288 0 1.00000 1.00000 0.0076420
## 2 0.016790 2 0.91342 0.91342 0.0074374
## 3 0.001000 4 0.87984 0.87984 0.0073497
###Prune the tree to the desired size ..at min error cp
pfit<- prune(fit, cp = 0.001)
###plot decision tree
plot(pfit, uniform=TRUE, main="Classification Tree for Default_on_payment")
###label the decision tree plot
text(pfit,splits = TRUE, use.n=TRUE, all=TRUE, cex=0.5, pretty=1)
labels(pfit)
## [1] "root" "Status_Checking_Acc=de"
## [3] "Status_Checking_Acc=bc" "Credit_History=def"
## [5] "Credit_History=f" "Credit_History=de"
## [7] "Status_Checking_Acc=c" "Status_Checking_Acc=b"
## [9] "Credit_History=bc"
library(rpart.plot)
prp(pfit, type=3, extra=2, under=TRUE)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.