CART_R

###
setwd("C:/Users/rpandey/Desktop/Classes")
library(rpart)
##Read the data in the file
cust_data<-read.csv("Default_On_Payment.csv")



fit<-rpart(Default_On_Payment~Status_Checking_Acc+Credit_History, data=cust_data, method="class", 
           control=rpart.control(minsplit=50, cp=0.001))


##display complexity parameter table
printcp(fit)

## 
## Classification tree:
## rpart(formula = Default_On_Payment ~ Status_Checking_Acc + Credit_History, 
##     data = cust_data, method = "class", control = rpart.control(minsplit = 50, 
##         cp = 0.001))
## 
## Variables actually used in tree construction:
## [1] Credit_History      Status_Checking_Acc
## 
## Root node error: 12001/40119 = 0.29914
## 
## n=40119 (2 observations deleted due to missingness)
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.043288      0   1.00000 1.00000 0.0076420
## 2 0.016790      2   0.91342 0.91342 0.0074374
## 3 0.001000      4   0.87984 0.87984 0.0073497

###plot cross-validation results
plotcp(fit)

###detailed results including splits
summary(fit)

## Call:
## rpart(formula = Default_On_Payment ~ Status_Checking_Acc + Credit_History, 
##     data = cust_data, method = "class", control = rpart.control(minsplit = 50, 
##         cp = 0.001))
##   n=40119 (2 observations deleted due to missingness)
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.04328806      0 1.0000000 1.0000000 0.007642025
## 2 0.01679027      2 0.9134239 0.9134239 0.007437446
## 3 0.00100000      4 0.8798433 0.8798433 0.007349726
## 
## Variable importance
## Status_Checking_Acc      Credit_History 
##                  73                  27 
## 
## Node number 1: 40119 observations,    complexity param=0.04328806
##   predicted class=0  expected loss=0.2991351  P(node) =1
##     class counts: 28118 12001
##    probabilities: 0.701 0.299 
##   left son=2 (18330 obs) right son=3 (21789 obs)
##   Primary splits:
##       Status_Checking_Acc splits as  -RRLL,  improve=1898.5820, (0 missing)
##       Credit_History      splits as  -RRLLL, improve= 685.6503, (0 missing)
##   Surrogate splits:
##       Credit_History splits as  -RRRRL, agree=0.592, adj=0.107, (0 split)
## 
## Node number 2: 18330 observations
##   predicted class=0  expected loss=0.1314239  P(node) =0.4568908
##     class counts: 15921  2409
##    probabilities: 0.869 0.131 
## 
## Node number 3: 21789 observations,    complexity param=0.04328806
##   predicted class=0  expected loss=0.4402221  P(node) =0.5431092
##     class counts: 12197  9592
##    probabilities: 0.560 0.440 
##   left son=6 (19058 obs) right son=7 (2731 obs)
##   Primary splits:
##       Credit_History      splits as  -RRLLL, improve=390.2977, (0 missing)
##       Status_Checking_Acc splits as  -RL--,  improve=123.0783, (0 missing)
## 
## Node number 6: 19058 observations,    complexity param=0.01679027
##   predicted class=0  expected loss=0.4043971  P(node) =0.4750368
##     class counts: 11351  7707
##    probabilities: 0.596 0.404 
##   left son=12 (4900 obs) right son=13 (14158 obs)
##   Primary splits:
##       Credit_History      splits as  ---RRL, improve=155.8200, (0 missing)
##       Status_Checking_Acc splits as  -RL--,  improve=103.8786, (0 missing)
## 
## Node number 7: 2731 observations
##   predicted class=1  expected loss=0.3097766  P(node) =0.06807248
##     class counts:   846  1885
##    probabilities: 0.310 0.690 
## 
## Node number 12: 4900 observations
##   predicted class=0  expected loss=0.2957143  P(node) =0.1221366
##     class counts:  3451  1449
##    probabilities: 0.704 0.296 
## 
## Node number 13: 14158 observations,    complexity param=0.01679027
##   predicted class=0  expected loss=0.4420116  P(node) =0.3529001
##     class counts:  7900  6258
##    probabilities: 0.558 0.442 
##   left son=26 (7261 obs) right son=27 (6897 obs)
##   Primary splits:
##       Status_Checking_Acc splits as  -RL--,  improve=204.5355, (0 missing)
##       Credit_History      splits as  ---RL-, improve= 15.2918, (0 missing)
##   Surrogate splits:
##       Credit_History splits as  ---RL-, agree=0.552, adj=0.081, (0 split)
## 
## Node number 26: 7261 observations
##   predicted class=0  expected loss=0.3591792  P(node) =0.1809866
##     class counts:  4653  2608
##    probabilities: 0.641 0.359 
## 
## Node number 27: 6897 observations
##   predicted class=1  expected loss=0.4707844  P(node) =0.1719136
##     class counts:  3247  3650
##    probabilities: 0.471 0.529

printcp(fit)

## 
## Classification tree:
## rpart(formula = Default_On_Payment ~ Status_Checking_Acc + Credit_History, 
##     data = cust_data, method = "class", control = rpart.control(minsplit = 50, 
##         cp = 0.001))
## 
## Variables actually used in tree construction:
## [1] Credit_History      Status_Checking_Acc
## 
## Root node error: 12001/40119 = 0.29914
## 
## n=40119 (2 observations deleted due to missingness)
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.043288      0   1.00000 1.00000 0.0076420
## 2 0.016790      2   0.91342 0.91342 0.0074374
## 3 0.001000      4   0.87984 0.87984 0.0073497

###Prune the tree to the desired size ..at min error cp
pfit<- prune(fit, cp = 0.001)

###plot decision tree 
plot(pfit, uniform=TRUE, main="Classification Tree for Default_on_payment")

###label the decision tree plot 
text(pfit,splits = TRUE, use.n=TRUE, all=TRUE, cex=0.5, pretty=1)

labels(pfit)

## [1] "root"                   "Status_Checking_Acc=de"
## [3] "Status_Checking_Acc=bc" "Credit_History=def"    
## [5] "Credit_History=f"       "Credit_History=de"     
## [7] "Status_Checking_Acc=c"  "Status_Checking_Acc=b" 
## [9] "Credit_History=bc"

library(rpart.plot)

prp(pfit, type=3, extra=2, under=TRUE)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

CART_R

Ratnakar_Pandey

September 21, 2017

R Markdown

Including Plots