##regression tree
library(MASS) #this data is in MASS package
boston.data <- data(Boston)
sample_index <- sample(nrow(Boston),nrow(Boston)*0.90)
boston.train <- Boston[sample_index,]
boston.test <- Boston[-sample_index,]
library(rpart)
library(rpart.plot)
##fit a regression tree model##
boston.rpart <- rpart(formula = medv~., data = boston.train)
##Prediction using regression tree##
###in-sample###
boston.train.pred.tree <- predict(boston.rpart,boston.train)
###out of sample###
boston.test.pred.tree <- predict(boston.rpart,boston.test)
(MSE.tree <- mean((boston.train.pred.tree-boston.train$medv)^2))
## [1] 15.80618
(MSPE.tree <- mean((boston.test.pred.tree-boston.test$medv)^2))
## [1] 21.12643
###Classification Tree (Credit Card Default data)
credit.data <- read.csv(file = "C:/Users/yizha/Desktop/Spring 2020/Data Mining-I/Module-4/credit_default.csv", header=T)
library(dplyr)
library(rpart)
library(rpart.plot)
credit.data<- rename(credit.data, default=default.payment.next.month)
# convert categorical data to factor
credit.data$SEX<- as.factor(credit.data$SEX)
credit.data$EDUCATION<- as.factor(credit.data$EDUCATION)
credit.data$MARRIAGE<- as.factor(credit.data$MARRIAGE)
index <- sample(nrow(credit.data),nrow(credit.data)*0.80)
credit.train = credit.data[index,]
credit.test = credit.data[-index,]
###fit a classsification tree
credit.rpart0 <- rpart(default ~., data=credit.train, method = "class")
credit.rpart <- rpart(formula = default~., data=credit.train,method = "class",
parms = list(loss=matrix(c(0,10,1,0), nrow = 2)))
##c(0, False Negative, False Positive, 0).
##If you have symmetric cost, you can ignore the parms argument.
#this tree with defaul cost minimizes the symmetric cost,
#which is misclassification rate. We can take a look at the confusion matrix.
pred0<- predict(credit.rpart0, type="class")
table(credit.train$default, pred0, dnn = c("True", "Pred"))#we need type="class" in order to get binary prediction
## Pred
## True 0 1
## 0 7202 324
## 1 1391 683
#we need type="class" in order to get binary prediction.
###Prediction using classification tree###
#In-sample prediction
credit.train.pred.tree1<- predict(credit.rpart, credit.train, type="class")
table(credit.train$default, credit.train.pred.tree1, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 2889 4637
## 1 196 1878
#out of-sample prediction
credit.test.pred.tree1 <- predict(credit.rpart,credit.test,type="class")
table(credit.test$default,credit.test.pred.tree1,dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 687 1155
## 1 61 497
We can get the expected loss for this tree model by defining a cost function that has the correct weights:
cost <- function(r, pi){
weight1 = 10
weight0 = 1
c1 = (r==1)&(pi==0) #logical vector - true if actual 1 but predict 0
c0 = (r==0)&(pi==1) #logical vector - true if actual 0 but predict 1
return(mean(weight1*c1+weight0*c0))
}
cost(credit.train$default,credit.train.pred.tree1)
## [1] 0.6871875
##Comparing this classification tree with logistic regression
#Fit logistic regression model
credit.glm<- glm(default~., data = credit.train, family=binomial)
#Get binary prediction
credit.test.pred.glm<- as.numeric(predict(credit.glm, credit.test, type="response")>0.21)
#Calculate cost using test set
cost(credit.test$default,credit.test.pred.glm)
## [1] 0.9595833
table(credit.test$default, credit.test.pred.glm, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 1179 663
## 1 164 394
###ROC Curve and Cut-off Probability for Classification Tree
credit.rpart <- rpart(formula = default ~ ., data = credit.train,
method = "class", parms = list(loss=matrix(c(0,10,1,0), nrow = 2)))
#Probability of getting 1
credit.test.prob.rpart = predict(credit.rpart,credit.test, type="prob")
library(ROCR)
pred = prediction(credit.test.prob.rpart[,2], credit.test$default)
perf = performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
slot(performance(pred, "auc"), "y.values")[[1]]
## [1] 0.7519794
credit.test.pred.rpart = as.numeric(credit.test.prob.rpart[,2] > 1/11)
table(credit.test$default, credit.test.pred.rpart, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 687 1155
## 1 61 497
##Pruning
the smaller the cp value,the larger (complex) tree rpart will attempt to fit. The default value for cp is 0.01.
boston.largetree <- rpart(formula = medv ~ ., data = boston.train, cp = 0.001)
prp(boston.largetree)
plotcp(boston.largetree)
##You can see that the rel error (in-sample error) is always decreasing as model is
#more complex, while the cross-validation error (measure of performance on future
#observations) is not. That is why we prune the tree to avoid overfitting the training data.
prune(boston.largetree, cp = 0.008)
## n= 455
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 455 40279.0000 22.575380
## 2) lstat>=9.725 269 6542.7490 17.232340
## 4) lstat>=16.085 131 2290.4970 13.990080
## 8) nox>=0.6615 75 887.2995 11.930670
## 16) crim>=9.87002 37 256.2508 9.656757 *
## 17) crim< 9.87002 38 253.4539 14.144740 *
## 9) nox< 0.6615 56 659.0998 16.748210 *
## 5) lstat< 16.085 138 1567.8860 20.310140 *
## 3) lstat< 9.725 186 14950.5300 30.302690
## 6) rm< 7.437 157 6242.0520 27.552230
## 12) rm< 6.797 113 3661.3600 25.445130
## 24) tax< 548 106 1151.3680 24.519810
## 48) rm< 6.542 73 492.8556 23.275340 *
## 49) rm>=6.542 33 295.3655 27.272730 *
## 25) tax>=548 7 1044.8770 39.457140 *
## 13) rm>=6.797 44 790.5218 32.963640 *
## 7) rm>=7.437 29 1090.7590 45.193100
## 14) ptratio>=17.6 7 465.9686 38.885710 *
## 15) ptratio< 17.6 22 257.7000 47.200000 *
###logistic regression
###chi-square test
table.edu<- table(credit.data$EDUCATION,credit.data$default)
chisq.test(table.edu)
##
## Pearson's Chi-squared test
##
## data: table.edu
## X-squared = 49.19, df = 3, p-value = 1.189e-10
##Train a logistic regression model with all X variables
credit.glm0 <- glm(default~., family = binomial,data= credit.train)
summary(credit.glm0)
##
## Call:
## glm(formula = default ~ ., family = binomial, data = credit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0889 -0.6962 -0.5414 -0.2903 3.2008
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.393e-01 1.542e-01 -6.092 1.12e-09 ***
## LIMIT_BAL -9.466e-07 2.849e-07 -3.322 0.000893 ***
## SEX2 -1.486e-01 5.462e-02 -2.721 0.006512 **
## EDUCATION2 -1.721e-01 6.323e-02 -2.722 0.006480 **
## EDUCATION3 -1.999e-01 8.467e-02 -2.361 0.018207 *
## EDUCATION4 -1.178e+00 3.202e-01 -3.681 0.000233 ***
## MARRIAGE2 -1.884e-01 6.201e-02 -3.038 0.002379 **
## MARRIAGE3 -2.904e-01 2.425e-01 -1.198 0.231073
## AGE 6.389e-03 3.334e-03 1.916 0.055335 .
## PAY_0 5.516e-01 3.146e-02 17.537 < 2e-16 ***
## PAY_2 9.547e-02 3.625e-02 2.633 0.008451 **
## PAY_3 3.681e-02 4.054e-02 0.908 0.363959
## PAY_4 4.121e-02 4.512e-02 0.913 0.361060
## PAY_5 6.237e-02 4.766e-02 1.309 0.190673
## PAY_6 1.228e-03 3.978e-02 0.031 0.975384
## BILL_AMT1 -7.703e-06 2.053e-06 -3.753 0.000175 ***
## BILL_AMT2 5.140e-06 2.550e-06 2.015 0.043865 *
## BILL_AMT3 2.618e-06 2.161e-06 1.211 0.225712
## BILL_AMT4 -4.929e-06 2.473e-06 -1.993 0.046238 *
## BILL_AMT5 5.885e-06 2.802e-06 2.101 0.035683 *
## BILL_AMT6 -1.786e-06 2.079e-06 -0.859 0.390283
## PAY_AMT1 -1.375e-05 3.793e-06 -3.624 0.000290 ***
## PAY_AMT2 -9.065e-06 3.348e-06 -2.708 0.006770 **
## PAY_AMT3 -3.909e-06 3.531e-06 -1.107 0.268289
## PAY_AMT4 -4.209e-06 2.884e-06 -1.459 0.144442
## PAY_AMT5 2.470e-06 2.761e-06 0.895 0.370858
## PAY_AMT6 -3.069e-06 2.155e-06 -1.424 0.154505
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10019.6 on 9599 degrees of freedom
## Residual deviance: 8815.2 on 9573 degrees of freedom
## AIC: 8869.2
##
## Number of Fisher Scoring iterations: 6
credit.glm0$deviance
## [1] 8815.201
AIC(credit.glm0)
## [1] 8869.201
BIC(credit.glm0)
## [1] 9062.778
##Prediction
#in sample prediction
library(ROCR)
pred.glm0.train <- predict(credit.glm0,type = "response")
pred <- prediction(pred.glm0.train,credit.train$default)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
unlist(slot(performance(pred,"auc"),"y.values"))
## [1] 0.7238995
##Precision-Recall Curve #Precision and recall curve and its AUC is more appropriate for imbalanced data.
library(PRROC)
score1 <- pred.glm0.train[credit.train$default==1]
score0 <- pred.glm0.train[credit.train$default==0]
roc = roc.curve(score1,score0,curve = T)
roc$auc
## [1] 0.7238995
pr= pr.curve(score1, score0, curve = T)
pr
##
## Precision-recall curve
##
## Area under curve (Integral):
## 0.4998509
##
## Area under curve (Davis & Goadrich):
## 0.4998408
##
## Curve for scores from 1.910549e-08 to 0.9933227
## ( can be plotted with plot(x) )
plot(pr)
##Out-of-sample prediction
pred.glm0.test <- predict(credit.glm0,newdata=credit.test,type="response")
##ROC curve
pred <- prediction(pred.glm0.test,credit.test$default)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
unlist(slot(performance(pred,"auc"),"y.values"))
## [1] 0.7461579
##Precision-Recall Curve
score1.test<- pred.glm0.test[credit.test$default==1]
score0.test<- pred.glm0.test[credit.test$default==0]
roc.test<- roc.curve(score1.test, score0.test, curve = T)
roc.test$auc
## [1] 0.7461579
pr.test<-pr.curve(score1.test, score0.test, curve = T)
##Binary Classification
#Choosing a large cut-off probability will result in few
#cases being predicted as 1, and chossing a small cut-off probability
#will result in many cases being predicted as 1.
table((pred.glm0.train > 0.9)*1)
##
## 0 1
## 9581 19
##Naive Choice of Cut-off probability
pcut1 <- mean(credit.train$default)
class.glm0.train <- (pred.glm0.train >pcut1)
table(credit.train$default,class.glm0.train,dnn=c("Ture","predicted"))
## predicted
## Ture FALSE TRUE
## 0 5160 2366
## 1 713 1361
##MR,FPR,FNR
(MR<- mean(credit.train$default!=class.glm0.train))
## [1] 0.3207292
(FPR <- sum(credit.train$default==0&class.glm0.train==1)/sum(credit.train$default==0))
## [1] 0.3143768
(FNR <- sum(credit.train$default==1&class.glm0.train==0)/sum(credit.train$default==1))
## [1] 0.3437801
##Determine Optimal cut-off Probability using Grid Search Method
costfunc=function(obs,pred.p,pcut){
weight1=5
weight0=1
c1= (obs==1 & pred.p<pcut)
c0= (obs==0 & pred.p>pcut)
cost=mean(weight1*c1 + weight0*c0)
return(cost)}
#define a sequence of probability (you need to search the optimal p-cut from
#this sequence)
p.seq = seq(0.01,1,0.01)
cost = rep(0,length(p.seq))
for(i in 1:length(p.seq)){
cost[i]=costfunc(obs=credit.train$default,pred.p=pred.glm0.train,pcut = p.seq[i])}
optimal.pcut.glm0 = p.seq[which(cost==min(cost))]
##Use the optimal cut-off probability
class.glm0.train.opt <- pred.glm0.train>optimal.pcut.glm0
table(credit.train$default,class.glm0.train.opt,dnn = c("True", "Predicted"))
## Predicted
## True FALSE TRUE
## 0 5281 2245
## 1 733 1341
MR<- mean(credit.train$default!= class.glm0.train.opt)
FPR<- sum(credit.train$default==0 & class.glm0.train.opt==1)/sum(credit.train$default==0)
FNR<- sum(credit.train$default==1 & class.glm0.train.opt==0)/sum(credit.train$default==1)
cost<- costfunc(obs = credit.train$default, pred.p = pred.glm0.train, pcut = optimal.pcut.glm0)