##regression tree

library(MASS) #this data is in MASS package
boston.data <- data(Boston)
sample_index <- sample(nrow(Boston),nrow(Boston)*0.90)
boston.train <- Boston[sample_index,]
boston.test <- Boston[-sample_index,]

library(rpart)
library(rpart.plot)
##fit a regression tree model##
boston.rpart <- rpart(formula = medv~., data = boston.train)
##Prediction using regression tree##
###in-sample###
boston.train.pred.tree <- predict(boston.rpart,boston.train)
###out of sample###
boston.test.pred.tree <- predict(boston.rpart,boston.test)
(MSE.tree <- mean((boston.train.pred.tree-boston.train$medv)^2))
## [1] 15.80618
(MSPE.tree <- mean((boston.test.pred.tree-boston.test$medv)^2))
## [1] 21.12643

###Classification Tree (Credit Card Default data)

credit.data <- read.csv(file = "C:/Users/yizha/Desktop/Spring 2020/Data Mining-I/Module-4/credit_default.csv", header=T)
library(dplyr)
library(rpart)
library(rpart.plot)
credit.data<- rename(credit.data, default=default.payment.next.month)
# convert categorical data to factor
credit.data$SEX<- as.factor(credit.data$SEX)
credit.data$EDUCATION<- as.factor(credit.data$EDUCATION)
credit.data$MARRIAGE<- as.factor(credit.data$MARRIAGE)
index <- sample(nrow(credit.data),nrow(credit.data)*0.80)
credit.train = credit.data[index,]
credit.test = credit.data[-index,]

###fit a classsification tree

credit.rpart0 <- rpart(default ~., data=credit.train, method = "class")
credit.rpart <- rpart(formula = default~., data=credit.train,method = "class",
                      parms = list(loss=matrix(c(0,10,1,0), nrow = 2)))
##c(0, False Negative, False Positive, 0). 
##If you have symmetric cost, you can ignore the parms argument.
#this tree with defaul cost minimizes the symmetric cost, 
#which is misclassification rate. We can take a look at the confusion matrix.
pred0<- predict(credit.rpart0, type="class")
table(credit.train$default, pred0, dnn = c("True", "Pred"))#we need type="class" in order to get binary prediction
##     Pred
## True    0    1
##    0 7202  324
##    1 1391  683
#we need type="class" in order to get binary prediction.

###Prediction using classification tree###
#In-sample prediction
credit.train.pred.tree1<- predict(credit.rpart, credit.train, type="class")
table(credit.train$default, credit.train.pred.tree1, dnn=c("Truth","Predicted"))
##      Predicted
## Truth    0    1
##     0 2889 4637
##     1  196 1878
#out of-sample prediction
credit.test.pred.tree1 <- predict(credit.rpart,credit.test,type="class")
table(credit.test$default,credit.test.pred.tree1,dnn=c("Truth","Predicted"))
##      Predicted
## Truth    0    1
##     0  687 1155
##     1   61  497

We can get the expected loss for this tree model by defining a cost function that has the correct weights:

cost <- function(r, pi){
  weight1 = 10
  weight0 = 1
  c1 = (r==1)&(pi==0) #logical vector - true if actual 1 but predict 0
  c0 = (r==0)&(pi==1) #logical vector - true if actual 0 but predict 1
  return(mean(weight1*c1+weight0*c0))
}
cost(credit.train$default,credit.train.pred.tree1)
## [1] 0.6871875
##Comparing this classification tree with logistic regression
#Fit logistic regression model
credit.glm<- glm(default~., data = credit.train, family=binomial)
#Get binary prediction
credit.test.pred.glm<- as.numeric(predict(credit.glm, credit.test, type="response")>0.21)
#Calculate cost using test set
cost(credit.test$default,credit.test.pred.glm)
## [1] 0.9595833
table(credit.test$default, credit.test.pred.glm, dnn=c("Truth","Predicted"))
##      Predicted
## Truth    0    1
##     0 1179  663
##     1  164  394
###ROC Curve and Cut-off Probability for Classification Tree
credit.rpart <- rpart(formula = default ~ ., data = credit.train, 
                      method = "class", parms = list(loss=matrix(c(0,10,1,0), nrow = 2)))
#Probability of getting 1
credit.test.prob.rpart = predict(credit.rpart,credit.test, type="prob")
library(ROCR)
pred = prediction(credit.test.prob.rpart[,2], credit.test$default)
perf = performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

slot(performance(pred, "auc"), "y.values")[[1]]
## [1] 0.7519794
credit.test.pred.rpart = as.numeric(credit.test.prob.rpart[,2] > 1/11)
table(credit.test$default, credit.test.pred.rpart, dnn=c("Truth","Predicted"))
##      Predicted
## Truth    0    1
##     0  687 1155
##     1   61  497

##Pruning

the smaller the cp value,the larger (complex) tree rpart will attempt to fit. The default value for cp is 0.01.

boston.largetree <- rpart(formula = medv ~ ., data = boston.train, cp = 0.001)
prp(boston.largetree)

plotcp(boston.largetree)

##You can see that the rel error (in-sample error) is always decreasing as model is 
#more complex, while the cross-validation error (measure of performance on future 
#observations) is not. That is why we prune the tree to avoid overfitting the training data.
prune(boston.largetree, cp = 0.008)
## n= 455 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 455 40279.0000 22.575380  
##    2) lstat>=9.725 269  6542.7490 17.232340  
##      4) lstat>=16.085 131  2290.4970 13.990080  
##        8) nox>=0.6615 75   887.2995 11.930670  
##         16) crim>=9.87002 37   256.2508  9.656757 *
##         17) crim< 9.87002 38   253.4539 14.144740 *
##        9) nox< 0.6615 56   659.0998 16.748210 *
##      5) lstat< 16.085 138  1567.8860 20.310140 *
##    3) lstat< 9.725 186 14950.5300 30.302690  
##      6) rm< 7.437 157  6242.0520 27.552230  
##       12) rm< 6.797 113  3661.3600 25.445130  
##         24) tax< 548 106  1151.3680 24.519810  
##           48) rm< 6.542 73   492.8556 23.275340 *
##           49) rm>=6.542 33   295.3655 27.272730 *
##         25) tax>=548 7  1044.8770 39.457140 *
##       13) rm>=6.797 44   790.5218 32.963640 *
##      7) rm>=7.437 29  1090.7590 45.193100  
##       14) ptratio>=17.6 7   465.9686 38.885710 *
##       15) ptratio< 17.6 22   257.7000 47.200000 *

###logistic regression

###chi-square test
table.edu<- table(credit.data$EDUCATION,credit.data$default)
chisq.test(table.edu)
## 
##  Pearson's Chi-squared test
## 
## data:  table.edu
## X-squared = 49.19, df = 3, p-value = 1.189e-10
##Train a logistic regression model with all X variables
credit.glm0 <- glm(default~., family = binomial,data= credit.train)
summary(credit.glm0)
## 
## Call:
## glm(formula = default ~ ., family = binomial, data = credit.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0889  -0.6962  -0.5414  -0.2903   3.2008  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.393e-01  1.542e-01  -6.092 1.12e-09 ***
## LIMIT_BAL   -9.466e-07  2.849e-07  -3.322 0.000893 ***
## SEX2        -1.486e-01  5.462e-02  -2.721 0.006512 ** 
## EDUCATION2  -1.721e-01  6.323e-02  -2.722 0.006480 ** 
## EDUCATION3  -1.999e-01  8.467e-02  -2.361 0.018207 *  
## EDUCATION4  -1.178e+00  3.202e-01  -3.681 0.000233 ***
## MARRIAGE2   -1.884e-01  6.201e-02  -3.038 0.002379 ** 
## MARRIAGE3   -2.904e-01  2.425e-01  -1.198 0.231073    
## AGE          6.389e-03  3.334e-03   1.916 0.055335 .  
## PAY_0        5.516e-01  3.146e-02  17.537  < 2e-16 ***
## PAY_2        9.547e-02  3.625e-02   2.633 0.008451 ** 
## PAY_3        3.681e-02  4.054e-02   0.908 0.363959    
## PAY_4        4.121e-02  4.512e-02   0.913 0.361060    
## PAY_5        6.237e-02  4.766e-02   1.309 0.190673    
## PAY_6        1.228e-03  3.978e-02   0.031 0.975384    
## BILL_AMT1   -7.703e-06  2.053e-06  -3.753 0.000175 ***
## BILL_AMT2    5.140e-06  2.550e-06   2.015 0.043865 *  
## BILL_AMT3    2.618e-06  2.161e-06   1.211 0.225712    
## BILL_AMT4   -4.929e-06  2.473e-06  -1.993 0.046238 *  
## BILL_AMT5    5.885e-06  2.802e-06   2.101 0.035683 *  
## BILL_AMT6   -1.786e-06  2.079e-06  -0.859 0.390283    
## PAY_AMT1    -1.375e-05  3.793e-06  -3.624 0.000290 ***
## PAY_AMT2    -9.065e-06  3.348e-06  -2.708 0.006770 ** 
## PAY_AMT3    -3.909e-06  3.531e-06  -1.107 0.268289    
## PAY_AMT4    -4.209e-06  2.884e-06  -1.459 0.144442    
## PAY_AMT5     2.470e-06  2.761e-06   0.895 0.370858    
## PAY_AMT6    -3.069e-06  2.155e-06  -1.424 0.154505    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10019.6  on 9599  degrees of freedom
## Residual deviance:  8815.2  on 9573  degrees of freedom
## AIC: 8869.2
## 
## Number of Fisher Scoring iterations: 6
credit.glm0$deviance
## [1] 8815.201
AIC(credit.glm0)
## [1] 8869.201
BIC(credit.glm0)
## [1] 9062.778
##Prediction
#in sample prediction
library(ROCR)
pred.glm0.train <- predict(credit.glm0,type = "response")
pred <- prediction(pred.glm0.train,credit.train$default)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)

unlist(slot(performance(pred,"auc"),"y.values"))
## [1] 0.7238995

##Precision-Recall Curve #Precision and recall curve and its AUC is more appropriate for imbalanced data.

library(PRROC)
score1 <- pred.glm0.train[credit.train$default==1]
score0 <- pred.glm0.train[credit.train$default==0]
roc = roc.curve(score1,score0,curve = T)
roc$auc
## [1] 0.7238995
pr= pr.curve(score1, score0, curve = T)
pr
## 
##   Precision-recall curve
## 
##     Area under curve (Integral):
##      0.4998509 
## 
##     Area under curve (Davis & Goadrich):
##      0.4998408 
## 
##     Curve for scores from  1.910549e-08  to  0.9933227 
##     ( can be plotted with plot(x) )
plot(pr)

##Out-of-sample prediction
pred.glm0.test <- predict(credit.glm0,newdata=credit.test,type="response")
##ROC curve
pred <- prediction(pred.glm0.test,credit.test$default)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)

unlist(slot(performance(pred,"auc"),"y.values"))
## [1] 0.7461579
##Precision-Recall Curve
score1.test<- pred.glm0.test[credit.test$default==1]
score0.test<- pred.glm0.test[credit.test$default==0]
roc.test<- roc.curve(score1.test, score0.test, curve = T)
roc.test$auc
## [1] 0.7461579
pr.test<-pr.curve(score1.test, score0.test, curve = T)
##Binary Classification
#Choosing a large cut-off probability will result in few 
#cases being predicted as 1, and chossing a small cut-off probability 
#will result in many cases being predicted as 1.
table((pred.glm0.train > 0.9)*1)
## 
##    0    1 
## 9581   19
##Naive Choice of Cut-off probability
pcut1 <- mean(credit.train$default)
class.glm0.train <- (pred.glm0.train >pcut1)
table(credit.train$default,class.glm0.train,dnn=c("Ture","predicted"))
##     predicted
## Ture FALSE TRUE
##    0  5160 2366
##    1   713 1361
##MR,FPR,FNR
(MR<- mean(credit.train$default!=class.glm0.train))
## [1] 0.3207292
(FPR <- sum(credit.train$default==0&class.glm0.train==1)/sum(credit.train$default==0))
## [1] 0.3143768
(FNR <- sum(credit.train$default==1&class.glm0.train==0)/sum(credit.train$default==1))
## [1] 0.3437801
##Determine Optimal cut-off Probability using Grid Search Method
costfunc=function(obs,pred.p,pcut){
  weight1=5
  weight0=1
  c1= (obs==1 & pred.p<pcut)
  c0= (obs==0 & pred.p>pcut)
  cost=mean(weight1*c1 + weight0*c0)
  return(cost)} 
#define a sequence of probability (you need to search the optimal p-cut from 
#this sequence)
p.seq = seq(0.01,1,0.01)
cost = rep(0,length(p.seq))
for(i in 1:length(p.seq)){
  cost[i]=costfunc(obs=credit.train$default,pred.p=pred.glm0.train,pcut = p.seq[i])}
optimal.pcut.glm0 = p.seq[which(cost==min(cost))]
##Use the optimal cut-off probability
class.glm0.train.opt <- pred.glm0.train>optimal.pcut.glm0
table(credit.train$default,class.glm0.train.opt,dnn = c("True", "Predicted"))
##     Predicted
## True FALSE TRUE
##    0  5281 2245
##    1   733 1341
MR<- mean(credit.train$default!= class.glm0.train.opt)
FPR<- sum(credit.train$default==0 & class.glm0.train.opt==1)/sum(credit.train$default==0)
FNR<- sum(credit.train$default==1 & class.glm0.train.opt==0)/sum(credit.train$default==1)
cost<- costfunc(obs = credit.train$default, pred.p = pred.glm0.train, pcut = optimal.pcut.glm0)