#install.packages("ROCR")

622 Code Algorithm’s

Bagging and Boosting

#Path of data file used in this module
B_B_path <- "/Users/josemawyin/Library/Mobile Documents/com~apple~CloudDocs/Data Science Masters /622/Bagging_Boosting/binary.csv"
admit_data <- read.csv(B_B_path,head=TRUE)
head(admit_data)

Model

#make some columns factors 
fad<-data.frame(as.factor(admit_data$admit),admit_data$gre, admit_data$gpa,as.factor(admit_data$rank)) 
names(fad)<-names(admit_data) 
# create test and training set 
set.seed(43) 
tstset<-sample(400,120,replace=FALSE) 
# 30% hold out test set 
admit_trdata<-fad[-tstset,] 
admit_tstdata<-fad[tstset,]
# generate model 
set.seed(43) 
iter=10 
bagfit.admit <-bagging(admit~.,data=admit_trdata,coob=T,nbagg=iter)
#Performance
bag.pred <-predict(bagfit.admit,admit_tstdata[,-1]) 
# probabilities --> 
bag.pred.result<-data.frame(actual=admit_tstdata[,1],predicted=bag.pred) 
# confusion matrix (aka contingency table) 
table(actual=admit_tstdata[,1],predicted = bag.pred) 
##       predicted
## actual  0  1
##      0 57 26
##      1 24 13
#pradmit.number<-as.numeric(predicted_admit)
prediction.admit.bag<-prediction(as.numeric(bag.pred),admit_tstdata$admit) 
performance.admit.bag<- performance(prediction.admit.bag,measure='tpr', x.measure='fpr') 

auc.admit.bag<-performance(prediction.admit.bag,measure='auc')

# plot and display AUC 
plot(performance.admit.bag, main="ROC Curve for Bagged ADMIT data") 

auc.admit.bag@y.values[[1]]
## [1] 0.5190492

How does the AUC change given the nhmber of iterations?

# generate model 
set.seed(43) 
AUC_value = data.frame()

for (iter in seq(10,100,10)){
  bagfit.admit <-bagging(admit~.,data=admit_trdata,coob=T,nbagg=iter)
  #Performance
  bag.pred <-predict(bagfit.admit,admit_tstdata[,-1]) 
  # probabilities --> 
  bag.pred.result<-data.frame(actual=admit_tstdata[,1],predicted=bag.pred) 
  # confusion matrix (aka contingency table) 
  table(actual=admit_tstdata[,1],predicted = bag.pred) 
  #pradmit.number<-as.numeric(predicted_admit)
  prediction.admit.bag<-prediction(as.numeric(bag.pred),admit_tstdata$admit) 
  performance.admit.bag<- performance(prediction.admit.bag,measure='tpr', x.measure='fpr') 
  
  auc.admit.bag<-performance(prediction.admit.bag,measure='auc')
  
  
  AUC_value <- rbind(AUC_value,c(iter,auc.admit.bag@y.values[[1]]))
}
plot(AUC_value)

==========================================

XGBoost and gbm:gbm data load

path<-B_B_path
#ad<-read.csv(path,head=T) 
#fad<-data.frame(as.factor(ad$admit),ad$gre,ad$gpa,as.factor(ad$rank)) 
admit_data<-read.csv(path,head=TRUE); 
head(admit_data) 
#make some columns factors 
fad<-data.frame(as.factor(admit_data$admit),admit_data$gre,
admit_data$gpa,as.factor(admit_data$rank)) 
names(fad)<-names(admit_data) 
set.seed(43) 
tstset<-sample(400,120,replace=FALSE) 
# 30% hold out test set
admit_trdata<-fad[-tstset,] 
admit_tstdata<-fad[tstset,]
# model 
mod_gbm = gbm(admit ~.,               
            data = admit_trdata,               
            distribution = "multinomial",               
            cv.folds = 10,               
            shrinkage = .01,               
            n.minobsinnode = 10,               
            n.trees = 200) 
print(mod_gbm)
## gbm(formula = admit ~ ., distribution = "multinomial", data = admit_trdata, 
##     n.trees = 200, n.minobsinnode = 10, shrinkage = 0.01, cv.folds = 10)
## A gradient boosted model with multinomial loss function.
## 200 iterations were performed.
## The best cross-validation iteration was 167.
## There were 3 predictors of which 3 had non-zero influence.
pred = predict.gbm(object = mod_gbm,                    
                   newdata = admit_tstdata,                    
                   n.trees = 200,                    
                   type = "response")
labels<-colnames(pred)[apply(pred,1,which.max)] 
result<-data.frame(admit_tstdata$admit,labels)
confusionMatrix<- table(actual=result$admit_tstdata.admit, predicted=result$labels) 

pradmit.number<-as.numeric(result$labels) 
actual.number<-as.numeric(result$admit_tstdata.admit) 
pr<-prediction(pradmit.number,actual.number) 
auc_data<-performance(pr,"tpr","fpr") 
plot(auc_data,main="ROC Curve for GBM ADMIT data") 

aucval<-performance(pr,measure="auc") 
aucval@y.values[[1]]
## [1] 0.4923478