#install.packages("ROCR")
#Path of data file used in this module
B_B_path <- "/Users/josemawyin/Library/Mobile Documents/com~apple~CloudDocs/Data Science Masters /622/Bagging_Boosting/binary.csv"
admit_data <- read.csv(B_B_path,head=TRUE)
head(admit_data)
Model
#make some columns factors
fad<-data.frame(as.factor(admit_data$admit),admit_data$gre, admit_data$gpa,as.factor(admit_data$rank))
names(fad)<-names(admit_data)
# create test and training set
set.seed(43)
tstset<-sample(400,120,replace=FALSE)
# 30% hold out test set
admit_trdata<-fad[-tstset,]
admit_tstdata<-fad[tstset,]
# generate model
set.seed(43)
iter=10
bagfit.admit <-bagging(admit~.,data=admit_trdata,coob=T,nbagg=iter)
#Performance
bag.pred <-predict(bagfit.admit,admit_tstdata[,-1])
# probabilities -->
bag.pred.result<-data.frame(actual=admit_tstdata[,1],predicted=bag.pred)
# confusion matrix (aka contingency table)
table(actual=admit_tstdata[,1],predicted = bag.pred)
## predicted
## actual 0 1
## 0 57 26
## 1 24 13
#pradmit.number<-as.numeric(predicted_admit)
prediction.admit.bag<-prediction(as.numeric(bag.pred),admit_tstdata$admit)
performance.admit.bag<- performance(prediction.admit.bag,measure='tpr', x.measure='fpr')
auc.admit.bag<-performance(prediction.admit.bag,measure='auc')
# plot and display AUC
plot(performance.admit.bag, main="ROC Curve for Bagged ADMIT data")
auc.admit.bag@y.values[[1]]
## [1] 0.5190492
How does the AUC change given the nhmber of iterations?
# generate model
set.seed(43)
AUC_value = data.frame()
for (iter in seq(10,100,10)){
bagfit.admit <-bagging(admit~.,data=admit_trdata,coob=T,nbagg=iter)
#Performance
bag.pred <-predict(bagfit.admit,admit_tstdata[,-1])
# probabilities -->
bag.pred.result<-data.frame(actual=admit_tstdata[,1],predicted=bag.pred)
# confusion matrix (aka contingency table)
table(actual=admit_tstdata[,1],predicted = bag.pred)
#pradmit.number<-as.numeric(predicted_admit)
prediction.admit.bag<-prediction(as.numeric(bag.pred),admit_tstdata$admit)
performance.admit.bag<- performance(prediction.admit.bag,measure='tpr', x.measure='fpr')
auc.admit.bag<-performance(prediction.admit.bag,measure='auc')
AUC_value <- rbind(AUC_value,c(iter,auc.admit.bag@y.values[[1]]))
}
plot(AUC_value)
XGBoost and gbm:gbm data load
path<-B_B_path
#ad<-read.csv(path,head=T)
#fad<-data.frame(as.factor(ad$admit),ad$gre,ad$gpa,as.factor(ad$rank))
admit_data<-read.csv(path,head=TRUE);
head(admit_data)
#make some columns factors
fad<-data.frame(as.factor(admit_data$admit),admit_data$gre,
admit_data$gpa,as.factor(admit_data$rank))
names(fad)<-names(admit_data)
set.seed(43)
tstset<-sample(400,120,replace=FALSE)
# 30% hold out test set
admit_trdata<-fad[-tstset,]
admit_tstdata<-fad[tstset,]
# model
mod_gbm = gbm(admit ~.,
data = admit_trdata,
distribution = "multinomial",
cv.folds = 10,
shrinkage = .01,
n.minobsinnode = 10,
n.trees = 200)
print(mod_gbm)
## gbm(formula = admit ~ ., distribution = "multinomial", data = admit_trdata,
## n.trees = 200, n.minobsinnode = 10, shrinkage = 0.01, cv.folds = 10)
## A gradient boosted model with multinomial loss function.
## 200 iterations were performed.
## The best cross-validation iteration was 167.
## There were 3 predictors of which 3 had non-zero influence.
pred = predict.gbm(object = mod_gbm,
newdata = admit_tstdata,
n.trees = 200,
type = "response")
labels<-colnames(pred)[apply(pred,1,which.max)]
result<-data.frame(admit_tstdata$admit,labels)
confusionMatrix<- table(actual=result$admit_tstdata.admit, predicted=result$labels)
pradmit.number<-as.numeric(result$labels)
actual.number<-as.numeric(result$admit_tstdata.admit)
pr<-prediction(pradmit.number,actual.number)
auc_data<-performance(pr,"tpr","fpr")
plot(auc_data,main="ROC Curve for GBM ADMIT data")
aucval<-performance(pr,measure="auc")
aucval@y.values[[1]]
## [1] 0.4923478