'setwd("/Users/dominator/Desktop/CSSA/DataMining&Visualization/Fundrasing Project/Fundrasing Project")'
fundrasing<-read.csv("Fundraising.csv")
future<-read.csv("FutureFundraising.csv")
set.seed(12345)
fundrasing<-fundrasing[c(-1,-2,-24)]
fundrasing$TARGET_B<-as.factor(fundrasing$TARGET_B)
with(fundrasing,table(fundrasing$TARGET_B))

Here we can notice that records of TARGET_B=0 and TARGET_B=1 are the same number. There are both 1560 records.

Firstly, Select Logistic Model as 1st classification model here

library(dplyr)
library(aod)
library(ggplot2)
library(car)
library(Hmisc)
library(rms)
library(ResourceSelection)
library(QuantPsyc)
library(InformationValue)
library(readxl)
library(corrplot)
res<-rcorr(as.matrix(fundrasing))
corrplot(res$P, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)

Consider rcorr of each input variables to TARGET_B, P-value for WEALTH and IC15 too large, drop these two variables

Initial model:Logistic Regression for Model w/No Predictors

logit0<-glm(TARGET_B~1,data=fundrasing,family='binomial')
summary(logit0)
wald.test(b = coef(logit0), Sigma = vcov(logit0), Terms = 1)
ClassLog(logit0, fundrasing$TARGET_B, cut = .5)

split train(60%) and valid(40%) dataset

train.index=sample(1:nrow(fundrasing),dim(fundrasing)[1]*0.6)
train=fundrasing[train.index,]
valid=fundrasing[-train.index,]

run a logistic full model without WALTH and IC15

logitfull<-glm(TARGET_B~zipconvert_2+zipconvert_3+zipconvert_4+zipconvert_5+homeowner.dummy+NUMCHLD
               +INCOME+gender.dummy+HV+Icmed+Icavg+NUMPROM+RAMNTALL+MAXRAMNT+LASTGIFT+totalmonths+
                 TIMELAG+AVGGIFT,data=train, family = "binomial")
summary(logitfull)

from P-value of all input variables, we shall drop ZIP for all of these dummy variables is not significant and hommeowner.dummy and RAMNTALL,MAXRAMNT will also be dropped for these P-value nearly 0.5

logit1<-glm(TARGET_B~NUMCHLD+INCOME+gender.dummy+HV+Icmed+Icavg+NUMPROM+LASTGIFT+totalmonths+
                 TIMELAG+AVGGIFT,data=train, family = "binomial")
summary(logit1)

drop the input variables:NUMCHLD, Icmed, Icavg,TIMELAG, AVGGIFT which above significance 0.1

logit2<-glm(TARGET_B~INCOME+gender.dummy+HV+NUMPROM+LASTGIFT+totalmonths,data=train, family = "binomial")
summary(logit2)

drop the input variables:HV which above significance 0.05

logit3<-glm(TARGET_B~INCOME+gender.dummy+NUMPROM+LASTGIFT+totalmonths,data=train, family = "binomial")
summary(logit3)

drop gender.dummy which significance is above 0.05

logit4<-glm(TARGET_B~INCOME+NUMPROM+LASTGIFT+totalmonths,data=train, family = "binomial")
summary(logit4)
anova(logit4,test = "Chisq")

Logit4 model can be our final optimal logist model that all input variables are significant. Then carry out Wald test for best version

wt<-matrix(nrow=length(coef(logit4)),ncol=3)
for( i in c(1:length(coef(logit4))) ){
  wttemp<-wald.test(b = coef(logit4), Sigma = vcov(logit4), Terms =i)
  wt[i,2]<-wttemp$result$chi2[3]
  wt[i,3]<-wttemp$result$chi2[1]
  wt[i,1]<-names(logit4$coefficients[i])
}
wt 

Since logit4 model is our final optimal logistic model, we shall use valid dataset to do prediction and compared with actual response

prob<-predict(logit4,valid,type='response')
pred_logit<-ifelse(prob>=0.5,1,0)
table(valid$TARGET_B,pred_logit)

Get an optimal Cutoff Point Drow and Draw ROC plot.

p<-plogis(predict(logit4,train))
optcutoff<-optimalCutoff(train$TARGET_B,p)[1]
optcutoff
ClassLog(logit4, train$TARGET_B, cut =optcutoff)
plotROC(train$TARGET_B,p)

As we can see the optimal cutoff point is 0.494912 and AUROC=0.5879

Second model: AdaBoost (adaptive boost) algorithm

library(adabag)
model <- boosting(TARGET_B~., data =train,mfinal = 20,boos = TRUE)

boos=TRUE means a bootstrap sample of the training set is drawn using the weights for each observation on that iteration.

mfinal=100 means the model will run 100 iterations

boost_pred=predict(model,valid)$class
table(boost_pred,valid$TARGET_B)
library(caret)
caret::confusionMatrix(factor(boost_pred),valid$TARGET_B)

The predict accuracy in Valid dataset is 0.5329

Part2 Calculate Net Profit

first create a dataframe in advance to save the net profit later.

Net_profit=data.frame('Model'=c('Logistic Model','AdaBoost'),'Net.Profit.train'=NA,'Net.Profit.valid'=NA)

Then we calculate the net profit, and we need to undo the weighted sampling

train_pred<-predict(model,train)$class
x<-as.numeric(train_pred)
Net_profit[2,2]<-sum(x)*(13-0.68)/9.8-(length(x)-sum(x))*0.68/0.53

valid_pred<-predict(model,valid)$class
x<-as.numeric(valid_pred)
Net_profit[2,3]<-sum(x)*(13-0.68)/9.8-(length(x)-sum(x))*0.68/0.53

train_pred2<-as.matrix(predict(logit4,train,type='response'))
x<-ifelse(train_pred2[,1]>optcutoff,1,0)
Net_profit[1,2]<-sum(x)*(13-0.68)/9.8-(length(x)-sum(x))*0.68/0.53

valid_pred2<-as.matrix(predict(logit4,valid,type='response'))
x<-ifelse(valid_pred2[,1]>optcutoff,1,0)
Net_profit[1,3]<-sum(x)*(13-0.68)/9.8-(length(x)-sum(x))*0.68/0.53
Net_profit

Draw Net Profit lift curves

donar <- (13 - 0.68)/9.8
non.donar <- (-0.68)/0.53
## for Logistic Regression
lift.table<-data.frame(cbind(as.numeric(as.character(valid$TARGET_B)),valid_pred2))
names(lift.table)=c('TARGET_B','pred')
lift.table=lift.table[order(-lift.table$pred),]
lift.table=lift.table%>%mutate(net.profit=ifelse(as.numeric(TARGET_B)==1,donar,non.donar))
lift.table$Cum.net.profit=lift.table$net.profit
for (i in 2:nrow(lift.table)){
  lift.table[i,4]<-lift.table[i-1,4]+lift.table[i,3]
}
g1=ggplot(lift.table,aes(y=Cum.net.profit,x=1:nrow(lift.table),color='red'))+geom_line()

valid_pred_p<-predict(model,valid)$prob
lift.table2=data.frame(cbind(as.numeric(as.character(valid$TARGET_B)),valid_pred_p[,2]))
names(lift.table2)=c('TARGET_B','pred')
lift.table2=lift.table2[order(-lift.table2$pred),]
lift.table2=lift.table2%>%mutate(net.profit=ifelse(as.numeric(TARGET_B==1),donar,non.donar))
lift.table2$Cum.net.profit=lift.table2$net.profit
for (i in 2:nrow(lift.table2)){
  lift.table2[i,4]<-lift.table2[i-1,4]+lift.table2[i,3]
}
g2=ggplot(lift.table2,aes(y=Cum.net.profit,x=1:nrow(lift.table2),color='red'))+geom_line()
lift=data.frame('logistic Model'=lift.table$Cum.net.profit,'AdaBoost'=lift.table2$Cum.net.profit)
lift$row=c(1:nrow(lift))
g=ggplot() + 
  geom_line(data = lift.table, aes(x = 1:nrow(lift.table), y = Cum.net.profit), color = "blue") +
  geom_line(data = lift.table2, aes(x = 1:nrow(lift.table2), y = Cum.net.profit), color = "red") +
  xlab('row') +
  ylab('Cum.net.profit')+ggtitle("Net Profit lift Curve: Blue-Logistic, Red-Adaboost")
g
lift.actual=data.frame(as.numeric(as.character(valid$TARGET_B)))
names(lift.actual)<-c('TARGET_B')
lift.actual=lift.actual%>%
  mutate(net.profit=ifelse(TARGET_B==1,donar,non.donar))
lift.actual=lift.actual[order(-lift.actual$TARGET_B),]
lift.actual$Cum.net.profit=lift.actual$net.profit
for (i in 2:nrow(lift.actual)){
  lift.actual[i,3]<-lift.actual[i-1,3]+lift.actual[i,2]
}
g_actual=ggplot(lift.actual,aes(x=1:nrow(lift.actual),y=Cum.net.profit),color='green')+geom_line()+xlab('row') +ylab('Cum.net.profit')+ggtitle("Actual Net Profit lift Curve")
library(ggpubr)
ggarrange(g,g_actual,ncol=1,nrow = 2)

From Lift curve, I prefer to select logistic model as our best model. The reasons are showed below:

1.Supposing all the predictors are correct then we sorted the probabilities in an decreasing order, the net profit lift curves should firstlt increase smoothly to the top peek point and then decease to the end as showed in above. Considering two cases, the blue one fits the actual lift curve chart well. Thatโ€™s why logistical model fits well 2. The predicting accuracy in valid datasets of logistic model is better.

future<-future[,-c(2)]
pred_future=as.matrix(predict(logit4,future,type='response'))
future$pred=pred_future
future=future %>% mutate(TARGET_B=ifelse(pred>optcutoff,1,0))
future=future[order(-future$pred),]
head(future[,c(2,22,24)],10)
table(future$TARGET_B)

Therefore, 997 mails can be donors.

---
title: "Fundrasing report"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

```


```{r read csv}
'setwd("/Users/dominator/Desktop/CSSA/DataMining&Visualization/Fundrasing Project/Fundrasing Project")'
fundrasing<-read.csv("Fundraising.csv")
future<-read.csv("FutureFundraising.csv")
set.seed(12345)
fundrasing<-fundrasing[c(-1,-2,-24)]
fundrasing$TARGET_B<-as.factor(fundrasing$TARGET_B)
with(fundrasing,table(fundrasing$TARGET_B))
```
## Here we can notice that records of TARGET_B=0 and TARGET_B=1 are the same number. There are both 1560 records.
## Firstly, Select Logistic Model as 1st classification model here
```{r}
library(dplyr)
library(aod)
library(ggplot2)
library(car)
library(Hmisc)
library(rms)
library(ResourceSelection)
library(QuantPsyc)
library(InformationValue)
library(readxl)
library(corrplot)
```
```{r}
res<-rcorr(as.matrix(fundrasing))
corrplot(res$P, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)
```
#Consider rcorr of each input variables to TARGET_B, P-value for WEALTH and IC15 too large, drop these two variables

#Initial model:Logistic Regression for Model w/No Predictors#
```{r logistic model}
logit0<-glm(TARGET_B~1,data=fundrasing,family='binomial')
summary(logit0)
wald.test(b = coef(logit0), Sigma = vcov(logit0), Terms = 1)
ClassLog(logit0, fundrasing$TARGET_B, cut = .5)
```
#split train(60%) and valid(40%) dataset
```{r}
train.index=sample(1:nrow(fundrasing),dim(fundrasing)[1]*0.6)
train=fundrasing[train.index,]
valid=fundrasing[-train.index,]
```
##run a logistic full model without WALTH and IC15
```{r}
logitfull<-glm(TARGET_B~zipconvert_2+zipconvert_3+zipconvert_4+zipconvert_5+homeowner.dummy+NUMCHLD
               +INCOME+gender.dummy+HV+Icmed+Icavg+NUMPROM+RAMNTALL+MAXRAMNT+LASTGIFT+totalmonths+
                 TIMELAG+AVGGIFT,data=train, family = "binomial")
summary(logitfull)
```
## from P-value of all input variables, we shall drop ZIP for all of these dummy variables is not significant and hommeowner.dummy and RAMNTALL,MAXRAMNT will also be dropped for these P-value nearly 0.5

```{r}
logit1<-glm(TARGET_B~NUMCHLD+INCOME+gender.dummy+HV+Icmed+Icavg+NUMPROM+LASTGIFT+totalmonths+
                 TIMELAG+AVGGIFT,data=train, family = "binomial")
summary(logit1)
```
###drop the input variables:NUMCHLD, Icmed, Icavg,TIMELAG, AVGGIFT which above significance 0.1
```{r}
logit2<-glm(TARGET_B~INCOME+gender.dummy+HV+NUMPROM+LASTGIFT+totalmonths,data=train, family = "binomial")
summary(logit2)
```
###drop the input variables:HV which above significance 0.05
```{r}
logit3<-glm(TARGET_B~INCOME+gender.dummy+NUMPROM+LASTGIFT+totalmonths,data=train, family = "binomial")
summary(logit3)
```
## drop gender.dummy which significance is above 0.05
```{r}
logit4<-glm(TARGET_B~INCOME+NUMPROM+LASTGIFT+totalmonths,data=train, family = "binomial")
summary(logit4)
anova(logit4,test = "Chisq")
```
##Logit4 model can be our final optimal logist model that all input variables are significant. Then carry out Wald test for best version
```{r}
wt<-matrix(nrow=length(coef(logit4)),ncol=3)
for( i in c(1:length(coef(logit4))) ){
  wttemp<-wald.test(b = coef(logit4), Sigma = vcov(logit4), Terms =i)
  wt[i,2]<-wttemp$result$chi2[3]
  wt[i,3]<-wttemp$result$chi2[1]
  wt[i,1]<-names(logit4$coefficients[i])
}
wt 
```
### Since logit4 model is our final optimal logistic model, we shall use valid dataset to do prediction and compared with actual response
```{r}
prob<-predict(logit4,valid,type='response')
pred_logit<-ifelse(prob>=0.5,1,0)
table(valid$TARGET_B,pred_logit)
```

##Get an optimal Cutoff Point Drow and Draw ROC plot.
```{r}
p<-plogis(predict(logit4,train))
optcutoff<-optimalCutoff(train$TARGET_B,p)[1]
optcutoff
ClassLog(logit4, train$TARGET_B, cut =optcutoff)
plotROC(train$TARGET_B,p)
```
##As we can see the optimal cutoff point is  0.494912 and AUROC=0.5879

### Second model: AdaBoost (adaptive boost) algorithm 
```{r}
library(adabag)
model <- boosting(TARGET_B~., data =train,mfinal = 20,boos = TRUE)
```
##boos=TRUE means a bootstrap sample of the training set is drawn using the weights for each observation on that iteration. 
mfinal=100 means the model will run 100 iterations
```{r}
boost_pred=predict(model,valid)$class
table(boost_pred,valid$TARGET_B)
library(caret)
caret::confusionMatrix(factor(boost_pred),valid$TARGET_B)
```
## The predict accuracy in Valid dataset is 0.5329

##Part2 Calculate Net Profit
##first create a dataframe in advance to save the net profit later.
```{r}
Net_profit=data.frame('Model'=c('Logistic Model','AdaBoost'),'Net.Profit.train'=NA,'Net.Profit.valid'=NA)
```
## Then we calculate the net profit, and we need to undo the weighted sampling 
```{r}
train_pred<-predict(model,train)$class
x<-as.numeric(train_pred)
Net_profit[2,2]<-sum(x)*(13-0.68)/9.8-(length(x)-sum(x))*0.68/0.53

valid_pred<-predict(model,valid)$class
x<-as.numeric(valid_pred)
Net_profit[2,3]<-sum(x)*(13-0.68)/9.8-(length(x)-sum(x))*0.68/0.53

train_pred2<-as.matrix(predict(logit4,train,type='response'))
x<-ifelse(train_pred2[,1]>optcutoff,1,0)
Net_profit[1,2]<-sum(x)*(13-0.68)/9.8-(length(x)-sum(x))*0.68/0.53

valid_pred2<-as.matrix(predict(logit4,valid,type='response'))
x<-ifelse(valid_pred2[,1]>optcutoff,1,0)
Net_profit[1,3]<-sum(x)*(13-0.68)/9.8-(length(x)-sum(x))*0.68/0.53
Net_profit
```
###Draw Net Profit lift curves
```{r}
donar <- (13 - 0.68)/9.8
non.donar <- (-0.68)/0.53
## for Logistic Regression
lift.table<-data.frame(cbind(as.numeric(as.character(valid$TARGET_B)),valid_pred2))
names(lift.table)=c('TARGET_B','pred')
lift.table=lift.table[order(-lift.table$pred),]
lift.table=lift.table%>%mutate(net.profit=ifelse(as.numeric(TARGET_B)==1,donar,non.donar))
lift.table$Cum.net.profit=lift.table$net.profit
for (i in 2:nrow(lift.table)){
  lift.table[i,4]<-lift.table[i-1,4]+lift.table[i,3]
}
g1=ggplot(lift.table,aes(y=Cum.net.profit,x=1:nrow(lift.table),color='red'))+geom_line()

valid_pred_p<-predict(model,valid)$prob
lift.table2=data.frame(cbind(as.numeric(as.character(valid$TARGET_B)),valid_pred_p[,2]))
names(lift.table2)=c('TARGET_B','pred')
lift.table2=lift.table2[order(-lift.table2$pred),]
lift.table2=lift.table2%>%mutate(net.profit=ifelse(as.numeric(TARGET_B==1),donar,non.donar))
lift.table2$Cum.net.profit=lift.table2$net.profit
for (i in 2:nrow(lift.table2)){
  lift.table2[i,4]<-lift.table2[i-1,4]+lift.table2[i,3]
}
g2=ggplot(lift.table2,aes(y=Cum.net.profit,x=1:nrow(lift.table2),color='red'))+geom_line()
lift=data.frame('logistic Model'=lift.table$Cum.net.profit,'AdaBoost'=lift.table2$Cum.net.profit)
lift$row=c(1:nrow(lift))
g=ggplot() + 
  geom_line(data = lift.table, aes(x = 1:nrow(lift.table), y = Cum.net.profit), color = "blue") +
  geom_line(data = lift.table2, aes(x = 1:nrow(lift.table2), y = Cum.net.profit), color = "red") +
  xlab('row') +
  ylab('Cum.net.profit')+ggtitle("Net Profit lift Curve: Blue-Logistic, Red-Adaboost")
g
lift.actual=data.frame(as.numeric(as.character(valid$TARGET_B)))
names(lift.actual)<-c('TARGET_B')
lift.actual=lift.actual%>%
  mutate(net.profit=ifelse(TARGET_B==1,donar,non.donar))
lift.actual=lift.actual[order(-lift.actual$TARGET_B),]
lift.actual$Cum.net.profit=lift.actual$net.profit
for (i in 2:nrow(lift.actual)){
  lift.actual[i,3]<-lift.actual[i-1,3]+lift.actual[i,2]
}
g_actual=ggplot(lift.actual,aes(x=1:nrow(lift.actual),y=Cum.net.profit),color='green')+geom_line()+xlab('row') +ylab('Cum.net.profit')+ggtitle("Actual Net Profit lift Curve")
library(ggpubr)
ggarrange(g,g_actual,ncol=1,nrow = 2)
```
##From Lift curve, I prefer to select logistic model as our best model. The reasons are showed below:
  1.Supposing all the predictors are correct then we sorted the probabilities in an decreasing order, the net profit lift curves should firstlt increase smoothly to the top peek point and then decease to the end as showed in above. Considering two cases, the blue one fits the actual lift curve chart well. That's why logistical model fits well
  2. The predicting accuracy in valid datasets of logistic model is better.
```{r}
future<-future[,-c(2)]
pred_future=as.matrix(predict(logit4,future,type='response'))
future$pred=pred_future
future=future %>% mutate(TARGET_B=ifelse(pred>optcutoff,1,0))
future=future[order(-future$pred),]
head(future[,c(2,22,24)],10)
table(future$TARGET_B)
```
Therefore, 997 mails can be donors.