Train Test Split

data("acquisitionRetention")
acquisitionRetention$acquisition=as.factor(acquisitionRetention$acquisition)
#anyNA(acquisitionRetention)
index = sample(nrow(acquisitionRetention),0.8*nrow(acquisitionRetention))
train = acquisitionRetention[index,]
test= acquisitionRetention[-index,]

Logistic Regression

model <- glm(acquisition ~ 
                            industry +
                            acq_exp + 
                            revenue +
                            employees, data = train, family = binomial)

#cooks distance plot

dev <-residuals(model, type = "deviance")
pea <-residuals(model, type = "pearson")
devstd<-residuals(model, type = "deviance")/sqrt(1 - hatvalues(model)) # standardized deviance residuals
peastd <-residuals(model, type = "pearson")/sqrt(1 - hatvalues(model)) # standardized pearson residuals

dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))

plot(devstd[model$model$acquisition==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[model$model$acquisition==0], col = "blue")

plot(peastd[model$model$acquisition==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[model$model$acquisition==0], col = "blue")

#model selection



#taking out outliers

cooks_sd_cutoff = 3*sd(cooks.distance(model))
out <- which(cooks.distance(model)>cooks_sd_cutoff)

model <- glm(acquisition ~ 
                            industry +
                            acq_exp + 
                            revenue +
                            employees, data = train[-out,], family = binomial)

summary(model)
## 
## Call:
## glm(formula = acquisition ~ industry + acq_exp + revenue + employees, 
##     family = binomial, data = train[-out, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2189  -0.3795   0.1791   0.5384   2.3296  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.4494662  1.2223138  -7.731 1.07e-14 ***
## industry     1.7265686  0.3341271   5.167 2.37e-07 ***
## acq_exp     -0.0004982  0.0009393  -0.530    0.596    
## revenue      0.0957225  0.0178870   5.352 8.72e-08 ***
## employees    0.0097873  0.0011185   8.750  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 474.77  on 389  degrees of freedom
## Residual deviance: 264.02  on 385  degrees of freedom
## AIC: 274.02
## 
## Number of Fisher Scoring iterations: 6
#Maximized cutoff (change predict object)
pred <- prediction(predict(model, test, type = "response"),
                   test$acquisition) 

#Predicted Probability and True Classification

auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

#ROC plot (don't need to change)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

#Confusion Matrix (change predict object)
predict = predict(model, test, type = "response")
#optcutoff = optimalCutoff(test$acquisition,predict)

predict_choice = ifelse(predict>optimal, 1,0)


caret::confusionMatrix(as.factor(predict_choice),as.factor(test$acquisition), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 29 17
##          1 10 44
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.632, 0.8139)
##     No Information Rate : 0.61           
##     P-Value [Acc > NIR] : 0.008104       
##                                          
##                   Kappa : 0.4503         
##                                          
##  Mcnemar's Test P-Value : 0.248213       
##                                          
##             Sensitivity : 0.7213         
##             Specificity : 0.7436         
##          Pos Pred Value : 0.8148         
##          Neg Pred Value : 0.6304         
##              Prevalence : 0.6100         
##          Detection Rate : 0.4400         
##    Detection Prevalence : 0.5400         
##       Balanced Accuracy : 0.7325         
##                                          
##        'Positive' Class : 1              
## 
#diagnostics
cooks_distance = cooks.distance(model)
plot(cooks_distance,col="pink", pch=19, cex=1) 
text(cooks.distance(model),labels = rownames(train))


exp(coef(model))
##  (Intercept)     industry      acq_exp      revenue    employees 
## 7.873158e-05 5.621332e+00 9.995019e-01 1.100454e+00 1.009835e+00
LR = Anova(model, test = "LR")
Wald = Anova(model, test = "Wald")
score = anova(model, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)
##            LR    Score       Wald
## 1  30.7481715 25.72755 26.7020287
## 2   0.2819222 25.72755  0.2812974
## 3  34.1474828 25.72755 28.6388012
## 4 158.5480790 25.72755 76.5652389
data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)
##           p.LR      p.Score       p.Wald
## 1 2.937805e-08 3.931739e-07 2.373722e-07
## 2 5.954441e-01 3.931739e-07 5.958521e-01
## 3 5.108929e-09 3.931739e-07 8.721692e-08
## 4 2.349007e-36 3.931739e-07 2.130570e-18

Using a cutoff value of 0.719 we get an accuracy of 75% with logistic regression.

Decision Tree

make sure response its a factor

model <- rpart(as.factor(acquisition) ~ 
                            industry +
                            acq_exp + 
                            revenue +
                            employees,
                            data = train) # simple DT model

rattle::fancyRpartPlot(model, sub = "") # vizualize the DT

 # save predictions from DT

preds = predict(model,test,type="class")


caret::confusionMatrix(as.factor(preds),as.factor(test$acquisition), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 21 14
##          1 18 47
##                                           
##                Accuracy : 0.68            
##                  95% CI : (0.5792, 0.7698)
##     No Information Rate : 0.61            
##     P-Value [Acc > NIR] : 0.09018         
##                                           
##                   Kappa : 0.3148          
##                                           
##  Mcnemar's Test P-Value : 0.59588         
##                                           
##             Sensitivity : 0.7705          
##             Specificity : 0.5385          
##          Pos Pred Value : 0.7231          
##          Neg Pred Value : 0.6000          
##              Prevalence : 0.6100          
##          Detection Rate : 0.4700          
##    Detection Prevalence : 0.6500          
##       Balanced Accuracy : 0.6545          
##                                           
##        'Positive' Class : 1               
## 

We get an accuracy of 74% with decision tree method.

Random Forest

set.seed(100)
model <- rfsrc(acquisition ~ 
                            industry +
                            acq_exp + 
                            revenue +
                            employees,
                            data = train, 
                            importance = TRUE, 
                            ntree = 1000)

model
##                          Sample size: 400
##            Frequency of class labels: 123, 277
##                      Number of trees: 1000
##            Forest terminal node size: 1
##        Average no. of terminal nodes: 57.611
## No. of variables tried at each split: 2
##               Total no. of variables: 4
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 253
##                             Analysis: RF-C
##                               Family: class
##                       Splitting rule: gini *random*
##        Number of random split points: 10
##               Normalized brier score: 50.31 
##                                  AUC: 88.62 
##                           Error rate: 0.18, 0.38, 0.09
## 
## Confusion matrix:
## 
##           predicted
##   observed  0   1 class.error
##          0 76  47      0.3821
##          1 26 251      0.0939
## 
##  Overall error rate: 18.25%

Variable Importance (this code no longer works with the classification rf)

#model$importance # values vary a lot

#data.frame(importance = model$importance) %>%
#  tibble::rownames_to_column(var = "variable") %>%
#  ggplot(aes(x = reorder(variable,importance), y = importance)) +
#    geom_bar(stat = "identity", fill = "orange", color = "black")+
 #   coord_flip() +
  #   labs(x = "Variables", y = "Variable importance")        

Minimum Depth

mindepth <- max.subtree(model,
                        sub.order = TRUE)

# first order depths
print(round(mindepth$order, 3)[,1])
##  industry   acq_exp   revenue employees 
##     1.586     1.440     1.335     0.663
# vizualise MD
data.frame(md = round(mindepth$order, 3)[,1]) %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,desc(md)), y = md)) +
    geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.2)+
    coord_flip() +
     labs(x = "Variables", y = "Minimal Depth")

# interactions
mindepth$sub.order
##            industry   acq_exp   revenue  employees
## industry  0.1400193 0.1714415 0.1865178 0.17208282
## acq_exp   0.4324688 0.1298402 0.1719492 0.15711450
## revenue   0.4505814 0.1595010 0.1188671 0.15172322
## employees 0.3424350 0.1460361 0.1525367 0.05853769
as.matrix(mindepth$sub.order) %>%
  reshape2::melt() %>%
  data.frame() %>%
  ggplot(aes(x = Var1, y = Var2, fill = value)) +
    scale_x_discrete(position = "top") +
    geom_tile(color = "white") +
    viridis::scale_fill_viridis("Relative min. depth") +
    labs(x = "", y = "") +
    theme_bw()

# cross-check with vimp
find.interaction(model,
                      method = "vimp",
                      importance = "permute")
## Pairing employees with acq_exp 
## Pairing employees with industry 
## Pairing employees with revenue 
## Pairing acq_exp with industry 
## Pairing acq_exp with revenue 
## Pairing industry with revenue 
## 
##                               Method: vimp
##                     No. of variables: 4
##            Variables sorted by VIMP?: TRUE
##    No. of variables used for pairing: 4
##     Total no. of paired interactions: 6
##             Monte Carlo replications: 1
##     Type of noising up used for VIMP: permute
## 
##                     Var 1  Var 2 Paired Additive Difference
## employees:acq_exp  0.0850 0.0303 0.1147   0.1153    -0.0006
## employees:industry 0.0850 0.0274 0.1047   0.1124    -0.0077
## employees:revenue  0.0850 0.0230 0.1014   0.1081    -0.0067
## acq_exp:industry   0.0298 0.0286 0.0441   0.0584    -0.0143
## acq_exp:revenue    0.0298 0.0236 0.0569   0.0534     0.0035
## industry:revenue   0.0287 0.0218 0.0511   0.0504     0.0007

All the differences are negligible so we can exclude interaction terms.

Tuning a forest hyper-parameters for predictive accuracy

# Establish a list of possible values for hyper-parameters
mtry.values <- seq(1,4,1)
nodesize.values <- seq(2,16,2)
ntree.values <- seq(1e3,10e3,1e3)

# Create a data frame containing all combinations 
hyper_grid <- expand.grid(mtry = mtry.values, nodesize = nodesize.values, ntree = ntree.values)

# Create an empty vector to store OOB error values
oob_err <- c()

# Write a loop over the rows of hyper_grid to train the grid of models
for (i in 1:nrow(hyper_grid)) {

    # Train a Random Forest model
   model <- rfsrc(acquisition ~ 
                            industry +
                            acq_exp + 
                            revenue +
                            employees,
                            data = train, 
                            mtry = hyper_grid$mtry[i],
                            nodesize = hyper_grid$nodesize[i],
                            ntree = hyper_grid$ntree[i])  
  
                          
    # Store OOB error for the model                      
    oob_err[i] <- model$err.rate[length(model$err.rate)]
}

# Identify optimal set of hyperparmeters based on OOB error
opt_i <- which.min(oob_err)
print(hyper_grid[opt_i,])
##   mtry nodesize ntree
## 5    1        4  1000

2 8 10000

  1. Rebuild training forest with optimal hyper-params
set.seed(100)
model <- rfsrc(as.factor(acquisition) ~ 
                            industry +
                            acq_exp + 
                            revenue +
                            employees,
                            data = train, 
                            mtry = 2,
                            nodesize = 8,
                            ntree = 10000)  
model
##                          Sample size: 400
##            Frequency of class labels: 123, 277
##                      Number of trees: 10000
##            Forest terminal node size: 8
##        Average no. of terminal nodes: 23.0283
## No. of variables tried at each split: 2
##               Total no. of variables: 4
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 253
##                             Analysis: RF-C
##                               Family: class
##                       Splitting rule: gini *random*
##        Number of random split points: 10
##               Normalized brier score: 49.9 
##                                  AUC: 89.16 
##                           Error rate: 0.19, 0.37, 0.11
## 
## Confusion matrix:
## 
##           predicted
##   observed  0   1 class.error
##          0 78  45      0.3659
##          1 30 247      0.1083
## 
##  Overall error rate: 18.75%

Predicting Random Forest model on test set

preds = predict(model,test,type="class")

caret::confusionMatrix(preds$class,as.factor(test$acquisition), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 20 13
##          1 19 48
##                                           
##                Accuracy : 0.68            
##                  95% CI : (0.5792, 0.7698)
##     No Information Rate : 0.61            
##     P-Value [Acc > NIR] : 0.09018         
##                                           
##                   Kappa : 0.3083          
##                                           
##  Mcnemar's Test P-Value : 0.37676         
##                                           
##             Sensitivity : 0.7869          
##             Specificity : 0.5128          
##          Pos Pred Value : 0.7164          
##          Neg Pred Value : 0.6061          
##              Prevalence : 0.6100          
##          Detection Rate : 0.4800          
##    Detection Prevalence : 0.6700          
##       Balanced Accuracy : 0.6499          
##                                           
##        'Positive' Class : 1               
## 

Predict acquisition on entire data set using rf / Train test split on new subsetted data (where acquisition and prediction both == 1)

data=acquisitionRetention
data2 = data.frame(data,prediction=preds$class)
data2=data2[data2$prediction==1 & data2$acquisition==1, ]

index2 = sample(nrow(data2),0.8*nrow(data2))
train2 = data2[index2,]
test2 = data2[-index2,]

linear regression on duration (no squared terms)

model <- lm(duration ~profit+acq_exp+ret_exp+freq+crossbuy+sow+industry+revenue+employees, data = train2)

preds = predict(model,test2)

mse = ((preds-test2$duration)^2)

summary(model)
## 
## Call:
## lm(formula = duration ~ profit + acq_exp + ret_exp + freq + crossbuy + 
##     sow + industry + revenue + employees, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -199.880  -19.249    2.385   26.870  103.183 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 418.851748  33.264256  12.592  < 2e-16 ***
## profit        0.166750   0.029596   5.634 6.96e-08 ***
## acq_exp      -0.314110   0.065558  -4.791 3.53e-06 ***
## ret_exp       0.730575   0.105709   6.911 8.73e-11 ***
## freq         -6.168016   0.849070  -7.264 1.21e-11 ***
## crossbuy     -0.991080   1.684312  -0.588 0.557015    
## sow          -0.009601   0.167230  -0.057 0.954281    
## industry    -27.144076   7.139536  -3.802 0.000198 ***
## revenue      -0.400482   0.369725  -1.083 0.280223    
## employees    -0.085291   0.017430  -4.893 2.25e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.48 on 174 degrees of freedom
## Multiple R-squared:  0.9616, Adjusted R-squared:  0.9596 
## F-statistic: 484.5 on 9 and 174 DF,  p-value: < 2.2e-16
print(mean(mse))
## [1] 2008.668
print(mse(preds, test2$duration))
## [1] 2008.668

random forest on duration

set.seed(100)
model <- rfsrc(duration ~profit+acq_exp+ret_exp+freq+crossbuy+sow+industry+revenue+employees,
                            mtry = 2,
                            nodesize = 8,
                            ntree = 10000, data=train2)  

preds = predict(model,test2)

#print(mse(preds$predicted,test2$duration))
mse = ((preds$predicted-test2$duration)^2)
print(mean(mse))
## [1] 5673.281
print(mse(preds$predicted, test2$duration))
## [1] 5673.281

Something is messed up because our MSE is much higher using random forest over regression.