R Markdown

library(caret)
library(tree)
library(SMCRM)
library(PerformanceAnalytics)
library(xgboost)
library(randomForestSRC)
library(rpart)
library(car)
library(dplyr)

# theme for nice plotting
theme_nice <- theme_classic()+
                theme(
                  axis.line.y.left = element_line(colour = "black"),
                  axis.line.y.right = element_line(colour = "black"),
                  axis.line.x.bottom = element_line(colour = "black"),
                  axis.line.x.top = element_line(colour = "black"),
                  axis.text.y = element_text(colour = "black", size = 12),
                  axis.text.x = element_text(color = "black", size = 12),
                  axis.ticks = element_line(color = "black")) +
                theme(
                  axis.ticks.length = unit(-0.25, "cm"), 
                  axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")), 
                  axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))
Load the dataset and get summary statistics
#  Load the data
data(acquisitionRetention)
#Loading dataset
 data1  = data.frame(acquisitionRetention)
# Get the summary
summary(data1)
##     customer      acquisition       duration          profit       
##  Min.   :  1.0   Min.   :0.000   Min.   :   0.0   Min.   :-1027.0  
##  1st Qu.:125.8   1st Qu.:0.000   1st Qu.:   0.0   1st Qu.: -316.3  
##  Median :250.5   Median :1.000   Median : 957.5   Median : 3369.9  
##  Mean   :250.5   Mean   :0.676   Mean   : 742.5   Mean   : 2403.8  
##  3rd Qu.:375.2   3rd Qu.:1.000   3rd Qu.:1146.2   3rd Qu.: 3931.6  
##  Max.   :500.0   Max.   :1.000   Max.   :1673.0   Max.   : 6134.3  
##     acq_exp           ret_exp         acq_exp_sq          ret_exp_sq     
##  Min.   :   1.21   Min.   :   0.0   Min.   :      1.5   Min.   :      0  
##  1st Qu.: 384.14   1st Qu.:   0.0   1st Qu.: 147562.0   1st Qu.:      0  
##  Median : 491.66   Median : 398.1   Median : 241729.7   Median : 158480  
##  Mean   : 493.35   Mean   : 336.3   Mean   : 271211.1   Mean   : 184000  
##  3rd Qu.: 600.21   3rd Qu.: 514.3   3rd Qu.: 360246.0   3rd Qu.: 264466  
##  Max.   :1027.04   Max.   :1095.0   Max.   :1054811.2   Max.   :1198937  
##       freq          freq_sq          crossbuy           sow        
##  Min.   : 0.00   Min.   :  0.00   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.: 0.00   1st Qu.:  0.00   1st Qu.: 0.000   1st Qu.:  0.00  
##  Median : 6.00   Median : 36.00   Median : 5.000   Median : 44.00  
##  Mean   : 6.22   Mean   : 69.25   Mean   : 4.052   Mean   : 38.88  
##  3rd Qu.:11.00   3rd Qu.:121.00   3rd Qu.: 7.000   3rd Qu.: 66.00  
##  Max.   :21.00   Max.   :441.00   Max.   :11.000   Max.   :116.00  
##     industry        revenue        employees     
##  Min.   :0.000   Min.   :14.49   Min.   :  18.0  
##  1st Qu.:0.000   1st Qu.:33.53   1st Qu.: 503.0  
##  Median :1.000   Median :41.43   Median : 657.5  
##  Mean   :0.522   Mean   :40.54   Mean   : 671.5  
##  3rd Qu.:1.000   3rd Qu.:47.52   3rd Qu.: 826.0  
##  Max.   :1.000   Max.   :65.10   Max.   :1461.0
# Create a copy
data2 <- data1
str(data1)
## 'data.frame':    500 obs. of  15 variables:
##  $ customer   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ acquisition: num  1 1 1 0 1 1 1 1 0 0 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : num  1 0 0 0 0 1 0 1 0 1 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...
Correlation Analysis
# Run Correlation chart including the yvar
chart.Correlation(data1, histogram=TRUE, pch=19)

Most of the variables are heavily correlated with response variable.
duration, profit, ret_exp, ret_exp_sq, freq, freq_sq, crossbuy and sow are heavily correlated.
Lets run some box plots to see if this makes sense.

Variable exclusion based on correlation
# Create box plots to understand variable correlation with the response variable Acquistion
par(mfrow = c(2, 3))
boxplot(duration~ acquisition, data = data1, xlab = "acquisition", ylab = "Duration")
boxplot(ret_exp~ acquisition, data = data1, xlab = "acquisition", ylab = "ret_exp")
boxplot(freq~ acquisition, data = data1, xlab = "acquisition", ylab = "freq")
boxplot(crossbuy~ acquisition, data = data1, xlab = "acquisition", ylab = "crossbuy")
boxplot(sow~ acquisition, data = data1, xlab = "acquisition", ylab = "sow")
boxplot(profit~ acquisition, data = data1, xlab = "acquisition", ylab = "profit")

It is clear that whenever the fields Duration, ret_exp, freq, crossbuy, sow are “0”, acquisition is always biased as 0.

Similarly, whenever profit is negative, acquisition is always 0 indicating that marketing costs are included for acqusition.

We will exclude these variables in model.

we will also exclude the “customer” variable in the model as it is just a number and doesnt have any influence on the model.

Transform the variables
# Convert the acquisition as a factor
data1$acquisition<- as.factor(data1$acquisition)
# See if any missing values and remove them
data1 <- na.omit(data1)
Task1 - Use acquisitionRetention data set to predict which customers will be acquired and for how long (duration) based on a feature set using a random forest.
  1. Use acquisitionRetention data set to predict which customers will be acquired
# Run a random forest model to predict acquisition using only the kept fields based on the above correlation analysis

# set the seed
set.seed(123)

# Set a tune grid to run at different mtry's
tunegrid <- expand.grid(.mtry=c(1:15))

# Set ntree to 1000 and importance to True to see the variable importance

# Run a randomforest with only the fields industry, revenue, employees and acq_exp
rf.forest1 <- rfsrc(acquisition~acq_exp+industry+revenue+employees, 
                 data = data1,
                 tuneGrid=tunegrid,
                 importance = TRUE,
                 ntree = 1000)

rf.forest1
##                          Sample size: 500
##            Frequency of class labels: 162, 338
##                      Number of trees: 1000
##            Forest terminal node size: 1
##        Average no. of terminal nodes: 76.809
## No. of variables tried at each split: 2
##               Total no. of variables: 4
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 316
##                             Analysis: RF-C
##                               Family: class
##                       Splitting rule: gini *random*
##        Number of random split points: 10
##               Normalized brier score: 57.26 
##                                  AUC: 85.97 
##                           Error rate: 0.21, 0.41, 0.12
## 
## Confusion matrix:
## 
##           predicted
##   observed  0   1 class.error
##          0 96  66      0.4074
##          1 40 298      0.1183
## 
##  Overall error rate: 21.2%

This gave an accuracy rate of 79.8%.

  1. for how long (duration) based on a feature set using a random forest.
# create a dataframe from the prediction values
predictedAcqusition <- rf.forest1$class.oob
data2<-cbind(data1,predictedAcqusition)
# Keep only the predicted acquisition customers only in a new dataframe
data3 <- data2 %>% filter(predictedAcqusition == 1)
summary(data3)
##     customer     acquisition    duration          profit         acq_exp       
##  Min.   :  1.0   0: 66       Min.   :   0.0   Min.   :-1027   Min.   :  43.15  
##  1st Qu.:130.8   1:298       1st Qu.: 813.5   1st Qu.: 3028   1st Qu.: 407.69  
##  Median :258.0               Median :1020.5   Median : 3555   Median : 496.07  
##  Mean   :256.7               Mean   : 894.2   Mean   : 3028   Mean   : 492.25  
##  3rd Qu.:386.2               3rd Qu.:1175.2   3rd Qu.: 4049   3rd Qu.: 579.63  
##  Max.   :500.0               Max.   :1673.0   Max.   : 6134   Max.   :1027.04  
##     ret_exp         acq_exp_sq        ret_exp_sq           freq       
##  Min.   :   0.0   Min.   :   1862   Min.   :      0   Min.   : 0.000  
##  1st Qu.: 322.1   1st Qu.: 166208   1st Qu.: 103779   1st Qu.: 3.000  
##  Median : 439.1   Median : 246087   Median : 192835   Median : 8.000  
##  Mean   : 404.9   Mean   : 262301   Mean   : 219752   Mean   : 7.527  
##  3rd Qu.: 538.4   3rd Qu.: 335971   3rd Qu.: 289859   3rd Qu.:11.000  
##  Max.   :1082.4   Max.   :1054811   Max.   :1171525   Max.   :21.000  
##     freq_sq          crossbuy          sow            industry     
##  Min.   :  0.00   Min.   : 0.00   Min.   :  0.00   Min.   :0.0000  
##  1st Qu.:  9.00   1st Qu.: 3.00   1st Qu.: 29.75   1st Qu.:0.0000  
##  Median : 64.00   Median : 5.00   Median : 52.00   Median :1.0000  
##  Mean   : 84.27   Mean   : 4.92   Mean   : 46.78   Mean   :0.5907  
##  3rd Qu.:121.00   3rd Qu.: 7.00   3rd Qu.: 68.00   3rd Qu.:1.0000  
##  Max.   :441.00   Max.   :11.00   Max.   :116.00   Max.   :1.0000  
##     revenue        employees      predictedAcqusition
##  Min.   :15.86   Min.   :  56.0   0:  0              
##  1st Qu.:35.98   1st Qu.: 606.8   1:364              
##  Median :43.06   Median : 744.5                      
##  Mean   :42.21   Mean   : 753.7                      
##  3rd Qu.:48.16   3rd Qu.: 895.2                      
##  Max.   :65.10   Max.   :1461.0
# Run Correlation chart including all variables
chart.Correlation(data3[,c(3,5:6,9,11:15)], histogram=TRUE, pch=19)

# Lets run a GLM model to see the variable collinearity with all variables 
set.seed(123)
glm.fit1 <- glm(duration~profit+acq_exp+freq+ret_exp+crossbuy+sow+industry+revenue+employees, data=data3)
preds.glm1 = predict(glm.fit1, newdata = data3)
glm1.mse = mean((preds.glm1 - data3$duration)^2)
glm1.mse
## [1] 2647.914
car::vif(glm.fit1)
##    profit   acq_exp      freq   ret_exp  crossbuy       sow  industry   revenue 
## 18.315825  1.131901  1.737211 10.056359  2.808369  2.559769  1.079648  1.091787 
## employees 
##  1.265496

Here profit and ret_exp are highly collinear as indicated in the correlation chart. Lets remove ret_exp as we are dealing with acquisition and see how the model collinerity is.

# Lets run a GLM model to see the variable collinearity with all variables 
set.seed(123)
glm.fit2 <- glm(duration~profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees, data=data3)
preds.glm2 = predict(glm.fit2, newdata = data3)
glm2.mse = mean((preds.glm2 - data3$duration)^2)
glm2.mse
## [1] 6219.244
car::vif(glm.fit2)
##    profit   acq_exp      freq  crossbuy       sow  industry   revenue employees 
##  4.000792  1.039238  1.736314  2.617264  2.331227  1.058091  1.087949  1.245028

Even though the MSE is high, there is no collinearity involved.

# Lets create a random forest model to predict duration only for the predicted acquisition customers

# Run a random forest REGRESSION model to predict duration with all fields except the transformed squared variables and ret_exp

# set the seed
set.seed(123)

# Set a tune grid to run at different mtry's
tunegrid <- expand.grid(.mtry=c(1:15))

# Set ntree to 1000 and importance to True to see the variable importance

# Run a randomforest with all the fields
rf.forest2 <- rfsrc(duration~profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees, 
                 data = data3,
                 tuneGrid=tunegrid,
                 importance = TRUE,
                 ntree = 1000)

rf.forest2
##                          Sample size: 364
##                      Number of trees: 1000
##            Forest terminal node size: 5
##        Average no. of terminal nodes: 39.23
## No. of variables tried at each split: 3
##               Total no. of variables: 8
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 230
##                             Analysis: RF-R
##                               Family: regr
##                       Splitting rule: mse *random*
##        Number of random split points: 10
##                 % variance explained: 97.1
##                           Error rate: 6232.35
# Get the predicted duration and write into a dataframe
predictedDuration <- predict(rf.forest2,data3)$predicted
data4 <- cbind(data3,predictedDuration)

# Calculate the actual duration and the predicted duration (how long) for the customers predicted to be acquired
TotalActualDuration <- sum(data4$duration)
TotalPredictedDuration <- sum(data4$predictedDuration)

cat("TotalActualDuration = ", TotalActualDuration)
## TotalActualDuration =  325490
cat("\nTotalPredictedDuration = ", TotalPredictedDuration)
## 
## TotalPredictedDuration =  325819.2
Task2 - Compute variable importance to detect interactions and optimize hyperparameters for acquired customers.
Compute Variable Importance and Minimal Depth to detect interactions
rf.forest2$importance
##      profit     acq_exp        freq    crossbuy         sow    industry 
## 62157.56443  3547.47906 37110.98554 56139.20775 22182.75501    95.40617 
##     revenue   employees 
##    24.98874   127.09285
  1. Variable importance
# Creating a plot of variable importance
rf.forest2$importance # values vary a lot
##      profit     acq_exp        freq    crossbuy         sow    industry 
## 62157.56443  3547.47906 37110.98554 56139.20775 22182.75501    95.40617 
##     revenue   employees 
##    24.98874   127.09285
# Use this if the question is related to variable importance on a classification model
# importance = rf.forest1$importance[,c(1)] # values vary a lot

data.frame(importance = rf.forest2$importance) %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,importance), y = importance)) +
    geom_bar(stat = "identity", fill = "orange", color = "black")+
    coord_flip() +
     labs(x = "Variables", y = "Variable importance")+
     theme_nice          # where's industry? difficult to see

rf.forest2$importance %>% log() # log transform
##    profit   acq_exp      freq  crossbuy       sow  industry   revenue employees 
## 11.037428  8.173993 10.521668 10.935590 10.007070  4.558143  3.218425  4.844918
data.frame(importance = rf.forest2$importance) %>% # add a large +ve constant
  log() %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,importance), y = importance)) +
    geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.5)+
    coord_flip() +
    labs(x = "Variables", y = "Log-transformed variable importance") +
    theme_nice       

As all the values are less than 1 and positive, we dont need any transformation such as log or adding a large constant etc.

  1. Minimal Depth
mindepth <- max.subtree(rf.forest2,
                        sub.order = TRUE)

# first order depths
print(round(mindepth$order, 3)[,1])
##    profit   acq_exp      freq  crossbuy       sow  industry   revenue employees 
##     1.635     3.297     2.279     2.274     3.097     7.579     4.109     3.778
# vizualise MD
data.frame(md = round(mindepth$order, 3)[,1]) %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,desc(md)), y = md)) +
    geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.2)+
    coord_flip() +
     labs(x = "Variables", y = "Minimal Depth")+
     theme_nice

# interactions
mindepth$sub.order
##              profit   acq_exp      freq  crossbuy       sow  industry   revenue
## profit    0.1399196 0.2264888 0.3205463 0.4089828 0.3510775 0.6802229 0.3576138
## acq_exp   0.2516335 0.2862922 0.4637345 0.5535774 0.4576455 0.7624997 0.4606001
## freq      0.3002827 0.3989477 0.2003254 0.5225195 0.4786879 0.7711915 0.4733314
## crossbuy  0.3387873 0.4275252 0.4883598 0.1999977 0.5227186 0.7921141 0.5137861
## sow       0.3905999 0.4891231 0.5645602 0.6259963 0.2717470 0.8379984 0.5698369
## industry  0.7872336 0.8231147 0.8627345 0.8951583 0.8713991 0.6518157 0.8634710
## revenue   0.4425062 0.5181028 0.6428629 0.6992784 0.6370630 0.8839927 0.3598838
## employees 0.3929604 0.4813238 0.6074370 0.6585310 0.5811056 0.8555074 0.5887954
##           employees
## profit    0.3300100
## acq_exp   0.4277176
## freq      0.4782207
## crossbuy  0.5056605
## sow       0.5782995
## industry  0.8591563
## revenue   0.6262199
## employees 0.3291201
as.matrix(mindepth$sub.order) %>%
  reshape2::melt() %>%
  data.frame() %>%
  ggplot(aes(x = Var1, y = Var2, fill = value)) +
    scale_x_discrete(position = "top") +
    geom_tile(color = "white") +
    viridis::scale_fill_viridis("Relative min. depth") +
    labs(x = "", y = "") +
    theme_bw()

# cross-check with vimp
find.interaction(rf.forest2,
                      method = "vimp",
                      importance = "permute")
## Pairing profit with crossbuy 
## Pairing profit with freq 
## Pairing profit with sow 
## Pairing profit with acq_exp 
## Pairing profit with employees 
## Pairing profit with industry 
## Pairing profit with revenue 
## Pairing crossbuy with freq 
## Pairing crossbuy with sow 
## Pairing crossbuy with acq_exp 
## Pairing crossbuy with employees 
## Pairing crossbuy with industry 
## Pairing crossbuy with revenue 
## Pairing freq with sow 
## Pairing freq with acq_exp 
## Pairing freq with employees 
## Pairing freq with industry 
## Pairing freq with revenue 
## Pairing sow with acq_exp 
## Pairing sow with employees 
## Pairing sow with industry 
## Pairing sow with revenue 
## Pairing acq_exp with employees 
## Pairing acq_exp with industry 
## Pairing acq_exp with revenue 
## Pairing employees with industry 
## Pairing employees with revenue 
## Pairing industry with revenue 
## 
##                               Method: vimp
##                     No. of variables: 8
##            Variables sorted by VIMP?: TRUE
##    No. of variables used for pairing: 8
##     Total no. of paired interactions: 28
##             Monte Carlo replications: 1
##     Type of noising up used for VIMP: permute
## 
##                         Var 1      Var 2      Paired    Additive Difference
## profit:crossbuy    62815.0616 54534.4378 145923.3560 117349.4995 28573.8565
## profit:freq        62815.0616 37649.3683 117829.5065 100464.4299 17365.0765
## profit:sow         62815.0616 22348.0172  96452.8229  85163.0788 11289.7441
## profit:acq_exp     62815.0616  3499.5851  61310.1644  66314.6467 -5004.4823
## profit:employees   62815.0616   191.0569  62470.2628  63006.1186  -535.8558
## profit:industry    62815.0616    32.6256  62306.4309  62847.6872  -541.2564
## profit:revenue     62815.0616    -3.4741  62776.0117  62811.5875   -35.5758
## crossbuy:freq      56125.4744 36503.0245 111990.4631  92628.4988 19361.9643
## crossbuy:sow       56125.4744 22746.6868  89041.9794  78872.1612 10169.8182
## crossbuy:acq_exp   56125.4744  3508.1688  58641.0382  59633.6432  -992.6050
## crossbuy:employees 56125.4744   131.9008  55122.2762  56257.3752 -1135.0990
## crossbuy:industry  56125.4744    20.4461  56258.3166  56145.9205   112.3961
## crossbuy:revenue   56125.4744     9.3166  55820.9310  56134.7910  -313.8600
## freq:sow           37592.2795 21712.9363  63614.1120  59305.2158  4308.8962
## freq:acq_exp       37592.2795  3450.4637  40311.5799  41042.7431  -731.1632
## freq:employees     37592.2795   141.5016  36916.5016  37733.7811  -817.2795
## freq:industry      37592.2795     0.1015  36411.0921  37592.3810 -1181.2889
## freq:revenue       37592.2795   -55.3336  36559.9083  37536.9459  -977.0376
## sow:acq_exp        21492.6912  3480.3746  25194.6917  24973.0658   221.6259
## sow:employees      21492.6912   125.0988  22086.2213  21617.7900   468.4314
## sow:industry       21492.6912    37.0040  21159.8598  21529.6952  -369.8355
## sow:revenue        21492.6912    25.1279  21418.5762  21517.8191   -99.2429
## acq_exp:employees   3525.2137    92.2344   3544.1762   3617.4482   -73.2719
## acq_exp:industry    3525.2137    96.5666   3476.1859   3621.7803  -145.5944
## acq_exp:revenue     3525.2137   -29.7336   3505.6993   3495.4801    10.2192
## employees:industry   -19.9197    56.0231     59.7672     36.1034    23.6638
## employees:revenue    -19.9197    50.5517      2.2189     30.6320   -28.4131
## industry:revenue      51.4946    40.3536    111.8956     91.8481    20.0474
  1. Tuning a forest hyper-parameters for predictive accuracy without any interaction terms
set.seed(123)
# Establish a list of possible values for hyper-parameters
mtry.values <- seq(4,6,1)
nodesize.values <- seq(4,8,2)
ntree.values <- seq(4e3,6e3,1e3)

# Create a data frame containing all combinations 
hyper_grid <- expand.grid(mtry = mtry.values, nodesize = nodesize.values, ntree = ntree.values)

# Create an empty vector to store OOB error values
oob_err <- c()

# Write a loop over the rows of hyper_grid to train the grid of models
for (i in 1:nrow(hyper_grid)) {

    # Train a Random Forest model
   model <- rfsrc(duration ~ profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees,
                            data = data3,
                            mtry = hyper_grid$mtry[i],
                            nodesize = hyper_grid$nodesize[i],
                            ntree = hyper_grid$ntree[i])  
  
                          
    # Store OOB error for the model                      
    oob_err[i] <- model$err.rate[length(model$err.rate)]
}
# Identify optimal set of hyperparmeters based on OOB error

opt_i <- which.min(oob_err)
print(hyper_grid[opt_i,])
##    mtry nodesize ntree
## 12    6        4  5000
Compare the accuracy of model with a decision trees and logistic regression model for acquiring customers
# Run a decision tree model

dt.model1 <- rpart(acquisition~acq_exp+industry+revenue+employees,
                             data = data1) # simple DT model

rattle::fancyRpartPlot(dt.model1, sub = "") # vizualize the DT

t_pred = predict(dt.model1,data1,type="class")
confMat <- table(data1$acquisition,t_pred)
confMat
##    t_pred
##       0   1
##   0 112  50
##   1  27 311
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.846

Decision Tree gave 80.6% accuracy.

# Run a logistic regression model
set.seed(123)
train.control <- trainControl(method = "cv")
glm.model1 <- train(acquisition~acq_exp_sq+industry+revenue+employees,
                  data=data1,method= 'glm', family= 'binomial',trControl = train.control)
summary(glm.model1)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2982  -0.6250   0.2985   0.6434   2.3767  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.718e+00  7.927e-01  -8.475  < 2e-16 ***
## acq_exp_sq  -1.331e-06  6.858e-07  -1.942   0.0522 .  
## industry     1.599e+00  2.605e-01   6.138 8.35e-10 ***
## revenue      7.215e-02  1.337e-02   5.397 6.76e-08 ***
## employees    6.782e-03  7.222e-04   9.391  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 629.85  on 499  degrees of freedom
## Residual deviance: 418.36  on 495  degrees of freedom
## AIC: 428.36
## 
## Number of Fisher Scoring iterations: 5
pred.glm.model1<-predict(glm.model1,data1)
confusionMatrix(pred.glm.model1,data1$acquisition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 102  37
##          1  60 301
##                                           
##                Accuracy : 0.806           
##                  95% CI : (0.7686, 0.8398)
##     No Information Rate : 0.676           
##     P-Value [Acc > NIR] : 5.918e-11       
##                                           
##                   Kappa : 0.5401          
##                                           
##  Mcnemar's Test P-Value : 0.0255          
##                                           
##             Sensitivity : 0.6296          
##             Specificity : 0.8905          
##          Pos Pred Value : 0.7338          
##          Neg Pred Value : 0.8338          
##              Prevalence : 0.3240          
##          Detection Rate : 0.2040          
##    Detection Prevalence : 0.2780          
##       Balanced Accuracy : 0.7601          
##                                           
##        'Positive' Class : 0               
## 

Logistic regression gave 80.6% accuracy.