R Markdown

library(caret)
library(tree)
library(SMCRM)
library(PerformanceAnalytics)
library(xgboost)
library(randomForestSRC)
library(rpart)
library(car)
library(dplyr)

# theme for nice plotting
theme_nice <- theme_classic()+
                theme(
                  axis.line.y.left = element_line(colour = "black"),
                  axis.line.y.right = element_line(colour = "black"),
                  axis.line.x.bottom = element_line(colour = "black"),
                  axis.line.x.top = element_line(colour = "black"),
                  axis.text.y = element_text(colour = "black", size = 12),
                  axis.text.x = element_text(color = "black", size = 12),
                  axis.ticks = element_line(color = "black")) +
                theme(
                  axis.ticks.length = unit(-0.25, "cm"), 
                  axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")), 
                  axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))
Load the dataset and get summary statistics
#  Load the data
data(acquisitionRetention)
#Loading dataset
 data1  = data.frame(acquisitionRetention)
str(acquisitionRetention)
## 'data.frame':    500 obs. of  15 variables:
##  $ customer   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ acquisition: num  1 1 1 0 1 1 1 1 0 0 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : num  1 0 0 0 0 1 0 1 0 1 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...
# Get the summary
summary(data1)
##     customer      acquisition       duration          profit       
##  Min.   :  1.0   Min.   :0.000   Min.   :   0.0   Min.   :-1027.0  
##  1st Qu.:125.8   1st Qu.:0.000   1st Qu.:   0.0   1st Qu.: -316.3  
##  Median :250.5   Median :1.000   Median : 957.5   Median : 3369.9  
##  Mean   :250.5   Mean   :0.676   Mean   : 742.5   Mean   : 2403.8  
##  3rd Qu.:375.2   3rd Qu.:1.000   3rd Qu.:1146.2   3rd Qu.: 3931.6  
##  Max.   :500.0   Max.   :1.000   Max.   :1673.0   Max.   : 6134.3  
##     acq_exp           ret_exp         acq_exp_sq          ret_exp_sq     
##  Min.   :   1.21   Min.   :   0.0   Min.   :      1.5   Min.   :      0  
##  1st Qu.: 384.14   1st Qu.:   0.0   1st Qu.: 147562.0   1st Qu.:      0  
##  Median : 491.66   Median : 398.1   Median : 241729.7   Median : 158480  
##  Mean   : 493.35   Mean   : 336.3   Mean   : 271211.1   Mean   : 184000  
##  3rd Qu.: 600.21   3rd Qu.: 514.3   3rd Qu.: 360246.0   3rd Qu.: 264466  
##  Max.   :1027.04   Max.   :1095.0   Max.   :1054811.2   Max.   :1198937  
##       freq          freq_sq          crossbuy           sow        
##  Min.   : 0.00   Min.   :  0.00   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.: 0.00   1st Qu.:  0.00   1st Qu.: 0.000   1st Qu.:  0.00  
##  Median : 6.00   Median : 36.00   Median : 5.000   Median : 44.00  
##  Mean   : 6.22   Mean   : 69.25   Mean   : 4.052   Mean   : 38.88  
##  3rd Qu.:11.00   3rd Qu.:121.00   3rd Qu.: 7.000   3rd Qu.: 66.00  
##  Max.   :21.00   Max.   :441.00   Max.   :11.000   Max.   :116.00  
##     industry        revenue        employees     
##  Min.   :0.000   Min.   :14.49   Min.   :  18.0  
##  1st Qu.:0.000   1st Qu.:33.53   1st Qu.: 503.0  
##  Median :1.000   Median :41.43   Median : 657.5  
##  Mean   :0.522   Mean   :40.54   Mean   : 671.5  
##  3rd Qu.:1.000   3rd Qu.:47.52   3rd Qu.: 826.0  
##  Max.   :1.000   Max.   :65.10   Max.   :1461.0
# Create a copy
data2 <- data1
str(data1)
## 'data.frame':    500 obs. of  15 variables:
##  $ customer   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ acquisition: num  1 1 1 0 1 1 1 1 0 0 ...
##  $ duration   : num  1635 1039 1288 0 1631 ...
##  $ profit     : num  6134 3524 4081 -638 5446 ...
##  $ acq_exp    : num  694 460 249 638 589 ...
##  $ ret_exp    : num  972 450 805 0 920 ...
##  $ acq_exp_sq : num  480998 211628 62016 407644 346897 ...
##  $ ret_exp_sq : num  943929 202077 648089 0 846106 ...
##  $ freq       : num  6 11 21 0 2 7 15 13 0 0 ...
##  $ freq_sq    : num  36 121 441 0 4 49 225 169 0 0 ...
##  $ crossbuy   : num  5 6 6 0 9 4 5 5 0 0 ...
##  $ sow        : num  95 22 90 0 80 48 51 23 0 0 ...
##  $ industry   : num  1 0 0 0 0 1 0 1 0 1 ...
##  $ revenue    : num  47.2 45.1 29.1 40.6 48.7 ...
##  $ employees  : num  898 686 1423 181 631 ...
Correlation Analysis
# Run Correlation chart including the yvar
chart.Correlation(data1, histogram=TRUE, pch=19)

Most of the variables are heavily correlated with response variable.
duration, profit, ret_exp, ret_exp_sq, freq, freq_sq, crossbuy and sow are heavily correlated.
Lets run some box plots to see if this makes sense.

Variable exclusion based on correlation
# Create box plots to understand variable correlation with the response variable Acquistion
par(mfrow = c(2, 3))
boxplot(duration~ acquisition, data = data1, xlab = "acquisition", ylab = "duration")
boxplot(ret_exp~ acquisition, data = data1, xlab = "acquisition", ylab = "ret_exp")
boxplot(freq~ acquisition, data = data1, xlab = "acquisition", ylab = "freq")
boxplot(crossbuy~ acquisition, data = data1, xlab = "acquisition", ylab = "crossbuy")
boxplot(sow~ acquisition, data = data1, xlab = "acquisition", ylab = "sow")
boxplot(profit~ acquisition, data = data1, xlab = "acquisition", ylab = "profit")

It is clear that whenever the fields Duration, ret_exp, freq, crossbuy, sow are “0”, acquisition is always biased as 0.

Similarly, whenever profit is negative, acquisition is always 0 indicating that marketing costs are included for acqusition.

We will exclude these variables in model.

we will also exclude the “customer” variable in the model as it is just a number and doesnt have any influence on the model.

Transform the variables
# Convert the acquisition as a factor
data1$acquisition<- as.factor(data1$acquisition)
# See if any missing values and remove them
data1 <- na.omit(data1)
Task1 - Use acquisitionRetention data set to predict which customers will be acquired and for how long (duration) based on a feature set using a random forest.
  1. Use acquisitionRetention data set to predict which customers will be acquired
# Run a random forest model to predict acqusition using only the kept fields based on the above correlation analysis

# set the seed
set.seed(123)

# Set a tune grid to run at different mtry's
tunegrid <- expand.grid(.mtry=c(1:15))

# Set ntree to 1000 and importance to True to see the variable importance

# Run a randomforest with only the fields industry, revenue, employees and acq_exp
rf.forest1 <- rfsrc(acquisition~acq_exp+industry+revenue+employees, 
                 data = data1,
                 tuneGrid=tunegrid,
                 importance = TRUE,
                 ntree = 1000)

rf.forest1
##                          Sample size: 500
##            Frequency of class labels: 162, 338
##                      Number of trees: 1000
##            Forest terminal node size: 1
##        Average no. of terminal nodes: 76.809
## No. of variables tried at each split: 2
##               Total no. of variables: 4
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 316
##                             Analysis: RF-C
##                               Family: class
##                       Splitting rule: gini *random*
##        Number of random split points: 10
##               Normalized brier score: 57.26 
##                                  AUC: 85.97 
##                           Error rate: 0.21, 0.41, 0.12
## 
## Confusion matrix:
## 
##           predicted
##   observed  0   1 class.error
##          0 96  66      0.4074
##          1 40 298      0.1183
## 
##  Overall error rate: 21.2%

This gave an accuracy rate of 78.8%.

  1. for how long (duration) based on a feature set using a random forest.
# create a dataframe from the prediction values
predictedAcqusition <- rf.forest1$class.oob
data2<-cbind(data1,predictedAcqusition)
# Keep only the predicted acquisition customers only in a new dataframe
data3 <- data2 %>% filter(predictedAcqusition == 1)
summary(data3)
##     customer     acquisition    duration          profit         acq_exp       
##  Min.   :  1.0   0: 66       Min.   :   0.0   Min.   :-1027   Min.   :  43.15  
##  1st Qu.:130.8   1:298       1st Qu.: 813.5   1st Qu.: 3028   1st Qu.: 407.69  
##  Median :258.0               Median :1020.5   Median : 3555   Median : 496.07  
##  Mean   :256.7               Mean   : 894.2   Mean   : 3028   Mean   : 492.25  
##  3rd Qu.:386.2               3rd Qu.:1175.2   3rd Qu.: 4049   3rd Qu.: 579.63  
##  Max.   :500.0               Max.   :1673.0   Max.   : 6134   Max.   :1027.04  
##     ret_exp         acq_exp_sq        ret_exp_sq           freq       
##  Min.   :   0.0   Min.   :   1862   Min.   :      0   Min.   : 0.000  
##  1st Qu.: 322.1   1st Qu.: 166208   1st Qu.: 103779   1st Qu.: 3.000  
##  Median : 439.1   Median : 246087   Median : 192835   Median : 8.000  
##  Mean   : 404.9   Mean   : 262301   Mean   : 219752   Mean   : 7.527  
##  3rd Qu.: 538.4   3rd Qu.: 335971   3rd Qu.: 289858   3rd Qu.:11.000  
##  Max.   :1082.4   Max.   :1054811   Max.   :1171525   Max.   :21.000  
##     freq_sq          crossbuy          sow            industry     
##  Min.   :  0.00   Min.   : 0.00   Min.   :  0.00   Min.   :0.0000  
##  1st Qu.:  9.00   1st Qu.: 3.00   1st Qu.: 29.75   1st Qu.:0.0000  
##  Median : 64.00   Median : 5.00   Median : 52.00   Median :1.0000  
##  Mean   : 84.27   Mean   : 4.92   Mean   : 46.78   Mean   :0.5907  
##  3rd Qu.:121.00   3rd Qu.: 7.00   3rd Qu.: 68.00   3rd Qu.:1.0000  
##  Max.   :441.00   Max.   :11.00   Max.   :116.00   Max.   :1.0000  
##     revenue        employees      predictedAcqusition
##  Min.   :15.86   Min.   :  56.0   0:  0              
##  1st Qu.:35.98   1st Qu.: 606.8   1:364              
##  Median :43.06   Median : 744.5                      
##  Mean   :42.21   Mean   : 753.7                      
##  3rd Qu.:48.16   3rd Qu.: 895.2                      
##  Max.   :65.10   Max.   :1461.0
# Run Correlation chart including all variables
chart.Correlation(data3[,c(3,5:6,9,11:15)], histogram=TRUE, pch=19)

# Lets run a GLM model to see the variable collinearity with all variables 
set.seed(123)
glm.fit1 <- glm(duration~profit+acq_exp+freq+ret_exp+crossbuy+sow+industry+revenue+employees, data=data3)
preds.glm1 = predict(glm.fit1, newdata = data3)
glm1.mse = mean((preds.glm1 - data3$duration)^2)
glm1.mse
## [1] 2647.914
car::vif(glm.fit1)
##    profit   acq_exp      freq   ret_exp  crossbuy       sow  industry   revenue 
## 18.315825  1.131901  1.737211 10.056359  2.808369  2.559769  1.079648  1.091787 
## employees 
##  1.265496

Here profit and ret_exp are highly collinear as indicated in the correlation chart. Lets remove ret_exp as we are dealing with acquisition and see how the model collinerity is.

# Lets run a GLM model to see the variable collinearity with all variables 
set.seed(123)
glm.fit2 <- glm(duration~profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees, data=data3)
preds.glm2 = predict(glm.fit2, newdata = data3)
glm2.mse = mean((preds.glm2 - data3$duration)^2)
glm2.mse
## [1] 6219.244
car::vif(glm.fit2)
##    profit   acq_exp      freq  crossbuy       sow  industry   revenue employees 
##  4.000792  1.039238  1.736314  2.617264  2.331227  1.058091  1.087949  1.245028

Even though the MSE is high, there is no collinearity involved.

# Lets create a random forest model to predict duration only for the predicted acquisition customers

# Run a random forest REGRESSION model to predict duration with all fields except the transformed squared variables and ret_exp

# set the seed
set.seed(123)

# Set a tune grid to run at different mtry's
tunegrid <- expand.grid(.mtry=c(1:15))

# Set ntree to 1000 and importance to True to see the variable importance

# Run a randomforest with all the fields
rf.forest2 <- rfsrc(duration~profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees, 
                 data = data3,
                 tuneGrid=tunegrid,
                 importance = TRUE,
                 ntree = 1000)

rf.forest2
##                          Sample size: 364
##                      Number of trees: 1000
##            Forest terminal node size: 5
##        Average no. of terminal nodes: 39.23
## No. of variables tried at each split: 3
##               Total no. of variables: 8
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 230
##                             Analysis: RF-R
##                               Family: regr
##                       Splitting rule: mse *random*
##        Number of random split points: 10
##                 % variance explained: 97.1
##                           Error rate: 6232.35
# Get the predicted duration and write into a dataframe
predictedDuration <- predict(rf.forest2,data3)$predicted
data4 <- cbind(data3,predictedDuration)

# Calculate the actual duration and the predicted duration (how long) for the customers predicted to be acquired
TotalActualDuration <- sum(data4$duration)
TotalPredictedDuration <- sum(data4$predictedDuration)

cat("TotalActualDuration = ", TotalActualDuration)
## TotalActualDuration =  325490
cat("\nTotalPredictedDuration = ", TotalPredictedDuration)
## 
## TotalPredictedDuration =  325819.2
# write the predicted acquisition (or) acquired customers and their predicted duration to a CSV file
write.csv(data4,file="casestudy4.csv",sep=",")
## Warning in write.csv(data4, file = "casestudy4.csv", sep = ","): attempt to set
## 'sep' ignored
Task2 - Compute variable importance to detect interactions and optimize hyperparameters for acquired customers.
Compute Variable Importance and Minimal Depth to detect interactions
rf.forest2$importance
##      profit     acq_exp        freq    crossbuy         sow    industry 
## 62919.52006  3598.16319 37340.18739 54590.92936 22353.78101   125.35100 
##     revenue   employees 
##   -27.07561    41.13362
  1. Variable importance
# Creating a plot of variable importance
rf.forest2$importance # values vary a lot
##      profit     acq_exp        freq    crossbuy         sow    industry 
## 62919.52006  3598.16319 37340.18739 54590.92936 22353.78101   125.35100 
##     revenue   employees 
##   -27.07561    41.13362
# Use this if the question is related to variable importance on a classification model
# importance = rf.forest1$importance[,c(1)] # values vary a lot

data.frame(importance = rf.forest2$importance) %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,importance), y = importance)) +
    geom_bar(stat = "identity", fill = "orange", color = "black")+
    coord_flip() +
     labs(x = "Variables", y = "Variable importance")+
     theme_nice          # where's industy? difficult to see

rf.forest2$importance %>% log() # log transform
## Warning in log(.): NaNs produced
##    profit   acq_exp      freq  crossbuy       sow  industry   revenue employees 
## 11.049612  8.188179 10.527825 10.907623 10.014751  4.831118       NaN  3.716826
data.frame(importance = rf.forest2$importance) %>% # add a large +ve constant
  log() %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,importance), y = importance)) +
    geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.5)+
    coord_flip() +
    labs(x = "Variables", y = "Log-transformed variable importance") +
    theme_nice       
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 1 rows containing missing values (position_stack).

As all the values are less than 1 and positive, we dont need any transformation such as log or adding a large constant etc.

  1. Minimal Depth
mindepth <- max.subtree(rf.forest2,
                        sub.order = TRUE)

# first order depths
print(round(mindepth$order, 3)[,1])
##    profit   acq_exp      freq  crossbuy       sow  industry   revenue employees 
##     1.635     3.297     2.279     2.274     3.097     7.579     4.109     3.778
# vizualise MD
data.frame(md = round(mindepth$order, 3)[,1]) %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,desc(md)), y = md)) +
    geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.2)+
    coord_flip() +
     labs(x = "Variables", y = "Minimal Depth")+
     theme_nice

# interactions
mindepth$sub.order
##              profit   acq_exp      freq  crossbuy       sow  industry   revenue
## profit    0.1399196 0.2264888 0.3205463 0.4089828 0.3510775 0.6802229 0.3576138
## acq_exp   0.2516335 0.2862922 0.4637345 0.5535774 0.4576455 0.7624997 0.4606001
## freq      0.3002827 0.3989477 0.2003254 0.5225195 0.4786879 0.7711915 0.4733314
## crossbuy  0.3387873 0.4275252 0.4883598 0.1999977 0.5227186 0.7921141 0.5137861
## sow       0.3905999 0.4891231 0.5645602 0.6259963 0.2717470 0.8379984 0.5698369
## industry  0.7872336 0.8231147 0.8627345 0.8951583 0.8713991 0.6518157 0.8634710
## revenue   0.4425062 0.5181028 0.6428629 0.6992784 0.6370630 0.8839927 0.3598838
## employees 0.3929604 0.4813238 0.6074370 0.6585310 0.5811056 0.8555074 0.5887954
##           employees
## profit    0.3300100
## acq_exp   0.4277176
## freq      0.4782207
## crossbuy  0.5056605
## sow       0.5782995
## industry  0.8591563
## revenue   0.6262199
## employees 0.3291201
as.matrix(mindepth$sub.order) %>%
  reshape2::melt() %>%
  data.frame() %>%
  ggplot(aes(x = Var1, y = Var2, fill = value)) +
    scale_x_discrete(position = "top") +
    geom_tile(color = "white") +
    viridis::scale_fill_viridis("Relative min. depth") +
    labs(x = "", y = "") +
    theme_bw()

# cross-check with vimp
find.interaction(rf.forest2,
                      method = "vimp",
                      importance = "permute")
## Pairing profit with crossbuy 
## Pairing profit with freq 
## Pairing profit with sow 
## Pairing profit with acq_exp 
## Pairing profit with industry 
## Pairing profit with employees 
## Pairing profit with revenue 
## Pairing crossbuy with freq 
## Pairing crossbuy with sow 
## Pairing crossbuy with acq_exp 
## Pairing crossbuy with industry 
## Pairing crossbuy with employees 
## Pairing crossbuy with revenue 
## Pairing freq with sow 
## Pairing freq with acq_exp 
## Pairing freq with industry 
## Pairing freq with employees 
## Pairing freq with revenue 
## Pairing sow with acq_exp 
## Pairing sow with industry 
## Pairing sow with employees 
## Pairing sow with revenue 
## Pairing acq_exp with industry 
## Pairing acq_exp with employees 
## Pairing acq_exp with revenue 
## Pairing industry with employees 
## Pairing industry with revenue 
## Pairing employees with revenue 
## 
##                               Method: vimp
##                     No. of variables: 8
##            Variables sorted by VIMP?: TRUE
##    No. of variables used for pairing: 8
##     Total no. of paired interactions: 28
##             Monte Carlo replications: 1
##     Type of noising up used for VIMP: permute
## 
##                         Var 1      Var 2      Paired    Additive Difference
## profit:crossbuy    63075.9562 55542.0686 144710.2118 118618.0248 26092.1870
## profit:freq        63075.9562 37727.4432 118575.7512 100803.3994 17772.3518
## profit:sow         63075.9562 21976.4935  97062.6351  85052.4497 12010.1854
## profit:acq_exp     63075.9562  3469.3948  61971.0645  66545.3510 -4574.2865
## profit:industry    63075.9562    52.9427  63277.5128  63128.8989   148.6139
## profit:employees   63075.9562   130.7931  62952.0255  63206.7493  -254.7238
## profit:revenue     63075.9562   -12.2320  63162.9360  63063.7242    99.2117
## crossbuy:freq      55482.8485 37614.0256 113120.6145  93096.8742 20023.7403
## crossbuy:sow       55482.8485 22088.6401  87676.6559  77571.4887 10105.1673
## crossbuy:acq_exp   55482.8485  3530.7137  58758.6047  59013.5622  -254.9576
## crossbuy:industry  55482.8485    55.3493  56288.5705  55538.1978   750.3727
## crossbuy:employees 55482.8485   135.8067  55891.8118  55618.6552   273.1566
## crossbuy:revenue   55482.8485    53.3064  55719.0531  55536.1549   182.8982
## freq:sow           37784.7137 21637.8438  63436.6884  59422.5575  4014.1309
## freq:acq_exp       37784.7137  3523.1716  40690.1692  41307.8852  -617.7160
## freq:industry      37784.7137    38.7850  37528.1332  37823.4986  -295.3654
## freq:employees     37784.7137   192.7261  37144.1947  37977.4398  -833.2451
## freq:revenue       37784.7137   -22.8234  37699.7775  37761.8902   -62.1128
## sow:acq_exp        21742.8296  3462.1572  24222.9264  25204.9868  -982.0604
## sow:industry       21742.8296    56.5913  21183.8711  21799.4209  -615.5498
## sow:employees      21742.8296   188.9560  22719.7225  21931.7856   787.9369
## sow:revenue        21742.8296   -45.1399  21742.7874  21697.6897    45.0978
## acq_exp:industry    3505.8289    23.3140   3555.2209   3529.1429    26.0780
## acq_exp:employees   3505.8289   223.4505   3527.1586   3729.2794  -202.1209
## acq_exp:revenue     3505.8289   -19.1429   3592.9939   3486.6860   106.3080
## industry:employees    65.8362   184.1673    270.9310    250.0035    20.9275
## industry:revenue      65.8362   -15.5537     74.2656     50.2826    23.9830
## employees:revenue    -16.0790    68.7050     -6.5460     52.6260   -59.1720
  1. Tuning a forest hyper-parameters for predictive accuracy without any interaction terms
set.seed(123)
# Establish a list of possible values for hyper-parameters
mtry.values <- seq(4,6,1)
nodesize.values <- seq(4,8,2)
ntree.values <- seq(4e3,6e3,1e3)

# Create a data frame containing all combinations 
hyper_grid <- expand.grid(mtry = mtry.values, nodesize = nodesize.values, ntree = ntree.values)

# Create an empty vector to store OOB error values
oob_err <- c()

# Write a loop over the rows of hyper_grid to train the grid of models
for (i in 1:nrow(hyper_grid)) {

    # Train a Random Forest model
   model <- rfsrc(duration ~ profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees,
                            data = data3,
                            mtry = hyper_grid$mtry[i],
                            nodesize = hyper_grid$nodesize[i],
                            ntree = hyper_grid$ntree[i])  
  
                          
    # Store OOB error for the model                      
    oob_err[i] <- model$err.rate[length(model$err.rate)]
}
# Identify optimal set of hyperparmeters based on OOB error

opt_i <- which.min(oob_err)
print(hyper_grid[opt_i,])
##    mtry nodesize ntree
## 12    6        4  5000
Task-3: Compare the accuracy of model with a decision trees and logistic regression model for acquiring customers
# Run a decision tree model

dt.model1 <- rpart(acquisition~acq_exp+industry+revenue+employees,
                             data = data1) # simple DT model

rattle::fancyRpartPlot(dt.model1, sub = "") # vizualize the DT

# Run the decision tree after applying Cross Validation
set.seed(123)
train.control <- trainControl(method = "CV", number=10)
dt.model2 <- train(acquisition~acq_exp+industry+revenue+employees,
                    data = data1,
                    trControl = train.control
                   ,method = 'rpart')

dt.model2
## CART 
## 
## 500 samples
##   4 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 449, 450, 450, 451, 450, 450, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa     
##   0.05555556  0.7533605  0.41364355
##   0.06790123  0.7533605  0.40826208
##   0.25308642  0.6698639  0.04969707
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.06790123.

Decision Tree gave 75.33% accuracy with CV.

t_pred = predict(dt.model1,data1,type="class")
confMat <- table(data1$acquisition,t_pred)
confMat
##    t_pred
##       0   1
##   0 112  50
##   1  27 311
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.846

Decision Tree gave 84.6% accuracy without CV.

# Run a logistic regression model
set.seed(123)
train.control <- trainControl(method = "cv")
glm.model1 <- train(acquisition~acq_exp_sq+industry+revenue+employees,
                  data=data1,method= 'glm', family= 'binomial',trControl = train.control)
summary(glm.model1)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2982  -0.6250   0.2985   0.6434   2.3767  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.718e+00  7.927e-01  -8.475  < 2e-16 ***
## acq_exp_sq  -1.331e-06  6.858e-07  -1.942   0.0522 .  
## industry     1.599e+00  2.605e-01   6.138 8.35e-10 ***
## revenue      7.215e-02  1.337e-02   5.397 6.76e-08 ***
## employees    6.782e-03  7.222e-04   9.391  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 629.85  on 499  degrees of freedom
## Residual deviance: 418.36  on 495  degrees of freedom
## AIC: 428.36
## 
## Number of Fisher Scoring iterations: 5
pred.glm.model1<-predict(glm.model1,data1)
confusionMatrix(pred.glm.model1,data1$acquisition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 102  37
##          1  60 301
##                                           
##                Accuracy : 0.806           
##                  95% CI : (0.7686, 0.8398)
##     No Information Rate : 0.676           
##     P-Value [Acc > NIR] : 5.918e-11       
##                                           
##                   Kappa : 0.5401          
##                                           
##  Mcnemar's Test P-Value : 0.0255          
##                                           
##             Sensitivity : 0.6296          
##             Specificity : 0.8905          
##          Pos Pred Value : 0.7338          
##          Neg Pred Value : 0.8338          
##              Prevalence : 0.3240          
##          Detection Rate : 0.2040          
##    Detection Prevalence : 0.2780          
##       Balanced Accuracy : 0.7601          
##                                           
##        'Positive' Class : 0               
## 

Logistic regression gave 80.6% accuracy.

PDP charts
# inspect relationship of avg_ret_exp with predicted duration with PDP
min(rf.forest2$xvar$acq_exp)
## [1] 43.15
max(rf.forest2$xvar$acq_exp)
## [1] 1027.04
acq_exp_seq = seq(40,1040,50)
# extract marginal effect using partial dependence
marginal.effect.rf2 <- partial(rf.forest2,
                           partial.xvar = "acq_exp",
                           partial.values = acq_exp_seq)

means.exp.rf2 <- marginal.effect.rf2$regrOutput$duration %>% colMeans()
marginal.effect.df.rf2 <-
  data.frame(pred.duration.rf2 = means.exp.rf2, acq_exp_seq = acq_exp_seq)
ggplot(marginal.effect.df.rf2, aes(x = acq_exp_seq, y = pred.duration.rf2)) +
  geom_point(shape = 21, color = "purple", size = 2, stroke = 1.2)+
  geom_smooth(method = "lm", formula = y ~ poly(x,3), se = FALSE, color = "black")+ # try with other values 
  labs(x = "Acquisition expense in $", y = "Predicted duration") +
  scale_x_continuous(breaks = seq(40,1040,50))+
  theme_nice 

# first check relationship between actual duration and acq_exp

ggplot(data3, aes(x = acq_exp, y = duration)) +
  geom_point(shape = 21, col = "purple", size = 3) +
  stat_smooth(method = "lm", se = FALSE, color = "black") +
  scale_x_continuous(breaks = seq(40,1040,50)) +
  scale_y_continuous(breaks = seq(0,1680,120)) +
  geom_rug(sides = "b", col = "red", alpha = 0.2) +
  labs(y = "Actual duration", x = "Acquisition expense in $") +
  theme_nice
## `geom_smooth()` using formula 'y ~ x'

# repeat with smaller values of acq_exp
acq_exp_seq2 = seq(440,590,5)

# extract marginal effect using partial dependence
marginal.effect.rf2.new <- partial(rf.forest2,
                           partial.xvar = "acq_exp",
                           partial.values = acq_exp_seq2)

means.exp.rf2.new <- marginal.effect.rf2.new$regrOutput$duration %>% colMeans()

marginal.effect.df.rf2.new <-
  data.frame(pred.duration.rf2.new = means.exp.rf2.new, acq_exp_seq2 = acq_exp_seq2)

ggplot(marginal.effect.df.rf2.new, aes(x = acq_exp_seq2, y = pred.duration.rf2.new)) +
  geom_point(shape = 21, color = "purple", size = 2, stroke = 1.2)+
  geom_smooth(method = "lm", formula = y ~ poly(x,3), se = FALSE, color = "black")+ # try with other values 
  labs(x = "Acquisition expense in $", y = "Predicted duration") +
  scale_x_continuous(breaks = seq(440,590,5))+
  theme_nice 

lm_regr <- lm(duration ~ profit+acq_exp+acq_exp_sq+freq+freq_sq+crossbuy+sow+industry+revenue+employees, 
   data = data3)
summary(lm_regr)
## 
## Call:
## lm(formula = duration ~ profit + acq_exp + acq_exp_sq + freq + 
##     freq_sq + crossbuy + sow + industry + revenue + employees, 
##     data = data3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -295.179  -46.914    5.709   49.073  213.777 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.590e+02  4.789e+01  11.673  < 2e-16 ***
## profit       2.981e-01  5.387e-03  55.332  < 2e-16 ***
## acq_exp     -9.973e-01  1.451e-01  -6.875 2.82e-11 ***
## acq_exp_sq   6.798e-04  1.418e-04   4.795 2.40e-06 ***
## freq        -5.528e+00  3.500e+00  -1.579  0.11515    
## freq_sq      1.252e-03  1.780e-01   0.007  0.99439    
## crossbuy    -7.244e+00  2.246e+00  -3.226  0.00137 ** 
## sow         -5.946e-01  2.168e-01  -2.742  0.00641 ** 
## industry    -4.021e+01  8.704e+00  -4.620 5.39e-06 ***
## revenue     -9.220e-01  4.529e-01  -2.036  0.04252 *  
## employees   -1.158e-01  2.168e-02  -5.340 1.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 77.59 on 353 degrees of freedom
## Multiple R-squared:  0.9727, Adjusted R-squared:  0.972 
## F-statistic:  1260 on 10 and 353 DF,  p-value: < 2.2e-16