Statistical Learning Homework 4

Juan Olazaba 2024-04-07

Exercise 1

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

\[ y=10sin(\pi x_1x_2)+20(x_3-0.5)^2+10x_4+5x_5+N(0,\sigma^2) \] (a) Did the random forest model significantly use the uninformative predictors (V6 – V10)?

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
osim <- simulated

model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 5000)
rfImp1 <- varImp(model1, scale = FALSE)

Based on our results, the random forest model did not significantly use the uninformative predictors (V6 - V10). They all exhibited low average impurity decreases over all the trees in the forest.

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

set.seed(104)
simulated$duplicate1 <- simulated$V1+rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)

## [1] 0.9477664

rf.mod <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 5000)
rfImp2 <- varImp(rf.mod, scale = FALSE)
print(rfImp2)

##                  Overall
## V1          6.1480103862
## V2          6.2205755875
## V3          0.6131062101
## V4          7.0224235923
## V5          2.0881708413
## V6          0.1769925679
## V7         -0.0052209694
## V8         -0.0843261310
## V9         -0.0655124279
## V10        -0.0007077308
## duplicate1  3.5619669553

simulated$duplicate2 <- simulated$V1+rnorm(200) * 0.1
rf.mod2 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 5000)
rfImp3 <- varImp(rf.mod2, scale = FALSE)

print(rfImp3)

##                Overall
## V1          5.28857264
## V2          6.53889792
## V3          0.45648431
## V4          6.97686635
## V5          1.95041235
## V6          0.15736819
## V7         -0.01951386
## V8         -0.08711254
## V9         -0.05816253
## V10        -0.01089477
## duplicate1  2.20132750
## duplicate2  2.21517943

print(rfImp2)

##                  Overall
## V1          6.1480103862
## V2          6.2205755875
## V3          0.6131062101
## V4          7.0224235923
## V5          2.0881708413
## V6          0.1769925679
## V7         -0.0052209694
## V8         -0.0843261310
## V9         -0.0655124279
## V10        -0.0007077308
## duplicate1  3.5619669553

print(rfImp1)

##         Overall
## V1   8.79515634
## V2   6.56654060
## V3   0.73300417
## V4   7.64635222
## V5   2.15996803
## V6   0.13519686
## V7   0.06891977
## V8  -0.14606085
## V9  -0.08615019
## V10 -0.02833177

merged_df <- merge(rfImp1, rfImp2, by = 0, all = TRUE)  # full outer join based on row names

# Change column names
colnames(merged_df) <- c("Variable", "Original", "Added")

# Turn rfImp3 row names into a column for it to match "Variable" column in merged_df
rfImp3$Variable <- row.names(rfImp3)

# Merge resulting data frame
final_merged_df <- merge(merged_df, rfImp3, by = "Variable", all = TRUE)
print(final_merged_df)

##      Variable    Original         Added     Overall
## 1  duplicate1          NA  3.5619669553  2.20132750
## 2  duplicate2          NA            NA  2.21517943
## 3          V1  8.79515634  6.1480103862  5.28857264
## 4         V10 -0.02833177 -0.0007077308 -0.01089477
## 5          V2  6.56654060  6.2205755875  6.53889792
## 6          V3  0.73300417  0.6131062101  0.45648431
## 7          V4  7.64635222  7.0224235923  6.97686635
## 8          V5  2.15996803  2.0881708413  1.95041235
## 9          V6  0.13519686  0.1769925679  0.15736819
## 10         V7  0.06891977 -0.0052209694 -0.01951386
## 11         V8 -0.14606085 -0.0843261310 -0.08711254
## 12         V9 -0.08615019 -0.0655124279 -0.05816253

Yes, the importance score for V1 decreased by 29.60%. If we add an additional predictor that is highly correlated with V1, the importance score decreases even further. Suggesting that the presence of collinearity diminishes the contribution of the predictor.

Do these importances show the same pattern as the traditional random forest model in part (a)?

Utilizing the cforest() function exhibits similar results to traditional random forest. If we compare variable importance values in the table below, we can observe that the values are quite close.

set.seed(401)
party.rf <- cforest(y ~ ., data = osim)
party.vi <- varimp(party.rf)
print(party.vi)

##          V1          V2          V3          V4          V5          V6 
##  8.46594475  6.71210451  0.03170395  8.26675512  1.98240807  0.04118253 
##          V7          V8          V9         V10 
## -0.03771838 -0.01570611 -0.01672384 -0.02870670

# Combine the two data frames by columns
combined_df <- cbind(rfImp1, party.vi)

# Change column names
colnames(combined_df) <- c("Traditional","Conditional Inference Trees")
sorted <- combined_df %>% 
          arrange(desc(Traditional))

# View the combined data frame
print(sorted)

##     Traditional Conditional Inference Trees
## V1   8.79515634                  8.46594475
## V4   7.64635222                  8.26675512
## V2   6.56654060                  6.71210451
## V5   2.15996803                  1.98240807
## V3   0.73300417                  0.03170395
## V6   0.13519686                  0.04118253
## V7   0.06891977                 -0.03771838
## V10 -0.02833177                 -0.02870670
## V9  -0.08615019                 -0.01672384
## V8  -0.14606085                 -0.01570611

combined_df$vars <- row.names(combined_df)
bars <- gather(combined_df,"Model","Value",-vars)

ggplot(bars, aes(x = vars, 
                 y = Value,
                 fill = Model,
                 color = Model)) +
  labs(x = "Variable",
       y = "Value") +
  geom_bar(stat = "identity", position="dodge") +
  ggtitle("Variable Importance by Model") +
  theme_light()

Does the same pattern occur with different models? Models evaluated: single tree, conditional trees, bagging, gbm, XGBoost.

When comparing variable importance for different models I observed that tradition random forest and conditional inference trees had very similar results. The first five predictors have similar scores and are ranked the same, however after the fifth predictor, V6-V10 differ in importance between models. Running a single decision tree model, also outputs different importance scores. Specifically, it puts predictor V1 in fourth place instead of first place as random forest did. In addition, when comparing an XGBoost model with a GBM, they seem to prioritize the first five variables in a similar fashion, with V1 and V4 being the only variables switched. Overall, we can expect similar results from running different types of random forest models. All of random forest models rank variables the same way. The single decision tree model was the only one to produce results with more variability.

# Single decision tree
t.mod <- rpart(y ~ ., data = osim)
singletree <- varImp(t.mod, scale = FALSE)
rpart.plot(t.mod, box.palette = "RdBu", shadow.col = "gray", nn = TRUE)

# GBM
gbm_model <- gbm(y ~ .,
                 data = osim,
                 distribution = "gaussian",  
                 n.trees = 500,  
                 interaction.depth = 4,  
                 shrinkage = 0.01,  
                 cv.folds = 5,
                 verbose = TRUE)

## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1       24.1670             nan     0.0100    0.1531
##      2       23.9133             nan     0.0100    0.2340
##      3       23.6358             nan     0.0100    0.2183
##      4       23.3923             nan     0.0100    0.1843
##      5       23.1776             nan     0.0100    0.1824
##      6       22.9465             nan     0.0100    0.1923
##      7       22.7015             nan     0.0100    0.1714
##      8       22.4553             nan     0.0100    0.1932
##      9       22.2487             nan     0.0100    0.1412
##     10       22.0367             nan     0.0100    0.1628
##     20       19.9010             nan     0.0100    0.1160
##     40       16.1896             nan     0.0100    0.1421
##     60       13.3836             nan     0.0100    0.1124
##     80       11.2214             nan     0.0100    0.0934
##    100        9.5048             nan     0.0100    0.0476
##    120        8.1565             nan     0.0100    0.0362
##    140        7.0323             nan     0.0100    0.0401
##    160        6.1552             nan     0.0100    0.0255
##    180        5.4105             nan     0.0100    0.0192
##    200        4.7505             nan     0.0100    0.0154
##    220        4.1993             nan     0.0100    0.0177
##    240        3.7038             nan     0.0100    0.0184
##    260        3.2719             nan     0.0100    0.0157
##    280        2.9332             nan     0.0100    0.0104
##    300        2.6492             nan     0.0100    0.0088
##    320        2.3950             nan     0.0100    0.0052
##    340        2.1912             nan     0.0100    0.0062
##    360        2.0174             nan     0.0100    0.0018
##    380        1.8604             nan     0.0100    0.0043
##    400        1.7232             nan     0.0100    0.0015
##    420        1.6073             nan     0.0100    0.0028
##    440        1.5022             nan     0.0100   -0.0017
##    460        1.4121             nan     0.0100    0.0023
##    480        1.3316             nan     0.0100    0.0012
##    500        1.2614             nan     0.0100   -0.0003

summary(gbm_model)

##     var    rel.inf
## V4   V4 28.8769407
## V1   V1 27.2546240
## V2   V2 22.3924783
## V5   V5 11.3868202
## V3   V3  6.5079090
## V7   V7  1.3060135
## V6   V6  0.9390911
## V9   V9  0.6559162
## V8   V8  0.3862335
## V10 V10  0.2939734

# X-G-Boost model
# Convert data set to xgb.matrix form
dtrain <- xgb.DMatrix(data = as.matrix(osim[, -11]), label = osim$y)

# Set XGBoost parameters
params <- list(
  objective = "reg:squarederror",   # Use squared error for regression
  max_depth = 3,                    # Maximum depth of trees
  eta = 0.1,                        # Learning rate
  subsample = 0.8,                  # Subsample ratio of the training instances
  colsample_bytree = 0.8,           # Subsample ratio of columns when constructing each tree
  nrounds = 100 
)

# Train XGBoost regression model
set.seed(412)
xgmodel <- xgboost(params = params, data = dtrain, nrounds = params$nrounds)

## [13:11:02] WARNING: src/learner.cc:767: 
## Parameters: { "nrounds" } are not used.
## 
## [1]  train-rmse:13.404816 
## [2]  train-rmse:12.205837 
## [3]  train-rmse:11.098226 
## [4]  train-rmse:10.110466 
## [5]  train-rmse:9.223679 
## [6]  train-rmse:8.403710 
## [7]  train-rmse:7.676675 
## [8]  train-rmse:7.016842 
## [9]  train-rmse:6.405671 
## [10] train-rmse:5.868406 
## [11] train-rmse:5.394286 
## [12] train-rmse:4.943999 
## [13] train-rmse:4.553758 
## [14] train-rmse:4.180865 
## [15] train-rmse:3.867211 
## [16] train-rmse:3.588804 
## [17] train-rmse:3.336419 
## [18] train-rmse:3.102745 
## [19] train-rmse:2.888623 
## [20] train-rmse:2.700505 
## [21] train-rmse:2.531398 
## [22] train-rmse:2.390964 
## [23] train-rmse:2.256409 
## [24] train-rmse:2.126198 
## [25] train-rmse:2.007025 
## [26] train-rmse:1.903379 
## [27] train-rmse:1.807370 
## [28] train-rmse:1.715265 
## [29] train-rmse:1.634894 
## [30] train-rmse:1.558840 
## [31] train-rmse:1.490037 
## [32] train-rmse:1.442248 
## [33] train-rmse:1.386516 
## [34] train-rmse:1.341928 
## [35] train-rmse:1.289945 
## [36] train-rmse:1.244374 
## [37] train-rmse:1.213534 
## [38] train-rmse:1.175151 
## [39] train-rmse:1.146698 
## [40] train-rmse:1.118393 
## [41] train-rmse:1.080502 
## [42] train-rmse:1.057118 
## [43] train-rmse:1.030455 
## [44] train-rmse:1.005868 
## [45] train-rmse:0.984499 
## [46] train-rmse:0.962644 
## [47] train-rmse:0.938600 
## [48] train-rmse:0.924984 
## [49] train-rmse:0.906063 
## [50] train-rmse:0.887524 
## [51] train-rmse:0.871785 
## [52] train-rmse:0.852143 
## [53] train-rmse:0.837588 
## [54] train-rmse:0.823146 
## [55] train-rmse:0.809244 
## [56] train-rmse:0.793071 
## [57] train-rmse:0.782217 
## [58] train-rmse:0.771578 
## [59] train-rmse:0.760672 
## [60] train-rmse:0.746426 
## [61] train-rmse:0.735683 
## [62] train-rmse:0.724415 
## [63] train-rmse:0.713657 
## [64] train-rmse:0.705167 
## [65] train-rmse:0.698128 
## [66] train-rmse:0.691478 
## [67] train-rmse:0.683803 
## [68] train-rmse:0.673715 
## [69] train-rmse:0.665352 
## [70] train-rmse:0.655082 
## [71] train-rmse:0.646348 
## [72] train-rmse:0.636263 
## [73] train-rmse:0.625558 
## [74] train-rmse:0.620163 
## [75] train-rmse:0.611719 
## [76] train-rmse:0.603809 
## [77] train-rmse:0.599097 
## [78] train-rmse:0.591702 
## [79] train-rmse:0.583634 
## [80] train-rmse:0.578260 
## [81] train-rmse:0.571664 
## [82] train-rmse:0.566050 
## [83] train-rmse:0.562909 
## [84] train-rmse:0.559467 
## [85] train-rmse:0.553540 
## [86] train-rmse:0.545390 
## [87] train-rmse:0.538778 
## [88] train-rmse:0.535683 
## [89] train-rmse:0.531732 
## [90] train-rmse:0.525795 
## [91] train-rmse:0.519597 
## [92] train-rmse:0.515618 
## [93] train-rmse:0.510750 
## [94] train-rmse:0.506428 
## [95] train-rmse:0.500552 
## [96] train-rmse:0.496908 
## [97] train-rmse:0.493499 
## [98] train-rmse:0.486940 
## [99] train-rmse:0.482316 
## [100]    train-rmse:0.476934

# Get variable importance scores
importance_scores <- xgb.importance(model = xgmodel)

# Plot variable importance
xgb.plot.importance(importance_matrix = importance_scores, main = "XGBoost Feature Importance")

Exercise 2

data(mlc_churn)
str(mlc_churn)

## tibble [5,000 × 20] (S3: tbl_df/tbl/data.frame)
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int [1:5000] 128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int [1:5000] 25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num [1:5000] 265 162 243 299 167 ...
##  $ total_day_calls              : int [1:5000] 110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num [1:5000] 45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num [1:5000] 197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int [1:5000] 99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num [1:5000] 16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num [1:5000] 245 254 163 197 187 ...
##  $ total_night_calls            : int [1:5000] 91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num [1:5000] 11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num [1:5000] 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int [1:5000] 3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num [1:5000] 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int [1:5000] 1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

table(mlc_churn$churn)

## 
##  yes   no 
##  707 4293

Explore the data by visualizing the relationship between the predictors and the outcome.

Are there important features of the predictor data themselves, such as between-predictor correlations or degenerate distributions? Can functions of more than one predictor be used to model the data more effectively?

# Class imbalance check
 ggplot(mlc_churn, aes(x = churn, fill = churn)) +
     geom_bar(alpha = 0.5) +
     labs(
         title = "Histogram of Values",
         x = "Customer Churn",
         y = "Frequency")

# Check degenerate distributions
nearZeroVar(mlc_churn)

## [1] 6

hist(mlc_churn$number_vmail_messages)

#removed: area code, state, lag variable international plan, voice mail plan, and response variable.
selected = mlc_churn[, -c(1,3,4,5,6)]
selected

## # A tibble: 5,000 × 15
##    account_length total_day_minutes total_day_calls total_day_charge
##             <int>             <dbl>           <int>            <dbl>
##  1            128              265.             110             45.1
##  2            107              162.             123             27.5
##  3            137              243.             114             41.4
##  4             84              299.              71             50.9
##  5             75              167.             113             28.3
##  6            118              223.              98             38.0
##  7            121              218.              88             37.1
##  8            147              157               79             26.7
##  9            117              184.              97             31.4
## 10            141              259.              84             44.0
## # ℹ 4,990 more rows
## # ℹ 11 more variables: total_eve_minutes <dbl>, total_eve_calls <int>,
## #   total_eve_charge <dbl>, total_night_minutes <dbl>, total_night_calls <int>,
## #   total_night_charge <dbl>, total_intl_minutes <dbl>, total_intl_calls <int>,
## #   total_intl_charge <dbl>, number_customer_service_calls <int>, churn <fct>

num <- selected

# Visual some relationships
pairs(selected[, c(1:6)])

# Exploring different variables
ggplot(mlc_churn, aes(x = state,
                      y = account_length,
                      color = churn)) +
  geom_point()

ggplot(mlc_churn, aes(x = international_plan, y = state, fill = churn)) +
    geom_tile()

ggplot(mlc_churn, aes(x = account_length,
                      y = as.numeric(churn))) +
        geom_point()

ggplot(mlc_churn, aes(x = account_length, y = total_day_minutes, color = international_plan)) +
  geom_point()

#  facet_grid(~ churn)
ggplot(mlc_churn, aes(x = area_code)) +
  geom_bar()

hist(mlc_churn$number_customer_service_calls)

A class imbalance has been observed, with “yes” churn cases consisting of only 16.47% out all of observations. Only predictors that have strong correlations between themselves are day minutes and day charge, this also applies to the other times of the day. There was one variable with a degenerate distribution which was removed from the data set. By plotting the GAM model we are able observe that some predictors do indeed exhibit nonlinear relationships. Had we ran a linear model, the data would not have been interpreted correctly. Additionally, I used the first GAM model to derive significant predictors. Comparing AIC values we can see that the second GAM model performs better with an AIC score of 3236.169 as supposed to 3559.85.

gam.mod1 <- gam(churn ~ .,
                 data = selected, family = binomial(link = "logit"))
summary(gam.mod1)

## 
## Call: gam(formula = churn ~ ., family = binomial(link = "logit"), data = selected)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1164  0.2681  0.4040  0.5535  1.7554 
## 
## (Dispersion Parameter for binomial family taken to be 1)
## 
##     Null Deviance: 4074.968 on 4999 degrees of freedom
## Residual Deviance: 3529.85 on 4985 degrees of freedom
## AIC: 3559.85 
## 
## Number of Local Scoring Iterations: 5 
## 
## Anova for Parametric Effects
##                                 Df Sum Sq Mean Sq  F value    Pr(>F)    
## account_length                   1    1.8   1.839   1.7390  0.187322    
## total_day_minutes                1  152.6 152.595 144.2828 < 2.2e-16 ***
## total_day_calls                  1    0.8   0.793   0.7497  0.386608    
## total_day_charge                 1    1.6   1.594   1.5067  0.219697    
## total_eve_minutes                1   31.4  31.435  29.7228 5.225e-08 ***
## total_eve_calls                  1    0.2   0.249   0.2351  0.627792    
## total_eve_charge                 1    0.2   0.211   0.1991  0.655468    
## total_night_minutes              1    8.6   8.610   8.1412  0.004345 ** 
## total_night_calls                1    0.5   0.470   0.4441  0.505175    
## total_night_charge               1    0.0   0.010   0.0094  0.922955    
## total_intl_minutes               1   18.4  18.421  17.4174 3.052e-05 ***
## total_intl_calls                 1   10.3  10.281   9.7214  0.001832 ** 
## total_intl_charge                1    0.1   0.138   0.1307  0.717766    
## number_customer_service_calls    1  221.2 221.217 209.1676 < 2.2e-16 ***
## Residuals                     4985 5272.2   1.058                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

gam.mod2 <- gam(churn ~ account_length +
                 s(total_day_minutes) +
                 s(total_eve_minutes) +
                 s(total_night_minutes) +
                 s(total_intl_minutes) +
                 s(total_intl_calls) +
                 number_customer_service_calls,
                 data = selected, family = binomial(link = "logit"))
summary(gam.mod2)

## 
## Call: gam(formula = churn ~ account_length + s(total_day_minutes) + 
##     s(total_eve_minutes) + s(total_night_minutes) + s(total_intl_minutes) + 
##     s(total_intl_calls) + number_customer_service_calls, family = binomial(link = "logit"), 
##     data = selected)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7848  0.2581  0.3626  0.4969  2.3761 
## 
## (Dispersion Parameter for binomial family taken to be 1)
## 
##     Null Deviance: 4074.968 on 4999 degrees of freedom
## Residual Deviance: 3190.168 on 4977 degrees of freedom
## AIC: 3236.169 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                                 Df Sum Sq Mean Sq  F value    Pr(>F)    
## account_length                   1    2.1   2.117   2.2268 0.1356950    
## s(total_day_minutes)             1  136.6 136.557 143.6102 < 2.2e-16 ***
## s(total_eve_minutes)             1   32.6  32.644  34.3300 4.951e-09 ***
## s(total_night_minutes)           1    8.9   8.880   9.3389 0.0022553 ** 
## s(total_intl_minutes)            1   23.4  23.422  24.6321 7.170e-07 ***
## s(total_intl_calls)              1   12.6  12.640  13.2927 0.0002692 ***
## number_customer_service_calls    1  250.5 250.508 263.4473 < 2.2e-16 ***
## Residuals                     4977 4732.6   0.951                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                               Npar Df Npar Chisq    P(Chi)    
## (Intercept)                                                   
## account_length                                                
## s(total_day_minutes)                3    276.864 < 2.2e-16 ***
## s(total_eve_minutes)                3     10.387   0.01555 *  
## s(total_night_minutes)              3      2.198   0.53233    
## s(total_intl_minutes)               3     11.053   0.01144 *  
## s(total_intl_calls)                 3     26.251 8.446e-06 ***
## number_customer_service_calls                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(gam.mod2)

Apply boosting, bagging, random forests, and BART to this data set. Be sure to fit the models on a training set and to evaluate their performance on a test set. How accurate are the results compared to simple methods like linear or logistic regression? What criteria should be used to evaluate the effectiveness and performance of the models.

# Note on bartMachine: Attemped BART but was not able to run successfully.

# Data preparation for analysis
set.seed(123)
ind <- sample(1:nrow(selected),0.8 * nrow(selected))
ind.n <- sample(1:nrow(num),0.8 * nrow(num))
num$churn <- ifelse(num$churn == "yes", 1, 0)

# re code yes no responses to 1's and 0's
selected$churn <- ifelse(selected$churn == "yes", 1, 0)
selected$churn <- as.factor(selected$churn)

# split and train data for categorical response
train <- selected[ind, ]
test <- selected[-ind, ]

# split and train data for continuous response
train.n <- num[ind.n, ]
test.n <- num[-ind.n, ]

# MULTIPLE LINEAR REGRESSION
linear_model <- lm(churn ~ ., data = train.n)
pred.n <- predict(linear_model, newdata = test.n)
mse <- mean((test.n$churn - pred.n)^2)
print(paste("Mean Squared Error (MSE):", mse))

## [1] "Mean Squared Error (MSE): 0.110322346523688"

summary(linear_model)

## 
## Call:
## lm(formula = churn ~ ., data = train.n)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.55215 -0.17698 -0.10279 -0.00692  1.08442 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -4.274e-01  6.284e-02  -6.801 1.19e-11 ***
## account_length                 8.727e-05  1.310e-04   0.666 0.505423    
## total_day_minutes              2.299e-01  3.109e-01   0.739 0.459732    
## total_day_calls                1.462e-04  2.648e-04   0.552 0.580924    
## total_day_charge              -1.344e+00  1.829e+00  -0.735 0.462464    
## total_eve_minutes              1.006e-02  1.546e-01   0.065 0.948123    
## total_eve_calls               -3.192e-04  2.619e-04  -1.219 0.223027    
## total_eve_charge              -1.113e-01  1.819e+00  -0.061 0.951192    
## total_night_minutes           -4.062e-02  8.170e-02  -0.497 0.619112    
## total_night_calls             -1.852e-04  2.624e-04  -0.706 0.480386    
## total_night_charge             9.104e-01  1.816e+00   0.501 0.616065    
## total_intl_minutes             6.741e-03  4.916e-01   0.014 0.989060    
## total_intl_calls              -7.389e-03  2.137e-03  -3.457 0.000552 ***
## total_intl_charge              9.492e-03  1.821e+00   0.005 0.995841    
## number_customer_service_calls  5.722e-02  3.969e-03  14.416  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3293 on 3985 degrees of freedom
## Multiple R-squared:  0.1109, Adjusted R-squared:  0.1077 
## F-statistic: 35.49 on 14 and 3985 DF,  p-value: < 2.2e-16

# R-squared is pretty terrible: 0.1095, with only a few predictors deemed statistically significant.

# LOGISTIC REGRESSION
ctrl <- trainControl(method = "cv", number = 5)
# Train logistic regression model using cross-validation
model <- train(churn ~ ., data = train, method = "glm", family = "binomial", trControl = ctrl)
# View cross-validation results
print(model)

## Generalized Linear Model 
## 
## 4000 samples
##   14 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 3200, 3200, 3200, 3200, 3200 
## Resampling results:
## 
##   Accuracy  Kappa    
##   0.8565    0.1036343

# Now fit model on 20% test data
log.mod <- glm(churn ~ ., data = train, family = "binomial")
predicted_probabilities <- predict(log.mod, newdata = test, type = "response")
predicted_labels <- ifelse(predicted_probabilities >= 0.5, 1, 0)

confusion_matrix <- table(test$churn, predicted_labels)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Test Error Rate:", 1 - accuracy))

## [1] "Test Error Rate: 0.135"

# Plot ROC curve
cv_roc <- roc(test$churn, predicted_probabilities)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(cv_roc, main = "ROC Curve", 
     col = "blue", lwd = 2)
# Add diagonal reference line (random classifier)
lines(0:1, 0:1, col = "gray", lty = 2)
# Add AUC value to the plot
legend("bottomright", legend = paste("AUC =", round(auc(cv_roc), 3)), 
       col = "blue", lwd = 2)

# GRADIENT BOOSTING MACHINE, Learns iteratively.
boosting <- gbm(churn ~., data = train.n,
                distribution = "bernoulli",
                n.trees = 3000,
                interaction.depth = 4)

predictions <- predict(boosting, test.n, n.trees = 1000, type = "response")

# Convert predicted probabilities to class labels (0 or 1)
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
accuracy <- mean(predicted_classes == test$churn)
cat("Test Error Rate:", 1 - accuracy, "\n")

## Test Error Rate: 0.194

gbm_roc <- roc(test.n$churn, predictions)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# Plot ROC curve
plot(gbm_roc, main = "ROC Curve for GBM Model", col = "red")
lines(x = c(0, 1), y = c(0, 1), col = "gray", lty = 2)  
legend("bottomright", legend = paste("AUC =", round(auc(gbm_roc), 3)), 
       col = "red", lwd = 2)

# RANDOM FOREST - BAGGING
rf.mod <- randomForest(churn ~ ., data = train, importance = TRUE, ntree = 5000)
variableimp <- varImp(rf.mod, scale = FALSE)

# Make predictions on 20% test set
predictions2 <- predict(rf.mod, newdata = test, type = "class")
predictions3 <- predict(rf.mod, newdata = test, type = "prob")[, "1"]
cat("Test Error Rate:", mean(predictions2 != test$churn))

## Test Error Rate: 0.08

#ROC Curve
rf_roc <- roc(test$churn, predictions3)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot(rf_roc, main = "ROC Curve for Random Forest Model", col = "lightgreen")
lines(x = c(0, 1), y = c(0, 1), col = "gray", lty = 2)  
legend("bottomright", legend = paste("AUC =", round(auc(rf_roc), 3)), 
       col = "lightgreen", lwd = 2)

Based on GBM model’s variable importance plot, we should move forward with the top six predictors which are: total_day_minutes, total_eve_minutes, number_customer_service_calls, total_intl_minutes, total_night_minutes_ account_length.

As expected, multiple linear regression did not perform so well. Multiple linear regression predicts continuous numeric values. It models the relationship between predictors and a continuous response variable, aiming to minimize the error between predicted and observed numeric outcomes.

To evaluate their model performance I used ROC curves. Logistic regression and GBM demonstrated the same error rate even after cross-validation. Random forest proved to the be the best performing model.

Exercise 3

data("OJ")
str(OJ)

## 'data.frame':    1070 obs. of  18 variables:
##  $ Purchase      : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
##  $ WeekofPurchase: num  237 239 245 227 228 230 232 234 235 238 ...
##  $ StoreID       : num  1 1 1 1 7 7 7 7 7 7 ...
##  $ PriceCH       : num  1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceMM       : num  1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
##  $ DiscCH        : num  0 0 0.17 0 0 0 0 0 0 0 ...
##  $ DiscMM        : num  0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
##  $ SpecialCH     : num  0 0 0 0 0 0 1 1 0 0 ...
##  $ SpecialMM     : num  0 1 0 0 0 1 1 0 0 0 ...
##  $ LoyalCH       : num  0.5 0.6 0.68 0.4 0.957 ...
##  $ SalePriceMM   : num  1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
##  $ SalePriceCH   : num  1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceDiff     : num  0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
##  $ Store7        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
##  $ PctDiscMM     : num  0 0.151 0 0 0 ...
##  $ PctDiscCH     : num  0 0 0.0914 0 0 ...
##  $ ListPriceDiff : num  0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
##  $ STORE         : num  1 1 1 1 0 0 0 0 0 0 ...

set.seed(123)
ind <- sample(1:nrow(OJ),0.75 * nrow(OJ))

tr <- OJ[ind, ]
te <- OJ[-ind, ]

tmod <- rpart(Purchase ~., data = tr)
tree.model <- tree(Purchase ~., data = tr)
summary(tree.model)

## 
## Classification tree:
## tree(formula = Purchase ~ ., data = tr)
## Variables actually used in tree construction:
## [1] "LoyalCH"   "PriceDiff"
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7637 = 606.4 / 794 
## Misclassification error rate: 0.1658 = 133 / 802

plot(tree.model)
text(tree.model, pretty = 0)

i.tr.pred = predict(tree.model, te, type = "class")
table(i.tr.pred, te$Purchase)

##          
## i.tr.pred  CH  MM
##        CH 149  34
##        MM  15  70

result <- (149 + 70)/268
p <- 1-result
w <- round(p, 4)
print(paste("Test Error Rate:", w))

## [1] "Test Error Rate: 0.1828"

#Cross validation
set.seed(45)
cv.b <- cv.tree(tree.model, FUN = prune.misclass)
names(cv.b)

## [1] "size"   "dev"    "k"      "method"

error_rates <- cv.b$dev
tree_sizes <- cv.b$size

plot(tree_sizes, error_rates, type = "b", 
     xlab = "Number of Terminal Nodes", 
     ylab = "Cross-Validated Error Rate",
     main = "Cross-Validated Error Rates")
lines(tree_sizes, error_rates, type = "b", col = "blue")

pruned <- prune.misclass(tree.model, best = 5)
plot(pruned)
text(pruned, pretty = 0)

te.preds = predict(pruned, te, type = "class")
table(te.preds, te$Purchase)

##         
## te.preds  CH  MM
##       CH 149  34
##       MM  15  70

result2 <- (149 + 70)/268
beta <- 1-result2
s <- round(beta, 4)
print(paste("Test Error Rate:", s))

## [1] "Test Error Rate: 0.1828"

(a - b) Single decision tree utilized only two predictors out of 18 covariates. Training error rate is 19%. The tree produced 6 terminal nodes and has a residual mean deviance of 0.8227.

([c]) I picked terminal node 14 with a split criterion of ‘PriceDiff’ < -0.39. The number of observations within the branch are 14. With the overall prediction for that branch being “CH” and a deviance of 19.12.

(d - g) After plotting the decision tree we can observe that LoyalCH and PriceDiff are important predictors were used as cutoff points within the diagram. The plot suggests that a value of LoyalCH that is less than 0.276142 means you will be more than likely purchase Minute Maid brand. While if the value of LoyalCH is greater than 0.0.276142 and the PriceDiff is greater than 0.05 a customer would be more likely buy Citrus Hill orange juice.

([h]) Tree size of 5 corresponds to the lowest test error rate as seen in our plot. ([j]) Training error rate is 16.58% which is surprisingly more than the unpruned tree. ([k]) With a test error rate of 17.54%, the pruned tree proved to have worse performance even though it may be more interpretable.

Exercise 4

Split data into training and test sets

data("OJ")
str(OJ)

## 'data.frame':    1070 obs. of  18 variables:
##  $ Purchase      : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
##  $ WeekofPurchase: num  237 239 245 227 228 230 232 234 235 238 ...
##  $ StoreID       : num  1 1 1 1 7 7 7 7 7 7 ...
##  $ PriceCH       : num  1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceMM       : num  1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
##  $ DiscCH        : num  0 0 0.17 0 0 0 0 0 0 0 ...
##  $ DiscMM        : num  0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
##  $ SpecialCH     : num  0 0 0 0 0 0 1 1 0 0 ...
##  $ SpecialMM     : num  0 1 0 0 0 1 1 0 0 0 ...
##  $ LoyalCH       : num  0.5 0.6 0.68 0.4 0.957 ...
##  $ SalePriceMM   : num  1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
##  $ SalePriceCH   : num  1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceDiff     : num  0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
##  $ Store7        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
##  $ PctDiscMM     : num  0 0.151 0 0 0 ...
##  $ PctDiscCH     : num  0 0 0.0914 0 0 ...
##  $ ListPriceDiff : num  0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
##  $ STORE         : num  1 1 1 1 0 0 0 0 0 0 ...

set.seed(123)
ind <- sample(1:nrow(OJ),0.75 * nrow(OJ))

tr <- OJ[ind, ]
te <- OJ[-ind, ]

There are 621 support vectors with 309 in one class and 312 and another. The model uses a linear kernel and a cost parameter of 0.01.

svmfit <- svm(Purchase ~ ., data = tr, kernel = "linear",
              cost = 0.01, scale = FALSE)
summary(svmfit)

## 
## Call:
## svm(formula = Purchase ~ ., data = tr, kernel = "linear", cost = 0.01, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  621
## 
##  ( 309 312 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

Training and test error rates.

ypred <- predict(svmfit, tr)
table(predict = ypred, truth = tr$Purchase)

##        truth
## predict  CH  MM
##      CH 441 133
##      MM  48 180

training_error_rate <- mean(ypred != tr$Purchase)
training_error_rate

## [1] 0.2256858

xpred <- predict(svmfit, te)
test_error_rate <- mean(xpred != te$Purchase)
test_error_rate

## [1] 0.2276119

Use tune() function.

set.seed(427)
tunes <- tune(svm, Purchase ~ ., data = tr, kernel = "linear",
              ranges = list(cost = c(0.01, 0.02, .04, .06, .08, .10)))
summary(tunes)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  0.04
## 
## - best performance: 0.1646451 
## 
## - Detailed performance results:
##   cost     error dispersion
## 1 0.01 0.1695988 0.03650945
## 2 0.02 0.1671142 0.04504690
## 3 0.04 0.1646451 0.04493262
## 4 0.06 0.1671451 0.04736174
## 5 0.08 0.1708951 0.04546727
## 6 0.10 0.1708796 0.04268469

New optimal value for the cost parameter is 0.04 with a test error rate of 0.16. Which is significantly better than the initial support vector classifier we trained of 0.228.

best <- tunes$best.model
summary(best)

## 
## Call:
## best.tune(METHOD = svm, train.x = Purchase ~ ., data = tr, ranges = list(cost = c(0.01, 
##     0.02, 0.04, 0.06, 0.08, 0.1)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.04 
## 
## Number of Support Vectors:  376
## 
##  ( 187 189 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

xlr8 <- predict(best, te)
test_error <- mean(xlr8 != te$Purchase)
cat("Test Error Rate:", test_error, "\n")

## Test Error Rate: 0.1604478

Repeat parts (b) through (e) using a support vector machine with a radial kernel. Use the default value for gamma.

svm.mod <- svm(Purchase ~ ., data = tr, kernel = "radial",
              gamma = 1)

summary(svm.mod)

## 
## Call:
## svm(formula = Purchase ~ ., data = tr, kernel = "radial", gamma = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  481
## 
##  ( 217 264 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

result.svm <- predict(svm.mod, te)
te.svm <- mean(result.svm != te$Purchase)
te.svm

## [1] 0.2313433

Tuning step and saw an improvement of 0.01.

tune.out <- tune(svm, Purchase ~ ., data = tr,
  kernel = "radial",
  ranges = list(cost = c(0.1, 1, 10, 100, 1000),
  gamma = c(0.5, 1, 2, 3, 4) 
  )
)

summary(tune.out)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1   0.5
## 
## - best performance: 0.1908333 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1  1e-01   0.5 0.2791667 0.04937023
## 2  1e+00   0.5 0.1908333 0.04066163
## 3  1e+01   0.5 0.1995370 0.03744128
## 4  1e+02   0.5 0.2194444 0.03484269
## 5  1e+03   0.5 0.2193981 0.04371266
## 6  1e-01   1.0 0.3365432 0.04752757
## 7  1e+00   1.0 0.1995062 0.04367377
## 8  1e+01   1.0 0.2107253 0.04057688
## 9  1e+02   1.0 0.2144599 0.03468734
## 10 1e+03   1.0 0.2218981 0.04622219
## 11 1e-01   2.0 0.3801698 0.04695982
## 12 1e+00   2.0 0.2156944 0.04240119
## 13 1e+01   2.0 0.2281481 0.04388407
## 14 1e+02   2.0 0.2206327 0.04592495
## 15 1e+03   2.0 0.2418056 0.04424762
## 16 1e-01   3.0 0.3901389 0.04410776
## 17 1e+00   3.0 0.2256481 0.04158999
## 18 1e+01   3.0 0.2343981 0.05384623
## 19 1e+02   3.0 0.2356019 0.03966734
## 20 1e+03   3.0 0.2380864 0.04373171
## 21 1e-01   4.0 0.3901389 0.04410776
## 22 1e+00   4.0 0.2306173 0.04612888
## 23 1e+01   4.0 0.2381327 0.05104952
## 24 1e+02   4.0 0.2381173 0.03714526
## 25 1e+03   4.0 0.2418519 0.04262481

table(
  true = te$Purchase,
  pred = predict(
    tune.out$best.model, newdata = te
  )
)

##     pred
## true  CH  MM
##   CH 140  24
##   MM  35  69

pred = predict(tune.out$best.model, newdata = te)
te_error <- mean(pred != te$Purchase)
cat("Support Vector Machine Tuned Test Error Rate:", te_error, "\n")

## Support Vector Machine Tuned Test Error Rate: 0.2201493

Repeat parts (b) through (e) using a support vector machine with a polynomial kernel. Set degree = 2.

Overall, the support vector machine (Polynomial kernel) produced the best results with a test error rate of 0.1567164. I was able to acquire this error rate after tuning the parameters. It outperformed the other two support vector machine models; linear kernel and radial kernel. I would conclude that the data is both linearly and non-linearly related. After tuning the other non-linear models, the linear kernel and the polynomial kernel had very similar performance. We can observe the difference in performance in the table provided at the end.

svm.poly <- svm(Purchase ~ ., data = tr, kernel = "polynomial",
              degree = 2)

summary(svm.poly)

## 
## Call:
## svm(formula = Purchase ~ ., data = tr, kernel = "polynomial", degree = 2)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  2 
##      coef.0:  0 
## 
## Number of Support Vectors:  451
## 
##  ( 222 229 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

pr <- predict(svm.poly, te)
mean(pr != te$Purchase)

## [1] 0.2126866

Now with tuning.

degree_grid <- expand.grid(degree = c(1, 2, 3, 4, 5))
cv.tune <- tune(svm, Purchase ~ ., data = tr,
  kernel = "polynomial",
  ranges = degree_grid,
  tunecontrol = tune.control(sampling = "cross", cross = 5))

summary(cv.tune)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 5-fold cross validation 
## 
## - best parameters:
##  degree
##       1
## 
## - best performance: 0.1646196 
## 
## - Detailed performance results:
##   degree     error dispersion
## 1      1 0.1646196 0.01883537
## 2      2 0.1833463 0.03913224
## 3      3 0.1646351 0.03652580
## 4      4 0.2219410 0.04145906
## 5      5 0.2431444 0.03524174

smod <- cv.tune$best.model
spred <- predict(smod, te)
t_error <- mean(spred != te$Purchase)
cat("Support Vector Machine Polynomial Test Error Rate:", t_error, "\n")

## Support Vector Machine Polynomial Test Error Rate: 0.1567164

# Comparison table
models <- c("Support Vector Classifier", "Support Vector Machine", "Polynomial SVM")
error <- c(0.1604478, 0.2201493, 0.1567164)
test_errors <- data.frame(Model = models, Test_Error_Rate = error)

kable(test_errors)

Model	Test_Error_Rate
Support Vector Classifier	0.1604478
Support Vector Machine	0.2201493
Polynomial SVM	0.1567164