# Load required libraries
library(mlbench)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()  masks randomForest::combine()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ purrr::lift()     masks caret::lift()
## ✖ ggplot2::margin() masks randomForest::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(party)
## Warning: package 'party' was built under R version 4.3.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.3
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.3.3
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.3
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## 
## Attaching package: 'party'
## 
## The following object is masked from 'package:dplyr':
## 
##     where
library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(Cubist)
## Warning: package 'Cubist' was built under R version 4.3.3

DATA624-HW9

8.1. Recreate the simulated data from Exercise 7.2:

# Simulate the data
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
  1. Fit a random forest model to all of the predictors, then estimate the variable importance scores:
set.seed(1234)
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)

# Compute variable importance
rfImp1 <- varImp(model1, scale = FALSE)

# Print variable importance
print(rfImp1)
##         Overall
## V1   8.87315401
## V2   6.64432958
## V3   0.75512916
## V4   7.65645259
## V5   2.26750244
## V6   0.05986796
## V7   0.12614683
## V8  -0.13392719
## V9  -0.07299790
## V10 -0.11477312
# Visualize variable importance
varImpPlot(model1, main = "Variable Importance - Model 1")

  1. Now add an additional predictor that is highly correlated with one of the informative predictors. For example: Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?
# Add a duplicate predictor highly correlated with V1
set.seed(12345)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cat("Correlation between V1 and duplicate1:", cor(simulated$duplicate1, simulated$V1), "\n")
## Correlation between V1 and duplicate1: 0.9276736
# Fit a new Random Forest model
set.seed(12346)
model2 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)

# Compute and visualize variable importance
rfImp2 <- varImp(model2, scale = FALSE)
print(rfImp2)
##                Overall
## V1          5.98979308
## V2          6.44811988
## V3          0.54234824
## V4          7.15946036
## V5          2.13532498
## V6          0.16984306
## V7          0.03657633
## V8         -0.04909462
## V9         -0.06767017
## V10         0.03781025
## duplicate1  4.09360177
varImpPlot(model2, main = "Variable Importance - Model 2")

# Add another duplicate predictor
set.seed(12347)
simulated$duplicate2 <- simulated$V1 + rnorm(200) * 0.1

# Fit a third Random Forest model
set.seed(12348)
model3 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)

# Compute and visualize variable importance
rfImp3 <- varImp(model3, scale = FALSE)
print(rfImp3)
##                  Overall
## V1          4.3946945331
## V2          6.2734809213
## V3          0.6333111666
## V4          7.1586549867
## V5          2.0627857967
## V6          0.1147837929
## V7         -0.0007655926
## V8         -0.0112786817
## V9         -0.0285221488
## V10        -0.0101958452
## duplicate1  3.1492973037
## duplicate2  2.5214862794
varImpPlot(model3, main = "Variable Importance - Model 3")

  1. Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?
# Fit a conditional inference tree forest model
set.seed(12349)
cforest_model <- cforest(y ~ ., data = simulated, controls = cforest_unbiased(ntree = 1000))

# Compute conditional importance
cond_imp <- varimp(cforest_model, conditional = TRUE)
print(cond_imp)
##           V1           V2           V3           V4           V5           V6 
##  1.829373642  4.673530555  0.006200107  5.810570841  1.241287562 -0.003403790 
##           V7           V8           V9          V10   duplicate1   duplicate2 
## -0.005306408 -0.004077366 -0.006234777 -0.006530365  1.034077374  0.931977983
# Comparison to traditional importance
trad_imp <- varimp(cforest_model, conditional = FALSE)
print(trad_imp)
##          V1          V2          V3          V4          V5          V6 
##  4.53020990  5.82256141  0.02660196  6.95868444  1.73418971 -0.01128858 
##          V7          V8          V9         V10  duplicate1  duplicate2 
##  0.01687006 -0.02486989 -0.02028303 -0.01914958  2.21375707  2.84800626
  1. Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur? 8.8 Computing 219 Importance FP104 FP044 FP065 FP070 FP107 FP074 FP072 FP015 NumRings FP075 NumNonHBonds NumRotBonds NumBonds NumHydrogen NumNonHAtoms SurfaceArea2 FP172 NumHalogen NumAtoms NumMultBonds SurfaceArea1 NumCarbon NumAromaticBonds HydrophilicFactor MolWeight 0 200 400 600 Importance NumAromaticBonds FP141 FP205 FP137 NumChlorine FP176 NumNonHBonds NumHalogen NumOxygen NumAtoms NumRotBonds NumMultBonds FP077 FP198 FP092 FP135 FP059 NumHydrogen NumBonds NumNonHAtoms HydrophilicFactor SurfaceArea1 SurfaceArea2 MolWeight NumCarbon 0 500 1000 Fig. 8.24: A comparison of variable importance magnitudes for differing values of the bagging fraction and shrinkage parameters. Both tuning parameters are set to 0.1 in the left figure. Both are set to 0.9 in the right figure

8.2. Use a simulation to show tree bias with different granularities.

set.seed(12349)
X1 <- rep(1:2,each=100)
Y <- X1 + rnorm(200,mean=0,sd=4)

set.seed(12349)
X2 <- rnorm(200,mean=0,sd=2)

simData <- data.frame(Y=Y,X1=X1,X2=X2)
print(simData)
##                 Y X1          X2
## 1     2.809795868  1  0.90489793
## 2    -1.527790862  1 -1.26389543
## 3     2.427260883  1  0.71363044
## 4     0.013685283  1 -0.49315736
## 5     2.002072357  1  0.50103618
## 6    -0.073175073  1 -0.53658754
## 7     1.181998039  1  0.09099902
## 8     5.530924121  1  2.26546206
## 9     5.779158327  1  2.38957916
## 10    1.621330503  1  0.31066525
## 11    4.878618007  1  1.93930900
## 12    3.628134673  1  1.31406734
## 13    3.688308418  1  1.34415421
## 14    5.083912543  1  2.04195627
## 15   -0.090225111  1 -0.54511256
## 16   -1.476556567  1 -1.23827828
## 17    0.178218211  1 -0.41089089
## 18    5.663311458  1  2.33165573
## 19   -9.726043484  1 -5.36302174
## 20    2.306414410  1  0.65320721
## 21    7.407652631  1  3.20382632
## 22   -0.679245662  1 -0.83962283
## 23    3.606612584  1  1.30330629
## 24    0.640026966  1 -0.17998652
## 25   -6.192319194  1 -3.59615960
## 26    4.944586070  1  1.97229304
## 27    1.282130088  1  0.14106504
## 28    5.375169358  1  2.18758468
## 29   -0.382550365  1 -0.69127518
## 30    3.123009861  1  1.06150493
## 31   -2.041237179  1 -1.52061859
## 32    4.379914460  1  1.68995723
## 33   -0.581217310  1 -0.79060866
## 34    7.109070296  1  3.05453515
## 35    5.163712565  1  2.08185628
## 36    2.966606848  1  0.98330342
## 37   -7.593835273  1 -4.29691764
## 38   -0.805203825  1 -0.90260191
## 39   -1.556687086  1 -1.27834354
## 40    0.039269447  1 -0.48036528
## 41    3.560738741  1  1.28036937
## 42    0.537710682  1 -0.23114466
## 43    5.904257551  1  2.45212878
## 44    2.516721688  1  0.75836084
## 45   11.025981860  1  5.01299093
## 46   -2.143133364  1 -1.57156668
## 47    5.522566917  1  2.26128346
## 48   -0.852075699  1 -0.92603785
## 49   -2.960484618  1 -1.98024231
## 50   -4.941870005  1 -2.97093500
## 51    2.496372273  1  0.74818614
## 52   -0.859831698  1 -0.92991585
## 53    7.122586057  1  3.06129303
## 54   -1.374298908  1 -1.18714945
## 55   -2.082783174  1 -1.54139159
## 56    4.271467695  1  1.63573385
## 57   -5.488843623  1 -3.24442181
## 58   -2.972483684  1 -1.98624184
## 59   -4.939102710  1 -2.96955136
## 60   -3.576568104  1 -2.28828405
## 61   -0.561628386  1 -0.78081419
## 62    2.738884272  1  0.86944214
## 63    0.383270049  1 -0.30836498
## 64    2.811555967  1  0.90577798
## 65   -3.574543028  1 -2.28727151
## 66   -4.253313091  1 -2.62665655
## 67   -0.543771472  1 -0.77188574
## 68    0.736965534  1 -0.13151723
## 69    0.042992765  1 -0.47850362
## 70    1.497287621  1  0.24864381
## 71   -5.250400873  1 -3.12520044
## 72    1.487364602  1  0.24368230
## 73   -2.276860964  1 -1.63843048
## 74   -0.654657755  1 -0.82732888
## 75   -1.636776326  1 -1.31838816
## 76    5.284383548  1  2.14219177
## 77    0.840649376  1 -0.07967531
## 78    2.986707140  1  0.99335357
## 79   -1.767178508  1 -1.38358925
## 80   -1.698837620  1 -1.34941881
## 81   -7.829478780  1 -4.41473939
## 82    3.814021552  1  1.40701078
## 83   -2.968304650  1 -1.98415232
## 84    4.262203033  1  1.63110152
## 85    3.157407916  1  1.07870396
## 86    6.489499383  1  2.74474969
## 87   -2.812639813  1 -1.90631991
## 88    6.700250050  1  2.85012502
## 89    2.991895685  1  0.99594784
## 90   -2.716082777  1 -1.85804139
## 91    0.829906712  1 -0.08504664
## 92    4.118764715  1  1.55938236
## 93    4.012091252  1  1.50604563
## 94    2.787708825  1  0.89385441
## 95    5.736577969  1  2.36828898
## 96    1.698178132  1  0.34908907
## 97    3.006650639  1  1.00332532
## 98    2.209997253  1  0.60499863
## 99   -1.157633821  1 -1.07881691
## 100   2.185787959  1  0.59289398
## 101   3.236275322  2  0.61813766
## 102   0.008907967  2 -0.99554602
## 103  -5.223580192  2 -3.61179010
## 104  -1.144791743  2 -1.57239587
## 105   0.028875506  2 -0.98556225
## 106   0.363393041  2 -0.81830348
## 107   5.356351175  2  1.67817559
## 108   5.264583095  2  1.63229155
## 109   0.967273463  2 -0.51636327
## 110  -1.943864402  2 -1.97193220
## 111  -1.199209710  2 -1.59960486
## 112   4.575946200  2  1.28797310
## 113   7.840557764  2  2.92027888
## 114  -3.061594530  2 -2.53079727
## 115   4.393483355  2  1.19674168
## 116   3.725718727  2  0.86285936
## 117   2.697310131  2  0.34865507
## 118   1.794051052  2 -0.10297447
## 119   4.136536015  2  1.06826801
## 120   1.830360295  2 -0.08481985
## 121   7.534031365  2  2.76701568
## 122   1.901432265  2 -0.04928387
## 123   2.766641689  2  0.38332084
## 124  -5.633610996  2 -3.81680550
## 125   1.991608659  2 -0.00419567
## 126  -1.398251887  2 -1.69912594
## 127   0.074754996  2 -0.96262250
## 128   1.004892903  2 -0.49755355
## 129  -1.089366966  2 -1.54468348
## 130   3.075476080  2  0.53773804
## 131 -11.696392027  2 -6.84819601
## 132   0.670252607  2 -0.66487370
## 133   3.485387399  2  0.74269370
## 134  -1.597391698  2 -1.79869585
## 135  -0.329193164  2 -1.16459658
## 136   0.403454670  2 -0.79827266
## 137  -7.229041278  2 -4.61452064
## 138   1.905477366  2 -0.04726132
## 139   3.381467960  2  0.69073398
## 140  -0.726947499  2 -1.36347375
## 141   9.246323871  2  3.62316194
## 142   5.459139917  2  1.72956996
## 143  -0.082265137  2 -1.04113257
## 144   4.106041748  2  1.05302087
## 145   6.908425609  2  2.45421280
## 146  -1.206927226  2 -1.60346361
## 147  -6.187032679  2 -4.09351634
## 148  -0.671349967  2 -1.33567498
## 149   6.036996764  2  2.01849838
## 150   2.458375068  2  0.22918753
## 151   4.761220243  2  1.38061012
## 152   1.577722363  2 -0.21113882
## 153   4.291443717  2  1.14572186
## 154   0.724764387  2 -0.63761781
## 155   2.763120278  2  0.38156014
## 156   7.442830306  2  2.72141515
## 157  -1.454709737  2 -1.72735487
## 158   7.979179309  2  2.98958965
## 159   3.975831133  2  0.98791557
## 160  -2.258439365  2 -2.12921968
## 161   2.525543155  2  0.26277158
## 162   0.808219715  2 -0.59589014
## 163   6.224022119  2  2.11201106
## 164   4.605144566  2  1.30257228
## 165  -2.078968940  2 -2.03948447
## 166   1.727481790  2 -0.13625911
## 167   5.550678693  2  1.77533935
## 168  -1.817015437  2 -1.90850772
## 169   1.832973223  2 -0.08351339
## 170  -2.705167668  2 -2.35258383
## 171  -5.823714548  2 -3.91185727
## 172   7.115203592  2  2.55760180
## 173   0.297601539  2 -0.85119923
## 174   8.264444867  2  3.13222243
## 175   0.792658637  2 -0.60367068
## 176  -1.481543203  2 -1.74077160
## 177   1.755478671  2 -0.12226066
## 178  -2.935127899  2 -2.46756395
## 179   4.021264167  2  1.01063208
## 180   1.511825659  2 -0.24408717
## 181  -0.002734983  2 -1.00136749
## 182   3.573124833  2  0.78656242
## 183   2.332246123  2  0.16612306
## 184  -3.260450541  2 -2.63022527
## 185  -1.615644381  2 -1.80782219
## 186   3.735804712  2  0.86790236
## 187  -0.893803626  2 -1.44690181
## 188   2.235460454  2  0.11773023
## 189  -1.458438734  2 -1.72921937
## 190  -2.938433446  2 -2.46921672
## 191  -0.296183175  2 -1.14809159
## 192   6.942567258  2  2.47128363
## 193  12.619451423  2  5.30972571
## 194  -1.441595594  2 -1.72079780
## 195   8.762730739  2  3.38136537
## 196  -0.145180629  2 -1.07259031
## 197   2.746404003  2  0.37320200
## 198   6.037004430  2  2.01850222
## 199  -7.103426347  2 -4.55171317
## 200  -4.855515167  2 -3.42775758
# Generate data with different levels of granularity
set.seed(12350)
granularity_data <- function(granularity) {
  x <- runif(500, 0, 1)
  y <- ifelse(x > 0.5, 1, 0) + rnorm(500, 0, 0.1) # Binary outcome with noise
  x <- round(x * granularity) / granularity # Adjust granularity
  data.frame(x, y)
}

# Test different granularities
granularities <- c(2, 10, 100, 1000)
models <- list()

for (g in granularities) {
  data <- granularity_data(g)
  models[[as.character(g)]] <- randomForest(y ~ x, data = data, ntree = 500)
}

# Plot variable importance for different granularities
par(mfrow = c(2, 2))
for (g in granularities) {
  varImpPlot(models[[as.character(g)]], main = paste("Granularity:", g))
}

8.3. In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9: (a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors? (b) Which model do you think would be more predictive of other samples?

# Generate simulation data
set.seed(12352)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

# Fit two boosted tree models with extreme parameters
set.seed(12353)

# Lower shrinkage and bag.fraction
gbm_model_low <- gbm(
  y ~ ., 
  data = simulated, 
  distribution = "gaussian", 
  n.trees = 500, 
  interaction.depth = 3, 
  shrinkage = 0.1, 
  bag.fraction = 0.5, 
  n.minobsinnode = 5  
)

# Higher shrinkage and bag.fraction
gbm_model_high <- gbm(
  y ~ ., 
  data = simulated, 
  distribution = "gaussian", 
  n.trees = 500, 
  interaction.depth = 3, 
  shrinkage = 0.9, 
  bag.fraction = 0.9, 
  n.minobsinnode = 5 
)

# Variable importance for low shrinkage
imp_low <- summary(gbm_model_low, plotit = FALSE)

# Variable importance for high shrinkage
imp_high <- summary(gbm_model_high, plotit = FALSE)

# Visualize variable importance
par(mfrow = c(1, 2), mar = c(10, 4, 4, 2)) 
barplot(
  imp_low$rel.inf, 
  names.arg = imp_low$var, 
  las = 2, # Rotate axis labels
  main = "Shrinkage=0.1, Bagging=0.5", 
  col = "skyblue", 
  ylim = c(0, max(imp_low$rel.inf, imp_high$rel.inf))
)
barplot(
  imp_high$rel.inf, 
  names.arg = imp_high$var, 
  las = 2, # Rotate axis labels
  main = "Shrinkage=0.9, Bagging=0.9", 
  col = "orange", 
  ylim = c(0, max(imp_low$rel.inf, imp_high$rel.inf))
)

  1. How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?
# Increase interaction depth with adjusted parameters
set.seed(12355)
gbm_model_depth <- gbm(
  y ~ ., 
  data = simulated, 
  distribution = "gaussian", 
  n.trees = 500, 
  interaction.depth = 10, 
  shrinkage = 0.1, 
  bag.fraction = 0.5, 
  n.minobsinnode = 5 
)

# Extract and visualize variable importance
imp_depth <- summary(gbm_model_depth, plotit = FALSE)

# Improved barplot for variable importance
par(mfrow = c(1, 1), mar = c(10, 4, 4, 2)) # Adjust margins for display
barplot(
  imp_depth$rel.inf, 
  names.arg = imp_depth$var, 
  las = 2, 
  main = "Increased Interaction Depth", 
  col = "lightgreen", 
  ylim = c(0, max(imp_depth$rel.inf))
)

8.7. Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models: (a) Which tree-based regression model gives the optimal resampling and test set performance? (b) Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models? (c) Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

The Cubist model provides the best performance across all the models we have tuned throughout this exercise.

set.seed(12359)

# Load the ChemicalManufacturingProcess dataset
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

# Using knn imputation to handle missing values
knn_model <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
df <- predict(knn_model, ChemicalManufacturingProcess)

# Remove near-zero variance predictors
df <- df %>% select(-one_of(nearZeroVar(df, names = TRUE)))

# Partition data into training and testing sets
set.seed(12359)
in_train <- createDataPartition(df$Yield, times = 1, p = 0.8, list = FALSE)
train_df <- df[in_train, ]
test_df <- df[-in_train, ]
# Fit models

# 1. Random Forest
set.seed(12356)
rf_model <- randomForest(Yield ~ ., data = train_df, ntree = 500)

# 2. Gradient Boosted Trees
set.seed(12356)
gbm_model <- gbm(
  Yield ~ ., data = train_df, distribution = "gaussian",
  n.trees = 500, interaction.depth = 3, shrinkage = 0.1, bag.fraction = 0.75
)

# 3. Cubist
set.seed(12356)
cubist_model <- cubist(x = train_df[, -ncol(train_df)], y = train_df$Yield)

# Evaluate models

# Predictions
rf_pred <- predict(rf_model, test_df)
gbm_pred <- predict(gbm_model, test_df, n.trees = 500)
cubist_pred <- predict(cubist_model, test_df[, -ncol(test_df)])

# Calculate RMSE for each model
rf_rmse <- RMSE(rf_pred, test_df$Yield)
gbm_rmse <- RMSE(gbm_pred, test_df$Yield)
cubist_rmse <- RMSE(cubist_pred, test_df$Yield)

# Print results
cat("Random Forest RMSE:", rf_rmse, "\n")
## Random Forest RMSE: 0.5718748
cat("GBM RMSE:", gbm_rmse, "\n")
## GBM RMSE: 0.5703229
cat("Cubist RMSE:", cubist_rmse, "\n")
## Cubist RMSE: 1.902987e-08
# Determine the best model based on RMSE
best_model <- ifelse(
  rf_rmse < gbm_rmse & rf_rmse < cubist_rmse, "Random Forest",
  ifelse(gbm_rmse < cubist_rmse, "GBM", "Cubist")
)
cat("Best Model:", best_model, "\n")
## Best Model: Cubist
# Feature importance visualization for the best model
if (best_model == "Random Forest") {
  importance <- importance(rf_model)
  varImpPlot(rf_model, main = "Variable Importance - Random Forest")
} else if (best_model == "GBM") {
  gbm_imp <- summary(gbm_model)
  barplot(gbm_imp$rel.inf, names.arg = gbm_imp$var, main = "Variable Importance - GBM")
} else if (best_model == "Cubist") {
  cubist_summary <- summary(cubist_model)
  print(cubist_summary)
}
## 
## Call:
## cubist.default(x = train_df[, -ncol(train_df)], y = train_df$Yield)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Mon Nov 18 18:25:27 2024
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## Read 144 cases (57 attributes) from undefined.data
## 
## Model:
## 
##   Rule 1: [144 cases, mean 0.0155360, range -2.669244 to 3.339426, est err 0.0000000]
## 
##  outcome = 0 + 1 Yield
## 
## 
## Evaluation on training data (144 cases):
## 
##     Average  |error|          0.0000000
##     Relative |error|               0.00
##     Correlation coefficient        1.00
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##           100%    Yield
## 
## 
## Time: 0.0 secs