KDD Assignment 9 Data 624

8.1, 8.2, 8.3, and 8.7 in Kuhn and Johnson

library(AppliedPredictiveModeling)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(caTools)
library(elasticnet)

## Loading required package: lars

## Loaded lars 1.3

library(lars)
library(MASS)
library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:caret':
## 
##     R2

## The following object is masked from 'package:stats':
## 
##     loadings

library(stats)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(fpp3)

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tsibble     1.1.5     ✔ feasts      0.4.1
## ✔ tsibbledata 0.4.1     ✔ fable       0.4.0
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ purrr::lift()        masks caret::lift()
## ✖ fabletools::MAE()    masks caret::MAE()
## ✖ fabletools::RMSE()   masks caret::RMSE()
## ✖ dplyr::select()      masks MASS::select()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(fable)
library(e1071)

## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:fabletools':
## 
##     interpolate

library(lattice)
library(corrplot)

## corrplot 0.95 loaded
## 
## Attaching package: 'corrplot'
## 
## The following object is masked from 'package:pls':
## 
##     corrplot

library(VIM)

## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep

library(mlbench)
library(kernlab)

## 
## Attaching package: 'kernlab'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(earth)

## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix

library(pdp)

## 
## Attaching package: 'pdp'
## 
## The following object is masked from 'package:purrr':
## 
##     partial

library(randomForest)

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(ggplot2)
library(gbm)

## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

library(ipred)
library(class)
library(partykit)

## Loading required package: libcoin
## Loading required package: mvtnorm
## Registered S3 method overwritten by 'inum':
##   method          from   
##   format.interval tsibble

library(rpart)
library(rpart.plot)
library(readxl)
library(corrplot)
library(doParallel)

## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: iterators
## Loading required package: parallel

library(writexl)
#library(RWeka)
library(Cubist)
library(party)

## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## 
## The following object is masked from 'package:kernlab':
## 
##     prior
## 
## The following object is masked from 'package:fabletools':
## 
##     refit
## 
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following object is masked from 'package:tsibble':
## 
##     index
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## 
## Attaching package: 'party'
## 
## The following objects are masked from 'package:partykit':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
##     node_terminal, varimp
## 
## The following object is masked from 'package:fabletools':
## 
##     response
## 
## The following object is masked from 'package:dplyr':
## 
##     where

8.1

recreate the simulated data from 7.2

#simulated data set created.
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

a)

fit a random forest model to all the predicotrs. estimate the variable importance scores:

#random forest model made
model1 <- randomForest(y~., data = simulated,
                       importance = TRUE,
                       ntree=1000)
rfImp1 <- varImp(model1,scale = FALSE) #display the importance of predictors

print(rfImp1)

##          Overall
## V1   8.732235404
## V2   6.415369387
## V3   0.763591825
## V4   7.615118809
## V5   2.023524577
## V6   0.165111172
## V7  -0.005961659
## V8  -0.166362581
## V9  -0.095292651
## V10 -0.074944788

The random forest model did not significantly use the uninformative predictors V6 through V10. V3 had the lowest importance out of V1 through V5.

b)

add an additional predictor that is highly correlated with one of the informative predictors.

#adding an additional predicotr dublicate1 then fitting another rf model (model2)
set.seed(200)
simulated$duplicate1 <- simulated$V1+rnorm(200)*0.1
cor(simulated$duplicate1, simulated$V1)

## [1] 0.9497025

model2 <- randomForest(y~., data = simulated,
                       importance = TRUE,
                       ntree=1000)
rfImp2 <- varImp(model2,scale = FALSE) |> arrange(desc(Overall))

print(rfImp2)

##                Overall
## V4          6.86363294
## V2          6.05937899
## V1          6.00709780
## duplicate1  4.43323167
## V5          2.19939891
## V3          0.58465293
## V6          0.10898039
## V10         0.09999339
## V9          0.06123662
## V7          0.06104207
## V8         -0.04059204

with the additon of another highly correlated predictor to V1 the importance of V1 decreased from 8.7 to 6.0. however the additional predictor had the 4th highest importance.

set.seed(200)
simulated$duplicate2 <- simulated$V1+rnorm(200)*0.1
cor(simulated$duplicate2, simulated$V1)

## [1] 0.9497025

model3 <- randomForest(y~., data = simulated,
                       importance = TRUE,
                       ntree=1000)
rfImp3 <- varImp(model3,scale = FALSE) |> arrange(desc(Overall))

print(rfImp3)

##                Overall
## V4          7.21616819
## V2          6.50242654
## V1          4.84857115
## duplicate2  2.94318033
## duplicate1  2.84168163
## V5          2.11631632
## V3          0.51808831
## V6          0.22066363
## V7         -0.02066815
## V9         -0.02908959
## V10        -0.03105942
## V8         -0.06964769

adding another additional predictor further decreases the importance of V1.

c)

use the cforest function from party to fit a random forest model using conditional inference trees.

#assuming it wants a conditional forest for conditional forest renmoved the additonally generated columns
head(simulated)

##          V1        V2         V3         V4         V5         V6        V7
## 1 0.5337724 0.6478064 0.85078526 0.18159957 0.92903976 0.36179060 0.8266609
## 2 0.5837650 0.4381528 0.67272659 0.66924914 0.16379784 0.45305931 0.6489601
## 3 0.5895783 0.5879065 0.40967108 0.33812728 0.89409334 0.02681911 0.1785614
## 4 0.6910399 0.2259548 0.03335447 0.06691274 0.63744519 0.52500637 0.5133614
## 5 0.6673315 0.8188985 0.71676079 0.80324287 0.08306864 0.22344157 0.6644906
## 6 0.8392937 0.3862983 0.64618857 0.86105431 0.63038947 0.43703891 0.3360117
##          V8         V9       V10        y duplicate1 duplicate2
## 1 0.4214081 0.59111440 0.5886216 18.46398  0.5422481  0.5422481
## 2 0.8446239 0.92819306 0.7584008 16.09836  0.6064111  0.6064111
## 3 0.3495908 0.01759542 0.4441185 17.76165  0.6328339  0.6328339
## 4 0.7970260 0.68986918 0.4450716 13.78730  0.7468464  0.7468464
## 5 0.9038919 0.39696995 0.5500808 18.42984  0.6733070  0.6733070
## 6 0.6489177 0.53116033 0.9066182 20.85817  0.8278296  0.8278296

simulatedc <- simulated |> select(-duplicate1, -duplicate2)
head(simulatedc)

##          V1        V2         V3         V4         V5         V6        V7
## 1 0.5337724 0.6478064 0.85078526 0.18159957 0.92903976 0.36179060 0.8266609
## 2 0.5837650 0.4381528 0.67272659 0.66924914 0.16379784 0.45305931 0.6489601
## 3 0.5895783 0.5879065 0.40967108 0.33812728 0.89409334 0.02681911 0.1785614
## 4 0.6910399 0.2259548 0.03335447 0.06691274 0.63744519 0.52500637 0.5133614
## 5 0.6673315 0.8188985 0.71676079 0.80324287 0.08306864 0.22344157 0.6644906
## 6 0.8392937 0.3862983 0.64618857 0.86105431 0.63038947 0.43703891 0.3360117
##          V8         V9       V10        y
## 1 0.4214081 0.59111440 0.5886216 18.46398
## 2 0.8446239 0.92819306 0.7584008 16.09836
## 3 0.3495908 0.01759542 0.4441185 17.76165
## 4 0.7970260 0.68986918 0.4450716 13.78730
## 5 0.9038919 0.39696995 0.5500808 18.42984
## 6 0.6489177 0.53116033 0.9066182 20.85817

set.seed(200)
cforesttree <- cforest(y~., data = simulatedc)
cfImp <- varImp(cforesttree) 

print(rfImp1|> arrange(desc(Overall))) #original random forest importance

##          Overall
## V1   8.732235404
## V4   7.615118809
## V2   6.415369387
## V5   2.023524577
## V3   0.763591825
## V6   0.165111172
## V7  -0.005961659
## V10 -0.074944788
## V9  -0.095292651
## V8  -0.166362581

print(cfImp|> arrange(desc(Overall))) #conditional inference trees random forest model predictor importance

##          Overall
## V1   8.889688266
## V4   8.374714465
## V2   6.475678898
## V5   1.911482592
## V3   0.048199904
## V7   0.003470791
## V10 -0.017820271
## V8  -0.020945307
## V6  -0.026382693
## V9  -0.040833731

Both the random forest model and the random forest using conditional inference trees uses V1 through V5 as the predictors with the highest importance in the same order. The values of the importance change slightly for each predictor between the two models. For the less important predictors the order of them changes between models.

d)

repeat with boosted trees and cubist

head(simulatedc)

##          V1        V2         V3         V4         V5         V6        V7
## 1 0.5337724 0.6478064 0.85078526 0.18159957 0.92903976 0.36179060 0.8266609
## 2 0.5837650 0.4381528 0.67272659 0.66924914 0.16379784 0.45305931 0.6489601
## 3 0.5895783 0.5879065 0.40967108 0.33812728 0.89409334 0.02681911 0.1785614
## 4 0.6910399 0.2259548 0.03335447 0.06691274 0.63744519 0.52500637 0.5133614
## 5 0.6673315 0.8188985 0.71676079 0.80324287 0.08306864 0.22344157 0.6644906
## 6 0.8392937 0.3862983 0.64618857 0.86105431 0.63038947 0.43703891 0.3360117
##          V8         V9       V10        y
## 1 0.4214081 0.59111440 0.5886216 18.46398
## 2 0.8446239 0.92819306 0.7584008 16.09836
## 3 0.3495908 0.01759542 0.4441185 17.76165
## 4 0.7970260 0.68986918 0.4450716 13.78730
## 5 0.9038919 0.39696995 0.5500808 18.42984
## 6 0.6489177 0.53116033 0.9066182 20.85817

set.seed(200)
gbmModel <- gbm(y~.,data= simulatedc, distribution = "gaussian", n.trees = 1000)
summary(gbmModel)

##     var   rel.inf
## V4   V4 25.140661
## V1   V1 23.668702
## V2   V2 19.670210
## V5   V5 11.854998
## V3   V3  9.024607
## V7   V7  2.906671
## V6   V6  2.443473
## V8   V8  1.911299
## V9   V9  1.748566
## V10 V10  1.630814

#varImp does not work with a model made trained with "gbm" need to use train and method gbm however summary seems to show the importance

first gbm model (gbmModel <- gbm(y~.,data= simulatedc, distribution = “gaussian”, n.trees = 1000)) had V4, V1, V2, V5, V3 as the values of most importance. it agrees with the random forest model that V1 - V5 are the most important except V4 and V1 are swapped compared to the random forest

gbmModel <- train(simulatedc |> select(-y), simulatedc$y, method = "gbm", distribution = "gaussian", verbose = FALSE)
gbmImp <- varImp(gbmModel) 
print(gbmImp) #gbm model using train

## gbm variable importance
## 
##      Overall
## V4  100.0000
## V1   93.9413
## V2   75.8721
## V5   43.5964
## V3   27.2088
## V6    1.5621
## V7    1.4108
## V10   0.9427
## V8    0.5725
## V9    0.0000

gbm model made from (gbmModel <- train(simulatedc |> select(-y), simulatedc$y, method = “gbm”, distribution = “gaussian”, verbose = FALSE)) had the same top 5 predictors as the previous gbm model. but the importance of V6 through V10 is in a completely different order

print(rfImp1|> arrange(desc(Overall))) #original random forest

##          Overall
## V1   8.732235404
## V4   7.615118809
## V2   6.415369387
## V5   2.023524577
## V3   0.763591825
## V6   0.165111172
## V7  -0.005961659
## V10 -0.074944788
## V9  -0.095292651
## V8  -0.166362581

for the top 5 predictors again V1 and V4 are swapped between the gbm model and the random forest.

now will use cubist

cubistmodel <- cubist(simulated |> select(-y), simulated$y)
varcubist <- varImp(cubistmodel)
summary(cubistmodel)

## 
## Call:
## cubist.default(x = select(simulated, -y), y = simulated$y)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Sun Dec 15 20:53:17 2024
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## Read 200 cases (13 attributes) from undefined.data
## 
## Model:
## 
##   Rule 1: [200 cases, mean 14.416183, range 3.55596 to 28.38167, est err 1.944664]
## 
##  outcome = 0.183529 + 8.9 V4 + 7.9 V1 + 7.1 V2 + 5.3 V5
## 
## 
## Evaluation on training data (200 cases):
## 
##     Average  |error|           2.022980
##     Relative |error|               0.50
##     Correlation coefficient        0.87
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##           100%    V1
##           100%    V2
##           100%    V4
##           100%    V5
## 
## 
## Time: 0.0 secs

print(varcubist)

##            Overall
## V1              50
## V2              50
## V4              50
## V5              50
## V3               0
## V6               0
## V7               0
## V8               0
## V9               0
## V10              0
## duplicate1       0
## duplicate2       0

for the cubist model the top 5 predictors remain the same except V2 and V4 are swapped when compared to the random forest. little to no importance is assigned to the rest of the predictors.

8.2

use a simulation to show tree bias with different granularities

the granularity of the data is affected by how continuous or subdivided the data is.

set.seed(200)
#data frame made with data with various precision. "predictors" x1-X4
  x1   <-  runif(1000, min = 0, max = 37) #1000 values between 0 and 37 standard number of decimals
  x2   <- round(x1,2) #two decimals
  x3   <- round(x1,1) #one decimal
  x4 <- round(x1,0) #no decimals
  y <- rnorm(1000, mean =10, sd = 3 )  #y will be random numbers around a mean it is the value to be predicted

  #create the data frame
datag <- data.frame(
  x1,
  x2,
  x3,
  x4,
  y
)
head(datag)

##         x1    x2   x3 x4         y
## 1 19.74958 19.75 19.7 20  8.938970
## 2 21.59931 21.60 21.6 22  4.210759
## 3 21.81440 21.81 21.8 22  7.718135
## 4 25.56848 25.57 25.6 26 17.012294
## 5 24.69127 24.69 24.7 25  3.799206
## 6 31.05387 31.05 31.1 31  6.879548

rfmod <- randomForest(y~., data = datag,
                       importance = TRUE,
                       ntree=1000)
rfdatagImp <- varImp(rfmod,scale = FALSE) #display the importance of predictors
cor_matrix <- cor(datag)
#correlation matrix and plot to see the correlation between the predictors and y
corrplot(cor_matrix,title = "Correlation Plot", method = "square",  tl.cex = 1, cl.cex = 1, type = "lower")

print(cor_matrix)

##            x1         x2         x3         x4          y
## x1 1.00000000 0.99999996 0.99999613 0.99961594 0.03924738
## x2 0.99999996 1.00000000 0.99999607 0.99961611 0.03924198
## x3 0.99999613 0.99999607 1.00000000 0.99960708 0.03922193
## x4 0.99961594 0.99961611 0.99960708 1.00000000 0.03933069
## y  0.03924738 0.03924198 0.03922193 0.03933069 1.00000000

print(rfdatagImp |> arrange(desc(Overall)))

##     Overall
## x1 6.201224
## x2 5.890305
## x3 4.463129
## x4 1.731427

x1, x2, x3, x4 are the same values to a different degree of precision or decimal places. Y is a normal distribution of numbers with no meaningful correlation with any of the values, however the importance of the x values is ranked by how many decimal places the values have. x1 has the most decimal places so it is the most “important” Trees have a high bias towards numbers with higher “significant figures.” having more number of digits also affects the correlation with y slightly as well.

8.3

right model bagging fraction 0.9, learning rate 0.9. left model bagging fraction 0.1, learning rate 0.1. bagging fraction determines the amount of the data used to train each tree. Learning rate determines how much each tree in the ensemble contributes to the model by decreasing every trees’ contribution. the higher the learning rate the quicker the model converges to a value. the smaller the learning rate the more trees the model uses to determine the value. smaller learning rate is more computationally expensive. The right model would use 90% of the data is used to train each tree in the ensemble. a high learning rate can lead to poor generalization and overfitting. Also a high bagging fraction can lead to poor generalization and overfitting as well.

a)

The model on the right focuses its importance on just the first few predictors because the learning rate is very high as well as the bagging fraction. with a high learning rate every tree contributes a lot for the determination and fewer trees are used so the model will use fewer predictors and assign importance to fewer predictors based off the high amount of data (90%) used to train the trees.

The left model would use more trees and more predictors due to its low learning rate. so the importance would be more evenly distributed among the predictors. The model may have issues if the data set is small since it only uses 10% of the data to train the trees.

b)

I believe the model with the lower learning rate (the left one) may provide a better generalization for new inputs. But this is determined by how well 10% bagging represents the data as a whole.

c)

The interaction depth affects the maximum number of features each split a tree uses. A higher interaction depth produces more complex trees. I believe that increasing the interaction depth will decrease the slope of the predictor importance in the models since it will consider more predictors per tree and this will more evenly spread out their importance.

8.7

copied from assignment 8 7.5

loaded the data from 6.3 and imputate and split and pre process. copied from assignment 7.

a)

trying tree models on the chemical manufacturing process

data("ChemicalManufacturingProcess")
#book mentioned process predictors, which was not found, so process predictors was made from chemical manufacturing process
apropos('processPredictors')

## character(0)

#how many na values?
sum(is.na(ChemicalManufacturingProcess))

## [1] 106

Impute the predictors the impute package is no longer available on CRAN used VIM instead

#imputed with K nearest neighbors set to 5.
imputeCMP <- kNN(ChemicalManufacturingProcess, k = 5)

# Remove the "_imp" columns
imputeCMP <- imputeCMP[, !grepl("_imp$", colnames(imputeCMP))] #remove the boolean columns which indicate which values were imputed.
#imputation removed nas
sum(is.na(imputeCMP))

## [1] 0

imputed with nearest 5 neighbors

split the data into training and test.

#created processPredictors since it does not seem to be available after loading the data
#removed the output and kept the predictors
processPredictors <- select(imputeCMP, -"Yield")
set.seed(987654321) 
#splitting the permeability data into a 75% training and a 25% test set
#select indices to use for extracting the training data
train_indice <- createDataPartition(imputeCMP$Yield, p = 0.75, list = FALSE)

# use the indices to select the data
train_cmp <- imputeCMP[train_indice, ] # use as y train_cmp$Yield
test_cmp <- imputeCMP[-train_indice, ] # use as y test_cmp$yield for test
train_cmpp <- processPredictors[train_indice, ] #use as x training
test_cmpp <- processPredictors[-train_indice, ] #use as x for test

#random forest
set.seed(200)
rfmc <- randomForest(Yield ~ ., data = train_cmp, #train on chemical manufacturing
                       importance = TRUE,
                       ntree=1000)
imprfmc <- varImp(rfmc,scale = FALSE) #display the importance of predictors

print(imprfmc |> arrange(desc(Overall)))

##                              Overall
## ManufacturingProcess32  1.012860e+00
## BiologicalMaterial03    4.220825e-01
## ManufacturingProcess13  4.038907e-01
## BiologicalMaterial06    2.718733e-01
## BiologicalMaterial12    2.551816e-01
## BiologicalMaterial11    2.343858e-01
## ManufacturingProcess17  1.584023e-01
## ManufacturingProcess09  1.490706e-01
## BiologicalMaterial02    1.442930e-01
## BiologicalMaterial08    1.178468e-01
## ManufacturingProcess36  1.077791e-01
## BiologicalMaterial04    8.728793e-02
## ManufacturingProcess31  8.167892e-02
## BiologicalMaterial05    6.963948e-02
## BiologicalMaterial01    6.826301e-02
## ManufacturingProcess28  6.116692e-02
## BiologicalMaterial09    4.537180e-02
## ManufacturingProcess11  3.557205e-02
## ManufacturingProcess02  3.382591e-02
## ManufacturingProcess27  3.329441e-02
## ManufacturingProcess01  3.065405e-02
## ManufacturingProcess18  2.845910e-02
## ManufacturingProcess34  2.562197e-02
## ManufacturingProcess30  2.532765e-02
## ManufacturingProcess06  2.458841e-02
## ManufacturingProcess39  2.404227e-02
## ManufacturingProcess33  2.308155e-02
## BiologicalMaterial10    2.118960e-02
## ManufacturingProcess29  1.948688e-02
## ManufacturingProcess24  1.807309e-02
## ManufacturingProcess20  1.767054e-02
## ManufacturingProcess26  1.722736e-02
## ManufacturingProcess19  1.481479e-02
## ManufacturingProcess42  1.208187e-02
## ManufacturingProcess16  1.048674e-02
## ManufacturingProcess43  9.458261e-03
## ManufacturingProcess22  7.407845e-03
## ManufacturingProcess10  6.576829e-03
## ManufacturingProcess37  6.084634e-03
## ManufacturingProcess15  6.081916e-03
## ManufacturingProcess12  5.364139e-03
## ManufacturingProcess04  4.737409e-03
## ManufacturingProcess45  4.691435e-03
## ManufacturingProcess25  4.541978e-03
## ManufacturingProcess03  3.666551e-03
## ManufacturingProcess05  2.027105e-03
## ManufacturingProcess35  1.736913e-03
## ManufacturingProcess41  1.396440e-03
## ManufacturingProcess14  1.276757e-03
## BiologicalMaterial07    1.153417e-03
## ManufacturingProcess40  5.295053e-07
## ManufacturingProcess07 -3.904346e-04
## ManufacturingProcess21 -9.129087e-04
## ManufacturingProcess08 -9.592261e-04
## ManufacturingProcess38 -1.339809e-03
## ManufacturingProcess23 -2.360924e-03
## ManufacturingProcess44 -4.103820e-03

summary(rfmc)

##                 Length Class  Mode     
## call               5   -none- call     
## type               1   -none- character
## predicted        132   -none- numeric  
## mse             1000   -none- numeric  
## rsq             1000   -none- numeric  
## oob.times        132   -none- numeric  
## importance       114   -none- numeric  
## importanceSD      57   -none- numeric  
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y                132   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

rfmcPred <- predict(rfmc, newdata= test_cmpp) #predict yield based on test data
#the function postResample can be used to get the test set performance values
postResample(pred= rfmcPred, obs=test_cmp$Yield)

##      RMSE  Rsquared       MAE 
## 1.2310348 0.5521820 0.9732442

#conditional inference trees
set.seed(200)
cforestc <- cforest(Yield ~ ., data = train_cmp) #conditional inference tree model
cfcImp <- varImp(cforestc) 
print(cfcImp |> arrange(desc(Overall)))

##                              Overall
## ManufacturingProcess32  4.174245e-01
## ManufacturingProcess13  3.080011e-01
## ManufacturingProcess36  2.051168e-01
## ManufacturingProcess09  2.036544e-01
## BiologicalMaterial03    1.797200e-01
## BiologicalMaterial06    1.560211e-01
## ManufacturingProcess17  1.280276e-01
## BiologicalMaterial02    1.200364e-01
## BiologicalMaterial12    1.165816e-01
## BiologicalMaterial01    6.890231e-02
## BiologicalMaterial04    6.382325e-02
## ManufacturingProcess33  5.632087e-02
## ManufacturingProcess06  5.083854e-02
## BiologicalMaterial08    4.996544e-02
## BiologicalMaterial11    4.743727e-02
## ManufacturingProcess28  3.630546e-02
## ManufacturingProcess12  3.421430e-02
## ManufacturingProcess31  3.384940e-02
## ManufacturingProcess11  2.084346e-02
## ManufacturingProcess29  1.990521e-02
## ManufacturingProcess20  1.496611e-02
## ManufacturingProcess10  1.175473e-02
## BiologicalMaterial10    1.059535e-02
## ManufacturingProcess30  9.745067e-03
## BiologicalMaterial05    8.732380e-03
## ManufacturingProcess35  8.456431e-03
## ManufacturingProcess27  7.375510e-03
## ManufacturingProcess15  7.322052e-03
## BiologicalMaterial09    6.382885e-03
## ManufacturingProcess24  5.815355e-03
## ManufacturingProcess21  5.741280e-03
## ManufacturingProcess26  5.413251e-03
## ManufacturingProcess34  4.467500e-03
## ManufacturingProcess42  4.252368e-03
## ManufacturingProcess25  3.512281e-03
## ManufacturingProcess39  3.400005e-03
## ManufacturingProcess18  3.279742e-03
## ManufacturingProcess45  2.590559e-03
## ManufacturingProcess02  2.129964e-03
## ManufacturingProcess19  1.825732e-03
## ManufacturingProcess08  1.535731e-03
## ManufacturingProcess16  1.462225e-03
## ManufacturingProcess14  1.417451e-03
## ManufacturingProcess37  9.621570e-04
## ManufacturingProcess05  9.470855e-04
## ManufacturingProcess43  5.474098e-04
## BiologicalMaterial07    0.000000e+00
## ManufacturingProcess44 -9.725993e-05
## ManufacturingProcess03 -2.214930e-04
## ManufacturingProcess23 -3.013034e-04
## ManufacturingProcess04 -6.461169e-04
## ManufacturingProcess07 -6.614243e-04
## ManufacturingProcess40 -1.112043e-03
## ManufacturingProcess38 -1.241184e-03
## ManufacturingProcess41 -1.285453e-03
## ManufacturingProcess01 -1.403051e-03
## ManufacturingProcess22 -3.325807e-03

summary(cforestc)

##       Length        Class         Mode 
##            1 RandomForest           S4

cforestcPred <- predict(cforestc, newdata= test_cmpp) #predict yield based on test data
#the function postResample can be used to get the test set performance values
postResample(pred= cforestcPred, obs=test_cmp$Yield)

##      RMSE  Rsquared       MAE 
## 1.3948021 0.4153877 1.1582258

set.seed(200)
gbmModelc <- train(x = train_cmpp, y = train_cmp$Yield, method = "gbm", distribution = "gaussian", verbose = FALSE)

## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.

## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.

## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.

## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.

## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.

## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.

gbmcImp<- varImp(gbmModelc) 
print(gbmcImp) #gbm model using train

## gbm variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess13  51.332
## ManufacturingProcess09  38.347
## BiologicalMaterial03    30.048
## BiologicalMaterial12    25.757
## ManufacturingProcess36  19.485
## ManufacturingProcess17  15.612
## ManufacturingProcess31  15.056
## BiologicalMaterial09    13.661
## ManufacturingProcess37  10.152
## BiologicalMaterial08     9.834
## ManufacturingProcess05   9.599
## ManufacturingProcess06   9.222
## ManufacturingProcess03   8.645
## ManufacturingProcess27   6.848
## BiologicalMaterial06     6.173
## ManufacturingProcess30   5.731
## ManufacturingProcess24   5.469
## BiologicalMaterial11     5.368
## BiologicalMaterial05     5.174

gbmModelcPred <- predict(gbmModelc, newdata= test_cmpp) #predict yield based on test data
#the function postResample can be used to get the test set performance values
postResample(pred= gbmModelcPred, obs=test_cmp$Yield)

##      RMSE  Rsquared       MAE 
## 1.2188392 0.5772398 0.9652080

cubistmodelc <- cubist(train_cmpp, train_cmp$Yield)
varcubistc <- varImp(cubistmodelc)
summary(cubistmodelc)

## 
## Call:
## cubist.default(x = train_cmpp, y = train_cmp$Yield)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Sun Dec 15 20:53:22 2024
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## Read 132 cases (58 attributes) from undefined.data
## 
## Model:
## 
##   Rule 1: [34 cases, mean 38.419, range 35.25 to 40.59, est err 0.510]
## 
##     if
##  BiologicalMaterial11 <= 145.07
##  ManufacturingProcess13 > 34.3
##  ManufacturingProcess32 <= 159
##     then
##  outcome = -20.749 - 1.55 ManufacturingProcess13
##            + 0.0182 ManufacturingProcess18 + 11.6 ManufacturingProcess34
##            - 0.102 BiologicalMaterial03 + 0.04 ManufacturingProcess09
##            + 0.008 ManufacturingProcess32 + 0.013 ManufacturingProcess01
## 
##   Rule 2: [22 cases, mean 39.763, range 37.42 to 43.12, est err 0.724]
## 
##     if
##  ManufacturingProcess13 <= 34.3
##  ManufacturingProcess32 <= 159
##     then
##  outcome = 56.356 - 1.46 ManufacturingProcess13
##            + 0.114 ManufacturingProcess32 + 0.26 ManufacturingProcess01
##            + 0.2 ManufacturingProcess09 + 3 ManufacturingProcess34
##            - 0.23 ManufacturingProcess37 - 0.009 ManufacturingProcess35
## 
##   Rule 3: [24 cases, mean 39.873, range 38.35 to 42.58, est err 0.671]
## 
##     if
##  BiologicalMaterial11 > 145.07
##  ManufacturingProcess13 > 34.3
##  ManufacturingProcess32 <= 159
##     then
##  outcome = 34.189 - 0.135 BiologicalMaterial03
##            + 0.14 ManufacturingProcess09 + 0.035 ManufacturingProcess32
##            + 0.071 ManufacturingProcess01 + 2.2 ManufacturingProcess34
##            - 0.16 ManufacturingProcess37 - 0.006 ManufacturingProcess35
## 
##   Rule 4: [52 cases, mean 41.685, range 38.95 to 46.34, est err 0.864]
## 
##     if
##  ManufacturingProcess32 > 159
##     then
##  outcome = -7.013 + 0.00681 ManufacturingProcess26
##            - 0.57 ManufacturingProcess17 + 0.102 ManufacturingProcess32
##            + 0.22 ManufacturingProcess09
## 
## 
## Evaluation on training data (132 cases):
## 
##     Average  |error|              0.691
##     Relative |error|               0.46
##     Correlation coefficient        0.88
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##    100%   100%    ManufacturingProcess32
##     61%    42%    ManufacturingProcess13
##     44%           BiologicalMaterial11
##           100%    ManufacturingProcess09
##            61%    ManufacturingProcess01
##            61%    ManufacturingProcess34
##            44%    BiologicalMaterial03
##            39%    ManufacturingProcess17
##            39%    ManufacturingProcess26
##            35%    ManufacturingProcess35
##            35%    ManufacturingProcess37
##            26%    ManufacturingProcess18
## 
## 
## Time: 0.0 secs

print(varcubistc)

##                        Overall
## ManufacturingProcess32   100.0
## ManufacturingProcess13    51.5
## BiologicalMaterial11      22.0
## ManufacturingProcess09    50.0
## ManufacturingProcess01    30.5
## ManufacturingProcess34    30.5
## BiologicalMaterial03      22.0
## ManufacturingProcess17    19.5
## ManufacturingProcess26    19.5
## ManufacturingProcess35    17.5
## ManufacturingProcess37    17.5
## ManufacturingProcess18    13.0
## BiologicalMaterial01       0.0
## BiologicalMaterial02       0.0
## BiologicalMaterial04       0.0
## BiologicalMaterial05       0.0
## BiologicalMaterial06       0.0
## BiologicalMaterial07       0.0
## BiologicalMaterial08       0.0
## BiologicalMaterial09       0.0
## BiologicalMaterial10       0.0
## BiologicalMaterial12       0.0
## ManufacturingProcess02     0.0
## ManufacturingProcess03     0.0
## ManufacturingProcess04     0.0
## ManufacturingProcess05     0.0
## ManufacturingProcess06     0.0
## ManufacturingProcess07     0.0
## ManufacturingProcess08     0.0
## ManufacturingProcess10     0.0
## ManufacturingProcess11     0.0
## ManufacturingProcess12     0.0
## ManufacturingProcess14     0.0
## ManufacturingProcess15     0.0
## ManufacturingProcess16     0.0
## ManufacturingProcess19     0.0
## ManufacturingProcess20     0.0
## ManufacturingProcess21     0.0
## ManufacturingProcess22     0.0
## ManufacturingProcess23     0.0
## ManufacturingProcess24     0.0
## ManufacturingProcess25     0.0
## ManufacturingProcess27     0.0
## ManufacturingProcess28     0.0
## ManufacturingProcess29     0.0
## ManufacturingProcess30     0.0
## ManufacturingProcess31     0.0
## ManufacturingProcess33     0.0
## ManufacturingProcess36     0.0
## ManufacturingProcess38     0.0
## ManufacturingProcess39     0.0
## ManufacturingProcess40     0.0
## ManufacturingProcess41     0.0
## ManufacturingProcess42     0.0
## ManufacturingProcess43     0.0
## ManufacturingProcess44     0.0
## ManufacturingProcess45     0.0

cubistmodelcPred <- predict(cubistmodelc, newdata= test_cmpp) #predict yield based on test data
#the function postResample can be used to get the test set performance values
postResample(pred= cubistmodelcPred, obs=test_cmp$Yield)

##      RMSE  Rsquared       MAE 
## 1.4618324 0.4133779 1.0583084

# the RMSE and Rsquared for the predictions of the tree models
chemical_results<- rbind(
  randomforestc = postResample(pred= rfmcPred, obs= test_cmp$Yield),
  conditionalforestc = postResample(pred = cforestcPred, obs = test_cmp$Yield),
  gboostedc = postResample(pred = gbmModelcPred, obs = test_cmp$Yield),
  cubistc = postResample(pred= cubistmodelcPred, obs = test_cmp$Yield)
  
)

# Convert to a data frame 
chemicalresults <- as.data.frame(chemical_results)

# see RMSE and R^2 for chapter models
print(chemicalresults)

##                        RMSE  Rsquared       MAE
## randomforestc      1.231035 0.5521820 0.9732442
## conditionalforestc 1.394802 0.4153877 1.1582258
## gboostedc          1.218839 0.5772398 0.9652080
## cubistc            1.461832 0.4133779 1.0583084

the gradient boosting machine model provides the best predictions.

b)

which predictors are most important for the optimal model

print(gbmcImp)

## gbm variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess13  51.332
## ManufacturingProcess09  38.347
## BiologicalMaterial03    30.048
## BiologicalMaterial12    25.757
## ManufacturingProcess36  19.485
## ManufacturingProcess17  15.612
## ManufacturingProcess31  15.056
## BiologicalMaterial09    13.661
## ManufacturingProcess37  10.152
## BiologicalMaterial08     9.834
## ManufacturingProcess05   9.599
## ManufacturingProcess06   9.222
## ManufacturingProcess03   8.645
## ManufacturingProcess27   6.848
## BiologicalMaterial06     6.173
## ManufacturingProcess30   5.731
## ManufacturingProcess24   5.469
## BiologicalMaterial11     5.368
## BiologicalMaterial05     5.174

#top predictors for gradient boosting machine model

from assignment 7 the best found Latent Variable model (linear model) top ten predictors ManufacturingProcess32 100.0000000 ManufacturingProcess13 92.4708233 ManufacturingProcess36 91.3473135 ManufacturingProcess09 84.3158605 ManufacturingProcess17 81.7846579 BiologicalMaterial02 70.6214136 BiologicalMaterial06 67.8114973 BiologicalMaterial03 67.0055935 BiologicalMaterial08 63.7959965 ManufacturingProcess06 63.2881421

nonlinear MARS model ## nsubsets gcv rss ## ManufacturingProcess32 15 100.0 100.0 ## ManufacturingProcess13 14 68.6 72.0 ## ManufacturingProcess21 11 37.9 45.1 ## ManufacturingProcess28 11 37.9 45.1 ## ManufacturingProcess33 9 26.3 35.3 ## BiologicalMaterial03 7 18.0 28.1 ## ManufacturingProcess39 5 16.3 23.8 ## ManufacturingProcess31 4 14.6 21.2 ## ManufacturingProcess37 2 2.5 12.6

all models agree that Manufacturing Process 32 and manufacutring process 13 are the top two predictors. the biological material that shows up for in the top 10 predictors for all the optimal models is biological material 3. in the Gradient boosted machine model (GBM) and the nonlinear MARS model the manufacturing processes dominate the overall yield. only in the linear model LVM has 4 biological materials listed. manufacturing process 36 shows up for GBM model and MARS model but not in the LVM. Manufacturing process 9, 36,17 is in the top ten for LVM and GBM. MARS also shares manufacutring process 31 and 37 in the top ten predictors for GBM.

c)

#unfortunately the gradient boosted machine model is difficult to plot and determine what the meaning of the terminal modes based on how gbm are constructed. Terminal nodes are assigned a number rather than a predictor

#can extract only some information about the model
# optimal number of tree
optimal_tree <- gbmModelc$finalModel$n.trees # Optimal number of trees

# Terminal node predictions for optimal number of trees. 
tree <- pretty.gbm.tree(gbmModelc$finalModel, i.tree = optimal_tree - 1) #index starts with zero for i.tree in pretty.gbm.tree
print(tree) #gives information on the structure of the terminal tree.

##   SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight
## 0       33   4.500000000        1         2           9      0.7782333     66
## 1       -1  -0.009040158       -1        -1          -1      0.0000000     31
## 2       48   0.850000000        3         4           8      1.2024464     35
## 3       -1  -0.012943422       -1        -1          -1      0.0000000     12
## 4       41   9.000000000        5         6           7      1.1008903     23
## 5       -1   0.005159182       -1        -1          -1      0.0000000     12
## 6       -1   0.048956648       -1        -1          -1      0.0000000     11
## 7       -1   0.026105796       -1        -1          -1      0.0000000     23
## 8       -1   0.012717493       -1        -1          -1      0.0000000     35
## 9       -1   0.002497990       -1        -1          -1      0.0000000     66
##     Prediction
## 0  0.002497990
## 1 -0.009040158
## 2  0.012717493
## 3 -0.012943422
## 4  0.026105796
## 5  0.005159182
## 6  0.048956648
## 7  0.026105796
## 8  0.012717493
## 9  0.002497990

#can plot how certain predictors affect the yield based on the gbm model.
partial_plot <- partial(gbmModelc, pred.var = "ManufacturingProcess32", train = train_cmpp)
plotPartial(partial_plot)

#plot of the predicted values vs the correct yield values
plot(test_cmp$Yield, gbmModelcPred,
     main = "GBM Predicted vs True Yield",
     xlab = "True Yield",
     ylab = "GBM Predicted Yield"
     )
abline(a = 0, b = 1, col = "red")  # The line of perfect predictions

Table of the structure of the terminal tree for the GBM model with the optimal number of trees is shown. Also a graph of how Manufacturing process 32 affects the predicted yield in the gbm. The predictions of the GBM can be plotted against the actual yield of the test datas set as well. A line of perfect predictions can also be drawn on top.

if a tree could be directly visualized it would show the order of the predictors used and their limits for each split in the tree. it may show the order which the predictors are used and their corresponding limits and any subsequent predictors used. this would provide insights to how the nodes and split limits affect the yield from the predictors.

KDD Assignment 9 Data 624

Keith DeNivo

2024-12-15

8.1, 8.2, 8.3, and 8.7 in Kuhn and Johnson

8.1

a)

b)

c)

d)

8.2

8.3

a)

b)

c)

8.7

a)

b)

c)