library(AppliedPredictiveModeling)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(caTools)
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.3
library(lars)
library(MASS)
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
library(stats)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tsibble 1.1.5 ✔ feasts 0.4.1
## ✔ tsibbledata 0.4.1 ✔ fable 0.4.0
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ fabletools::MAE() masks caret::MAE()
## ✖ fabletools::RMSE() masks caret::RMSE()
## ✖ dplyr::select() masks MASS::select()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(fable)
library(e1071)
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:fabletools':
##
## interpolate
library(lattice)
library(corrplot)
## corrplot 0.95 loaded
##
## Attaching package: 'corrplot'
##
## The following object is masked from 'package:pls':
##
## corrplot
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
##
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
##
## The following object is masked from 'package:datasets':
##
## sleep
library(mlbench)
library(kernlab)
##
## Attaching package: 'kernlab'
##
## The following object is masked from 'package:purrr':
##
## cross
##
## The following object is masked from 'package:ggplot2':
##
## alpha
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
library(pdp)
##
## Attaching package: 'pdp'
##
## The following object is masked from 'package:purrr':
##
## partial
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(ggplot2)
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(ipred)
library(class)
library(partykit)
## Loading required package: libcoin
## Loading required package: mvtnorm
## Registered S3 method overwritten by 'inum':
## method from
## format.interval tsibble
library(rpart)
library(rpart.plot)
library(readxl)
library(corrplot)
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loading required package: iterators
## Loading required package: parallel
library(writexl)
#library(RWeka)
library(Cubist)
library(party)
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
##
## The following object is masked from 'package:kernlab':
##
## prior
##
## The following object is masked from 'package:fabletools':
##
## refit
##
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following object is masked from 'package:tsibble':
##
## index
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
##
## The following object is masked from 'package:stringr':
##
## boundary
##
##
## Attaching package: 'party'
##
## The following objects are masked from 'package:partykit':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
## node_terminal, varimp
##
## The following object is masked from 'package:fabletools':
##
## response
##
## The following object is masked from 'package:dplyr':
##
## where
recreate the simulated data from 7.2
#simulated data set created.
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
fit a random forest model to all the predicotrs. estimate the variable importance scores:
#random forest model made
model1 <- randomForest(y~., data = simulated,
importance = TRUE,
ntree=1000)
rfImp1 <- varImp(model1,scale = FALSE) #display the importance of predictors
print(rfImp1)
## Overall
## V1 8.732235404
## V2 6.415369387
## V3 0.763591825
## V4 7.615118809
## V5 2.023524577
## V6 0.165111172
## V7 -0.005961659
## V8 -0.166362581
## V9 -0.095292651
## V10 -0.074944788
The random forest model did not significantly use the uninformative predictors V6 through V10. V3 had the lowest importance out of V1 through V5.
add an additional predictor that is highly correlated with one of the informative predictors.
#adding an additional predicotr dublicate1 then fitting another rf model (model2)
set.seed(200)
simulated$duplicate1 <- simulated$V1+rnorm(200)*0.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9497025
model2 <- randomForest(y~., data = simulated,
importance = TRUE,
ntree=1000)
rfImp2 <- varImp(model2,scale = FALSE) |> arrange(desc(Overall))
print(rfImp2)
## Overall
## V4 6.86363294
## V2 6.05937899
## V1 6.00709780
## duplicate1 4.43323167
## V5 2.19939891
## V3 0.58465293
## V6 0.10898039
## V10 0.09999339
## V9 0.06123662
## V7 0.06104207
## V8 -0.04059204
with the additon of another highly correlated predictor to V1 the importance of V1 decreased from 8.7 to 6.0. however the additional predictor had the 4th highest importance.
set.seed(200)
simulated$duplicate2 <- simulated$V1+rnorm(200)*0.1
cor(simulated$duplicate2, simulated$V1)
## [1] 0.9497025
model3 <- randomForest(y~., data = simulated,
importance = TRUE,
ntree=1000)
rfImp3 <- varImp(model3,scale = FALSE) |> arrange(desc(Overall))
print(rfImp3)
## Overall
## V4 7.21616819
## V2 6.50242654
## V1 4.84857115
## duplicate2 2.94318033
## duplicate1 2.84168163
## V5 2.11631632
## V3 0.51808831
## V6 0.22066363
## V7 -0.02066815
## V9 -0.02908959
## V10 -0.03105942
## V8 -0.06964769
adding another additional predictor further decreases the importance of V1.
use the cforest function from party to fit a random forest model using conditional inference trees.
#assuming it wants a conditional forest for conditional forest renmoved the additonally generated columns
head(simulated)
## V1 V2 V3 V4 V5 V6 V7
## 1 0.5337724 0.6478064 0.85078526 0.18159957 0.92903976 0.36179060 0.8266609
## 2 0.5837650 0.4381528 0.67272659 0.66924914 0.16379784 0.45305931 0.6489601
## 3 0.5895783 0.5879065 0.40967108 0.33812728 0.89409334 0.02681911 0.1785614
## 4 0.6910399 0.2259548 0.03335447 0.06691274 0.63744519 0.52500637 0.5133614
## 5 0.6673315 0.8188985 0.71676079 0.80324287 0.08306864 0.22344157 0.6644906
## 6 0.8392937 0.3862983 0.64618857 0.86105431 0.63038947 0.43703891 0.3360117
## V8 V9 V10 y duplicate1 duplicate2
## 1 0.4214081 0.59111440 0.5886216 18.46398 0.5422481 0.5422481
## 2 0.8446239 0.92819306 0.7584008 16.09836 0.6064111 0.6064111
## 3 0.3495908 0.01759542 0.4441185 17.76165 0.6328339 0.6328339
## 4 0.7970260 0.68986918 0.4450716 13.78730 0.7468464 0.7468464
## 5 0.9038919 0.39696995 0.5500808 18.42984 0.6733070 0.6733070
## 6 0.6489177 0.53116033 0.9066182 20.85817 0.8278296 0.8278296
simulatedc <- simulated |> select(-duplicate1, -duplicate2)
head(simulatedc)
## V1 V2 V3 V4 V5 V6 V7
## 1 0.5337724 0.6478064 0.85078526 0.18159957 0.92903976 0.36179060 0.8266609
## 2 0.5837650 0.4381528 0.67272659 0.66924914 0.16379784 0.45305931 0.6489601
## 3 0.5895783 0.5879065 0.40967108 0.33812728 0.89409334 0.02681911 0.1785614
## 4 0.6910399 0.2259548 0.03335447 0.06691274 0.63744519 0.52500637 0.5133614
## 5 0.6673315 0.8188985 0.71676079 0.80324287 0.08306864 0.22344157 0.6644906
## 6 0.8392937 0.3862983 0.64618857 0.86105431 0.63038947 0.43703891 0.3360117
## V8 V9 V10 y
## 1 0.4214081 0.59111440 0.5886216 18.46398
## 2 0.8446239 0.92819306 0.7584008 16.09836
## 3 0.3495908 0.01759542 0.4441185 17.76165
## 4 0.7970260 0.68986918 0.4450716 13.78730
## 5 0.9038919 0.39696995 0.5500808 18.42984
## 6 0.6489177 0.53116033 0.9066182 20.85817
set.seed(200)
cforesttree <- cforest(y~., data = simulatedc)
cfImp <- varImp(cforesttree)
print(rfImp1|> arrange(desc(Overall))) #original random forest importance
## Overall
## V1 8.732235404
## V4 7.615118809
## V2 6.415369387
## V5 2.023524577
## V3 0.763591825
## V6 0.165111172
## V7 -0.005961659
## V10 -0.074944788
## V9 -0.095292651
## V8 -0.166362581
print(cfImp|> arrange(desc(Overall))) #conditional inference trees random forest model predictor importance
## Overall
## V1 8.889688266
## V4 8.374714465
## V2 6.475678898
## V5 1.911482592
## V3 0.048199904
## V7 0.003470791
## V10 -0.017820271
## V8 -0.020945307
## V6 -0.026382693
## V9 -0.040833731
Both the random forest model and the random forest using conditional inference trees uses V1 through V5 as the predictors with the highest importance in the same order. The values of the importance change slightly for each predictor between the two models. For the less important predictors the order of them changes between models.
repeat with boosted trees and cubist
head(simulatedc)
## V1 V2 V3 V4 V5 V6 V7
## 1 0.5337724 0.6478064 0.85078526 0.18159957 0.92903976 0.36179060 0.8266609
## 2 0.5837650 0.4381528 0.67272659 0.66924914 0.16379784 0.45305931 0.6489601
## 3 0.5895783 0.5879065 0.40967108 0.33812728 0.89409334 0.02681911 0.1785614
## 4 0.6910399 0.2259548 0.03335447 0.06691274 0.63744519 0.52500637 0.5133614
## 5 0.6673315 0.8188985 0.71676079 0.80324287 0.08306864 0.22344157 0.6644906
## 6 0.8392937 0.3862983 0.64618857 0.86105431 0.63038947 0.43703891 0.3360117
## V8 V9 V10 y
## 1 0.4214081 0.59111440 0.5886216 18.46398
## 2 0.8446239 0.92819306 0.7584008 16.09836
## 3 0.3495908 0.01759542 0.4441185 17.76165
## 4 0.7970260 0.68986918 0.4450716 13.78730
## 5 0.9038919 0.39696995 0.5500808 18.42984
## 6 0.6489177 0.53116033 0.9066182 20.85817
set.seed(200)
gbmModel <- gbm(y~.,data= simulatedc, distribution = "gaussian", n.trees = 1000)
summary(gbmModel)
## var rel.inf
## V4 V4 25.140661
## V1 V1 23.668702
## V2 V2 19.670210
## V5 V5 11.854998
## V3 V3 9.024607
## V7 V7 2.906671
## V6 V6 2.443473
## V8 V8 1.911299
## V9 V9 1.748566
## V10 V10 1.630814
#varImp does not work with a model made trained with "gbm" need to use train and method gbm however summary seems to show the importance
first gbm model (gbmModel <- gbm(y~.,data= simulatedc, distribution = “gaussian”, n.trees = 1000)) had V4, V1, V2, V5, V3 as the values of most importance. it agrees with the random forest model that V1 - V5 are the most important except V4 and V1 are swapped compared to the random forest
gbmModel <- train(simulatedc |> select(-y), simulatedc$y, method = "gbm", distribution = "gaussian", verbose = FALSE)
gbmImp <- varImp(gbmModel)
print(gbmImp) #gbm model using train
## gbm variable importance
##
## Overall
## V4 100.0000
## V1 93.9413
## V2 75.8721
## V5 43.5964
## V3 27.2088
## V6 1.5621
## V7 1.4108
## V10 0.9427
## V8 0.5725
## V9 0.0000
gbm model made from (gbmModel <- train(simulatedc |> select(-y), simulatedc$y, method = “gbm”, distribution = “gaussian”, verbose = FALSE)) had the same top 5 predictors as the previous gbm model. but the importance of V6 through V10 is in a completely different order
print(rfImp1|> arrange(desc(Overall))) #original random forest
## Overall
## V1 8.732235404
## V4 7.615118809
## V2 6.415369387
## V5 2.023524577
## V3 0.763591825
## V6 0.165111172
## V7 -0.005961659
## V10 -0.074944788
## V9 -0.095292651
## V8 -0.166362581
for the top 5 predictors again V1 and V4 are swapped between the gbm model and the random forest.
now will use cubist
cubistmodel <- cubist(simulated |> select(-y), simulated$y)
varcubist <- varImp(cubistmodel)
summary(cubistmodel)
##
## Call:
## cubist.default(x = select(simulated, -y), y = simulated$y)
##
##
## Cubist [Release 2.07 GPL Edition] Sun Dec 15 20:53:17 2024
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 200 cases (13 attributes) from undefined.data
##
## Model:
##
## Rule 1: [200 cases, mean 14.416183, range 3.55596 to 28.38167, est err 1.944664]
##
## outcome = 0.183529 + 8.9 V4 + 7.9 V1 + 7.1 V2 + 5.3 V5
##
##
## Evaluation on training data (200 cases):
##
## Average |error| 2.022980
## Relative |error| 0.50
## Correlation coefficient 0.87
##
##
## Attribute usage:
## Conds Model
##
## 100% V1
## 100% V2
## 100% V4
## 100% V5
##
##
## Time: 0.0 secs
print(varcubist)
## Overall
## V1 50
## V2 50
## V4 50
## V5 50
## V3 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
## duplicate1 0
## duplicate2 0
for the cubist model the top 5 predictors remain the same except V2 and V4 are swapped when compared to the random forest. little to no importance is assigned to the rest of the predictors.
use a simulation to show tree bias with different granularities
the granularity of the data is affected by how continuous or subdivided the data is.
set.seed(200)
#data frame made with data with various precision. "predictors" x1-X4
x1 <- runif(1000, min = 0, max = 37) #1000 values between 0 and 37 standard number of decimals
x2 <- round(x1,2) #two decimals
x3 <- round(x1,1) #one decimal
x4 <- round(x1,0) #no decimals
y <- rnorm(1000, mean =10, sd = 3 ) #y will be random numbers around a mean it is the value to be predicted
#create the data frame
datag <- data.frame(
x1,
x2,
x3,
x4,
y
)
head(datag)
## x1 x2 x3 x4 y
## 1 19.74958 19.75 19.7 20 8.938970
## 2 21.59931 21.60 21.6 22 4.210759
## 3 21.81440 21.81 21.8 22 7.718135
## 4 25.56848 25.57 25.6 26 17.012294
## 5 24.69127 24.69 24.7 25 3.799206
## 6 31.05387 31.05 31.1 31 6.879548
rfmod <- randomForest(y~., data = datag,
importance = TRUE,
ntree=1000)
rfdatagImp <- varImp(rfmod,scale = FALSE) #display the importance of predictors
cor_matrix <- cor(datag)
#correlation matrix and plot to see the correlation between the predictors and y
corrplot(cor_matrix,title = "Correlation Plot", method = "square", tl.cex = 1, cl.cex = 1, type = "lower")
print(cor_matrix)
## x1 x2 x3 x4 y
## x1 1.00000000 0.99999996 0.99999613 0.99961594 0.03924738
## x2 0.99999996 1.00000000 0.99999607 0.99961611 0.03924198
## x3 0.99999613 0.99999607 1.00000000 0.99960708 0.03922193
## x4 0.99961594 0.99961611 0.99960708 1.00000000 0.03933069
## y 0.03924738 0.03924198 0.03922193 0.03933069 1.00000000
print(rfdatagImp |> arrange(desc(Overall)))
## Overall
## x1 6.201224
## x2 5.890305
## x3 4.463129
## x4 1.731427
x1, x2, x3, x4 are the same values to a different degree of precision or decimal places. Y is a normal distribution of numbers with no meaningful correlation with any of the values, however the importance of the x values is ranked by how many decimal places the values have. x1 has the most decimal places so it is the most “important” Trees have a high bias towards numbers with higher “significant figures.” having more number of digits also affects the correlation with y slightly as well.
right model bagging fraction 0.9, learning rate 0.9. left model bagging fraction 0.1, learning rate 0.1. bagging fraction determines the amount of the data used to train each tree. Learning rate determines how much each tree in the ensemble contributes to the model by decreasing every trees’ contribution. the higher the learning rate the quicker the model converges to a value. the smaller the learning rate the more trees the model uses to determine the value. smaller learning rate is more computationally expensive. The right model would use 90% of the data is used to train each tree in the ensemble. a high learning rate can lead to poor generalization and overfitting. Also a high bagging fraction can lead to poor generalization and overfitting as well.
The model on the right focuses its importance on just the first few predictors because the learning rate is very high as well as the bagging fraction. with a high learning rate every tree contributes a lot for the determination and fewer trees are used so the model will use fewer predictors and assign importance to fewer predictors based off the high amount of data (90%) used to train the trees.
The left model would use more trees and more predictors due to its low learning rate. so the importance would be more evenly distributed among the predictors. The model may have issues if the data set is small since it only uses 10% of the data to train the trees.
I believe the model with the lower learning rate (the left one) may provide a better generalization for new inputs. But this is determined by how well 10% bagging represents the data as a whole.
The interaction depth affects the maximum number of features each split a tree uses. A higher interaction depth produces more complex trees. I believe that increasing the interaction depth will decrease the slope of the predictor importance in the models since it will consider more predictors per tree and this will more evenly spread out their importance.
copied from assignment 8 7.5
loaded the data from 6.3 and imputate and split and pre process. copied from assignment 7.
trying tree models on the chemical manufacturing process
data("ChemicalManufacturingProcess")
#book mentioned process predictors, which was not found, so process predictors was made from chemical manufacturing process
apropos('processPredictors')
## character(0)
#how many na values?
sum(is.na(ChemicalManufacturingProcess))
## [1] 106
Impute the predictors the impute package is no longer available on CRAN used VIM instead
#imputed with K nearest neighbors set to 5.
imputeCMP <- kNN(ChemicalManufacturingProcess, k = 5)
# Remove the "_imp" columns
imputeCMP <- imputeCMP[, !grepl("_imp$", colnames(imputeCMP))] #remove the boolean columns which indicate which values were imputed.
#imputation removed nas
sum(is.na(imputeCMP))
## [1] 0
imputed with nearest 5 neighbors
split the data into training and test.
#created processPredictors since it does not seem to be available after loading the data
#removed the output and kept the predictors
processPredictors <- select(imputeCMP, -"Yield")
set.seed(987654321)
#splitting the permeability data into a 75% training and a 25% test set
#select indices to use for extracting the training data
train_indice <- createDataPartition(imputeCMP$Yield, p = 0.75, list = FALSE)
# use the indices to select the data
train_cmp <- imputeCMP[train_indice, ] # use as y train_cmp$Yield
test_cmp <- imputeCMP[-train_indice, ] # use as y test_cmp$yield for test
train_cmpp <- processPredictors[train_indice, ] #use as x training
test_cmpp <- processPredictors[-train_indice, ] #use as x for test
#random forest
set.seed(200)
rfmc <- randomForest(Yield ~ ., data = train_cmp, #train on chemical manufacturing
importance = TRUE,
ntree=1000)
imprfmc <- varImp(rfmc,scale = FALSE) #display the importance of predictors
print(imprfmc |> arrange(desc(Overall)))
## Overall
## ManufacturingProcess32 1.012860e+00
## BiologicalMaterial03 4.220825e-01
## ManufacturingProcess13 4.038907e-01
## BiologicalMaterial06 2.718733e-01
## BiologicalMaterial12 2.551816e-01
## BiologicalMaterial11 2.343858e-01
## ManufacturingProcess17 1.584023e-01
## ManufacturingProcess09 1.490706e-01
## BiologicalMaterial02 1.442930e-01
## BiologicalMaterial08 1.178468e-01
## ManufacturingProcess36 1.077791e-01
## BiologicalMaterial04 8.728793e-02
## ManufacturingProcess31 8.167892e-02
## BiologicalMaterial05 6.963948e-02
## BiologicalMaterial01 6.826301e-02
## ManufacturingProcess28 6.116692e-02
## BiologicalMaterial09 4.537180e-02
## ManufacturingProcess11 3.557205e-02
## ManufacturingProcess02 3.382591e-02
## ManufacturingProcess27 3.329441e-02
## ManufacturingProcess01 3.065405e-02
## ManufacturingProcess18 2.845910e-02
## ManufacturingProcess34 2.562197e-02
## ManufacturingProcess30 2.532765e-02
## ManufacturingProcess06 2.458841e-02
## ManufacturingProcess39 2.404227e-02
## ManufacturingProcess33 2.308155e-02
## BiologicalMaterial10 2.118960e-02
## ManufacturingProcess29 1.948688e-02
## ManufacturingProcess24 1.807309e-02
## ManufacturingProcess20 1.767054e-02
## ManufacturingProcess26 1.722736e-02
## ManufacturingProcess19 1.481479e-02
## ManufacturingProcess42 1.208187e-02
## ManufacturingProcess16 1.048674e-02
## ManufacturingProcess43 9.458261e-03
## ManufacturingProcess22 7.407845e-03
## ManufacturingProcess10 6.576829e-03
## ManufacturingProcess37 6.084634e-03
## ManufacturingProcess15 6.081916e-03
## ManufacturingProcess12 5.364139e-03
## ManufacturingProcess04 4.737409e-03
## ManufacturingProcess45 4.691435e-03
## ManufacturingProcess25 4.541978e-03
## ManufacturingProcess03 3.666551e-03
## ManufacturingProcess05 2.027105e-03
## ManufacturingProcess35 1.736913e-03
## ManufacturingProcess41 1.396440e-03
## ManufacturingProcess14 1.276757e-03
## BiologicalMaterial07 1.153417e-03
## ManufacturingProcess40 5.295053e-07
## ManufacturingProcess07 -3.904346e-04
## ManufacturingProcess21 -9.129087e-04
## ManufacturingProcess08 -9.592261e-04
## ManufacturingProcess38 -1.339809e-03
## ManufacturingProcess23 -2.360924e-03
## ManufacturingProcess44 -4.103820e-03
summary(rfmc)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 132 -none- numeric
## mse 1000 -none- numeric
## rsq 1000 -none- numeric
## oob.times 132 -none- numeric
## importance 114 -none- numeric
## importanceSD 57 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 132 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
rfmcPred <- predict(rfmc, newdata= test_cmpp) #predict yield based on test data
#the function postResample can be used to get the test set performance values
postResample(pred= rfmcPred, obs=test_cmp$Yield)
## RMSE Rsquared MAE
## 1.2310348 0.5521820 0.9732442
#conditional inference trees
set.seed(200)
cforestc <- cforest(Yield ~ ., data = train_cmp) #conditional inference tree model
cfcImp <- varImp(cforestc)
print(cfcImp |> arrange(desc(Overall)))
## Overall
## ManufacturingProcess32 4.174245e-01
## ManufacturingProcess13 3.080011e-01
## ManufacturingProcess36 2.051168e-01
## ManufacturingProcess09 2.036544e-01
## BiologicalMaterial03 1.797200e-01
## BiologicalMaterial06 1.560211e-01
## ManufacturingProcess17 1.280276e-01
## BiologicalMaterial02 1.200364e-01
## BiologicalMaterial12 1.165816e-01
## BiologicalMaterial01 6.890231e-02
## BiologicalMaterial04 6.382325e-02
## ManufacturingProcess33 5.632087e-02
## ManufacturingProcess06 5.083854e-02
## BiologicalMaterial08 4.996544e-02
## BiologicalMaterial11 4.743727e-02
## ManufacturingProcess28 3.630546e-02
## ManufacturingProcess12 3.421430e-02
## ManufacturingProcess31 3.384940e-02
## ManufacturingProcess11 2.084346e-02
## ManufacturingProcess29 1.990521e-02
## ManufacturingProcess20 1.496611e-02
## ManufacturingProcess10 1.175473e-02
## BiologicalMaterial10 1.059535e-02
## ManufacturingProcess30 9.745067e-03
## BiologicalMaterial05 8.732380e-03
## ManufacturingProcess35 8.456431e-03
## ManufacturingProcess27 7.375510e-03
## ManufacturingProcess15 7.322052e-03
## BiologicalMaterial09 6.382885e-03
## ManufacturingProcess24 5.815355e-03
## ManufacturingProcess21 5.741280e-03
## ManufacturingProcess26 5.413251e-03
## ManufacturingProcess34 4.467500e-03
## ManufacturingProcess42 4.252368e-03
## ManufacturingProcess25 3.512281e-03
## ManufacturingProcess39 3.400005e-03
## ManufacturingProcess18 3.279742e-03
## ManufacturingProcess45 2.590559e-03
## ManufacturingProcess02 2.129964e-03
## ManufacturingProcess19 1.825732e-03
## ManufacturingProcess08 1.535731e-03
## ManufacturingProcess16 1.462225e-03
## ManufacturingProcess14 1.417451e-03
## ManufacturingProcess37 9.621570e-04
## ManufacturingProcess05 9.470855e-04
## ManufacturingProcess43 5.474098e-04
## BiologicalMaterial07 0.000000e+00
## ManufacturingProcess44 -9.725993e-05
## ManufacturingProcess03 -2.214930e-04
## ManufacturingProcess23 -3.013034e-04
## ManufacturingProcess04 -6.461169e-04
## ManufacturingProcess07 -6.614243e-04
## ManufacturingProcess40 -1.112043e-03
## ManufacturingProcess38 -1.241184e-03
## ManufacturingProcess41 -1.285453e-03
## ManufacturingProcess01 -1.403051e-03
## ManufacturingProcess22 -3.325807e-03
summary(cforestc)
## Length Class Mode
## 1 RandomForest S4
cforestcPred <- predict(cforestc, newdata= test_cmpp) #predict yield based on test data
#the function postResample can be used to get the test set performance values
postResample(pred= cforestcPred, obs=test_cmp$Yield)
## RMSE Rsquared MAE
## 1.3948021 0.4153877 1.1582258
set.seed(200)
gbmModelc <- train(x = train_cmpp, y = train_cmp$Yield, method = "gbm", distribution = "gaussian", verbose = FALSE)
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
gbmcImp<- varImp(gbmModelc)
print(gbmcImp) #gbm model using train
## gbm variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess13 51.332
## ManufacturingProcess09 38.347
## BiologicalMaterial03 30.048
## BiologicalMaterial12 25.757
## ManufacturingProcess36 19.485
## ManufacturingProcess17 15.612
## ManufacturingProcess31 15.056
## BiologicalMaterial09 13.661
## ManufacturingProcess37 10.152
## BiologicalMaterial08 9.834
## ManufacturingProcess05 9.599
## ManufacturingProcess06 9.222
## ManufacturingProcess03 8.645
## ManufacturingProcess27 6.848
## BiologicalMaterial06 6.173
## ManufacturingProcess30 5.731
## ManufacturingProcess24 5.469
## BiologicalMaterial11 5.368
## BiologicalMaterial05 5.174
gbmModelcPred <- predict(gbmModelc, newdata= test_cmpp) #predict yield based on test data
#the function postResample can be used to get the test set performance values
postResample(pred= gbmModelcPred, obs=test_cmp$Yield)
## RMSE Rsquared MAE
## 1.2188392 0.5772398 0.9652080
cubistmodelc <- cubist(train_cmpp, train_cmp$Yield)
varcubistc <- varImp(cubistmodelc)
summary(cubistmodelc)
##
## Call:
## cubist.default(x = train_cmpp, y = train_cmp$Yield)
##
##
## Cubist [Release 2.07 GPL Edition] Sun Dec 15 20:53:22 2024
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 132 cases (58 attributes) from undefined.data
##
## Model:
##
## Rule 1: [34 cases, mean 38.419, range 35.25 to 40.59, est err 0.510]
##
## if
## BiologicalMaterial11 <= 145.07
## ManufacturingProcess13 > 34.3
## ManufacturingProcess32 <= 159
## then
## outcome = -20.749 - 1.55 ManufacturingProcess13
## + 0.0182 ManufacturingProcess18 + 11.6 ManufacturingProcess34
## - 0.102 BiologicalMaterial03 + 0.04 ManufacturingProcess09
## + 0.008 ManufacturingProcess32 + 0.013 ManufacturingProcess01
##
## Rule 2: [22 cases, mean 39.763, range 37.42 to 43.12, est err 0.724]
##
## if
## ManufacturingProcess13 <= 34.3
## ManufacturingProcess32 <= 159
## then
## outcome = 56.356 - 1.46 ManufacturingProcess13
## + 0.114 ManufacturingProcess32 + 0.26 ManufacturingProcess01
## + 0.2 ManufacturingProcess09 + 3 ManufacturingProcess34
## - 0.23 ManufacturingProcess37 - 0.009 ManufacturingProcess35
##
## Rule 3: [24 cases, mean 39.873, range 38.35 to 42.58, est err 0.671]
##
## if
## BiologicalMaterial11 > 145.07
## ManufacturingProcess13 > 34.3
## ManufacturingProcess32 <= 159
## then
## outcome = 34.189 - 0.135 BiologicalMaterial03
## + 0.14 ManufacturingProcess09 + 0.035 ManufacturingProcess32
## + 0.071 ManufacturingProcess01 + 2.2 ManufacturingProcess34
## - 0.16 ManufacturingProcess37 - 0.006 ManufacturingProcess35
##
## Rule 4: [52 cases, mean 41.685, range 38.95 to 46.34, est err 0.864]
##
## if
## ManufacturingProcess32 > 159
## then
## outcome = -7.013 + 0.00681 ManufacturingProcess26
## - 0.57 ManufacturingProcess17 + 0.102 ManufacturingProcess32
## + 0.22 ManufacturingProcess09
##
##
## Evaluation on training data (132 cases):
##
## Average |error| 0.691
## Relative |error| 0.46
## Correlation coefficient 0.88
##
##
## Attribute usage:
## Conds Model
##
## 100% 100% ManufacturingProcess32
## 61% 42% ManufacturingProcess13
## 44% BiologicalMaterial11
## 100% ManufacturingProcess09
## 61% ManufacturingProcess01
## 61% ManufacturingProcess34
## 44% BiologicalMaterial03
## 39% ManufacturingProcess17
## 39% ManufacturingProcess26
## 35% ManufacturingProcess35
## 35% ManufacturingProcess37
## 26% ManufacturingProcess18
##
##
## Time: 0.0 secs
print(varcubistc)
## Overall
## ManufacturingProcess32 100.0
## ManufacturingProcess13 51.5
## BiologicalMaterial11 22.0
## ManufacturingProcess09 50.0
## ManufacturingProcess01 30.5
## ManufacturingProcess34 30.5
## BiologicalMaterial03 22.0
## ManufacturingProcess17 19.5
## ManufacturingProcess26 19.5
## ManufacturingProcess35 17.5
## ManufacturingProcess37 17.5
## ManufacturingProcess18 13.0
## BiologicalMaterial01 0.0
## BiologicalMaterial02 0.0
## BiologicalMaterial04 0.0
## BiologicalMaterial05 0.0
## BiologicalMaterial06 0.0
## BiologicalMaterial07 0.0
## BiologicalMaterial08 0.0
## BiologicalMaterial09 0.0
## BiologicalMaterial10 0.0
## BiologicalMaterial12 0.0
## ManufacturingProcess02 0.0
## ManufacturingProcess03 0.0
## ManufacturingProcess04 0.0
## ManufacturingProcess05 0.0
## ManufacturingProcess06 0.0
## ManufacturingProcess07 0.0
## ManufacturingProcess08 0.0
## ManufacturingProcess10 0.0
## ManufacturingProcess11 0.0
## ManufacturingProcess12 0.0
## ManufacturingProcess14 0.0
## ManufacturingProcess15 0.0
## ManufacturingProcess16 0.0
## ManufacturingProcess19 0.0
## ManufacturingProcess20 0.0
## ManufacturingProcess21 0.0
## ManufacturingProcess22 0.0
## ManufacturingProcess23 0.0
## ManufacturingProcess24 0.0
## ManufacturingProcess25 0.0
## ManufacturingProcess27 0.0
## ManufacturingProcess28 0.0
## ManufacturingProcess29 0.0
## ManufacturingProcess30 0.0
## ManufacturingProcess31 0.0
## ManufacturingProcess33 0.0
## ManufacturingProcess36 0.0
## ManufacturingProcess38 0.0
## ManufacturingProcess39 0.0
## ManufacturingProcess40 0.0
## ManufacturingProcess41 0.0
## ManufacturingProcess42 0.0
## ManufacturingProcess43 0.0
## ManufacturingProcess44 0.0
## ManufacturingProcess45 0.0
cubistmodelcPred <- predict(cubistmodelc, newdata= test_cmpp) #predict yield based on test data
#the function postResample can be used to get the test set performance values
postResample(pred= cubistmodelcPred, obs=test_cmp$Yield)
## RMSE Rsquared MAE
## 1.4618324 0.4133779 1.0583084
# the RMSE and Rsquared for the predictions of the tree models
chemical_results<- rbind(
randomforestc = postResample(pred= rfmcPred, obs= test_cmp$Yield),
conditionalforestc = postResample(pred = cforestcPred, obs = test_cmp$Yield),
gboostedc = postResample(pred = gbmModelcPred, obs = test_cmp$Yield),
cubistc = postResample(pred= cubistmodelcPred, obs = test_cmp$Yield)
)
# Convert to a data frame
chemicalresults <- as.data.frame(chemical_results)
# see RMSE and R^2 for chapter models
print(chemicalresults)
## RMSE Rsquared MAE
## randomforestc 1.231035 0.5521820 0.9732442
## conditionalforestc 1.394802 0.4153877 1.1582258
## gboostedc 1.218839 0.5772398 0.9652080
## cubistc 1.461832 0.4133779 1.0583084
the gradient boosting machine model provides the best predictions.
which predictors are most important for the optimal model
print(gbmcImp)
## gbm variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess13 51.332
## ManufacturingProcess09 38.347
## BiologicalMaterial03 30.048
## BiologicalMaterial12 25.757
## ManufacturingProcess36 19.485
## ManufacturingProcess17 15.612
## ManufacturingProcess31 15.056
## BiologicalMaterial09 13.661
## ManufacturingProcess37 10.152
## BiologicalMaterial08 9.834
## ManufacturingProcess05 9.599
## ManufacturingProcess06 9.222
## ManufacturingProcess03 8.645
## ManufacturingProcess27 6.848
## BiologicalMaterial06 6.173
## ManufacturingProcess30 5.731
## ManufacturingProcess24 5.469
## BiologicalMaterial11 5.368
## BiologicalMaterial05 5.174
#top predictors for gradient boosting machine model
from assignment 7 the best found Latent Variable model (linear model) top ten predictors ManufacturingProcess32 100.0000000 ManufacturingProcess13 92.4708233 ManufacturingProcess36 91.3473135 ManufacturingProcess09 84.3158605 ManufacturingProcess17 81.7846579 BiologicalMaterial02 70.6214136 BiologicalMaterial06 67.8114973 BiologicalMaterial03 67.0055935 BiologicalMaterial08 63.7959965 ManufacturingProcess06 63.2881421
nonlinear MARS model ## nsubsets gcv rss ## ManufacturingProcess32 15 100.0 100.0 ## ManufacturingProcess13 14 68.6 72.0 ## ManufacturingProcess21 11 37.9 45.1 ## ManufacturingProcess28 11 37.9 45.1 ## ManufacturingProcess33 9 26.3 35.3 ## BiologicalMaterial03 7 18.0 28.1 ## ManufacturingProcess39 5 16.3 23.8 ## ManufacturingProcess31 4 14.6 21.2 ## ManufacturingProcess37 2 2.5 12.6
all models agree that Manufacturing Process 32 and manufacutring process 13 are the top two predictors. the biological material that shows up for in the top 10 predictors for all the optimal models is biological material 3. in the Gradient boosted machine model (GBM) and the nonlinear MARS model the manufacturing processes dominate the overall yield. only in the linear model LVM has 4 biological materials listed. manufacturing process 36 shows up for GBM model and MARS model but not in the LVM. Manufacturing process 9, 36,17 is in the top ten for LVM and GBM. MARS also shares manufacutring process 31 and 37 in the top ten predictors for GBM.
#unfortunately the gradient boosted machine model is difficult to plot and determine what the meaning of the terminal modes based on how gbm are constructed. Terminal nodes are assigned a number rather than a predictor
#can extract only some information about the model
# optimal number of tree
optimal_tree <- gbmModelc$finalModel$n.trees # Optimal number of trees
# Terminal node predictions for optimal number of trees.
tree <- pretty.gbm.tree(gbmModelc$finalModel, i.tree = optimal_tree - 1) #index starts with zero for i.tree in pretty.gbm.tree
print(tree) #gives information on the structure of the terminal tree.
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight
## 0 33 4.500000000 1 2 9 0.7782333 66
## 1 -1 -0.009040158 -1 -1 -1 0.0000000 31
## 2 48 0.850000000 3 4 8 1.2024464 35
## 3 -1 -0.012943422 -1 -1 -1 0.0000000 12
## 4 41 9.000000000 5 6 7 1.1008903 23
## 5 -1 0.005159182 -1 -1 -1 0.0000000 12
## 6 -1 0.048956648 -1 -1 -1 0.0000000 11
## 7 -1 0.026105796 -1 -1 -1 0.0000000 23
## 8 -1 0.012717493 -1 -1 -1 0.0000000 35
## 9 -1 0.002497990 -1 -1 -1 0.0000000 66
## Prediction
## 0 0.002497990
## 1 -0.009040158
## 2 0.012717493
## 3 -0.012943422
## 4 0.026105796
## 5 0.005159182
## 6 0.048956648
## 7 0.026105796
## 8 0.012717493
## 9 0.002497990
#can plot how certain predictors affect the yield based on the gbm model.
partial_plot <- partial(gbmModelc, pred.var = "ManufacturingProcess32", train = train_cmpp)
plotPartial(partial_plot)
#plot of the predicted values vs the correct yield values
plot(test_cmp$Yield, gbmModelcPred,
main = "GBM Predicted vs True Yield",
xlab = "True Yield",
ylab = "GBM Predicted Yield"
)
abline(a = 0, b = 1, col = "red") # The line of perfect predictions
Table of the structure of the terminal tree for the GBM model with the optimal number of trees is shown. Also a graph of how Manufacturing process 32 affects the predicted yield in the gbm. The predictions of the GBM can be plotted against the actual yield of the test datas set as well. A line of perfect predictions can also be drawn on top.
if a tree could be directly visualized it would show the order of the predictors used and their limits for each split in the tree. it may show the order which the predictors are used and their corresponding limits and any subsequent predictors used. this would provide insights to how the nodes and split limits affect the yield from the predictors.