library(mlbench)
library(randomForest)
library(caret)
library(party)
library(gbm)
library(Cubist)
set.seed(200)
simulated <- mlbench.friedman1(200, sd=1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
model1 <- randomForest(y~., data=simulated,
importance = TRUE,
ntree = 1000)
rfImp <- varImp(model1, scale=FALSE)
rfImp
## Overall
## V1 8.732235404
## V2 6.415369387
## V3 0.763591825
## V4 7.615118809
## V5 2.023524577
## V6 0.165111172
## V7 -0.005961659
## V8 -0.166362581
## V9 -0.095292651
## V10 -0.074944788
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206
model2 <- randomForest(y~., data=simulated,
importance=TRUE,
ntree=1000)
rfImp2 <- varImp(model2, scale=FALSE)
rfImp2
## Overall
## V1 5.69119973
## V2 6.06896061
## V3 0.62970218
## V4 7.04752238
## V5 1.87238438
## V6 0.13569065
## V7 -0.01345645
## V8 -0.04370565
## V9 0.00840438
## V10 0.02894814
## duplicate1 4.28331581
The importance of V1 decreased.
Conditional Inference Tree
model3 <- cforest(y~., data=simulated)
party::varimp(model3, conditional=FALSE)
## V1 V2 V3 V4 V5
## 4.6171158805 6.0579730772 0.0003116115 7.6223892727 1.7161194047
## V6 V7 V8 V9 V10
## -0.0289427183 0.0465374951 -0.0380965511 0.0046062409 -0.0310326410
## duplicate1
## 5.0941897280
party::varimp(model3, conditional=TRUE)
## V1 V2 V3 V4 V5 V6
## 1.807953145 4.688980360 0.012878752 6.190578351 1.051666850 0.028174759
## V7 V8 V9 V10 duplicate1
## -0.011709437 -0.004356587 0.015118505 -0.022190587 1.926751729
The variable importances are in a different order between traditional and modified.
model5 <- cubist(simulated[1:10], simulated$y)
varImp(model5)
## Overall
## V1 50
## V2 50
## V4 50
## V5 50
## V3 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
X <- ChemicalManufacturingProcess[2:58]
y <- ChemicalManufacturingProcess[1]
sum(is.na(X))
## [1] 106
prep <- preProcess(X, method=c("knnImpute"))
prep
## Created from 152 samples and 57 variables
##
## Pre-processing:
## - centered (57)
## - ignored (0)
## - 5 nearest neighbor imputation (57)
## - scaled (57)
X_imputed <- predict(prep, X)
sum(is.na(X_imputed))
## [1] 0
sample <- sample(c(TRUE, FALSE), nrow(X_imputed), replace=TRUE, prob=c(0.7,0.3))
X_imputed_train <- X_imputed[sample, ]
X_imputed_test <- X_imputed[!sample, ]
y_train <- y[sample, ]
y_test <- y[!sample, ]
model6 <- randomForest(X_imputed_train, y_train, importance=TRUE, ntrees=1000)
model7 <- gbm.fit(X_imputed_train, y_train, distribution = "gaussian")
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 3.7410 nan 0.0010 0.0030
## 2 3.7382 nan 0.0010 0.0030
## 3 3.7348 nan 0.0010 0.0031
## 4 3.7319 nan 0.0010 0.0033
## 5 3.7284 nan 0.0010 0.0031
## 6 3.7255 nan 0.0010 0.0030
## 7 3.7220 nan 0.0010 0.0033
## 8 3.7189 nan 0.0010 0.0033
## 9 3.7157 nan 0.0010 0.0032
## 10 3.7129 nan 0.0010 0.0030
## 20 3.6818 nan 0.0010 0.0033
## 40 3.6176 nan 0.0010 0.0031
## 60 3.5594 nan 0.0010 0.0026
## 80 3.5030 nan 0.0010 0.0028
## 100 3.4471 nan 0.0010 0.0027
# gbmGrid <- expand.grid(.interaction.depth=seq(1,7,by=2), .n.trees=seq(100,1000,by=50), .shrinkage=c(0.01,0.1))
# gbmTune <- train(X_imputed_train, y_train,
# method="gbm",
# tuneGrid=gbmGrid,
# verbose=FALSE)
model8 <- cubist(X_imputed_train, y_train)
rfPred <- predict(model6, newdata=X_imputed_test)
postResample(pred=rfPred, obs=y_test)
## RMSE Rsquared MAE
## 0.9729932 0.6183819 0.7772096
gbmPred <- predict(model7, newdata=X_imputed_test)
## Using 100 trees...
postResample(pred=gbmPred, obs=y_test)
## RMSE Rsquared MAE
## 1.5208723 0.3077065 1.2511209
cubPred <- predict(model8, newdata=X_imputed_test)
postResample(pred=cubPred, obs=y_test)
## RMSE Rsquared MAE
## 1.1389650 0.5507048 0.9143831
Random Forest has the best Rsquared.
varImp(model6)
## Overall
## BiologicalMaterial01 3.46957805
## BiologicalMaterial02 7.25587149
## BiologicalMaterial03 8.89571390
## BiologicalMaterial04 4.72496196
## BiologicalMaterial05 4.45273599
## BiologicalMaterial06 9.13110172
## BiologicalMaterial07 1.00100150
## BiologicalMaterial08 4.46730174
## BiologicalMaterial09 3.78469247
## BiologicalMaterial10 2.71424922
## BiologicalMaterial11 6.37803220
## BiologicalMaterial12 10.58740240
## ManufacturingProcess01 4.20535511
## ManufacturingProcess02 3.54644577
## ManufacturingProcess03 1.87289651
## ManufacturingProcess04 0.43030516
## ManufacturingProcess05 0.59025824
## ManufacturingProcess06 4.14659867
## ManufacturingProcess07 0.24295675
## ManufacturingProcess08 -1.12903407
## ManufacturingProcess09 4.74726040
## ManufacturingProcess10 0.07719570
## ManufacturingProcess11 3.06288939
## ManufacturingProcess12 0.51524347
## ManufacturingProcess13 8.60317183
## ManufacturingProcess14 1.26972601
## ManufacturingProcess15 1.17458602
## ManufacturingProcess16 0.95218464
## ManufacturingProcess17 10.05866043
## ManufacturingProcess18 3.75453178
## ManufacturingProcess19 1.13851737
## ManufacturingProcess20 2.45374740
## ManufacturingProcess21 3.94953824
## ManufacturingProcess22 0.14887561
## ManufacturingProcess23 4.15559362
## ManufacturingProcess24 0.23883881
## ManufacturingProcess25 3.20575592
## ManufacturingProcess26 0.88195137
## ManufacturingProcess27 4.24304390
## ManufacturingProcess28 6.31570540
## ManufacturingProcess29 4.02894794
## ManufacturingProcess30 3.30838659
## ManufacturingProcess31 3.59374328
## ManufacturingProcess32 22.97443691
## ManufacturingProcess33 3.80718271
## ManufacturingProcess34 1.23928196
## ManufacturingProcess35 -0.67764375
## ManufacturingProcess36 5.22341382
## ManufacturingProcess37 1.70434810
## ManufacturingProcess38 0.35412762
## ManufacturingProcess39 6.37174168
## ManufacturingProcess40 0.76267013
## ManufacturingProcess41 -0.03627818
## ManufacturingProcess42 2.11360416
## ManufacturingProcess43 1.60683988
## ManufacturingProcess44 -0.50105098
## ManufacturingProcess45 1.79213330