Problem 8.1
set.seed(200)
simulated <- mlbench.friedman1(200, sd=1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
- Random Forest Model
model1 <- randomForest(y ~ ., data=simulated, importance=TRUE, ntree =1000)
rfImp1 <- varImp(model1, scale=FALSE)
rfImp1
## Overall
## V1 8.84289608
## V2 6.74508245
## V3 0.67830653
## V4 7.75934674
## V5 2.23628276
## V6 0.11429887
## V7 0.03724747
## V8 -0.05349642
## V9 -0.04495617
## V10 0.03863205
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$y)
## [1] 0.4613314
model2 <- randomForest(y ~ ., data=simulated, importance=TRUE, ntree =1000)
rfImp2 <- varImp(model2, scale=FALSE)
rfImp2
## Overall
## V1 6.008319352
## V2 6.308908170
## V3 0.571604465
## V4 7.187015958
## V5 2.131040245
## V6 0.211304611
## V7 0.025100355
## V8 -0.116980037
## V9 -0.003679481
## V10 0.024878337
## duplicate1 3.618101735
model3 <- cforest(y ~ ., data=simulated)
varimp(model3)
## V1 V2 V3 V4 V5 V6
## 6.66043731 6.19942369 0.04107867 7.80355110 1.86181623 -0.03307225
## V7 V8 V9 V10 duplicate1
## -0.02430149 -0.04881394 -0.01498777 -0.06177746 2.92518849
model4 <- gbm(y ~ ., data=simulated, distribution = "gaussian")
summary(model4)

## var rel.inf
## V4 V4 30.334001
## V2 V2 22.513861
## V1 V1 21.456642
## V5 V5 12.779456
## V3 V3 7.784866
## duplicate1 duplicate1 4.987620
## V7 V7 0.143554
## V6 V6 0.000000
## V8 V8 0.000000
## V9 V9 0.000000
## V10 V10 0.000000
simulatedX <- simulated[,colnames(simulated)[colnames(simulated) != 'y']]
simulatedY <- simulated[["y"]]
model5 <- cubist(simulatedX, simulatedY)
summary(model5)
##
## Call:
## cubist.default(x = simulatedX, y = simulatedY)
##
##
## Cubist [Release 2.07 GPL Edition] Sun Nov 19 17:41:52 2023
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 200 cases (12 attributes) from undefined.data
##
## Model:
##
## Rule 1: [200 cases, mean 14.416183, range 3.55596 to 28.38167, est err 1.944664]
##
## outcome = 0.183529 + 8.9 V4 + 7.9 V1 + 7.1 V2 + 5.3 V5
##
##
## Evaluation on training data (200 cases):
##
## Average |error| 2.199542
## Relative |error| 0.54
## Correlation coefficient 0.84
##
##
## Attribute usage:
## Conds Model
##
## 100% V1
## 100% V2
## 100% V4
## 100% V5
##
##
## Time: 0.0 secs
Problem 8.2
Problem 8.7
data("ChemicalManufacturingProcess")
columns <- colnames(ChemicalManufacturingProcess)
for(col in columns) {
print(col)
median_value <- median(ChemicalManufacturingProcess[[col]],na.rm=TRUE)
ChemicalManufacturingProcess[col][is.na(ChemicalManufacturingProcess[col])] <- median_value
}
## [1] "Yield"
## [1] "BiologicalMaterial01"
## [1] "BiologicalMaterial02"
## [1] "BiologicalMaterial03"
## [1] "BiologicalMaterial04"
## [1] "BiologicalMaterial05"
## [1] "BiologicalMaterial06"
## [1] "BiologicalMaterial07"
## [1] "BiologicalMaterial08"
## [1] "BiologicalMaterial09"
## [1] "BiologicalMaterial10"
## [1] "BiologicalMaterial11"
## [1] "BiologicalMaterial12"
## [1] "ManufacturingProcess01"
## [1] "ManufacturingProcess02"
## [1] "ManufacturingProcess03"
## [1] "ManufacturingProcess04"
## [1] "ManufacturingProcess05"
## [1] "ManufacturingProcess06"
## [1] "ManufacturingProcess07"
## [1] "ManufacturingProcess08"
## [1] "ManufacturingProcess09"
## [1] "ManufacturingProcess10"
## [1] "ManufacturingProcess11"
## [1] "ManufacturingProcess12"
## [1] "ManufacturingProcess13"
## [1] "ManufacturingProcess14"
## [1] "ManufacturingProcess15"
## [1] "ManufacturingProcess16"
## [1] "ManufacturingProcess17"
## [1] "ManufacturingProcess18"
## [1] "ManufacturingProcess19"
## [1] "ManufacturingProcess20"
## [1] "ManufacturingProcess21"
## [1] "ManufacturingProcess22"
## [1] "ManufacturingProcess23"
## [1] "ManufacturingProcess24"
## [1] "ManufacturingProcess25"
## [1] "ManufacturingProcess26"
## [1] "ManufacturingProcess27"
## [1] "ManufacturingProcess28"
## [1] "ManufacturingProcess29"
## [1] "ManufacturingProcess30"
## [1] "ManufacturingProcess31"
## [1] "ManufacturingProcess32"
## [1] "ManufacturingProcess33"
## [1] "ManufacturingProcess34"
## [1] "ManufacturingProcess35"
## [1] "ManufacturingProcess36"
## [1] "ManufacturingProcess37"
## [1] "ManufacturingProcess38"
## [1] "ManufacturingProcess39"
## [1] "ManufacturingProcess40"
## [1] "ManufacturingProcess41"
## [1] "ManufacturingProcess42"
## [1] "ManufacturingProcess43"
## [1] "ManufacturingProcess44"
## [1] "ManufacturingProcess45"
set.seed(1234)
sample_set <- sample(nrow(ChemicalManufacturingProcess),round(nrow(ChemicalManufacturingProcess)*.75), replace=FALSE)
train_set <- ChemicalManufacturingProcess[sample_set, ]
test_set <- ChemicalManufacturingProcess[-sample_set, ]
train_setX <- train_set[,colnames(train_set)!='Yield']
train_setY <- train_set$Yield
test_setX <- test_set[,colnames(test_set)!='Yield']
test_setY <- test_set$Yield