DATA 624 - Homework 9

Problem 8.1

set.seed(200)
simulated <-  mlbench.friedman1(200, sd=1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

Random Forest Model

model1 <- randomForest(y ~ ., data=simulated, importance=TRUE, ntree =1000)
rfImp1 <- varImp(model1, scale=FALSE)

rfImp1

##         Overall
## V1   8.84289608
## V2   6.74508245
## V3   0.67830653
## V4   7.75934674
## V5   2.23628276
## V6   0.11429887
## V7   0.03724747
## V8  -0.05349642
## V9  -0.04495617
## V10  0.03863205

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$y)

## [1] 0.4613314

model2 <- randomForest(y ~ ., data=simulated, importance=TRUE, ntree =1000)
rfImp2 <- varImp(model2, scale=FALSE)

rfImp2

##                 Overall
## V1          6.008319352
## V2          6.308908170
## V3          0.571604465
## V4          7.187015958
## V5          2.131040245
## V6          0.211304611
## V7          0.025100355
## V8         -0.116980037
## V9         -0.003679481
## V10         0.024878337
## duplicate1  3.618101735

model3 <- cforest(y ~ ., data=simulated)
varimp(model3)

##          V1          V2          V3          V4          V5          V6 
##  6.66043731  6.19942369  0.04107867  7.80355110  1.86181623 -0.03307225 
##          V7          V8          V9         V10  duplicate1 
## -0.02430149 -0.04881394 -0.01498777 -0.06177746  2.92518849

model4 <- gbm(y ~ ., data=simulated, distribution = "gaussian")

summary(model4)

##                   var   rel.inf
## V4                 V4 30.334001
## V2                 V2 22.513861
## V1                 V1 21.456642
## V5                 V5 12.779456
## V3                 V3  7.784866
## duplicate1 duplicate1  4.987620
## V7                 V7  0.143554
## V6                 V6  0.000000
## V8                 V8  0.000000
## V9                 V9  0.000000
## V10               V10  0.000000

simulatedX <- simulated[,colnames(simulated)[colnames(simulated) != 'y']]
simulatedY <- simulated[["y"]]
model5 <- cubist(simulatedX, simulatedY)

summary(model5)

## 
## Call:
## cubist.default(x = simulatedX, y = simulatedY)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Sun Nov 19 17:41:52 2023
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## Read 200 cases (12 attributes) from undefined.data
## 
## Model:
## 
##   Rule 1: [200 cases, mean 14.416183, range 3.55596 to 28.38167, est err 1.944664]
## 
##  outcome = 0.183529 + 8.9 V4 + 7.9 V1 + 7.1 V2 + 5.3 V5
## 
## 
## Evaluation on training data (200 cases):
## 
##     Average  |error|           2.199542
##     Relative |error|               0.54
##     Correlation coefficient        0.84
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##           100%    V1
##           100%    V2
##           100%    V4
##           100%    V5
## 
## 
## Time: 0.0 secs

Problem 8.2

Problem 8.3

Problem 8.7

data("ChemicalManufacturingProcess")

columns <- colnames(ChemicalManufacturingProcess)

for(col in columns) {
  print(col)
  
  median_value <- median(ChemicalManufacturingProcess[[col]],na.rm=TRUE)
  ChemicalManufacturingProcess[col][is.na(ChemicalManufacturingProcess[col])] <- median_value
}

## [1] "Yield"
## [1] "BiologicalMaterial01"
## [1] "BiologicalMaterial02"
## [1] "BiologicalMaterial03"
## [1] "BiologicalMaterial04"
## [1] "BiologicalMaterial05"
## [1] "BiologicalMaterial06"
## [1] "BiologicalMaterial07"
## [1] "BiologicalMaterial08"
## [1] "BiologicalMaterial09"
## [1] "BiologicalMaterial10"
## [1] "BiologicalMaterial11"
## [1] "BiologicalMaterial12"
## [1] "ManufacturingProcess01"
## [1] "ManufacturingProcess02"
## [1] "ManufacturingProcess03"
## [1] "ManufacturingProcess04"
## [1] "ManufacturingProcess05"
## [1] "ManufacturingProcess06"
## [1] "ManufacturingProcess07"
## [1] "ManufacturingProcess08"
## [1] "ManufacturingProcess09"
## [1] "ManufacturingProcess10"
## [1] "ManufacturingProcess11"
## [1] "ManufacturingProcess12"
## [1] "ManufacturingProcess13"
## [1] "ManufacturingProcess14"
## [1] "ManufacturingProcess15"
## [1] "ManufacturingProcess16"
## [1] "ManufacturingProcess17"
## [1] "ManufacturingProcess18"
## [1] "ManufacturingProcess19"
## [1] "ManufacturingProcess20"
## [1] "ManufacturingProcess21"
## [1] "ManufacturingProcess22"
## [1] "ManufacturingProcess23"
## [1] "ManufacturingProcess24"
## [1] "ManufacturingProcess25"
## [1] "ManufacturingProcess26"
## [1] "ManufacturingProcess27"
## [1] "ManufacturingProcess28"
## [1] "ManufacturingProcess29"
## [1] "ManufacturingProcess30"
## [1] "ManufacturingProcess31"
## [1] "ManufacturingProcess32"
## [1] "ManufacturingProcess33"
## [1] "ManufacturingProcess34"
## [1] "ManufacturingProcess35"
## [1] "ManufacturingProcess36"
## [1] "ManufacturingProcess37"
## [1] "ManufacturingProcess38"
## [1] "ManufacturingProcess39"
## [1] "ManufacturingProcess40"
## [1] "ManufacturingProcess41"
## [1] "ManufacturingProcess42"
## [1] "ManufacturingProcess43"
## [1] "ManufacturingProcess44"
## [1] "ManufacturingProcess45"

set.seed(1234)

sample_set <- sample(nrow(ChemicalManufacturingProcess),round(nrow(ChemicalManufacturingProcess)*.75), replace=FALSE)

train_set <- ChemicalManufacturingProcess[sample_set, ]
test_set <- ChemicalManufacturingProcess[-sample_set, ]

train_setX <- train_set[,colnames(train_set)!='Yield']
train_setY <- train_set$Yield

test_setX <- test_set[,colnames(test_set)!='Yield']
test_setY <- test_set$Yield

DATA 624 - Homework 9

Kory Martin

2023-11-16

Problem 8.1

Problem 8.2

Problem 8.3

Problem 8.7