homework

8.1

library(mlbench)
library(AppliedPredictiveModeling)
library(mice)
library(glmnet)
library(rpart.plot)
library(rpart)
library(randomForest)
library(caret)
library(gbm)
library(party)

set.seed(200)
simulated <- mlbench.friedman1(200,sd=1)

simulated <- cbind(simulated$x,simulated$y)

simulated <- as.data.frame(simulated)

colnames(simulated) [ncol(simulated)] <- "y"

Part A-)

model1 <- randomForest(y ~ .,data = simulated,
                       importance = T,
                       ntree=1000)
                       
rfImp1 <- varImp(model1,scale = F)

rfImp1

##         Overall
## V1   8.83890885
## V2   6.49023056
## V3   0.67583163
## V4   7.58822553
## V5   2.27426009
## V6   0.17436781
## V7   0.15136583
## V8  -0.03078937
## V9  -0.02989832
## V10 -0.08529218

Answer: No.

Part B-)

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1

cor(simulated$duplicate1,simulated$V1)

## [1] 0.9396216

set.seed(200)
model2 <- randomForest(y ~ .,data = simulated,
                       importance = T,
                       ntree=1000)
                       
rfImp2 <- varImp(model2,scale = F)

rfImp2

##                Overall
## V1          5.98437487
## V2          6.25489202
## V3          0.43837543
## V4          7.15358387
## V5          2.48949664
## V6          0.11440487
## V7         -0.02794148
## V8         -0.06304965
## V9          0.01960016
## V10         0.04054481
## duplicate1  3.73591550

The V1 importance value changed. when you add other correlated variables, this gets affected the model reduces the importance of that variable.

Part C-)

## model 
model3 <- cforest(y~.,data = simulated)

rfImp3 <- varImp(model3,conditional = T)

rfImp3

##                  Overall
## V1          3.1693280188
## V2          4.8840435653
## V3          0.0098451007
## V4          6.2260531322
## V5          1.1189835747
## V6         -0.0018071530
## V7          0.0281145471
## V8         -0.0028838278
## V9         -0.0004290913
## V10        -0.0049792526
## duplicate1  0.8346559974

Part D-)

## Boosted Trees
gbmG <- expand.grid(.interaction.depth = seq(1, 7, by = 2),
                        .n.trees = seq(100, 1000, by = 100),
                        .shrinkage = c(0.01, 0.1),
                        .n.minobsinnode = 8)

set.seed(100)

gbmModel <- train(y ~.,data=simulated,
                  method = "gbm",
                  tuneGrid =gbmG,
                  verbose=F
                 )


gbmModel

## Stochastic Gradient Boosting 
## 
## 200 samples
##  11 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  RMSE      Rsquared   MAE     
##   0.01       1                   100     4.178012  0.5995731  3.440232
##   0.01       1                   200     3.622404  0.6786334  2.988966
##   0.01       1                   300     3.244692  0.7172428  2.682294
##   0.01       1                   400     2.973813  0.7460880  2.453410
##   0.01       1                   500     2.762667  0.7702920  2.273950
##   0.01       1                   600     2.604577  0.7883159  2.136035
##   0.01       1                   700     2.469156  0.8038703  2.015820
##   0.01       1                   800     2.362118  0.8156714  1.922345
##   0.01       1                   900     2.273324  0.8248678  1.845458
##   0.01       1                  1000     2.199770  0.8322394  1.782315
##   0.01       3                   100     3.634945  0.7002001  2.986401
##   0.01       3                   200     2.954822  0.7600813  2.421691
##   0.01       3                   300     2.568118  0.7990696  2.095650
##   0.01       3                   400     2.328693  0.8235169  1.888299
##   0.01       3                   500     2.180180  0.8376818  1.759354
##   0.01       3                   600     2.085584  0.8464500  1.676836
##   0.01       3                   700     2.026024  0.8516805  1.624534
##   0.01       3                   800     1.989089  0.8548408  1.591619
##   0.01       3                   900     1.962865  0.8571933  1.568312
##   0.01       3                  1000     1.945740  0.8586485  1.552526
##   0.01       5                   100     3.446506  0.7285909  2.831779
##   0.01       5                   200     2.767486  0.7803475  2.267982
##   0.01       5                   300     2.414322  0.8137915  1.963803
##   0.01       5                   400     2.217119  0.8329981  1.792983
##   0.01       5                   500     2.106997  0.8431495  1.696944
##   0.01       5                   600     2.043564  0.8489449  1.641401
##   0.01       5                   700     2.004394  0.8526798  1.607344
##   0.01       5                   800     1.981411  0.8547352  1.587122
##   0.01       5                   900     1.966312  0.8560527  1.574756
##   0.01       5                  1000     1.954311  0.8572874  1.564144
##   0.01       7                   100     3.372906  0.7370016  2.763059
##   0.01       7                   200     2.705003  0.7851620  2.210439
##   0.01       7                   300     2.378531  0.8150526  1.928110
##   0.01       7                   400     2.205493  0.8317320  1.777105
##   0.01       7                   500     2.113172  0.8404191  1.695506
##   0.01       7                   600     2.060774  0.8455228  1.649824
##   0.01       7                   700     2.030694  0.8484757  1.623990
##   0.01       7                   800     2.013265  0.8500857  1.609038
##   0.01       7                   900     1.999323  0.8515334  1.597335
##   0.01       7                  1000     1.989479  0.8525312  1.589274
##   0.10       1                   100     2.195764  0.8285276  1.776892
##   0.10       1                   200     1.959457  0.8503871  1.582230
##   0.10       1                   300     1.932871  0.8526404  1.558672
##   0.10       1                   400     1.926477  0.8526955  1.551107
##   0.10       1                   500     1.932369  0.8517473  1.552430
##   0.10       1                   600     1.934477  0.8511304  1.554695
##   0.10       1                   700     1.946886  0.8490138  1.563973
##   0.10       1                   800     1.948370  0.8486842  1.560990
##   0.10       1                   900     1.953661  0.8479323  1.563938
##   0.10       1                  1000     1.960700  0.8468495  1.569715
##   0.10       3                   100     1.988825  0.8500829  1.585613
##   0.10       3                   200     1.938198  0.8555845  1.542416
##   0.10       3                   300     1.933977  0.8556436  1.539291
##   0.10       3                   400     1.929703  0.8558850  1.534560
##   0.10       3                   500     1.929368  0.8558105  1.533742
##   0.10       3                   600     1.929929  0.8556187  1.535045
##   0.10       3                   700     1.931114  0.8554105  1.536152
##   0.10       3                   800     1.931348  0.8553588  1.536404
##   0.10       3                   900     1.931773  0.8552943  1.536791
##   0.10       3                  1000     1.931845  0.8552686  1.536923
##   0.10       5                   100     2.049572  0.8406216  1.652940
##   0.10       5                   200     2.022033  0.8436971  1.628273
##   0.10       5                   300     2.018530  0.8438815  1.625497
##   0.10       5                   400     2.017191  0.8439611  1.624523
##   0.10       5                   500     2.016495  0.8439874  1.623664
##   0.10       5                   600     2.016455  0.8439491  1.623628
##   0.10       5                   700     2.016606  0.8439161  1.623759
##   0.10       5                   800     2.016657  0.8438950  1.623783
##   0.10       5                   900     2.016749  0.8438773  1.623799
##   0.10       5                  1000     2.016787  0.8438720  1.623811
##   0.10       7                   100     2.072232  0.8366470  1.664053
##   0.10       7                   200     2.048919  0.8391755  1.642860
##   0.10       7                   300     2.044841  0.8397148  1.639547
##   0.10       7                   400     2.044210  0.8397409  1.638574
##   0.10       7                   500     2.043446  0.8398404  1.637736
##   0.10       7                   600     2.043324  0.8398537  1.637507
##   0.10       7                   700     2.043368  0.8398486  1.637590
##   0.10       7                   800     2.043500  0.8398325  1.637673
##   0.10       7                   900     2.043454  0.8398480  1.637652
##   0.10       7                  1000     2.043402  0.8398584  1.637585
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 400, interaction.depth =
##  1, shrinkage = 0.1 and n.minobsinnode = 8.

varImp(gbmModel)

## gbm variable importance
## 
##             Overall
## V4         100.0000
## V2          84.7556
## V1          67.9084
## V5          43.4834
## V3          32.3890
## duplicate1  28.4273
## V7           3.0259
## V6           3.0160
## V8           1.4249
## V10          0.7694
## V9           0.0000

## Cubist


library(Cubist)

cubistmodel <- train(y ~ .,data = simulated,method ="cubist")

varImp(cubistmodel)

## cubist variable importance
## 
##            Overall
## V1         100.000
## V2          75.694
## V4          68.750
## V3          58.333
## V5          56.944
## V6          13.889
## duplicate1   2.083
## V7           0.000
## V8           0.000
## V9           0.000
## V10          0.000

## Cubist,removing the duplicate variable
cubistmode2 <- train(y ~ .,data = subset( simulated, select = -duplicate1 ),method ="cubist")

varImp(cubistmode2)

## cubist variable importance
## 
##     Overall
## V1   100.00
## V2    75.69
## V4    68.06
## V3    58.33
## V5    55.56
## V6    15.28
## V9     0.00
## V7     0.00
## V8     0.00
## V10    0.00

## Boosted Trees,removing the duplicate variable


gbmModel <- train(y ~.,data=subset( simulated, select = -duplicate1 ),
                  method = "gbm",
                  tuneGrid =gbmG,
                  verbose=F
                 )


varImp(gbmModel)

## gbm variable importance
## 
##      Overall
## V4  100.0000
## V1   92.3936
## V2   78.5856
## V5   40.7876
## V3   25.7773
## V7    3.2176
## V6    2.0752
## V8    0.4303
## V9    0.4180
## V10   0.0000

8.2

x1 <- rep(1:1, times = 200)

x2 <- rep(1:10, times = 20)

x3 <- rep(6:7, each= 5,times = 20)

x4 <- rep(4:8, each= 2, times = 20)

y <- x1 + x2 + x3+ x4

data <- data.frame(cbind(y,x1,x2,x3,x4))

model <- rpart(y ~.,data=data)

varImp(model)

##      Overall
## x2 4.1270925
## x3 0.7945946
## x4 4.0832487
## x1 0.0000000

We can see the model bias when selecting the variables with higher variance than the one with lower variance. we can see that variable x is all 1 value and the overall importance was 0.

8.3

A-)

I would say it’s affected by how much on the tunning section(Bagging and Shrikange) it has which makes the model be cutting off more and be more sharp and precise by selecting the variables.

b-)

we can see that the model on the right it more selecting with the importance of the variables, for which I would say that this can make this more accurate with few variables but in the left also can be a good preditor too, just that will be selecting more variables.

c-)

increasing the interaction depth will affect the slope of the predictor. the more the increase the more the importance variable decreased

8.7

Part A-)

data("ChemicalManufacturingProcess")

data <- ChemicalManufacturingProcess

Amelia::missmap(data)

temp <- mice(data,m=5, maxit = 50, method = 'pmm', seed = 500,printFlag = F)

## Warning: Number of logged events: 6750

imputed.data <- complete(temp)
 imputed.data <- imputed.data[,-nearZeroVar(imputed.data)]
 
Amelia::missmap(imputed.data)

set.seed(250)
yield_data <- createDataPartition(imputed.data$Yield, p=0.8, list=F)

train_data <- imputed.data[yield_data, ]
test_data <- imputed.data[-yield_data, ]





## Model 1

model_1 <- randomForest(Yield ~ .,data = train_data,
                       importance = T,
                       ntree=1000)
                       
varImp(model_1,scale = F)

##                              Overall
## BiologicalMaterial01    0.0426966936
## BiologicalMaterial02    0.1240280327
## BiologicalMaterial03    0.2670477431
## BiologicalMaterial04    0.0738353195
## BiologicalMaterial05    0.0390409516
## BiologicalMaterial06    0.3027663975
## BiologicalMaterial08    0.0729511657
## BiologicalMaterial09    0.0372654223
## BiologicalMaterial10    0.0131264695
## BiologicalMaterial11    0.1105449282
## BiologicalMaterial12    0.2926017443
## ManufacturingProcess01  0.0512100922
## ManufacturingProcess02  0.0230209516
## ManufacturingProcess03  0.0046418270
## ManufacturingProcess04  0.0175594873
## ManufacturingProcess05  0.0203529768
## ManufacturingProcess06  0.0613674512
## ManufacturingProcess07 -0.0012368999
## ManufacturingProcess08  0.0018919718
## ManufacturingProcess09  0.0969428522
## ManufacturingProcess10  0.0048648135
## ManufacturingProcess11  0.0278007086
## ManufacturingProcess12  0.0069989400
## ManufacturingProcess13  0.1662447869
## ManufacturingProcess14  0.0015902013
## ManufacturingProcess15  0.0059216136
## ManufacturingProcess16  0.0202062122
## ManufacturingProcess17  0.1087102678
## ManufacturingProcess18  0.0405865801
## ManufacturingProcess19  0.0135716556
## ManufacturingProcess20  0.0369963520
## ManufacturingProcess21  0.0112075876
## ManufacturingProcess22  0.0090416783
## ManufacturingProcess23  0.0029466812
## ManufacturingProcess24  0.0192405975
## ManufacturingProcess25  0.0408411742
## ManufacturingProcess26  0.0079980450
## ManufacturingProcess27  0.0359580098
## ManufacturingProcess28  0.1069922929
## ManufacturingProcess29  0.0242011249
## ManufacturingProcess30  0.0333595148
## ManufacturingProcess31  0.1588404515
## ManufacturingProcess32  1.1421379654
## ManufacturingProcess33  0.0627240975
## ManufacturingProcess34  0.0062974536
## ManufacturingProcess35 -0.0021251947
## ManufacturingProcess36  0.1853098443
## ManufacturingProcess37  0.0071797826
## ManufacturingProcess38  0.0040607220
## ManufacturingProcess39  0.0548080692
## ManufacturingProcess40 -0.0010118684
## ManufacturingProcess41  0.0036438697
## ManufacturingProcess42  0.0197711583
## ManufacturingProcess43  0.0180009404
## ManufacturingProcess44  0.0065751749
## ManufacturingProcess45  0.0005136061

m1Pred <- predict(model_1,newdata = test_data)
postResample(pred = m1Pred , obs = test_data$Yield)

##      RMSE  Rsquared       MAE 
## 1.0613079 0.7141271 0.7959005

## Model 2

model_2 <- cforest(Yield~.,data = train_data)

varImp(model_2,conditional = T)

##                              Overall
## BiologicalMaterial01    0.0030101489
## BiologicalMaterial02    0.0552226104
## BiologicalMaterial03    0.0471211332
## BiologicalMaterial04    0.0210819842
## BiologicalMaterial05    0.0084635541
## BiologicalMaterial06    0.0621491894
## BiologicalMaterial08    0.0102516839
## BiologicalMaterial09    0.0001837210
## BiologicalMaterial10   -0.0003240165
## BiologicalMaterial11    0.0102918968
## BiologicalMaterial12    0.0297864445
## ManufacturingProcess01  0.0020023506
## ManufacturingProcess02  0.0066310054
## ManufacturingProcess03 -0.0038548565
## ManufacturingProcess04 -0.0001491352
## ManufacturingProcess05  0.0029402878
## ManufacturingProcess06  0.0418093763
## ManufacturingProcess07 -0.0009710851
## ManufacturingProcess08 -0.0018998929
## ManufacturingProcess09  0.0865705386
## ManufacturingProcess10  0.0016134526
## ManufacturingProcess11  0.0160208912
## ManufacturingProcess12  0.0095589829
## ManufacturingProcess13  0.0701623767
## ManufacturingProcess14 -0.0007784407
## ManufacturingProcess15  0.0018814450
## ManufacturingProcess16  0.0021811429
## ManufacturingProcess17  0.0404237411
## ManufacturingProcess18  0.0049236461
## ManufacturingProcess19  0.0039359128
## ManufacturingProcess20  0.0109191683
## ManufacturingProcess21 -0.0015598975
## ManufacturingProcess22  0.0006088870
## ManufacturingProcess23  0.0006669005
## ManufacturingProcess24  0.0015587443
## ManufacturingProcess25  0.0048959455
## ManufacturingProcess26  0.0031421256
## ManufacturingProcess27  0.0077562831
## ManufacturingProcess28  0.0036051715
## ManufacturingProcess29  0.0243083982
## ManufacturingProcess30  0.0030013839
## ManufacturingProcess31  0.0373292819
## ManufacturingProcess32  0.2284602646
## ManufacturingProcess33  0.0371538119
## ManufacturingProcess34  0.0074333863
## ManufacturingProcess35  0.0020324927
## ManufacturingProcess36  0.0857178519
## ManufacturingProcess37  0.0071399660
## ManufacturingProcess38 -0.0019010303
## ManufacturingProcess39  0.0077861400
## ManufacturingProcess40 -0.0003153308
## ManufacturingProcess41  0.0003226814
## ManufacturingProcess42 -0.0008832979
## ManufacturingProcess43  0.0038618663
## ManufacturingProcess44  0.0062521291
## ManufacturingProcess45  0.0007253181

m2Pred <- predict(model_2,newdata = test_data)
postResample(pred = m2Pred , obs = test_data$Yield)

##      RMSE  Rsquared       MAE 
## 1.3013265 0.5610503 1.0313082

## Model 3

gbmModel <- train(Yield ~.,data=train_data,
                  method = "gbm",
                  tuneGrid =gbmG,
                  verbose=F
                 )


varImp(gbmModel)

## gbm variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess09  18.349
## ManufacturingProcess06  15.948
## BiologicalMaterial12    15.750
## ManufacturingProcess17  11.702
## ManufacturingProcess13  11.172
## BiologicalMaterial11     8.631
## ManufacturingProcess31   8.455
## BiologicalMaterial03     8.372
## BiologicalMaterial09     7.724
## BiologicalMaterial06     7.263
## ManufacturingProcess01   7.211
## ManufacturingProcess05   7.000
## ManufacturingProcess25   6.271
## ManufacturingProcess28   5.278
## BiologicalMaterial02     5.137
## BiologicalMaterial05     5.129
## ManufacturingProcess37   4.591
## BiologicalMaterial04     4.562
## ManufacturingProcess18   4.530

gbmPred <- predict(gbmModel,newdata = test_data)
postResample(pred = gbmPred , obs = test_data$Yield)

##      RMSE  Rsquared       MAE 
## 0.9293387 0.7664653 0.7422395

## Model 4

plsModel <- train(
  Yield~., data = train_data, method = "pls",
  trControl = trainControl("cv", number = 10),
  scale = T,
  tuneLength = 20
  )

varImp(plsModel)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:caret':
## 
##     R2

## The following object is masked from 'package:stats':
## 
##     loadings

## pls variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess36   76.62
## ManufacturingProcess13   71.23
## ManufacturingProcess09   69.26
## ManufacturingProcess17   66.09
## ManufacturingProcess06   61.12
## ManufacturingProcess33   61.00
## BiologicalMaterial02     56.78
## BiologicalMaterial06     54.14
## BiologicalMaterial08     51.87
## BiologicalMaterial12     48.38
## BiologicalMaterial11     48.19
## BiologicalMaterial03     48.05
## ManufacturingProcess12   48.02
## BiologicalMaterial01     47.92
## ManufacturingProcess29   47.55
## BiologicalMaterial04     45.74
## ManufacturingProcess04   41.77
## ManufacturingProcess28   38.63
## ManufacturingProcess37   38.16

plsPred <- predict(plsModel,newdata = test_data)
postResample(pred = plsPred , obs = test_data$Yield)

##      RMSE  Rsquared       MAE 
## 1.6880579 0.3808667 1.1458658

data.frame(rbind(randomForest = postResample(pred = m1Pred , obs = test_data$Yield),cforest = postResample(pred = m2Pred , obs = test_data$Yield), Gbm = postResample(pred = gbmPred , obs = test_data$Yield), pls = postResample(pred = plsPred , obs = test_data$Yield) ))

##                   RMSE  Rsquared       MAE
## randomForest 1.0613079 0.7141271 0.7959005
## cforest      1.3013265 0.5610503 1.0313082
## Gbm          0.9293387 0.7664653 0.7422395
## pls          1.6880579 0.3808667 1.1458658

Part B-)

varImp(plsModel)

## pls variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess36   76.62
## ManufacturingProcess13   71.23
## ManufacturingProcess09   69.26
## ManufacturingProcess17   66.09
## ManufacturingProcess06   61.12
## ManufacturingProcess33   61.00
## BiologicalMaterial02     56.78
## BiologicalMaterial06     54.14
## BiologicalMaterial08     51.87
## BiologicalMaterial12     48.38
## BiologicalMaterial11     48.19
## BiologicalMaterial03     48.05
## ManufacturingProcess12   48.02
## BiologicalMaterial01     47.92
## ManufacturingProcess29   47.55
## BiologicalMaterial04     45.74
## ManufacturingProcess04   41.77
## ManufacturingProcess28   38.63
## ManufacturingProcess37   38.16

varImp(gbmModel)

## gbm variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess09  18.349
## ManufacturingProcess06  15.948
## BiologicalMaterial12    15.750
## ManufacturingProcess17  11.702
## ManufacturingProcess13  11.172
## BiologicalMaterial11     8.631
## ManufacturingProcess31   8.455
## BiologicalMaterial03     8.372
## BiologicalMaterial09     7.724
## BiologicalMaterial06     7.263
## ManufacturingProcess01   7.211
## ManufacturingProcess05   7.000
## ManufacturingProcess25   6.271
## ManufacturingProcess28   5.278
## BiologicalMaterial02     5.137
## BiologicalMaterial05     5.129
## ManufacturingProcess37   4.591
## BiologicalMaterial04     4.562
## ManufacturingProcess18   4.530

When comparing the gbm model with the nonlinear PLS model we see that on both models the “manufacturing process” variables still dominating the list. you can also seem some changes in the importance order on the variables but seem very similar to the pick of the 10 top predictors. But we can see the Gbm model gives a better result checking on the metrics RMSE and R2.

Part C-)

rpartModel <- train(Yield ~.,data=train_data,
                  method = "rpart2",
                tuneLength = 10,
                trControl = trainControl(method = "cv")
          
                 )



rpart.plot(rpartModel$finalModel)

Observing the tree graph we can see more clearly that Manufacturing process 32 has to be less than 160 and process 06 less than 208 and biological material less than 20, taking into consideration this is the 3 top predictors variables for which have the most influential in the model and the Yield results.

homework_9

Anthony Munoz

4/22/2020

8.1

Part A-)

Part B-)

Part C-)

Part D-)

8.2

8.3

8.7

Part A-)

Part B-)

Part C-)