Do problems 8.1, 8.2, 8.3, and 8.7 in Kuhn and Johnson. Please submit the Rpubs link along with the .rmd file.
#install.packages("mlbench")
library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
print(head(simulated))
## V1 V2 V3 V4 V5 V6 V7
## 1 0.5337724 0.6478064 0.85078526 0.18159957 0.92903976 0.36179060 0.8266609
## 2 0.5837650 0.4381528 0.67272659 0.66924914 0.16379784 0.45305931 0.6489601
## 3 0.5895783 0.5879065 0.40967108 0.33812728 0.89409334 0.02681911 0.1785614
## 4 0.6910399 0.2259548 0.03335447 0.06691274 0.63744519 0.52500637 0.5133614
## 5 0.6673315 0.8188985 0.71676079 0.80324287 0.08306864 0.22344157 0.6644906
## 6 0.8392937 0.3862983 0.64618857 0.86105431 0.63038947 0.43703891 0.3360117
## V8 V9 V10 y
## 1 0.4214081 0.59111440 0.5886216 18.46398
## 2 0.8446239 0.92819306 0.7584008 16.09836
## 3 0.3495908 0.01759542 0.4441185 17.76165
## 4 0.7970260 0.68986918 0.4450716 13.78730
## 5 0.9038919 0.39696995 0.5500808 18.42984
## 6 0.6489177 0.53116033 0.9066182 20.85817
#install.packages("randomForest")
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
print(model1)
##
## Call:
## randomForest(formula = y ~ ., data = simulated, importance = TRUE, ntree = 1000)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 6.754258
## % Var explained: 72.3
rfImp1 <- varImp(model1, scale = FALSE)
print(rfImp1)
## Overall
## V1 8.732235404
## V2 6.415369387
## V3 0.763591825
## V4 7.615118809
## V5 2.023524577
## V6 0.165111172
## V7 -0.005961659
## V8 -0.166362581
## V9 -0.095292651
## V10 -0.074944788
The random forest model here did in fact use the informative predictors, but NOT significantly. Based on the output of the varImp function, the uninformative predictors have either a super low value of importance or a negative value of importance meaning that these predictors were essentially useless in helping to predict the target variable.
#install.packages("partykit")
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
#library(partykit)
ctrl <- cforest_control(mtry = 2, ntree = 1000)
model_cforest <- cforest(y ~ ., data = simulated, controls = ctrl)
imp_noncond <- varimp(model_cforest, conditional = FALSE)
imp_cond <- varimp(model_cforest, conditional = TRUE)
print(imp_noncond)
## V1 V2 V3 V4 V5 V6
## 3.166087285 3.557760232 0.083675313 3.929311188 1.127230728 -0.008267497
## V7 V8 V9 V10 duplicate1
## -0.004826123 -0.044799674 0.057038436 -0.018890770 3.418620783
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 duplicate1
# 3.12828455 3.52526015 0.07843721 3.94416785 1.08432299 -0.03204551 -0.01399208 -0.04295751 0.04787140 -0.05415470 3.35948570
print(imp_cond)
## V1 V2 V3 V4 V5 V6
## 1.428160582 2.634637732 0.053824081 2.913744526 0.634624879 0.001987701
## V7 V8 V9 V10 duplicate1
## -0.040865904 -0.011076562 0.013321626 -0.002146922 1.430874011
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 duplicate1
# 1.424172e+00 2.643533e+00 3.660300e-02 2.783233e+00 6.099108e-01 3.714634e-05 -2.293399e-02 -2.379747e-02 1.457397e-02 -1.512565e-02 1.444891e+00
The outputs of both model’s importance ranking scores are different from one another. THe change in the boolean conditional input impacts these scores. Essentially, when the conditional is ser to true the correlation between the predictor variables is considered. THis is why duplicate1 and v1 have lower imporatns scores in the version where conditional is set to true. THe model is controlling for the correlation between these two identidal variables.
##Splitting x'x from y
## With DUPE
y_wd <- simulated$y
X_wd<- simulated[, !(names(simulated) %in% "y")]
y_nd <- simulated$y
X_nd <- simulated[, !(names(simulated) %in% c("y", "duplicate1"))]
## Doing with Boosted Trees
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
gbmModel_wd <- gbm.fit(X_wd, y_wd, distribution = "gaussian")
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 24.3703 nan 0.0010 0.0113
## 2 24.3578 nan 0.0010 0.0106
## 3 24.3455 nan 0.0010 0.0110
## 4 24.3302 nan 0.0010 0.0113
## 5 24.3180 nan 0.0010 0.0088
## 6 24.3053 nan 0.0010 0.0131
## 7 24.2906 nan 0.0010 0.0132
## 8 24.2787 nan 0.0010 0.0093
## 9 24.2662 nan 0.0010 0.0103
## 10 24.2500 nan 0.0010 0.0099
## 20 24.1283 nan 0.0010 0.0091
## 40 23.8739 nan 0.0010 0.0116
## 60 23.6387 nan 0.0010 0.0087
## 80 23.3898 nan 0.0010 0.0102
## 100 23.1433 nan 0.0010 0.0113
print(summary(gbmModel_wd))
## var rel.inf
## V1 V1 30.12091
## V2 V2 25.89821
## duplicate1 duplicate1 25.88611
## V4 V4 18.09476
## V3 V3 0.00000
## V5 V5 0.00000
## V6 V6 0.00000
## V7 V7 0.00000
## V8 V8 0.00000
## V9 V9 0.00000
## V10 V10 0.00000
gbmModel_nd <- gbm.fit(X_nd, y_nd, distribution = "gaussian")
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 24.3698 nan 0.0010 0.0122
## 2 24.3601 nan 0.0010 0.0073
## 3 24.3501 nan 0.0010 0.0094
## 4 24.3349 nan 0.0010 0.0114
## 5 24.3229 nan 0.0010 0.0090
## 6 24.3102 nan 0.0010 0.0113
## 7 24.2991 nan 0.0010 0.0096
## 8 24.2862 nan 0.0010 0.0063
## 9 24.2727 nan 0.0010 0.0108
## 10 24.2591 nan 0.0010 0.0109
## 20 24.1341 nan 0.0010 0.0100
## 40 23.8937 nan 0.0010 0.0100
## 60 23.6474 nan 0.0010 0.0084
## 80 23.4178 nan 0.0010 0.0054
## 100 23.1875 nan 0.0010 0.0092
print(summary(gbmModel_nd))
## var rel.inf
## V1 V1 56.65060
## V2 V2 23.92152
## V4 V4 19.42789
## V3 V3 0.00000
## V5 V5 0.00000
## V6 V6 0.00000
## V7 V7 0.00000
## V8 V8 0.00000
## V9 V9 0.00000
## V10 V10 0.00000
#### Similar to previous models, when i run the boosted tree model with and without the duplicate column the importance for the predictor variables shifts. When the duplicate1 column is kept within the model the importance of V1 is the most important with duplicate1 in second place. Both variables have respective importance values of 40.49 and 29.16. When the duplicate predictor variable is removed, v1 has an importance value of 61.29 and is in first place. Second place is v2 with an importance value of 23.14. Lastly, this function and lib did not seem to have a conditional input to toggle for this question, so i simply did with and without the duplicate1 variable.
## Doing with Cubist
library(Cubist)
cubistMod_wd <- cubist(X_wd, y_wd)
## WITH DUPE
#imp_noncond <- varimp(cubistMod_wd, conditional = FALSE)
#print(imp_noncond)
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 duplicate1
# 3.12828455 3.52526015 0.07843721 3.94416785 1.08432299 -0.03204551 -0.01399208 -0.04295751 0.04787140 -0.05415470 3.35948570
#imp_cond <- varimp(cubistMod_wd, conditional = TRUE)
#print(imp_cond)
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 duplicate1
# 1.424172e+00 2.643533e+00 3.660300e-02 2.783233e+00 6.099108e-01 3.714634e-05 -2.293399e-02 -2.379747e-02 1.457397e-02 -1.512565e-02 1.444891e+00
## NO DUPE
#cubistMod_nd <- cubist(X_nd, y_nd)
#imp_noncond <- varimp(cubistMod_nd, conditional = FALSE)
#print(imp_noncond)
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 duplicate1
# 3.12828455 3.52526015 0.07843721 3.94416785 1.08432299 -0.03204551 -0.01399208 -0.04295751 0.04787140 -0.05415470 3.35948570
#imp_cond <- varimp(cubistMod_nd, conditional = TRUE)
#print(imp_cond)
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 duplicate1
# 1.424172e+00 2.643533e+00 3.660300e-02 2.783233e+00 6.099108e-01 3.714634e-05 -2.293399e-02 -2.379747e-02 1.457397e-02 -1.512565e-02 1.444891e+00
#### Firstly, for the cubist model that maintained the duplicate1 variable in the model. This model when having the conditional input set to False and then changed to True, the same thing happened. The v1 and duplicate1 importance scores whent from 3.12 and 3.59, respectively, to 1.42 and 1.44. THis shows the intervariable correlations were considered. Second, when the duplicate1 predictor values were removed from the df before running the model the importance scores were different. Initially when the importance scores were run with the conditional input on False, the importance scores for nearly all the variables were higher than when the conditional input was True. Lets take V1, a variable we know is important for the predictor variable, it went from 3.12 to 1.47.
library(rpart)
set.seed(123)
n <- 500
## TO answer this question we are making a data set with several different variables, a continousi variable, and then tw ocategorical variables. THe two catefgorical variables have different numbers of categories.
# Continuous Predtictor
X1 <- rnorm(n)
#XCategorical with 3 different categories
X2 <- sample(letters[1:3], n, replace = TRUE)
# Categorical variable with 10 differet categfories
X3 <- sample(letters[1:10], n, replace = TRUE)
#generating the target var
y <- rnorm(n)
data <- data.frame(X1 = X1, X2 = as.factor(X2), X3 = as.factor(X3), y = y)
# Makinf a simlr
tree_model <- rpart(y ~ ., data = data)
# Plot variable importance
print(varImp(tree_model))
## Overall
## X1 0.062492009
## X2 0.005479156
## X3 0.064358816
In the predictorvariable importance output on can see the different levels of importance that were given to the completely made up variables THew more granularity or splits the model is able to do the more importance is assumed for the variable. THis is a bias based on how the tree models work, however, it is important to note here that there is no correlaiton in the data as well. Its all random, yet these more granular categories and the continuous values are ranked as more important.
In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:
In looking at the two models displayed in Figure 8.24, the model on the right and the model on the left both models have different imporance scores the variables. As mentioned the model on the left has lower bagging and learning rate values than the model on the right. THis means, this results in more randomness in the samples being taken from the larger dataset as well as the importance scores coming out more “hollistically” accurate. Meaning, that when the larger values for learning rate and bagging are used the importance scores tend to be high amongst a few select varaibles, while with lower scores the model takes its time with more smaller random samples for figuring out the relationships.
The model with the smaller learning rate and bagging fraction would be more predictive of other samples, as it is taking smaller samples of the larger dataset in order to get a more well-rounded sense of what patterns are truly present. This approach may make the model slightly more sensitive to noise in the data, but overall it is more reflective of the true structure of other samples.
Increasing the interaction depth the model is able to consider more variables, this lends itself to being able to see how more variables influence the outcome overall, as opposed to a super shallow tree which limited this type of information. With that being said, the depth of the tree can also open up the model to being overfit on the data.
Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
## Data from 6.3
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
## Taking code from last hw
imputed <- preProcess(ChemicalManufacturingProcess, method = "medianImpute")
chem_df_imputed <- predict(imputed, ChemicalManufacturingProcess)
## Establishing the x and y vars
X <- chem_df_imputed[, -which(names(chem_df_imputed) == "Yield")]
y <- chem_df_imputed$Yield
## putting together for working & spltting
ChemData <- list(x = X, y = y)
## Splitting 70/30
splitIndex <- createDataPartition(ChemData$y, p = 0.7, list = FALSE)
chemtrainingData <- list(x = data.frame(ChemData$x[splitIndex, ]),y = ChemData$y[splitIndex])
chemtestData <- list( x = data.frame(ChemData$x[-splitIndex, ]),y = ChemData$y[-splitIndex])
### Beginning of Tree modeling
train_control <- trainControl(method = "cv", number = 5)
## RANDOM FOREST
set.seed(123)
rf_model <- train(x = chemtrainingData$x, y = chemtrainingData$y, method = "rf", trControl = train_control, importance = TRUE)
print(rf_model)
## Random Forest
##
## 124 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 99, 99, 99, 100, 99
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.283451 0.5169752 1.0650397
## 29 1.200165 0.5320328 0.9535206
## 57 1.204947 0.5203852 0.9331334
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 29.
# Decision Tree
set.seed(123)
dt_model <- train( x = chemtrainingData$x, y = chemtrainingData$y, method = "rpart", trControl = train_control)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
print(dt_model)
## CART
##
## 124 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 99, 99, 99, 100, 99
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.06580774 1.445622 0.3298998 1.135981
## 0.08304053 1.384817 0.3739533 1.111718
## 0.35700028 1.540417 0.2856842 1.275779
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.08304053.
#Boosted tree
set.seed(123)
gbm_model <- train( x = chemtrainingData$x, y = chemtrainingData$y, method = "gbm", trControl = train_control, verbose = FALSE)
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
print(gbm_model)
## Stochastic Gradient Boosting
##
## 124 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 99, 99, 99, 100, 99
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 50 1.256130 0.4746139 0.9867910
## 1 100 1.219050 0.5017183 0.9644389
## 1 150 1.217129 0.5041514 0.9705865
## 2 50 1.210074 0.5065611 0.9554854
## 2 100 1.198013 0.5252308 0.9304255
## 2 150 1.189514 0.5312462 0.9378008
## 3 50 1.211165 0.5069573 0.9696533
## 3 100 1.208305 0.5177520 0.9428995
## 3 150 1.192893 0.5295323 0.9304069
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 2, shrinkage = 0.1 and n.minobsinnode = 10.
#### MODEL EVALUATION
# Random Forest Predictions
rf_preds <- predict(rf_model, newdata = chemtestData$x)
rf_rmse <- RMSE(rf_preds, chemtestData$y)
# Decision Tree Predictions
dt_preds <- predict(dt_model, newdata = chemtestData$x)
dt_rmse <- RMSE(dt_preds, chemtestData$y)
# Boosted Tree Predictions
gbm_preds <- predict(gbm_model, newdata = chemtestData$x)
gbm_rmse <- RMSE(gbm_preds, chemtestData$y)
# Print RMSE results
print(paste("Random Forest RMSE:", rf_rmse)) # 0.770559334916787
## [1] "Random Forest RMSE: 1.19858675635713"
print(paste("Decision Tree (CART) RMSE:", dt_rmse))#1.16107886521949
## [1] "Decision Tree (CART) RMSE: 1.60062410501889"
print(paste("Boosted Tree (GBM) RMSE:", gbm_rmse))#0.872160081853729
## [1] "Boosted Tree (GBM) RMSE: 1.1762031938977"
In this instance of the three models that I built the Random Forest provided the best results with the lowest RMSE. The RMSE for this model was 0.77.
The top predictors from the linear model and non-linear model were:
According to the random foreat model in this homwork, the most important variables are the manufactuing variables, as they make out 7 of the top 10 variables listed in order of descending importance. However, ti should be of note that the top 4 variables are 50/50 biological and manufacuring. The top two slots as manufacutring variables, while the 3rd and 4th slots are biologuial. THis is different from the linear model results, which had the top 6 most important5 variables being manufacturing. THe non linear model had four out of the top 5 variables as manufacturing. The tree model actualy is the one that has more biological variable importance ranked than the other two.
rf_varimp <- varImp(rf_model)
print(rf_varimp)
## rf variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial03 48.35
## BiologicalMaterial12 39.77
## BiologicalMaterial11 38.59
## ManufacturingProcess09 38.43
## ManufacturingProcess17 37.73
## BiologicalMaterial06 36.57
## ManufacturingProcess13 32.65
## ManufacturingProcess36 30.25
## BiologicalMaterial02 27.90
## ManufacturingProcess11 27.63
## ManufacturingProcess39 26.93
## ManufacturingProcess31 25.96
## ManufacturingProcess01 25.10
## ManufacturingProcess27 24.73
## ManufacturingProcess20 23.95
## ManufacturingProcess02 22.42
## BiologicalMaterial10 22.41
## BiologicalMaterial09 21.84
## ManufacturingProcess21 21.53
library(rpart.plot)
print(rpart.plot(dt_model$finalModel))
## $obj
## n= 124
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 124 361.1538 40.15032
## 2) ManufacturingProcess32< 159.5 72 120.8465 39.28375 *
## 3) ManufacturingProcess32>=159.5 52 111.3753 41.35019 *
##
## $snipped.nodes
## NULL
##
## $xlim
## [1] -0.65 1.65
##
## $ylim
## [1] -0.8 1.8
##
## $x
## [1] 0.50000000 0.03845145 0.96154855
##
## $y
## [1] 0.93276038 0.03427902 0.03427902
##
## $branch.x
## [,1] [,2] [,3]
## x 0.5 0.03845145 0.9615485
## NA 0.03845145 0.9615485
## NA 0.50000000 0.5000000
##
## $branch.y
## [,1] [,2] [,3]
## y 1.10826 0.2097790 0.2097790
## NA 0.7343691 0.7343691
## NA 0.7343691 0.7343691
##
## $labs
## [1] "40\n100%" "39\n58%" "41\n42%"
##
## $cex
## [1] 1
##
## $boxes
## $boxes$x1
## [1] 0.39316761 -0.05073889 0.87235820
##
## $boxes$y1
## [1] 0.83356472 -0.06491664 -0.06491664
##
## $boxes$x2
## [1] 0.6068324 0.1276418 1.0507389
##
## $boxes$y2
## [1] 1.108260 0.209779 0.209779
##
##
## $split.labs
## [1] ""
##
## $split.cex
## [1] 1 1 1
##
## $split.box
## $split.box$x1
## [1] 0.01092326 NA NA
##
## $split.box$y1
## [1] 0.6580647 NA NA
##
## $split.box$x2
## [1] 0.9890767 NA NA
##
## $split.box$y2
## [1] 0.8106734 NA NA
### The single decision tree shows that ManufacturingProcess32 and BiologicalMaterial12 are important predictors. The diagrtam provides information on the split points in the data as well as the precentages of the data that fall on each side of such splits. So in that sense this does show more information about the predictors and their relationships with yield. The diagram makes the tree much easier to visualize.