set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
## Overall
## V1 8.732235404
## V2 6.415369387
## V3 0.763591825
## V4 7.615118809
## V5 2.023524577
## V6 0.165111172
## V7 -0.005961659
## V8 -0.166362581
## V9 -0.095292651
## V10 -0.074944788
The variable importance function for random forest reveals that only, V1-V5 wer the dominant variables, V6-V10 were not utilized significantly
set.seed(200)
ctrl <- cforest_control(mtry = ncol(simulated) - 1)
cf_fit <- party::cforest(y ~ ., data = simulated, controls = ctrl)
cfImp1 <- varImp(cf_fit)
cfImp2 <- varImp(cf_fit, conditional=T)
cfImp1 <- cfImp1 %>% setNames(c("cf_ti"))
cfImp2 <- cfImp2 %>% setNames(c("cf_ci"))
names<- rownames(cfImp2)
names[11] <- "V11"
Imp_cf <- cbind(cfImp1,cfImp2)
rownames(Imp_cf) <- NULL
Imp_cf <- cbind(names,Imp_cf)
Imp_cf
## names cf_ti cf_ci
## 1 V1 3.75726531 0.600678542
## 2 V2 7.35210373 4.588727102
## 3 V3 0.02552462 0.005858870
## 4 V4 9.95219052 6.140435409
## 5 V5 2.07525107 0.726520861
## 6 V6 -0.03627375 0.003208094
## 7 V7 0.03866849 0.021965866
## 8 V8 -0.04209865 -0.004550812
## 9 V9 -0.02705505 0.003543102
## 10 V10 0.02546114 0.014950855
## 11 V11 5.51730707 0.774554525
The pattern of importance for the predictor variables is same, again variables V6 - V10 being not utilized significantly.
simulated_orig <- simulated[,-12]
gbm_fit <- gbm.fit(simulated_orig[,-11], simulated_orig[,11], n.trees = 100,distribution="gaussian")
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 24.3716 nan 0.0010 0.0101
## 2 24.3587 nan 0.0010 0.0101
## 3 24.3460 nan 0.0010 0.0104
## 4 24.3345 nan 0.0010 0.0084
## 5 24.3227 nan 0.0010 0.0084
## 6 24.3126 nan 0.0010 0.0058
## 7 24.3006 nan 0.0010 0.0109
## 8 24.2875 nan 0.0010 0.0108
## 9 24.2755 nan 0.0010 0.0100
## 10 24.2643 nan 0.0010 0.0110
## 20 24.1394 nan 0.0010 0.0086
## 40 23.8966 nan 0.0010 0.0042
## 60 23.6548 nan 0.0010 0.0112
## 80 23.4069 nan 0.0010 0.0109
## 100 23.1839 nan 0.0010 0.0083
gbmImp1 <- varImp(gbm_fit, numTrees = 100)
gbmImp1 <- gbmImp1 %>% add_row(Overall=NA) %>% setNames(c("gbm1"))
gbm_fit2 <- gbm.fit(simulated[,-11], simulated[,11], n.trees = 100,distribution="gaussian")
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 24.3737 nan 0.0010 0.0095
## 2 24.3610 nan 0.0010 0.0087
## 3 24.3512 nan 0.0010 0.0047
## 4 24.3388 nan 0.0010 0.0103
## 5 24.3264 nan 0.0010 0.0098
## 6 24.3141 nan 0.0010 0.0080
## 7 24.3019 nan 0.0010 0.0069
## 8 24.2916 nan 0.0010 0.0091
## 9 24.2796 nan 0.0010 0.0104
## 10 24.2669 nan 0.0010 0.0104
## 20 24.1347 nan 0.0010 0.0114
## 40 23.8809 nan 0.0010 0.0095
## 60 23.6437 nan 0.0010 0.0091
## 80 23.4089 nan 0.0010 0.0063
## 100 23.1668 nan 0.0010 0.0090
gbmImp2 <- varImp(gbm_fit2, numTrees = 100)
gbmImp2 <- gbmImp2 %>%setNames(c("gbm2"))
imp <- data.frame(gbmImp1,gbmImp2)
imp
## gbm1 gbm2
## V1 37303.00 26729.87
## V2 21991.62 19097.14
## V3 0.00 0.00
## V4 11202.37 13459.04
## V5 0.00 0.00
## V6 0.00 0.00
## V7 0.00 0.00
## V8 0.00 0.00
## V9 0.00 0.00
## V10 0.00 0.00
## duplicate1 NA 12834.88
We ran gradient boosting model, varibale importance pattern remains the same except V3, which is no longer a dominant predictor, in addition the scale for level of importance is different. The importance score is very high although pattern remains same except V3.
cubist_fit1 <- cubist(simulated_orig[,-11], simulated_orig[,11])
cubist_fit2 <- cubist(simulated[,-11],simulated[,11])
cbImp1 <- varImp(cubist_fit1)%>% add_row(Overall=NA) %>% setNames(c("Cubist1"))
cbImp2<- varImp(cubist_fit2) %>% setNames(c("Cubist2"))
cImp <- data.frame(cbImp1,cbImp2)
cImp
## Cubist1 Cubist2
## V1 50 50
## V2 50 50
## V4 50 50
## V5 50 50
## duplicate1 0 50
## V3 0 0
## V6 0 0
## V7 0 0
## V8 0 0
## V9 0 0
## V10 NA 0
Cubist model provide a different inference, the pattern nremains same, with the addition of correlated predictor, both the pattern and level of significance for predictors remains same. Although the variable V3 is also no longer considered dominant as was the case with hgbm model.
Predictors with higher number of distinct values are favoured more over granular predictors also referred as selection bias in tree based models. However if the data contains noise variables, they are favored over the low variance predictors. With the increase in noise in the data, the chances of selection of granular predictor increases. We will create simulated data for low variance predictor and higher granular predictor to check the tree bias.
x1 <- rep(1:2, each=100)
x2 <- rnorm(200, mean=0, sd=4)
y <- x1 + rnorm(200, mean= 0 , sd=1)
sim <- data.frame(y,x1,x2)
sim_tree2 <- rpart(y~., data=sim)
plot(as.party(sim_tree2))
varImp(sim_tree2)
## Overall
## x1 0.3302428
## x2 0.4565380
Variable x2 is not correlated to the response, x1 has a low variance and should be preffered, however due to increase in noise, x2 is favoured and has more importance in the tree model.
Learning rate is considered as a remedy for greediness strategy of the model to constrain the learning process. Smallest values of the learning rate works best, with the increase in learning rate model follows greedy strategy and selects few predictors as important. Also increase in bagging fraction increased randomness which will pick up few predictorsa as important. Therefore model on right with high learning rate and bagging fractions pick up few predictors as important.
As per textbook, the greedy models have the drawbacks of not finding the optimal model as well as over fitting the training data. Lower values of learning rate are generally preferred as they make the model robust to the specific characteristics of tree and thus allowing it to generalize well. Less greedy model will be more predictive of other samples. Model on left would be more predictive.
With the increase in interaction depth, more predictotrs will be emphasized and considered as dominant model on right will be benefitted more.
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
processPredictors = as.matrix(ChemicalManufacturingProcess[,2:58])
yield = ChemicalManufacturingProcess[,1]
train_r <- createDataPartition(yield, p=0.75, list=F)
pp_train <- ChemicalManufacturingProcess[train_r,-1]
y_train <- ChemicalManufacturingProcess[train_r,1]
pp_test <- ChemicalManufacturingProcess[-train_r,-1]
y_test <- ChemicalManufacturingProcess[-train_r,1]
p_pro <- c("nzv", "center","scale", "medianImpute")
We will reun the previous models PLS and our best non linear model SVM from the previous exercises and compare with our regression tree models.
t_ctrl <- trainControl(method = "repeatedcv", repeats = 5)
pls_fit<-train(pp_train, y_train, method="pls", tuneLength = 10,preProcess=p_pro, trainControl=t_ctrl)
pls_fit
## Partial Least Squares
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (56), scaled (56), median imputation (56),
## remove (1)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.561055 0.3774014 1.163414
## 2 1.934334 0.3446223 1.212661
## 3 1.787266 0.3816435 1.180361
## 4 2.050947 0.3508562 1.227950
## 5 2.389644 0.3250761 1.303656
## 6 2.517391 0.3155029 1.344945
## 7 2.634067 0.3178341 1.381436
## 8 2.673197 0.3228776 1.413350
## 9 2.823067 0.3215343 1.465793
## 10 2.991490 0.3087708 1.507498
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
pls_pred <- predict(pls_fit, pp_test)
pls_met <- postResample(pred = pls_pred, obs = y_test)
svm_fit <- train(pp_train, y_train, method="svmRadial", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl)
svm_fit
## Support Vector Machines with Radial Basis Function Kernel
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (56), scaled (56), median imputation (56),
## remove (1)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.420427 0.4334232 1.1464269
## 0.50 1.338306 0.4739383 1.0751440
## 1.00 1.272648 0.5133583 1.0191364
## 2.00 1.231911 0.5391300 0.9824667
## 4.00 1.206938 0.5537465 0.9581289
## 8.00 1.199927 0.5580732 0.9474439
## 16.00 1.199796 0.5578186 0.9465771
## 32.00 1.199796 0.5578186 0.9465771
## 64.00 1.199796 0.5578186 0.9465771
## 128.00 1.199796 0.5578186 0.9465771
##
## Tuning parameter 'sigma' was held constant at a value of 0.0134643
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0134643 and C = 16.
plot(svm_fit)
svm_pred <- predict(svm_fit, newdata=pp_test)
svm_met<- postResample(pred=svm_pred,y_test)
set.seed(100)
rpart_fit <- train(pp_train, y_train,method = "rpart2",tuneLength = 10, preProcess= p_pro,trControl = t_ctrl)
## note: only 9 possible values of the max tree depth from the initial fit.
## Truncating the grid to 9 .
rpart_fit
## CART
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (56), scaled (56), median imputation (56),
## remove (1)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 118, 118, 117, 119, 119, 119, ...
## Resampling results across tuning parameters:
##
## maxdepth RMSE Rsquared MAE
## 1 1.371111 0.4725263 1.108974
## 2 1.474377 0.4026948 1.193796
## 3 1.501114 0.3911224 1.215081
## 4 1.507949 0.3890736 1.214658
## 5 1.506176 0.3993266 1.201189
## 6 1.509971 0.3989018 1.200145
## 8 1.520948 0.4039172 1.199111
## 9 1.529030 0.4031171 1.207630
## 10 1.531485 0.4010683 1.214313
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 1.
plot(rpart_fit)
rpart_pred <- predict(rpart_fit,newdata=pp_test)
rpart_met <- postResample(pred=rpart_pred,y_test)
t_ctrl1 <- trainControl(method = "boot", number = 25)
cubist_fit = train(pp_train, y_train, method="cubist", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl1)
cubist_fit
## Cubist
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (56), scaled (56), median imputation (56),
## remove (1)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 1.940278 0.2830995 1.4049493
## 1 5 1.929041 0.2893836 1.3878733
## 1 9 1.924183 0.2885351 1.3890951
## 10 0 1.362353 0.4781958 1.0397573
## 10 5 1.345388 0.4917660 1.0200376
## 10 9 1.351590 0.4856743 1.0260618
## 20 0 1.309823 0.5081134 1.0021506
## 20 5 1.288967 0.5239613 0.9770453
## 20 9 1.296986 0.5168396 0.9844058
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
plot(cubist_fit)
cubist_pred <- predict(cubist_fit,newdata=pp_test)
cub_met <- postResample(pred=cubist_pred,y_test)
t_ctrl1 <- trainControl(method = "boot", number = 25)
gbmGrid <- expand.grid(interaction.depth=seq(1,6,by=1), n.trees=c(25,50,100,200), shrinkage=c(0.01,0.05,0.1,0.2), n.minobsinnode=10)
gbm_fit <- suppressWarnings(train(pp_train, y_train, method = "gbm", metric = "Rsquared",tuneGrid=gbmGrid, trControl = t_ctrl1, verbose=F))
gbm_fit
## Stochastic Gradient Boosting
##
## 132 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees RMSE Rsquared MAE
## 0.01 1 25 1.712756 0.4454776 1.385802
## 0.01 1 50 1.621988 0.4661492 1.312226
## 0.01 1 100 1.503242 0.4837555 1.214042
## 0.01 1 200 1.388204 0.4971314 1.115721
## 0.01 2 25 1.690658 0.4748192 1.368556
## 0.01 2 50 1.583752 0.4837508 1.280807
## 0.01 2 100 1.451896 0.4984267 1.171379
## 0.01 2 200 1.352892 0.5034784 1.079600
## 0.01 3 25 1.679457 0.4736221 1.359423
## 0.01 3 50 1.570638 0.4791374 1.272411
## 0.01 3 100 1.439364 0.4926281 1.160686
## 0.01 3 200 1.344050 0.5037625 1.069747
## 0.01 4 25 1.678033 0.4744426 1.358522
## 0.01 4 50 1.568640 0.4788039 1.271183
## 0.01 4 100 1.436651 0.4911090 1.158420
## 0.01 4 200 1.342080 0.5032271 1.067274
## 0.01 5 25 1.679078 0.4722191 1.359078
## 0.01 5 50 1.568681 0.4796975 1.269627
## 0.01 5 100 1.438563 0.4914669 1.159340
## 0.01 5 200 1.342580 0.5030176 1.068448
## 0.01 6 25 1.679085 0.4774034 1.358302
## 0.01 6 50 1.566659 0.4869812 1.267870
## 0.01 6 100 1.436209 0.4952178 1.155661
## 0.01 6 200 1.340005 0.5065398 1.064695
## 0.05 1 25 1.461375 0.4834037 1.177216
## 0.05 1 50 1.365967 0.4968327 1.092312
## 0.05 1 100 1.322783 0.5038166 1.046304
## 0.05 1 200 1.314258 0.5039139 1.031652
## 0.05 2 25 1.421236 0.4802892 1.141874
## 0.05 2 50 1.340784 0.4981844 1.068276
## 0.05 2 100 1.314378 0.5053502 1.036043
## 0.05 2 200 1.304759 0.5110213 1.021403
## 0.05 3 25 1.408264 0.4835587 1.131230
## 0.05 3 50 1.339575 0.4924542 1.062561
## 0.05 3 100 1.324908 0.4965663 1.046805
## 0.05 3 200 1.314981 0.5047032 1.035036
## 0.05 4 25 1.405206 0.4777145 1.124655
## 0.05 4 50 1.336463 0.4936479 1.055397
## 0.05 4 100 1.312522 0.5051812 1.032138
## 0.05 4 200 1.304388 0.5110657 1.021616
## 0.05 5 25 1.408188 0.4783661 1.130393
## 0.05 5 50 1.341027 0.4919164 1.064794
## 0.05 5 100 1.320026 0.4998635 1.039727
## 0.05 5 200 1.315285 0.5031942 1.034693
## 0.05 6 25 1.406991 0.4776274 1.122459
## 0.05 6 50 1.343760 0.4848734 1.055168
## 0.05 6 100 1.322768 0.4945521 1.031141
## 0.05 6 200 1.311970 0.5024329 1.022440
## 0.10 1 25 1.356622 0.4905137 1.085220
## 0.10 1 50 1.325515 0.4975304 1.050352
## 0.10 1 100 1.318844 0.4989554 1.032740
## 0.10 1 200 1.312747 0.5052850 1.027093
## 0.10 2 25 1.369932 0.4706975 1.093761
## 0.10 2 50 1.339275 0.4892267 1.057327
## 0.10 2 100 1.339444 0.4887403 1.050442
## 0.10 2 200 1.330924 0.4964887 1.044655
## 0.10 3 25 1.370322 0.4629538 1.082265
## 0.10 3 50 1.346881 0.4776285 1.058223
## 0.10 3 100 1.332679 0.4891845 1.048575
## 0.10 3 200 1.325857 0.4958345 1.041328
## 0.10 4 25 1.335380 0.4968940 1.053380
## 0.10 4 50 1.319177 0.5023743 1.038059
## 0.10 4 100 1.315065 0.5061081 1.031123
## 0.10 4 200 1.324114 0.5001117 1.036582
## 0.10 5 25 1.339207 0.4907860 1.059958
## 0.10 5 50 1.329592 0.4902802 1.043180
## 0.10 5 100 1.321380 0.4965297 1.035965
## 0.10 5 200 1.324372 0.4968625 1.041277
## 0.10 6 25 1.354030 0.4779726 1.069869
## 0.10 6 50 1.334962 0.4883882 1.052556
## 0.10 6 100 1.331603 0.4899311 1.046333
## 0.10 6 200 1.331162 0.4926485 1.044852
## 0.20 1 25 1.348991 0.4747716 1.072010
## 0.20 1 50 1.336140 0.4855201 1.051852
## 0.20 1 100 1.334149 0.4913595 1.050797
## 0.20 1 200 1.356129 0.4795985 1.067157
## 0.20 2 25 1.350177 0.4782256 1.062311
## 0.20 2 50 1.340871 0.4851169 1.050132
## 0.20 2 100 1.349305 0.4858283 1.058905
## 0.20 2 200 1.353554 0.4846227 1.063774
## 0.20 3 25 1.363556 0.4633107 1.073163
## 0.20 3 50 1.352829 0.4723090 1.070220
## 0.20 3 100 1.362558 0.4695725 1.074481
## 0.20 3 200 1.365353 0.4695418 1.075685
## 0.20 4 25 1.370253 0.4613634 1.083815
## 0.20 4 50 1.363473 0.4677953 1.071989
## 0.20 4 100 1.369527 0.4661172 1.076336
## 0.20 4 200 1.369108 0.4674363 1.073481
## 0.20 5 25 1.368572 0.4607400 1.067419
## 0.20 5 50 1.364387 0.4663933 1.062282
## 0.20 5 100 1.364317 0.4687609 1.064470
## 0.20 5 200 1.363963 0.4700279 1.064037
## 0.20 6 25 1.375013 0.4554775 1.075382
## 0.20 6 50 1.376221 0.4573589 1.078668
## 0.20 6 100 1.384439 0.4535269 1.083926
## 0.20 6 200 1.386157 0.4540562 1.084007
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200,
## interaction.depth = 4, shrinkage = 0.05 and n.minobsinnode = 10.
plot(gbm_fit)
gbm_pred <- predict(gbm_fit,newdata=pp_test)
gbm_met<- postResample(pred=gbm_pred,y_test)
comb_met <- data.frame(pls_met, svm_met, rpart_met, cub_met,gbm_met)
comb_met <- data.frame(t(comb_met))
model<- c("PLS", "SVM", "RPART", "CUBIST", "GBM")
rownames(comb_met) <- NULL
comb_met <- data.frame(cbind(model,comb_met))
comb_met
## model RMSE Rsquared MAE
## 1 PLS 1.430764 0.4467640 1.2118667
## 2 SVM 1.150287 0.6799685 0.8682791
## 3 RPART 1.578932 0.3276660 1.2274628
## 4 CUBIST 1.014967 0.7530604 0.7547706
## 5 GBM 1.023675 0.7482967 0.7649099
Cubist Model outperforms the previous pls, previous nonlinear SVM and regression tree models with the lowest RMSE and hioghest R squared values on the test set.
The variable importance from Cubist Model reveals that dominant predictors are aligned with the results from the previous linear and non-linear models. ManufacturingProcess32 was the most important predictor variables in all three models. In addition, in all three models ManufacturingProcess variables were dominant than the BiologicalMaterila variables. The ratio of importance of Manufacturing to Biological in top 10 variables was almost same in three models. The level of significance was however slightly different for the dominan predictors.
cub_dot <-dotPlot(varImp(cubist_fit), top=10)
pls_dot <-dotPlot(varImp(pls_fit), top=10)
## Warning: package 'pls' was built under R version 3.5.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
svm_dot <- dotPlot(varImp(svm_fit), top=10)
#grid.arrange(cub_dot,pls_dot, ncol=2)
cub_dot
pls_dot
svm_dot
cImp<- varImp(cubist_fit)$importance %>%setNames("CUBIST") %>% rownames_to_column("var")%>% arrange(desc(CUBIST)) %>%top_n(10)
## Selecting by CUBIST
pImp<- varImp(pls_fit)$importance %>%setNames("PLS") %>% rownames_to_column("var")%>% arrange(desc(PLS)) %>%top_n(10)
## Selecting by PLS
sImp<- varImp(svm_fit)$importance %>%setNames("SVM")%>% rownames_to_column("var")%>% arrange(desc(SVM)) %>%top_n(10)
## Selecting by SVM
imp_list <- data.frame(cImp,pImp,sImp)
imp_list
## var CUBIST var.1 PLS
## 1 BiologicalMaterial03 100.00000 ManufacturingProcess32 100.00000
## 2 ManufacturingProcess32 79.26829 ManufacturingProcess13 86.07998
## 3 ManufacturingProcess17 56.09756 ManufacturingProcess36 84.94073
## 4 ManufacturingProcess09 48.78049 ManufacturingProcess09 80.34108
## 5 ManufacturingProcess33 47.56098 BiologicalMaterial06 75.75377
## 6 BiologicalMaterial02 36.58537 BiologicalMaterial03 75.21953
## 7 ManufacturingProcess04 35.36585 BiologicalMaterial02 75.18435
## 8 BiologicalMaterial12 34.14634 ManufacturingProcess33 68.07548
## 9 BiologicalMaterial06 32.92683 ManufacturingProcess12 67.27790
## 10 ManufacturingProcess21 29.26829 ManufacturingProcess17 67.20556
## var.2 SVM
## 1 ManufacturingProcess32 100.00000
## 2 ManufacturingProcess13 93.47736
## 3 BiologicalMaterial06 82.54199
## 4 BiologicalMaterial12 78.21556
## 5 ManufacturingProcess36 74.99325
## 6 ManufacturingProcess17 74.50045
## 7 BiologicalMaterial03 74.46390
## 8 ManufacturingProcess09 68.45708
## 9 ManufacturingProcess31 65.16971
## 10 ManufacturingProcess11 62.00754
ManufacturingProcess32 is the top predictor which is in agreement with the other models. The biological predictors does not seems to have much dominance with the response yield and the results are in accordance with previous models providing no additional knowledge of biological predictors with yield.
rp_fit <- rpart(y_train~., pp_train, maxdepth=3)
plot(as.party(rp_fit),gp=gpar(fontsize = 8))
#https://www.statmethods.net/advstats/cart.html