Answer:
$ /sigma /frac{number of timea a word appears in a doc}{total number of terms in document(D)} $
library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
# install.packages("randomForest")
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
## Overall
## V1 8.83890885
## V2 6.49023056
## V3 0.67583163
## V4 7.58822553
## V5 2.27426009
## V6 0.17436781
## V7 0.15136583
## V8 -0.03078937
## V9 -0.02989832
## V10 -0.08529218
Answer: The V6-10 were not significantly used as per the above table.
# install.packages("party")
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
model3 <- cforest(y ~ ., data = simulated)
cfImp3_cond <- varimp(model3, conditional = TRUE)
cfImp3_uncond <- varimp(model3, conditional = FALSE)
cfImp3_cond
## V1 V2 V3 V4 V5
## 3.173621e+00 4.954327e+00 -2.487929e-03 6.122763e+00 1.157286e+00
## V6 V7 V8 V9 V10
## 6.534901e-05 -2.353746e-02 6.846242e-03 1.737579e-02 1.154302e-02
## duplicate1
## 9.159232e-01
cfImp3_uncond
## V1 V2 V3 V4 V5 V6
## 6.426216746 6.191817884 -0.007081095 7.543620062 1.904419019 -0.015717721
## V7 V8 V9 V10 duplicate1
## 0.010691722 -0.044368734 0.021920857 0.002841800 2.664973100
Answer: The conditional inference tree adjusts the importance of both V1 and duplicate.
# install.packages("Cubist")
library(Cubist)
model4 <- cubist(x = simulated[, names(simulated)[names(simulated) != 'y']],
y = simulated[,c('y')])
cfImp4_cond <- varImp(model4, conditional = TRUE)
cfImp4_uncond <- varImp(model4, conditional = FALSE)
cfImp4_cond
## Overall
## V1 50
## V2 50
## V4 50
## V5 50
## V3 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
## duplicate1 0
cfImp4_uncond
## Overall
## V1 50
## V2 50
## V4 50
## V5 50
## V3 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
## duplicate1 0
Answer: Whether conditional or unconditional is used makes no difference to the importance.
# install.packages("party")
# install.packages("rpart")
library(rpart)
library(party)
set.seed(123)
X1 <- rep(1:2,each=100)
X2 <- rnorm(100,mean=0,sd=4)
Y <- X1 + rnorm(100,mean=0,sd=10)
df1 <- data.frame(Y=Y, X1=X1, X2=X2)
mod <- rpart(Y ~ ., data = df1)
varImp(mod)
## Overall
## X1 0.04052473
## X2 1.19529577
Answer: incredible, even though the X2 is not part of the simulation, so not a driver of Y it ends up more important to the tree due to the large standard deviation we set.
Answer: Increasing a learning rate is intended to slow down the adaptation of the model to the training data, lower adaptation results in fewer predictors.
Answer: Lower learning rate and bagging fraction would have the result of performing better on unseen data. High learning rate models are overfitting and overfitting on training = poor predictive power on unseen data.
Answer: Increased interaction -> more dense trees -> more predictors will be taken into account so varimp will show allocation of importance across more variables.
Answer:
# install.packages("AppliedPredictiveModeling")
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
(cmpImpute <- preProcess(ChemicalManufacturingProcess[,-c(1)], method=c('bagImpute')))
## Created from 152 samples and 57 variables
##
## Pre-processing:
## - bagged tree imputation (57)
## - ignored (0)
cmp <- predict(cmpImpute, ChemicalManufacturingProcess[,-c(1)])
set.seed(100)
trainRow <- createDataPartition(ChemicalManufacturingProcess$Yield, p=0.8, list=FALSE)
X.train <- cmp[trainRow, ]
y.train <- ChemicalManufacturingProcess$Yield[trainRow]
X.test <- cmp[-trainRow, ]
y.test <- ChemicalManufacturingProcess$Yield[-trainRow]
rpartModel <- train(x = X.train,
y = y.train,
method = "rpart",
tuneLength = 10,
control = rpart.control(maxdepth=2))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
rpartModel
## CART
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.01204205 1.541620 0.3674644 1.200057
## 0.01847923 1.541620 0.3674644 1.200057
## 0.03255360 1.539348 0.3684727 1.197754
## 0.03490253 1.539658 0.3671155 1.196496
## 0.03555401 1.539658 0.3671155 1.196496
## 0.03773587 1.539658 0.3671155 1.196496
## 0.06206321 1.561557 0.3488273 1.224788
## 0.07034214 1.557795 0.3509089 1.221777
## 0.07735748 1.559110 0.3472341 1.222490
## 0.39423191 1.622214 0.3374732 1.281136
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.0325536.
rfModel <- train(x = X.train,
y = y.train,
method = 'rf',
tuneLength = 10)
rfModel
## Random Forest
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.348422 0.5770987 1.0689942
## 8 1.258421 0.6038632 0.9767273
## 14 1.244093 0.6052371 0.9597814
## 20 1.248788 0.5972043 0.9609205
## 26 1.251547 0.5903273 0.9632382
## 32 1.251553 0.5881619 0.9632356
## 38 1.258010 0.5808720 0.9663852
## 44 1.267526 0.5716490 0.9736540
## 50 1.269082 0.5693298 0.9745388
## 57 1.278411 0.5627945 0.9814042
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 14.
resamp <- resamples(list(SingleTree=rpartModel, RandomForest=rfModel))
summary(resamp)
##
## Call:
## summary.resamples(object = resamp)
##
## Models: SingleTree, RandomForest
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SingleTree 0.9708033 1.1457205 1.1940681 1.1977538 1.299353 1.351002 0
## RandomForest 0.7696501 0.8538683 0.9679689 0.9597814 1.028382 1.206217 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SingleTree 1.286055 1.461978 1.548867 1.539348 1.624625 1.832355 0
## RandomForest 1.005828 1.109358 1.245362 1.244093 1.333256 1.529391 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SingleTree 0.1751645 0.3181988 0.3596247 0.3684727 0.4264661 0.5315992 0
## RandomForest 0.5031329 0.5572926 0.6117129 0.6052371 0.6625399 0.7171261 0
Answer: Random’s forecast mean R^2 is higher than single tree.
varImp(rpartModel)
## rpart variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess13 100.00
## BiologicalMaterial12 76.69
## BiologicalMaterial06 64.10
## ManufacturingProcess09 59.76
## ManufacturingProcess32 56.41
## BiologicalMaterial03 36.68
## BiologicalMaterial11 35.94
## ManufacturingProcess06 32.93
## BiologicalMaterial08 30.35
## ManufacturingProcess17 29.50
## ManufacturingProcess38 0.00
## ManufacturingProcess05 0.00
## ManufacturingProcess18 0.00
## ManufacturingProcess33 0.00
## ManufacturingProcess03 0.00
## ManufacturingProcess01 0.00
## ManufacturingProcess35 0.00
## ManufacturingProcess28 0.00
## ManufacturingProcess31 0.00
## ManufacturingProcess14 0.00
varImp(rfModel)
## rf variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess13 39.110
## BiologicalMaterial12 32.813
## BiologicalMaterial03 32.523
## BiologicalMaterial06 31.197
## ManufacturingProcess17 18.196
## ManufacturingProcess09 17.165
## BiologicalMaterial02 15.453
## BiologicalMaterial04 15.419
## ManufacturingProcess06 15.158
## ManufacturingProcess36 14.205
## BiologicalMaterial11 13.399
## ManufacturingProcess33 11.714
## ManufacturingProcess31 11.667
## ManufacturingProcess28 10.504
## BiologicalMaterial09 9.857
## BiologicalMaterial05 9.044
## BiologicalMaterial08 8.951
## ManufacturingProcess11 7.383
## ManufacturingProcess18 6.970
ManuFacturing pmaterials are highest ranking but in trems of quantity in top 10 they are equal.
Answer:
plot(rpartModel$finalModel)
text(rpartModel$finalModel)