## [1] "random forest variable importance:"
## [1] "cubist model variable importance:"
## [1] "gradient boosted model variable importance:"
##
## Call:
## lm(formula = listd ~ ., data = tree_sim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.672 -12.162 2.632 9.224 49.376
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.4812 4.7206 -0.949 0.3449
## lista 0.6934 0.1482 4.680 9.43e-06 ***
## listb 22.6882 8.2615 2.746 0.0072 **
## listc 3.6030 2.8533 1.263 0.2097
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.64 on 96 degrees of freedom
## Multiple R-squared: 0.7125, Adjusted R-squared: 0.7035
## F-statistic: 79.29 on 3 and 96 DF, p-value: < 2.2e-16
## [1] "gradient boosted model:"
## RMSE Rsquared MAE
## 1.2814740 0.4916554 1.0386560
## [1] "random forest model:"
## RMSE Rsquared MAE
## 1.2904536 0.4993502 1.0394417
## [1] "cubist model:"
## RMSE Rsquared MAE
## 1.3735015 0.4272412 1.1034048
## [1] "bootstrap aggregated model:"
## RMSE Rsquared MAE
## 1.3086128 0.4689586 0.9986271
## [1] "conditional inference tree model model:"
## RMSE Rsquared MAE
## 1.5625866 0.2925502 1.2764040
exercises are from: Applied Predictive Modeling, Max Kuhn and Kjell Johnson
set.seed(200)
simulated<-mlbench.friedman1(200,sd = 1)
simulated<-cbind(simulated\(x, simulated\)y)
simulated<-as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <-“y”
model1<- randomForest(y ~., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1<- varImp(model1, scale = FALSE)
print(‘random forest variable importance:’) varImpPlot(model1, scale = FALSE)
simulated\(duplicate1<-simulated\)V1 + rnorm(200) * .1
cor(simulated\(duplicate1,simulated\)V1)
model1<- randomForest(y ~., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1<- varImp(model1, scale = FALSE)
print(‘random forest variable with highly correlated variable importance:’) varImpPlot(model1, scale = FALSE)
ci_random_forest<-cforest(y ~., data = simulated)
#plot(varimp(ci_random_forest),xlab=‘variable’,ylab=‘importance’)
#lines(varimp(ci_random_forest),type=‘h’) varimp_set<-varimp(ci_random_forest,conditional=FALSE) plot(varimp_set,xlab=‘variable’,ylab=‘importance’)
lines(varimp_set,type=‘h’) x_set<-simulated[,-11]
cubist_model<-cubist(x_set, simulated[,11]) print(‘cubist model variable importance:’) cubist_vars<-varImp(cubist_model)
qplot(data = cubist_vars, y=Overall, x = rownames(cubist_vars),xlab = ‘variable’, ylab = ‘importance’)
gbmGrid<- expand.grid(interaction.depth = seq (2,8, by = 2),
n.trees = seq(100,500, by = 100),
shrinkage = c(.01,.1),
n.minobsinnode = 5)
gbmTune <- train(x_set,simulated[,11],
method = “gbm”,
tuneGrid = gbmGrid,
verbose = FALSE)
print(‘gradient boosted model variable importance:’) gbmImp1<- varImp(gbmTune)
plot(gbmImp1)
listd = runif(50)
liste = runif(50)
lista<-seq(1,99, by=2)
listb<-c(rep(0,25),rep(1,25))
listc<-c(rep(0,33),rep(1,33),rep(2,34))
listd<- listb 30 + listc10liste + (lista+listd)liste tree_sim<-as.data.frame(cbind(lista,listb,listc,listd))
tree_sim_lm<-lm(listd~.,data = tree_sim)
summary(tree_sim_lm)
tree_sim_tree_model<-rpart(listd~.,data=tree_sim)
treesimp_Imp1<- varImp(tree_sim_tree_model)
qplot(data = treesimp_Imp1, y=Overall, x = rownames(treesimp_Imp1),xlab = ‘variable’, ylab = ‘importance’)
###Question 8:7
data(ChemicalManufacturingProcess)
column_names<-names(ChemicalManufacturingProcess)
chem_set<-as.matrix(unlist(ChemicalManufacturingProcess))
chem_set<-matrix(chem_set,ncol=58)
colnames(chem_set)<-column_names
chem_set<-knn.impute(chem_set, k = 5)
chem_set<-chem_set[,-8]
set.seed=39
test_set_indices<-sample.int(176,size=44)
test_set<-chem_set[test_set_indices,]
training_set<-chem_set[-test_set_indices,]
training_set_df<-data.frame(training_set)
test_set_df<-data.frame(test_set)
gbmGrid<- expand.grid(interaction.depth = seq (4,5, by = 1),
n.trees = seq(100,1000, by = 100),
shrinkage = c(.005,.01),
n.minobsinnode = 5)
gbmTune <- train(training_set_df[,-1],training_set_df[,1],
method = “gbm”,
tuneGrid = gbmGrid,
verbose = FALSE)
gbmPred<-predict(gbmTune,newdata=test_set_df)
print(‘gradient boosted model:’) postResample(pred = gbmPred,obs = test_set_df[,1])
#—————–
rf_model<-randomForest(Yield ~., data = training_set_df,
importance = TRUE,
ntree = 1000)
rfPred<-predict(rf_model,newdata=test_set_df) print(‘random forest model:’) postResample(pred = rfPred,obs = test_set_df[,1]) #—————–
cubist_model<-cubist(training_set_df[,-1],training_set_df[,1])
cubistPred<-predict(cubist_model,newdata=test_set_df)
print(‘cubist model:’) postResample(pred = cubistPred,obs = test_set_df[,1])
#—————–
bagged_model<-bagging(Yield ~., data = training_set_df)
bagPred<-predict(bagged_model,newdata=test_set_df)
print(‘bootstrap aggregated model:’)
postResample(pred = bagPred,obs = test_set_df[,1])
gbmImp1<- varImp(gbmTune)
plot(gbmImp1)
condTree_model<-ctree(Yield ~., data = training_set_df)
condTreePred<-predict(condTree_model,newdata=test_set_df)
print(‘conditional inference tree model model:’)
postResample(pred = condTreePred,obs = test_set_df[,1]) plot(condTree_model)