These are class notes and practical exercises for week 4 of Jeff Leek’s “Practical Machine Learning” course, (week #4 (Coursera Data science Program). https://www.coursera.org/learn/practical-machine-learning/lecture/HYzfz/combining-predictors.
Combining classfiers is a technique to enhance prediction capability, but it is done at the expense of interpretability/explicability.
Combining Classifiers:
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.2.5
data(Wage)
Wage=subset(Wage,select=-c(Wage$logwage))
inBuild=createDataPartition(y=Wage$wage, p=0.7, list=FALSE)
wagevalid=Wage[-inBuild,]
wagebuilt=Wage[inBuild,]
inTrain=createDataPartition(y=wagebuilt$wage, p=0.7, list=FALSE)
wagetrain=wagebuilt[inTrain,]
wagetest=wagebuilt[-inTrain,]
dim(wagevalid);dim(wagetrain);dim(wagetest);dim(Wage)
## [1] 898 9
## [1] 1474 9
## [1] 628 9
## [1] 3000 9
mod1=train(wage~., method="glm", data=wagetrain)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
mod2=train(wage~., method="rf", data=wagetrain,trControl=trainControl(method = "cv"),number=3)
Predict on the testing set
pred1=predict(mod1, wagetest)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
pred2=predict(mod2, wagetest)
qplot(pred1, pred2, colour=wage,data=wagetest)
## Building a combined prediction model
From the previous steps, a new data set is created that includes 3 fields: a) two fields, the predictions from model 1 (glm) and model 2 (random forest) and b) the original outcome variable (wage) from the training set.
Using the “gam” function in carat, a combined model will be created to predcit wage based on pred1 and pred2.
combDF=data.frame(pred1,pred2, wage=wagetest$wage)
CombModFit=train(wage~., method="gam", data=combDF)
## Loading required package: mgcv
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 3.2.5
## This is mgcv 1.8-10. For overview type 'help("mgcv-package")'.
combPred=predict(CombModFit,combDF)
The “combDFfit” is a gam (Generalized additive) model that combines the predictions generated from both a GLM and a “random forrest” models.
<