RFE Illustration

library(caret)

## Warning: package 'caret' was built under R version 3.1.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 3.1.2

## Loading required package: ggplot2

library(mlbench)
library(Hmisc)

## Warning: package 'Hmisc' was built under R version 3.1.2

## Loading required package: grid
## Loading required package: survival

## Warning: package 'survival' was built under R version 3.1.2

## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: Formula

## Warning: package 'Formula' was built under R version 3.1.2

## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.1.1

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:Hmisc':
## 
##     combine

n <- 100
p <- 40
sigma <- 1
set.seed(1)
sim <- mlbench.friedman1(n, sd = sigma)
colnames(sim$x) <- c(paste("real", 1:5, sep = ""),
                     paste("bogus", 1:5, sep = ""))
bogus <- matrix(rnorm(n * p), nrow = n)
colnames(bogus) <- paste("bogus", 5+(1:ncol(bogus)), sep = "")
x <- cbind(sim$x, bogus)
y <- sim$y

head(x)

##          real1     real2     real3      real4     real5    bogus1
## [1,] 0.2655087 0.6547239 0.2675082 0.67371223 0.6588776 0.5541771
## [2,] 0.3721239 0.3531973 0.2186453 0.09485786 0.1850700 0.6882752
## [3,] 0.5728534 0.2702601 0.5167968 0.49259612 0.9543781 0.6580576
## [4,] 0.9082078 0.9926841 0.2689506 0.46155184 0.8978485 0.6633427
## [5,] 0.2016819 0.6334933 0.1811683 0.37521653 0.9436971 0.4722342
## [6,] 0.8983897 0.2132081 0.5185761 0.99109922 0.7236908 0.9695282
##         bogus2     bogus3     bogus4     bogus5     bogus6      bogus7
## [1,] 0.8142518 0.92974321 0.85868745 0.83189899 -0.3410670 -0.70756823
## [2,] 0.9287772 0.90093927 0.03443876 0.76684275  1.5024245  1.97157201
## [3,] 0.1474810 0.75088219 0.97099715 0.27278032  0.5283077 -0.08999868
## [4,] 0.7498217 0.67656877 0.74511014 0.18816330  0.5421914 -0.01401725
## [5,] 0.9756573 0.64801345 0.27325524 0.22576183 -0.1366734 -1.12345694
## [6,] 0.9747925 0.07324687 0.67710610 0.06197037 -1.1367339 -1.34413012
##           bogus8     bogus9     bogus10    bogus11    bogus12    bogus13
## [1,] -1.08690882 -1.5414026  1.13496509  0.2418959 -1.5570357  0.3412484
## [2,] -1.82608301  0.1943211  1.11193185 -1.1327594  1.9231637  1.3161672
## [3,]  0.99528181  0.2644225 -0.87077763  1.4899074 -1.8568296 -0.9597765
## [4,] -0.01186178 -1.1187352  0.21073159 -0.2482471 -2.1061184 -1.2055752
## [5,] -0.59962839  0.6509530  0.06939565  0.1835837  0.6976485  1.5675731
## [6,] -0.17794799 -1.0329002 -1.66264885  0.4048710  0.9074444  0.2252858
##         bogus14    bogus15     bogus16    bogus17    bogus18     bogus19
## [1,]  1.5468813  0.8500435  0.34419403  1.6212029  0.7140855 -0.57099429
## [2,]  0.1789210 -0.9253130  0.01271984 -0.3291028  0.5813846  0.28653902
## [3,] -0.2825466  0.8935812 -0.87345013 -2.3264095 -0.1467239  1.14761986
## [4,] -0.7672988 -0.9410097  0.34280028  2.1929980  1.5069818  0.13955870
## [5,] -0.5764042  0.5389521 -0.17738775 -1.0824800 -0.2795326  0.08892661
## [6,] -0.9148558 -0.1819744  0.92143325 -0.5063610  2.0277387 -2.63015932
##          bogus20     bogus21    bogus22    bogus23    bogus24    bogus25
## [1,] -0.88614959 -1.34105095  0.9169380 -0.3116892 -0.3743289 -1.8054836
## [2,] -1.92225490 -0.04570723  0.8092731  0.2057491  0.9953538 -0.6780407
## [3,]  1.61970074  2.18799112 -0.7116223 -0.6539869  0.1021435 -0.4733581
## [4,]  0.51926990  1.42209580 -2.6895852 -1.1532577  1.4829437  1.0274171
## [5,] -0.05584993  0.18324702 -0.5670470  0.5274909  0.5600487 -0.5973876
## [6,]  0.69641761 -0.65293284  1.2991988  1.3939191  0.1424510  1.1598494
##          bogus26    bogus27     bogus28     bogus29    bogus30     bogus31
## [1,]  0.94033680 -0.4053392 -2.10406017  0.78104120  0.7391149  0.31570474
## [2,]  0.78785519  1.9406715 -0.08443947 -0.04650931  0.3866087  0.16389139
## [3,]  0.08694194  0.4849653  0.75632942  0.09576593  1.2963972  0.95765836
## [4,]  0.03280097 -0.2020973 -1.58071605 -1.33525787 -0.8035584 -0.13109632
## [5,]  1.55285735 -1.1696286  0.70724595  0.54878591 -1.6026257 -0.04676214
## [6,] -2.40487804 -0.3698461 -1.04598767 -1.90524352  0.9332510  1.08256389
##         bogus32    bogus33     bogus34    bogus35    bogus36    bogus37
## [1,] -0.1131544 -0.4456043 -1.04818566  0.5205997  1.7290728  0.8681650
## [2,]  0.8564798 -0.6763940 -0.42554881  0.3775619 -0.8149923 -1.4843701
## [3,] -0.1855841  0.4116056 -0.23487313 -0.6236588 -1.6902330 -0.4008993
## [4,]  1.4280518 -0.5868514  1.19028909 -0.5726105  1.4909445 -0.6393477
## [5,]  2.0817674 -1.2743676  0.54071372  0.3125012  0.7036331  0.2163910
## [6,] -0.5674901 -0.8968956 -0.08926451 -0.7074278 -0.9626936 -1.2611376
##         bogus38   bogus39    bogus40    bogus41     bogus42     bogus43
## [1,] -0.7280986 0.2618973 -1.1346302  0.4083129 -1.47983426 -0.40774766
## [2,] -0.2467004 0.1076991  0.7645571  0.4260585  1.02834657  0.99756662
## [3,] -0.6136157 0.8309216  0.5707101 -1.1011658 -2.22105108 -0.96926774
## [4,]  0.1039478 0.8612745 -1.3516939 -0.3323497 -1.63855763  0.75862510
## [5,] -0.8005786 0.3303093 -2.0298855  0.2302076  0.35723943  0.08275072
## [6,]  1.3237039 1.0899624  0.5904787 -1.1711534  0.02628372 -0.96894852
##         bogus44    bogus45
## [1,] -0.8396835  1.5579537
## [2,]  0.4461303 -0.7292970
## [3,] -0.3654167 -1.5039509
## [4,]  0.5391799 -0.5667870
## [5,] -0.8085769 -2.1044536
## [6,] -0.4844113  0.5307319

names(x)

## NULL

dim(x)

## [1] 100  50

class(x)

## [1] "matrix"

colnames(x)

##  [1] "real1"   "real2"   "real3"   "real4"   "real5"   "bogus1"  "bogus2" 
##  [8] "bogus3"  "bogus4"  "bogus5"  "bogus6"  "bogus7"  "bogus8"  "bogus9" 
## [15] "bogus10" "bogus11" "bogus12" "bogus13" "bogus14" "bogus15" "bogus16"
## [22] "bogus17" "bogus18" "bogus19" "bogus20" "bogus21" "bogus22" "bogus23"
## [29] "bogus24" "bogus25" "bogus26" "bogus27" "bogus28" "bogus29" "bogus30"
## [36] "bogus31" "bogus32" "bogus33" "bogus34" "bogus35" "bogus36" "bogus37"
## [43] "bogus38" "bogus39" "bogus40" "bogus41" "bogus42" "bogus43" "bogus44"
## [50] "bogus45"

#Of the 50 predictors, there are 45 pure noise variables: 5 are uniform on [0, 1] and 
#40 are random univariate standard normals. 

#The predictors are centered and scaled:
normalization <- preProcess(x,
                            #method=c("center", "scale").
                            #na.remove=T, k=5,
                            verbose=T
                            )

## Calculating means for centering
## Calculating standard deviations for scaling

x <- predict(normalization, x)
x <- as.data.frame(x)
subsets <- c(1:5, 10, 15, 20, 25) #The simulation will fit models with subset sizes of 25, 20, 15, 10, 5, 4, 3, 2, 1.

#to fit linear models, the lmFuncs set of functions can be used
set.seed(10)

ctrl <- rfeControl(functions = lmFuncs,
                   method = "repeatedcv",
                   repeats = 5,
                   verbose = FALSE)

lmProfile <- rfe(x, y,
                 sizes = subsets,
                 rfeControl = ctrl)

## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize

lmProfile

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared RMSESD RsquaredSD Selected
##          1 3.932   0.3812 0.7272     0.2116         
##          2 3.566   0.4894 0.5791     0.2029         
##          3 3.153   0.5953 0.6439     0.1905         
##          4 2.904   0.6562 0.7357     0.2107        *
##          5 2.993   0.6438 0.7627     0.2066         
##         10 3.221   0.5972 0.8173     0.2094         
##         15 3.394   0.5612 0.8189     0.2144         
##         20 3.517   0.5418 0.9081     0.2337         
##         25 3.719   0.5042 0.9147     0.2365         
##         50 4.088   0.4502 0.9538     0.2302         
## 
## The top 4 variables (out of 4):
##    real4, real5, real2, real1

predictors(lmProfile) #The predictors function can be used to get a text string of variable names that were picked in the final model

## [1] "real4" "real5" "real2" "real1"

lmProfile$fit

## 
## Call:
## lm(formula = y ~ ., data = tmp)
## 
## Coefficients:
## (Intercept)        real4        real5        real2        real1  
##      14.613        2.857        1.965        1.625        1.359

head(lmProfile$resample)

##    Variables     RMSE  Rsquared    Resample
## 4          4 2.940756 0.5542831 Fold01.Rep1
## 14         4 2.704529 0.6583788 Fold02.Rep1
## 24         4 4.599872 0.2181731 Fold03.Rep1
## 34         4 2.601659 0.8478823 Fold04.Rep1
## 44         4 1.979220 0.8571701 Fold05.Rep1
## 54         4 2.978896 0.8301785 Fold06.Rep1

plot(lmProfile) #produces the performance profile across different subset sizes, as shown in the figure below.

trellis.par.set(caretTheme())
plot(lmProfile, type = c("g", "o"))

############
#To use feature elimination for an arbitrary model, a set of functions must be passed to rfe for each of the steps
# caret contains a list called rfFuncs, but this document will use a more simple version rfRFE
#rfRFE.......

#@@@@@@@
    rfRFE <-  list(summary = defaultSummary,
                   fit = function(x, y, first, last, ...){
                       library(randomForest)
                       randomForest(x, y, importance = first, ...)
                   },
                   pred = function(object, x)  predict(object, x),
                   rank = function(object, x, y) {
                       vimp <- varImp(object)
                       vimp <- vimp[order(vimp$Overall,decreasing = TRUE),,drop = FALSE]
                       vimp$var <- rownames(vimp)
                       vimp
                   },
                   selectSize = pickSizeBest,
                   selectVar = pickVars)
#@@@@@@@@

rfRFE$summary #1

## function (data, lev = NULL, model = NULL) 
## {
##     if (is.character(data$obs)) 
##         data$obs <- factor(data$obs, levels = lev)
##     postResample(data[, "pred"], data[, "obs"])
## }
## <environment: namespace:caret>

rfRFE$fit #2

## function(x, y, first, last, ...){
##                        library(randomForest)
##                        randomForest(x, y, importance = first, ...)
##                    }

rfRFE$pred #3

## function(object, x)  predict(object, x)

rfRFE$rank #4

## function(object, x, y) {
##                        vimp <- varImp(object)
##                        vimp <- vimp[order(vimp$Overall,decreasing = TRUE),,drop = FALSE]
##                        vimp$var <- rownames(vimp)
##                        vimp
##                    }

rfRFE$selectVar #5

## function (y, size) 
## {
##     finalImp <- ddply(y[, c("Overall", "var")], .(var), function(x) mean(x$Overall, 
##         na.rm = TRUE))
##     names(finalImp)[2] <- "Overall"
##     finalImp <- finalImp[order(finalImp$Overall, decreasing = TRUE), 
##         ]
##     as.character(finalImp$var[1:size])
## }
## <environment: namespace:caret>

#For random forest, we fit the same series of model sizes as the linear model. 
ctrl$functions <- rfRFE
ctrl$returnResamp <- "all"
set.seed(10)
rfProfile <- rfe(x, y, sizes = subsets, rfeControl = ctrl)
rfProfile

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared RMSESD RsquaredSD Selected
##          1 4.731   0.2068 0.9741     0.1817         
##          2 3.924   0.3801 0.6817     0.2231         
##          3 3.201   0.5981 0.6212     0.1939         
##          4 2.678   0.7713 0.5115     0.1080        *
##          5 2.846   0.7585 0.5935     0.1282         
##         10 3.089   0.7009 0.5807     0.1646         
##         15 3.167   0.6917 0.5622     0.1661         
##         20 3.306   0.6815 0.5393     0.1764         
##         25 3.385   0.6668 0.5551     0.1818         
##         50 3.537   0.6464 0.5635     0.1960         
## 
## The top 4 variables (out of 4):
##    real4, real5, real2, real1

#The resampling profile can be visualized along with plots of the individual resampling results:
#1
trellis.par.set(caretTheme())
plot1 <- plot(rfProfile, type = c("g", "o"))
plot2 <- plot(rfProfile, type = c("g", "o"), metric = "Rsquared")
print(plot1, split=c(1,1,1,2), more=TRUE)
print(plot2, split=c(1,2,1,2))

#2
trellis.par.set(caretTheme())
plot1 <- plot(rfProfile, type = c("g", "o"))
plot2 <- plot(rfProfile, type = c("g", "o"), metric = "Rsquared")
print(plot1,more=TRUE)
print(plot2)

#trellis.par.set(theme1)
plot1 <- xyplot(rfProfile,
                type = c("g", "p", "smooth"),
                ylab = "RMSE CV Estimates")
plot2 <- densityplot(rfProfile,
                     subset = Variables < 5,
                     adjust = 1.25,
                     as.table = TRUE,
                     xlab = "RMSE CV Estimates",
                     pch = "|")

## Warning in `$.data.frame`(data, Variable): Name partially matched in data
## frame

## Warning in `$.data.frame`(data, Variable): Name partially matched in data
## frame

print(plot1, split=c(1,1,1,2), more=TRUE)
print(plot2, split=c(1,2,1,2))

RFE Illustration

SA