library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.1.2
## Loading required package: ggplot2
library(mlbench)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.1.2
## Loading required package: grid
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.1.2
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.1.2
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.1.1
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:Hmisc':
##
## combine
n <- 100
p <- 40
sigma <- 1
set.seed(1)
sim <- mlbench.friedman1(n, sd = sigma)
colnames(sim$x) <- c(paste("real", 1:5, sep = ""),
paste("bogus", 1:5, sep = ""))
bogus <- matrix(rnorm(n * p), nrow = n)
colnames(bogus) <- paste("bogus", 5+(1:ncol(bogus)), sep = "")
x <- cbind(sim$x, bogus)
y <- sim$y
head(x)
## real1 real2 real3 real4 real5 bogus1
## [1,] 0.2655087 0.6547239 0.2675082 0.67371223 0.6588776 0.5541771
## [2,] 0.3721239 0.3531973 0.2186453 0.09485786 0.1850700 0.6882752
## [3,] 0.5728534 0.2702601 0.5167968 0.49259612 0.9543781 0.6580576
## [4,] 0.9082078 0.9926841 0.2689506 0.46155184 0.8978485 0.6633427
## [5,] 0.2016819 0.6334933 0.1811683 0.37521653 0.9436971 0.4722342
## [6,] 0.8983897 0.2132081 0.5185761 0.99109922 0.7236908 0.9695282
## bogus2 bogus3 bogus4 bogus5 bogus6 bogus7
## [1,] 0.8142518 0.92974321 0.85868745 0.83189899 -0.3410670 -0.70756823
## [2,] 0.9287772 0.90093927 0.03443876 0.76684275 1.5024245 1.97157201
## [3,] 0.1474810 0.75088219 0.97099715 0.27278032 0.5283077 -0.08999868
## [4,] 0.7498217 0.67656877 0.74511014 0.18816330 0.5421914 -0.01401725
## [5,] 0.9756573 0.64801345 0.27325524 0.22576183 -0.1366734 -1.12345694
## [6,] 0.9747925 0.07324687 0.67710610 0.06197037 -1.1367339 -1.34413012
## bogus8 bogus9 bogus10 bogus11 bogus12 bogus13
## [1,] -1.08690882 -1.5414026 1.13496509 0.2418959 -1.5570357 0.3412484
## [2,] -1.82608301 0.1943211 1.11193185 -1.1327594 1.9231637 1.3161672
## [3,] 0.99528181 0.2644225 -0.87077763 1.4899074 -1.8568296 -0.9597765
## [4,] -0.01186178 -1.1187352 0.21073159 -0.2482471 -2.1061184 -1.2055752
## [5,] -0.59962839 0.6509530 0.06939565 0.1835837 0.6976485 1.5675731
## [6,] -0.17794799 -1.0329002 -1.66264885 0.4048710 0.9074444 0.2252858
## bogus14 bogus15 bogus16 bogus17 bogus18 bogus19
## [1,] 1.5468813 0.8500435 0.34419403 1.6212029 0.7140855 -0.57099429
## [2,] 0.1789210 -0.9253130 0.01271984 -0.3291028 0.5813846 0.28653902
## [3,] -0.2825466 0.8935812 -0.87345013 -2.3264095 -0.1467239 1.14761986
## [4,] -0.7672988 -0.9410097 0.34280028 2.1929980 1.5069818 0.13955870
## [5,] -0.5764042 0.5389521 -0.17738775 -1.0824800 -0.2795326 0.08892661
## [6,] -0.9148558 -0.1819744 0.92143325 -0.5063610 2.0277387 -2.63015932
## bogus20 bogus21 bogus22 bogus23 bogus24 bogus25
## [1,] -0.88614959 -1.34105095 0.9169380 -0.3116892 -0.3743289 -1.8054836
## [2,] -1.92225490 -0.04570723 0.8092731 0.2057491 0.9953538 -0.6780407
## [3,] 1.61970074 2.18799112 -0.7116223 -0.6539869 0.1021435 -0.4733581
## [4,] 0.51926990 1.42209580 -2.6895852 -1.1532577 1.4829437 1.0274171
## [5,] -0.05584993 0.18324702 -0.5670470 0.5274909 0.5600487 -0.5973876
## [6,] 0.69641761 -0.65293284 1.2991988 1.3939191 0.1424510 1.1598494
## bogus26 bogus27 bogus28 bogus29 bogus30 bogus31
## [1,] 0.94033680 -0.4053392 -2.10406017 0.78104120 0.7391149 0.31570474
## [2,] 0.78785519 1.9406715 -0.08443947 -0.04650931 0.3866087 0.16389139
## [3,] 0.08694194 0.4849653 0.75632942 0.09576593 1.2963972 0.95765836
## [4,] 0.03280097 -0.2020973 -1.58071605 -1.33525787 -0.8035584 -0.13109632
## [5,] 1.55285735 -1.1696286 0.70724595 0.54878591 -1.6026257 -0.04676214
## [6,] -2.40487804 -0.3698461 -1.04598767 -1.90524352 0.9332510 1.08256389
## bogus32 bogus33 bogus34 bogus35 bogus36 bogus37
## [1,] -0.1131544 -0.4456043 -1.04818566 0.5205997 1.7290728 0.8681650
## [2,] 0.8564798 -0.6763940 -0.42554881 0.3775619 -0.8149923 -1.4843701
## [3,] -0.1855841 0.4116056 -0.23487313 -0.6236588 -1.6902330 -0.4008993
## [4,] 1.4280518 -0.5868514 1.19028909 -0.5726105 1.4909445 -0.6393477
## [5,] 2.0817674 -1.2743676 0.54071372 0.3125012 0.7036331 0.2163910
## [6,] -0.5674901 -0.8968956 -0.08926451 -0.7074278 -0.9626936 -1.2611376
## bogus38 bogus39 bogus40 bogus41 bogus42 bogus43
## [1,] -0.7280986 0.2618973 -1.1346302 0.4083129 -1.47983426 -0.40774766
## [2,] -0.2467004 0.1076991 0.7645571 0.4260585 1.02834657 0.99756662
## [3,] -0.6136157 0.8309216 0.5707101 -1.1011658 -2.22105108 -0.96926774
## [4,] 0.1039478 0.8612745 -1.3516939 -0.3323497 -1.63855763 0.75862510
## [5,] -0.8005786 0.3303093 -2.0298855 0.2302076 0.35723943 0.08275072
## [6,] 1.3237039 1.0899624 0.5904787 -1.1711534 0.02628372 -0.96894852
## bogus44 bogus45
## [1,] -0.8396835 1.5579537
## [2,] 0.4461303 -0.7292970
## [3,] -0.3654167 -1.5039509
## [4,] 0.5391799 -0.5667870
## [5,] -0.8085769 -2.1044536
## [6,] -0.4844113 0.5307319
names(x)
## NULL
dim(x)
## [1] 100 50
class(x)
## [1] "matrix"
colnames(x)
## [1] "real1" "real2" "real3" "real4" "real5" "bogus1" "bogus2"
## [8] "bogus3" "bogus4" "bogus5" "bogus6" "bogus7" "bogus8" "bogus9"
## [15] "bogus10" "bogus11" "bogus12" "bogus13" "bogus14" "bogus15" "bogus16"
## [22] "bogus17" "bogus18" "bogus19" "bogus20" "bogus21" "bogus22" "bogus23"
## [29] "bogus24" "bogus25" "bogus26" "bogus27" "bogus28" "bogus29" "bogus30"
## [36] "bogus31" "bogus32" "bogus33" "bogus34" "bogus35" "bogus36" "bogus37"
## [43] "bogus38" "bogus39" "bogus40" "bogus41" "bogus42" "bogus43" "bogus44"
## [50] "bogus45"
#Of the 50 predictors, there are 45 pure noise variables: 5 are uniform on [0, 1] and
#40 are random univariate standard normals.
#The predictors are centered and scaled:
normalization <- preProcess(x,
#method=c("center", "scale").
#na.remove=T, k=5,
verbose=T
)
## Calculating means for centering
## Calculating standard deviations for scaling
x <- predict(normalization, x)
x <- as.data.frame(x)
subsets <- c(1:5, 10, 15, 20, 25) #The simulation will fit models with subset sizes of 25, 20, 15, 10, 5, 4, 3, 2, 1.
#to fit linear models, the lmFuncs set of functions can be used
set.seed(10)
ctrl <- rfeControl(functions = lmFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE)
lmProfile <- rfe(x, y,
sizes = subsets,
rfeControl = ctrl)
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
lmProfile
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared RMSESD RsquaredSD Selected
## 1 3.932 0.3812 0.7272 0.2116
## 2 3.566 0.4894 0.5791 0.2029
## 3 3.153 0.5953 0.6439 0.1905
## 4 2.904 0.6562 0.7357 0.2107 *
## 5 2.993 0.6438 0.7627 0.2066
## 10 3.221 0.5972 0.8173 0.2094
## 15 3.394 0.5612 0.8189 0.2144
## 20 3.517 0.5418 0.9081 0.2337
## 25 3.719 0.5042 0.9147 0.2365
## 50 4.088 0.4502 0.9538 0.2302
##
## The top 4 variables (out of 4):
## real4, real5, real2, real1
predictors(lmProfile) #The predictors function can be used to get a text string of variable names that were picked in the final model
## [1] "real4" "real5" "real2" "real1"
lmProfile$fit
##
## Call:
## lm(formula = y ~ ., data = tmp)
##
## Coefficients:
## (Intercept) real4 real5 real2 real1
## 14.613 2.857 1.965 1.625 1.359
head(lmProfile$resample)
## Variables RMSE Rsquared Resample
## 4 4 2.940756 0.5542831 Fold01.Rep1
## 14 4 2.704529 0.6583788 Fold02.Rep1
## 24 4 4.599872 0.2181731 Fold03.Rep1
## 34 4 2.601659 0.8478823 Fold04.Rep1
## 44 4 1.979220 0.8571701 Fold05.Rep1
## 54 4 2.978896 0.8301785 Fold06.Rep1
plot(lmProfile) #produces the performance profile across different subset sizes, as shown in the figure below.

trellis.par.set(caretTheme())
plot(lmProfile, type = c("g", "o"))

############
#To use feature elimination for an arbitrary model, a set of functions must be passed to rfe for each of the steps
# caret contains a list called rfFuncs, but this document will use a more simple version rfRFE
#rfRFE.......
#@@@@@@@
rfRFE <- list(summary = defaultSummary,
fit = function(x, y, first, last, ...){
library(randomForest)
randomForest(x, y, importance = first, ...)
},
pred = function(object, x) predict(object, x),
rank = function(object, x, y) {
vimp <- varImp(object)
vimp <- vimp[order(vimp$Overall,decreasing = TRUE),,drop = FALSE]
vimp$var <- rownames(vimp)
vimp
},
selectSize = pickSizeBest,
selectVar = pickVars)
#@@@@@@@@
rfRFE$summary #1
## function (data, lev = NULL, model = NULL)
## {
## if (is.character(data$obs))
## data$obs <- factor(data$obs, levels = lev)
## postResample(data[, "pred"], data[, "obs"])
## }
## <environment: namespace:caret>
rfRFE$fit #2
## function(x, y, first, last, ...){
## library(randomForest)
## randomForest(x, y, importance = first, ...)
## }
rfRFE$pred #3
## function(object, x) predict(object, x)
rfRFE$rank #4
## function(object, x, y) {
## vimp <- varImp(object)
## vimp <- vimp[order(vimp$Overall,decreasing = TRUE),,drop = FALSE]
## vimp$var <- rownames(vimp)
## vimp
## }
rfRFE$selectVar #5
## function (y, size)
## {
## finalImp <- ddply(y[, c("Overall", "var")], .(var), function(x) mean(x$Overall,
## na.rm = TRUE))
## names(finalImp)[2] <- "Overall"
## finalImp <- finalImp[order(finalImp$Overall, decreasing = TRUE),
## ]
## as.character(finalImp$var[1:size])
## }
## <environment: namespace:caret>
#For random forest, we fit the same series of model sizes as the linear model.
ctrl$functions <- rfRFE
ctrl$returnResamp <- "all"
set.seed(10)
rfProfile <- rfe(x, y, sizes = subsets, rfeControl = ctrl)
rfProfile
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared RMSESD RsquaredSD Selected
## 1 4.731 0.2068 0.9741 0.1817
## 2 3.924 0.3801 0.6817 0.2231
## 3 3.201 0.5981 0.6212 0.1939
## 4 2.678 0.7713 0.5115 0.1080 *
## 5 2.846 0.7585 0.5935 0.1282
## 10 3.089 0.7009 0.5807 0.1646
## 15 3.167 0.6917 0.5622 0.1661
## 20 3.306 0.6815 0.5393 0.1764
## 25 3.385 0.6668 0.5551 0.1818
## 50 3.537 0.6464 0.5635 0.1960
##
## The top 4 variables (out of 4):
## real4, real5, real2, real1
#The resampling profile can be visualized along with plots of the individual resampling results:
#1
trellis.par.set(caretTheme())
plot1 <- plot(rfProfile, type = c("g", "o"))
plot2 <- plot(rfProfile, type = c("g", "o"), metric = "Rsquared")
print(plot1, split=c(1,1,1,2), more=TRUE)
print(plot2, split=c(1,2,1,2))

#2
trellis.par.set(caretTheme())
plot1 <- plot(rfProfile, type = c("g", "o"))
plot2 <- plot(rfProfile, type = c("g", "o"), metric = "Rsquared")
print(plot1,more=TRUE)
print(plot2)

#trellis.par.set(theme1)
plot1 <- xyplot(rfProfile,
type = c("g", "p", "smooth"),
ylab = "RMSE CV Estimates")
plot2 <- densityplot(rfProfile,
subset = Variables < 5,
adjust = 1.25,
as.table = TRUE,
xlab = "RMSE CV Estimates",
pch = "|")
## Warning in `$.data.frame`(data, Variable): Name partially matched in data
## frame
## Warning in `$.data.frame`(data, Variable): Name partially matched in data
## frame
print(plot1, split=c(1,1,1,2), more=TRUE)
print(plot2, split=c(1,2,1,2))
