caret 是一个完备的数据挖掘工具包,功能如下: 1. 数据分割 2. 数据预处理 3. 特征选择 4. 模型训练与调优 5. 变量重要性评估
另外caret,包含了超过150个机器学习模型。
用caret中的featurePlot函数进行可视化分析:
library(AppliedPredictiveModeling)
transparentTheme(trans = .4)
library(caret)
## Loading required package: ggplot2
featurePlot(x = iris[, 1:4],
y = iris$Species,
plot = "pairs",
## Add a key at the top
auto.key = list(columns = 3))
transparentTheme(trans = .9)
featurePlot(x = iris[, 1:4],
y = iris$Species,
plot = "density",
## Pass in options to xyplot() to
## make it prettier
scales = list(x = list(relation="free"),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
layout = c(4, 1),
auto.key = list(columns = 3))
featurePlot(x = iris[, 1:4],
y = iris$Species,
plot = "box",
## Pass in options to bwplot()
scales = list(y = list(relation="free"),
x = list(rot = 90)),
layout = c(4,1 ),
auto.key = list(columns = 2))
1.生成哑变量 2.去除相关性变量 3.线性依赖关系 4.中心化标准化 5.因子变量距离分析
数据中,pclass和sex是因子变量,R中的基础函数model.matrix可以生成哑变量。
library(earth)
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
data(etitanic)
head(etitanic)
## pclass survived sex age sibsp parch
## 1 1st 1 female 29.0000 0 0
## 2 1st 1 male 0.9167 1 2
## 3 1st 0 female 2.0000 1 2
## 4 1st 0 male 30.0000 1 2
## 5 1st 0 female 25.0000 1 2
## 6 1st 1 male 48.0000 0 0
head(model.matrix(survived ~ ., data = etitanic))
## (Intercept) pclass2nd pclass3rd sexmale age sibsp parch
## 1 1 0 0 0 29.0000 0 0
## 2 1 0 0 1 0.9167 1 2
## 3 1 0 0 0 2.0000 1 2
## 4 1 0 0 1 30.0000 1 2
## 5 1 0 0 0 25.0000 1 2
## 6 1 0 0 1 48.0000 0 0
利用caret中的dummyVars函数,可以更方便的完成此项工作:
dummies <- dummyVars(survived ~ ., data = etitanic)
head(predict(dummies, newdata = etitanic))
## pclass.1st pclass.2nd pclass.3rd sex.female sex.male age sibsp parch
## 1 1 0 0 1 0 29.0000 0 0
## 2 1 0 0 0 1 0.9167 1 2
## 3 1 0 0 1 0 2.0000 1 2
## 4 1 0 0 0 1 30.0000 1 2
## 5 1 0 0 1 0 25.0000 1 2
## 6 1 0 0 0 1 48.0000 0 0
变量之间的有相关性说明变量存在冗余,并且模型会不稳定
descrCor <- cor(iris[,-5])
highlyCorDescr <- findCorrelation(descrCor, cutoff = .75)
highlyCorDescr
## [1] 3 4
说明第4个变量和第3个变量存在相关性。
函数findLinearCombos使用矩阵的QR分解来枚举一组线性组合(如果存在):
ltfrDesign <- matrix(0, nrow=6, ncol=6)
ltfrDesign[,1] <- c(1, 1, 1, 1, 1, 1)
ltfrDesign[,2] <- c(1, 1, 1, 0, 0, 0)
ltfrDesign[,3] <- c(0, 0, 0, 1, 1, 1)
ltfrDesign[,4] <- c(1, 0, 0, 1, 0, 0)
ltfrDesign[,5] <- c(0, 1, 0, 0, 1, 0)
ltfrDesign[,6] <- c(0, 0, 1, 0, 0, 1)
comboInfo <- findLinearCombos(ltfrDesign)
comboInfo
## $linearCombos
## $linearCombos[[1]]
## [1] 3 1 2
##
## $linearCombos[[2]]
## [1] 6 1 4 5
##
##
## $remove
## [1] 3 6
ltfrDesign[, -comboInfo$remove]
## [,1] [,2] [,3] [,4]
## [1,] 1 1 1 0
## [2,] 1 1 0 1
## [3,] 1 1 0 0
## [4,] 1 0 1 0
## [5,] 1 0 0 1
## [6,] 1 0 0 0
set.seed(96)
data(mdrr)
inTrain <- sample(seq(along = mdrrClass), length(mdrrClass)/2)
training <- mdrrDescr[inTrain,]
test <- mdrrDescr[-inTrain,]
trainMDRR <- mdrrClass[inTrain]
testMDRR <- mdrrClass[-inTrain]
preProcValues <- preProcess(training, method = c("center", "scale"))
## Warning in preProcess.default(training, method = c("center", "scale")):
## These variables have zero variances: nI, nR08, D.Dr08, T.N..I., T.O..I.,
## T.Cl..Br., T.I..I., G.N..I., G.O..I., G.Cl..Br., G.I..I.
trainTransformed <- predict(preProcValues, training)
testTransformed <- predict(preProcValues, test)
library(AppliedPredictiveModeling)
transparentTheme(trans = .4)
plotSubset <- data.frame(scale(mdrrDescr[, c("nC", "X4v")]))
xyplot(nC ~ X4v,
data = plotSubset,
groups = mdrrClass,
auto.key = list(columns = 2))
transformed <- spatialSign(plotSubset)
transformed <- as.data.frame(transformed)
xyplot(nC ~ X4v,
data = transformed,
groups = mdrrClass,
auto.key = list(columns = 2))
trainSet <- sample(1:150, 100)
distData <- classDist(iris[trainSet, 1:4],
iris$Species[trainSet])
newDist <- predict(distData,
iris[-trainSet, 1:4])
splom(newDist, groups = iris$Species[-trainSet])
## 4.数据分割
简单的按比例分割
library(caret)
set.seed(3456)
trainIndex <- createDataPartition(iris$Species, p = .8,
list = FALSE,
times = 1)
head(trainIndex)
## Resample1
## [1,] 1
## [2,] 2
## [3,] 4
## [4,] 5
## [5,] 6
## [6,] 8
irisTrain <- iris[ trainIndex,]
irisTest <- iris[-trainIndex,]
head(irisTrain)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 8 5.0 3.4 1.5 0.2 setosa
head(irisTest)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 3 4.7 3.2 1.3 0.2 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 23 4.6 3.6 1.0 0.2 setosa
caret 中的train函数可以训练几乎所有的统计机器学习算法:
library(mlbench)
data(Sonar)
str(Sonar[, 1:10])
## 'data.frame': 208 obs. of 10 variables:
## $ V1 : num 0.02 0.0453 0.0262 0.01 0.0762 0.0286 0.0317 0.0519 0.0223 0.0164 ...
## $ V2 : num 0.0371 0.0523 0.0582 0.0171 0.0666 0.0453 0.0956 0.0548 0.0375 0.0173 ...
## $ V3 : num 0.0428 0.0843 0.1099 0.0623 0.0481 ...
## $ V4 : num 0.0207 0.0689 0.1083 0.0205 0.0394 ...
## $ V5 : num 0.0954 0.1183 0.0974 0.0205 0.059 ...
## $ V6 : num 0.0986 0.2583 0.228 0.0368 0.0649 ...
## $ V7 : num 0.154 0.216 0.243 0.11 0.121 ...
## $ V8 : num 0.16 0.348 0.377 0.128 0.247 ...
## $ V9 : num 0.3109 0.3337 0.5598 0.0598 0.3564 ...
## $ V10: num 0.211 0.287 0.619 0.126 0.446 ...
library(caret)
set.seed(998)
inTraining <- createDataPartition(Sonar$Class, p = .75, list = FALSE)
training <- Sonar[ inTraining,]
testing <- Sonar[-inTraining,]
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(Class ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
## Loading required package: gbm
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
## Loading required package: plyr
gbmFit1
## Stochastic Gradient Boosting
##
## 157 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 142, 142, 140, 142, 142, 141, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.7609191 0.5163703
## 1 100 0.7934216 0.5817734
## 1 150 0.7977230 0.5897796
## 2 50 0.7858235 0.5667749
## 2 100 0.8188897 0.6316548
## 2 150 0.8194363 0.6329037
## 3 50 0.7895686 0.5726290
## 3 100 0.8130564 0.6195719
## 3 150 0.8221348 0.6383441
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
trellis.par.set(caretTheme())
plot(gbmFit1)
plot(gbmFit1, metric = "Kappa", plotType = "level",
scales = list(x = list(rot = 90)))
ggplot(gbmFit1)
## Warning: Ignoring unknown aesthetics: shape