1.介绍

caret 是一个完备的数据挖掘工具包,功能如下: 1. 数据分割 2. 数据预处理 3. 特征选择 4. 模型训练与调优 5. 变量重要性评估

另外caret,包含了超过150个机器学习模型。

2.可视化

用caret中的featurePlot函数进行可视化分析:

散点图

library(AppliedPredictiveModeling)
transparentTheme(trans = .4)
library(caret)
## Loading required package: ggplot2
featurePlot(x = iris[, 1:4], 
            y = iris$Species, 
            plot = "pairs",
            ## Add a key at the top
            auto.key = list(columns = 3))

密度图

transparentTheme(trans = .9)
featurePlot(x = iris[, 1:4], 
            y = iris$Species,
            plot = "density", 
            ## Pass in options to xyplot() to 
            ## make it prettier
            scales = list(x = list(relation="free"), 
                          y = list(relation="free")), 
            adjust = 1.5, 
            pch = "|", 
            layout = c(4, 1), 
            auto.key = list(columns = 3))

箱线图

featurePlot(x = iris[, 1:4], 
            y = iris$Species, 
            plot = "box", 
            ## Pass in options to bwplot() 
            scales = list(y = list(relation="free"),
                          x = list(rot = 90)),  
            layout = c(4,1 ), 
            auto.key = list(columns = 2))

3.数据预处理

1.生成哑变量 2.去除相关性变量 3.线性依赖关系 4.中心化标准化 5.因子变量距离分析

生成哑变量

数据中,pclass和sex是因子变量,R中的基础函数model.matrix可以生成哑变量。

library(earth)
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
data(etitanic)
head(etitanic)
##   pclass survived    sex     age sibsp parch
## 1    1st        1 female 29.0000     0     0
## 2    1st        1   male  0.9167     1     2
## 3    1st        0 female  2.0000     1     2
## 4    1st        0   male 30.0000     1     2
## 5    1st        0 female 25.0000     1     2
## 6    1st        1   male 48.0000     0     0
head(model.matrix(survived ~ ., data = etitanic))
##   (Intercept) pclass2nd pclass3rd sexmale     age sibsp parch
## 1           1         0         0       0 29.0000     0     0
## 2           1         0         0       1  0.9167     1     2
## 3           1         0         0       0  2.0000     1     2
## 4           1         0         0       1 30.0000     1     2
## 5           1         0         0       0 25.0000     1     2
## 6           1         0         0       1 48.0000     0     0

利用caret中的dummyVars函数,可以更方便的完成此项工作:

dummies <- dummyVars(survived ~ ., data = etitanic)
head(predict(dummies, newdata = etitanic))
##   pclass.1st pclass.2nd pclass.3rd sex.female sex.male     age sibsp parch
## 1          1          0          0          1        0 29.0000     0     0
## 2          1          0          0          0        1  0.9167     1     2
## 3          1          0          0          1        0  2.0000     1     2
## 4          1          0          0          0        1 30.0000     1     2
## 5          1          0          0          1        0 25.0000     1     2
## 6          1          0          0          0        1 48.0000     0     0

去除相关性变量

变量之间的有相关性说明变量存在冗余,并且模型会不稳定

descrCor <- cor(iris[,-5])
highlyCorDescr <- findCorrelation(descrCor, cutoff = .75)
highlyCorDescr
## [1] 3 4

说明第4个变量和第3个变量存在相关性。

线性依赖关系

函数findLinearCombos使用矩阵的QR分解来枚举一组线性组合(如果存在):

ltfrDesign <- matrix(0, nrow=6, ncol=6)
ltfrDesign[,1] <- c(1, 1, 1, 1, 1, 1)
ltfrDesign[,2] <- c(1, 1, 1, 0, 0, 0)
ltfrDesign[,3] <- c(0, 0, 0, 1, 1, 1)
ltfrDesign[,4] <- c(1, 0, 0, 1, 0, 0)
ltfrDesign[,5] <- c(0, 1, 0, 0, 1, 0)
ltfrDesign[,6] <- c(0, 0, 1, 0, 0, 1)
comboInfo <- findLinearCombos(ltfrDesign)
comboInfo
## $linearCombos
## $linearCombos[[1]]
## [1] 3 1 2
## 
## $linearCombos[[2]]
## [1] 6 1 4 5
## 
## 
## $remove
## [1] 3 6
ltfrDesign[, -comboInfo$remove]
##      [,1] [,2] [,3] [,4]
## [1,]    1    1    1    0
## [2,]    1    1    0    1
## [3,]    1    1    0    0
## [4,]    1    0    1    0
## [5,]    1    0    0    1
## [6,]    1    0    0    0

中心化标准化

set.seed(96)
data(mdrr)
inTrain <- sample(seq(along = mdrrClass), length(mdrrClass)/2)

training <- mdrrDescr[inTrain,]
test <- mdrrDescr[-inTrain,]
trainMDRR <- mdrrClass[inTrain]
testMDRR <- mdrrClass[-inTrain]

preProcValues <- preProcess(training, method = c("center", "scale"))
## Warning in preProcess.default(training, method = c("center", "scale")):
## These variables have zero variances: nI, nR08, D.Dr08, T.N..I., T.O..I.,
## T.Cl..Br., T.I..I., G.N..I., G.O..I., G.Cl..Br., G.I..I.
trainTransformed <- predict(preProcValues, training)
testTransformed <- predict(preProcValues, test)

pac ica 主成分分析和独立性分析

library(AppliedPredictiveModeling)
transparentTheme(trans = .4)
plotSubset <- data.frame(scale(mdrrDescr[, c("nC", "X4v")])) 
xyplot(nC ~ X4v,
       data = plotSubset,
       groups = mdrrClass, 
       auto.key = list(columns = 2))  

transformed <- spatialSign(plotSubset)
transformed <- as.data.frame(transformed)
xyplot(nC ~ X4v, 
       data = transformed, 
       groups = mdrrClass, 
       auto.key = list(columns = 2)) 

因变量距离计算

trainSet <- sample(1:150, 100)
    distData <- classDist(iris[trainSet, 1:4],
                          iris$Species[trainSet])
    newDist <- predict(distData,
                       iris[-trainSet, 1:4])
    splom(newDist, groups = iris$Species[-trainSet])

## 4.数据分割

简单的按比例分割

library(caret)
set.seed(3456)
trainIndex <- createDataPartition(iris$Species, p = .8, 
                                  list = FALSE, 
                                  times = 1)
head(trainIndex)
##      Resample1
## [1,]         1
## [2,]         2
## [3,]         4
## [4,]         5
## [5,]         6
## [6,]         8
irisTrain <- iris[ trainIndex,]
irisTest  <- iris[-trainIndex,]

head(irisTrain)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
## 8          5.0         3.4          1.5         0.2  setosa
head(irisTest)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 3           4.7         3.2          1.3         0.2  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 12          4.8         3.4          1.6         0.2  setosa
## 14          4.3         3.0          1.1         0.1  setosa
## 21          5.4         3.4          1.7         0.2  setosa
## 23          4.6         3.6          1.0         0.2  setosa

5.模型训练

caret 中的train函数可以训练几乎所有的统计机器学习算法:

library(mlbench)
data(Sonar)
str(Sonar[, 1:10])
## 'data.frame':    208 obs. of  10 variables:
##  $ V1 : num  0.02 0.0453 0.0262 0.01 0.0762 0.0286 0.0317 0.0519 0.0223 0.0164 ...
##  $ V2 : num  0.0371 0.0523 0.0582 0.0171 0.0666 0.0453 0.0956 0.0548 0.0375 0.0173 ...
##  $ V3 : num  0.0428 0.0843 0.1099 0.0623 0.0481 ...
##  $ V4 : num  0.0207 0.0689 0.1083 0.0205 0.0394 ...
##  $ V5 : num  0.0954 0.1183 0.0974 0.0205 0.059 ...
##  $ V6 : num  0.0986 0.2583 0.228 0.0368 0.0649 ...
##  $ V7 : num  0.154 0.216 0.243 0.11 0.121 ...
##  $ V8 : num  0.16 0.348 0.377 0.128 0.247 ...
##  $ V9 : num  0.3109 0.3337 0.5598 0.0598 0.3564 ...
##  $ V10: num  0.211 0.287 0.619 0.126 0.446 ...
library(caret)
set.seed(998)
inTraining <- createDataPartition(Sonar$Class, p = .75, list = FALSE)
training <- Sonar[ inTraining,]
testing  <- Sonar[-inTraining,]
fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)
set.seed(825)
gbmFit1 <- train(Class ~ ., data = training, 
                 method = "gbm", 
                 trControl = fitControl,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE)
## Loading required package: gbm
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
## Loading required package: plyr
gbmFit1
## Stochastic Gradient Boosting 
## 
## 157 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 142, 142, 140, 142, 142, 141, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.7609191  0.5163703
##   1                  100      0.7934216  0.5817734
##   1                  150      0.7977230  0.5897796
##   2                   50      0.7858235  0.5667749
##   2                  100      0.8188897  0.6316548
##   2                  150      0.8194363  0.6329037
##   3                   50      0.7895686  0.5726290
##   3                  100      0.8130564  0.6195719
##   3                  150      0.8221348  0.6383441
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
trellis.par.set(caretTheme())
plot(gbmFit1) 

plot(gbmFit1, metric = "Kappa", plotType = "level",
     scales = list(x = list(rot = 90)))

ggplot(gbmFit1) 
## Warning: Ignoring unknown aesthetics: shape