Установить пакет CARET, выполнить команду
names(getModelInfo()), ознакомиться со списком доступных
методов выбора признаков. Выполнить графический разведочный анализ
данных с использованием функции featurePlot() для набора
данных из справочного файла пакета CARET:
# Установка и подключение пакетов
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
install.packages("caret", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
library(caret)
library(ggplot2)
names(getModelInfo())
## [1] "ada" "AdaBag" "AdaBoost.M1"
## [4] "adaboost" "amdai" "ANFIS"
## [7] "avNNet" "awnb" "awtan"
## [10] "bag" "bagEarth" "bagEarthGCV"
## [13] "bagFDA" "bagFDAGCV" "bam"
## [16] "bartMachine" "bayesglm" "binda"
## [19] "blackboost" "blasso" "blassoAveraged"
## [22] "bridge" "brnn" "BstLm"
## [25] "bstSm" "bstTree" "C5.0"
## [28] "C5.0Cost" "C5.0Rules" "C5.0Tree"
## [31] "cforest" "chaid" "CSimca"
## [34] "ctree" "ctree2" "cubist"
## [37] "dda" "deepboost" "DENFIS"
## [40] "dnn" "dwdLinear" "dwdPoly"
## [43] "dwdRadial" "earth" "elm"
## [46] "enet" "evtree" "extraTrees"
## [49] "fda" "FH.GBML" "FIR.DM"
## [52] "foba" "FRBCS.CHI" "FRBCS.W"
## [55] "FS.HGD" "gam" "gamboost"
## [58] "gamLoess" "gamSpline" "gaussprLinear"
## [61] "gaussprPoly" "gaussprRadial" "gbm_h2o"
## [64] "gbm" "gcvEarth" "GFS.FR.MOGUL"
## [67] "GFS.LT.RS" "GFS.THRIFT" "glm.nb"
## [70] "glm" "glmboost" "glmnet_h2o"
## [73] "glmnet" "glmStepAIC" "gpls"
## [76] "hda" "hdda" "hdrda"
## [79] "HYFIS" "icr" "J48"
## [82] "JRip" "kernelpls" "kknn"
## [85] "knn" "krlsPoly" "krlsRadial"
## [88] "lars" "lars2" "lasso"
## [91] "lda" "lda2" "leapBackward"
## [94] "leapForward" "leapSeq" "Linda"
## [97] "lm" "lmStepAIC" "LMT"
## [100] "loclda" "logicBag" "LogitBoost"
## [103] "logreg" "lssvmLinear" "lssvmPoly"
## [106] "lssvmRadial" "lvq" "M5"
## [109] "M5Rules" "manb" "mda"
## [112] "Mlda" "mlp" "mlpKerasDecay"
## [115] "mlpKerasDecayCost" "mlpKerasDropout" "mlpKerasDropoutCost"
## [118] "mlpML" "mlpSGD" "mlpWeightDecay"
## [121] "mlpWeightDecayML" "monmlp" "msaenet"
## [124] "multinom" "mxnet" "mxnetAdam"
## [127] "naive_bayes" "nb" "nbDiscrete"
## [130] "nbSearch" "neuralnet" "nnet"
## [133] "nnls" "nodeHarvest" "null"
## [136] "OneR" "ordinalNet" "ordinalRF"
## [139] "ORFlog" "ORFpls" "ORFridge"
## [142] "ORFsvm" "ownn" "pam"
## [145] "parRF" "PART" "partDSA"
## [148] "pcaNNet" "pcr" "pda"
## [151] "pda2" "penalized" "PenalizedLDA"
## [154] "plr" "pls" "plsRglm"
## [157] "polr" "ppr" "pre"
## [160] "PRIM" "protoclass" "qda"
## [163] "QdaCov" "qrf" "qrnn"
## [166] "randomGLM" "ranger" "rbf"
## [169] "rbfDDA" "Rborist" "rda"
## [172] "regLogistic" "relaxo" "rf"
## [175] "rFerns" "RFlda" "rfRules"
## [178] "ridge" "rlda" "rlm"
## [181] "rmda" "rocc" "rotationForest"
## [184] "rotationForestCp" "rpart" "rpart1SE"
## [187] "rpart2" "rpartCost" "rpartScore"
## [190] "rqlasso" "rqnc" "RRF"
## [193] "RRFglobal" "rrlda" "RSimca"
## [196] "rvmLinear" "rvmPoly" "rvmRadial"
## [199] "SBC" "sda" "sdwd"
## [202] "simpls" "SLAVE" "slda"
## [205] "smda" "snn" "sparseLDA"
## [208] "spikeslab" "spls" "stepLDA"
## [211] "stepQDA" "superpc" "svmBoundrangeString"
## [214] "svmExpoString" "svmLinear" "svmLinear2"
## [217] "svmLinear3" "svmLinearWeights" "svmLinearWeights2"
## [220] "svmPoly" "svmRadial" "svmRadialCost"
## [223] "svmRadialSigma" "svmRadialWeights" "svmSpectrumString"
## [226] "tan" "tanSearch" "treebag"
## [229] "vbmpRadial" "vglmAdjCat" "vglmContRatio"
## [232] "vglmCumulative" "widekernelpls" "WM"
## [235] "wsrf" "xgbDART" "xgbLinear"
## [238] "xgbTree" "xyf"
set.seed(123) # Для воспроизводимости
x <- matrix(rnorm(50 * 5), ncol = 5)
y <- factor(rep(c("A", "B"), 25))
head(x)
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.56047565 0.25331851 -0.71040656 0.7877388 2.1988103
## [2,] -0.23017749 -0.02854676 0.25688371 0.7690422 1.3124130
## [3,] 1.55870831 -0.04287046 -0.24669188 0.3322026 -0.2651451
## [4,] 0.07050839 1.36860228 -0.34754260 -1.0083766 0.5431941
## [5,] 0.12928774 -0.22577099 -0.95161857 -0.1194526 -0.4143399
## [6,] 1.71506499 1.51647060 -0.04502772 -0.2803953 -0.4762469
head(y)
## [1] A B A B A B
## Levels: A B
featurePlot(x, y, plot = "density")
Графики плотности
featurePlot(x, y, plot = "box")
Boxplot для каждого признака
featurePlot(x, y, plot = "strip")
Stripplot для каждого признака
featurePlot(x, y, plot = "pairs")
Диаграмма рассеяния (pairs plot)
Графики плотности показывают, что данные подчиняются нормальному распределению (так как использовалась rnorm()). Различия между классами A и B незначительны или отсутствуют, что видно по сильному перекрытию распределений всех признаков (V1–V5).
С использованием функций из пакета FSelector определить
важность признаков для классификации, используя набор
iris.
install.packages("FSelector", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
library(FSelector)
data(iris)
gain <- information.gain(Species ~ ., data = iris)
print(gain)
## attr_importance
## Sepal.Length 0.4521286
## Sepal.Width 0.2672750
## Petal.Length 0.9402853
## Petal.Width 0.9554360
Признаки Petal.Length и Petal.Width являются наиболее значимыми, так как содержат наибольшее количество информации о классе.
С использованием функции discretize() из пакета
arules выполнить дискретизацию переменной
Petal.Length различными методами.
install.packages("arules", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
iris$Petal.Length.interval <- discretize(iris$Petal.Length, method = "interval")
iris$Petal.Length.frequency <- discretize(iris$Petal.Length, method = "frequency")
iris$Petal.Length.cluster <- discretize(iris$Petal.Length, method = "cluster")
iris$Petal.Length.fixed <- discretize(iris$Petal.Length, method = "fixed", c(0, 2, 4, 6, 8))
print(iris[, c("Petal.Length", "Petal.Length.interval", "Petal.Length.frequency", "Petal.Length.cluster", "Petal.Length.fixed")])
## Petal.Length Petal.Length.interval Petal.Length.frequency
## 1 1.4 [1,2.97) [1,2.63)
## 2 1.4 [1,2.97) [1,2.63)
## 3 1.3 [1,2.97) [1,2.63)
## 4 1.5 [1,2.97) [1,2.63)
## 5 1.4 [1,2.97) [1,2.63)
## 6 1.7 [1,2.97) [1,2.63)
## 7 1.4 [1,2.97) [1,2.63)
## 8 1.5 [1,2.97) [1,2.63)
## 9 1.4 [1,2.97) [1,2.63)
## 10 1.5 [1,2.97) [1,2.63)
## 11 1.5 [1,2.97) [1,2.63)
## 12 1.6 [1,2.97) [1,2.63)
## 13 1.4 [1,2.97) [1,2.63)
## 14 1.1 [1,2.97) [1,2.63)
## 15 1.2 [1,2.97) [1,2.63)
## 16 1.5 [1,2.97) [1,2.63)
## 17 1.3 [1,2.97) [1,2.63)
## 18 1.4 [1,2.97) [1,2.63)
## 19 1.7 [1,2.97) [1,2.63)
## 20 1.5 [1,2.97) [1,2.63)
## 21 1.7 [1,2.97) [1,2.63)
## 22 1.5 [1,2.97) [1,2.63)
## 23 1.0 [1,2.97) [1,2.63)
## 24 1.7 [1,2.97) [1,2.63)
## 25 1.9 [1,2.97) [1,2.63)
## 26 1.6 [1,2.97) [1,2.63)
## 27 1.6 [1,2.97) [1,2.63)
## 28 1.5 [1,2.97) [1,2.63)
## 29 1.4 [1,2.97) [1,2.63)
## 30 1.6 [1,2.97) [1,2.63)
## 31 1.6 [1,2.97) [1,2.63)
## 32 1.5 [1,2.97) [1,2.63)
## 33 1.5 [1,2.97) [1,2.63)
## 34 1.4 [1,2.97) [1,2.63)
## 35 1.5 [1,2.97) [1,2.63)
## 36 1.2 [1,2.97) [1,2.63)
## 37 1.3 [1,2.97) [1,2.63)
## 38 1.4 [1,2.97) [1,2.63)
## 39 1.3 [1,2.97) [1,2.63)
## 40 1.5 [1,2.97) [1,2.63)
## 41 1.3 [1,2.97) [1,2.63)
## 42 1.3 [1,2.97) [1,2.63)
## 43 1.3 [1,2.97) [1,2.63)
## 44 1.6 [1,2.97) [1,2.63)
## 45 1.9 [1,2.97) [1,2.63)
## 46 1.4 [1,2.97) [1,2.63)
## 47 1.6 [1,2.97) [1,2.63)
## 48 1.4 [1,2.97) [1,2.63)
## 49 1.5 [1,2.97) [1,2.63)
## 50 1.4 [1,2.97) [1,2.63)
## 51 4.7 [2.97,4.93) [2.63,4.9)
## 52 4.5 [2.97,4.93) [2.63,4.9)
## 53 4.9 [2.97,4.93) [4.9,6.9]
## 54 4.0 [2.97,4.93) [2.63,4.9)
## 55 4.6 [2.97,4.93) [2.63,4.9)
## 56 4.5 [2.97,4.93) [2.63,4.9)
## 57 4.7 [2.97,4.93) [2.63,4.9)
## 58 3.3 [2.97,4.93) [2.63,4.9)
## 59 4.6 [2.97,4.93) [2.63,4.9)
## 60 3.9 [2.97,4.93) [2.63,4.9)
## 61 3.5 [2.97,4.93) [2.63,4.9)
## 62 4.2 [2.97,4.93) [2.63,4.9)
## 63 4.0 [2.97,4.93) [2.63,4.9)
## 64 4.7 [2.97,4.93) [2.63,4.9)
## 65 3.6 [2.97,4.93) [2.63,4.9)
## 66 4.4 [2.97,4.93) [2.63,4.9)
## 67 4.5 [2.97,4.93) [2.63,4.9)
## 68 4.1 [2.97,4.93) [2.63,4.9)
## 69 4.5 [2.97,4.93) [2.63,4.9)
## 70 3.9 [2.97,4.93) [2.63,4.9)
## 71 4.8 [2.97,4.93) [2.63,4.9)
## 72 4.0 [2.97,4.93) [2.63,4.9)
## 73 4.9 [2.97,4.93) [4.9,6.9]
## 74 4.7 [2.97,4.93) [2.63,4.9)
## 75 4.3 [2.97,4.93) [2.63,4.9)
## 76 4.4 [2.97,4.93) [2.63,4.9)
## 77 4.8 [2.97,4.93) [2.63,4.9)
## 78 5.0 [4.93,6.9] [4.9,6.9]
## 79 4.5 [2.97,4.93) [2.63,4.9)
## 80 3.5 [2.97,4.93) [2.63,4.9)
## 81 3.8 [2.97,4.93) [2.63,4.9)
## 82 3.7 [2.97,4.93) [2.63,4.9)
## 83 3.9 [2.97,4.93) [2.63,4.9)
## 84 5.1 [4.93,6.9] [4.9,6.9]
## 85 4.5 [2.97,4.93) [2.63,4.9)
## 86 4.5 [2.97,4.93) [2.63,4.9)
## 87 4.7 [2.97,4.93) [2.63,4.9)
## 88 4.4 [2.97,4.93) [2.63,4.9)
## 89 4.1 [2.97,4.93) [2.63,4.9)
## 90 4.0 [2.97,4.93) [2.63,4.9)
## 91 4.4 [2.97,4.93) [2.63,4.9)
## 92 4.6 [2.97,4.93) [2.63,4.9)
## 93 4.0 [2.97,4.93) [2.63,4.9)
## 94 3.3 [2.97,4.93) [2.63,4.9)
## 95 4.2 [2.97,4.93) [2.63,4.9)
## 96 4.2 [2.97,4.93) [2.63,4.9)
## 97 4.2 [2.97,4.93) [2.63,4.9)
## 98 4.3 [2.97,4.93) [2.63,4.9)
## 99 3.0 [2.97,4.93) [2.63,4.9)
## 100 4.1 [2.97,4.93) [2.63,4.9)
## 101 6.0 [4.93,6.9] [4.9,6.9]
## 102 5.1 [4.93,6.9] [4.9,6.9]
## 103 5.9 [4.93,6.9] [4.9,6.9]
## 104 5.6 [4.93,6.9] [4.9,6.9]
## 105 5.8 [4.93,6.9] [4.9,6.9]
## 106 6.6 [4.93,6.9] [4.9,6.9]
## 107 4.5 [2.97,4.93) [2.63,4.9)
## 108 6.3 [4.93,6.9] [4.9,6.9]
## 109 5.8 [4.93,6.9] [4.9,6.9]
## 110 6.1 [4.93,6.9] [4.9,6.9]
## 111 5.1 [4.93,6.9] [4.9,6.9]
## 112 5.3 [4.93,6.9] [4.9,6.9]
## 113 5.5 [4.93,6.9] [4.9,6.9]
## 114 5.0 [4.93,6.9] [4.9,6.9]
## 115 5.1 [4.93,6.9] [4.9,6.9]
## 116 5.3 [4.93,6.9] [4.9,6.9]
## 117 5.5 [4.93,6.9] [4.9,6.9]
## 118 6.7 [4.93,6.9] [4.9,6.9]
## 119 6.9 [4.93,6.9] [4.9,6.9]
## 120 5.0 [4.93,6.9] [4.9,6.9]
## 121 5.7 [4.93,6.9] [4.9,6.9]
## 122 4.9 [2.97,4.93) [4.9,6.9]
## 123 6.7 [4.93,6.9] [4.9,6.9]
## 124 4.9 [2.97,4.93) [4.9,6.9]
## 125 5.7 [4.93,6.9] [4.9,6.9]
## 126 6.0 [4.93,6.9] [4.9,6.9]
## 127 4.8 [2.97,4.93) [2.63,4.9)
## 128 4.9 [2.97,4.93) [4.9,6.9]
## 129 5.6 [4.93,6.9] [4.9,6.9]
## 130 5.8 [4.93,6.9] [4.9,6.9]
## 131 6.1 [4.93,6.9] [4.9,6.9]
## 132 6.4 [4.93,6.9] [4.9,6.9]
## 133 5.6 [4.93,6.9] [4.9,6.9]
## 134 5.1 [4.93,6.9] [4.9,6.9]
## 135 5.6 [4.93,6.9] [4.9,6.9]
## 136 6.1 [4.93,6.9] [4.9,6.9]
## 137 5.6 [4.93,6.9] [4.9,6.9]
## 138 5.5 [4.93,6.9] [4.9,6.9]
## 139 4.8 [2.97,4.93) [2.63,4.9)
## 140 5.4 [4.93,6.9] [4.9,6.9]
## 141 5.6 [4.93,6.9] [4.9,6.9]
## 142 5.1 [4.93,6.9] [4.9,6.9]
## 143 5.1 [4.93,6.9] [4.9,6.9]
## 144 5.9 [4.93,6.9] [4.9,6.9]
## 145 5.7 [4.93,6.9] [4.9,6.9]
## 146 5.2 [4.93,6.9] [4.9,6.9]
## 147 5.0 [4.93,6.9] [4.9,6.9]
## 148 5.2 [4.93,6.9] [4.9,6.9]
## 149 5.4 [4.93,6.9] [4.9,6.9]
## 150 5.1 [4.93,6.9] [4.9,6.9]
## Petal.Length.cluster Petal.Length.fixed
## 1 [1,2.88) [0,2)
## 2 [1,2.88) [0,2)
## 3 [1,2.88) [0,2)
## 4 [1,2.88) [0,2)
## 5 [1,2.88) [0,2)
## 6 [1,2.88) [0,2)
## 7 [1,2.88) [0,2)
## 8 [1,2.88) [0,2)
## 9 [1,2.88) [0,2)
## 10 [1,2.88) [0,2)
## 11 [1,2.88) [0,2)
## 12 [1,2.88) [0,2)
## 13 [1,2.88) [0,2)
## 14 [1,2.88) [0,2)
## 15 [1,2.88) [0,2)
## 16 [1,2.88) [0,2)
## 17 [1,2.88) [0,2)
## 18 [1,2.88) [0,2)
## 19 [1,2.88) [0,2)
## 20 [1,2.88) [0,2)
## 21 [1,2.88) [0,2)
## 22 [1,2.88) [0,2)
## 23 [1,2.88) [0,2)
## 24 [1,2.88) [0,2)
## 25 [1,2.88) [0,2)
## 26 [1,2.88) [0,2)
## 27 [1,2.88) [0,2)
## 28 [1,2.88) [0,2)
## 29 [1,2.88) [0,2)
## 30 [1,2.88) [0,2)
## 31 [1,2.88) [0,2)
## 32 [1,2.88) [0,2)
## 33 [1,2.88) [0,2)
## 34 [1,2.88) [0,2)
## 35 [1,2.88) [0,2)
## 36 [1,2.88) [0,2)
## 37 [1,2.88) [0,2)
## 38 [1,2.88) [0,2)
## 39 [1,2.88) [0,2)
## 40 [1,2.88) [0,2)
## 41 [1,2.88) [0,2)
## 42 [1,2.88) [0,2)
## 43 [1,2.88) [0,2)
## 44 [1,2.88) [0,2)
## 45 [1,2.88) [0,2)
## 46 [1,2.88) [0,2)
## 47 [1,2.88) [0,2)
## 48 [1,2.88) [0,2)
## 49 [1,2.88) [0,2)
## 50 [1,2.88) [0,2)
## 51 [2.88,4.96) [4,6)
## 52 [2.88,4.96) [4,6)
## 53 [2.88,4.96) [4,6)
## 54 [2.88,4.96) [4,6)
## 55 [2.88,4.96) [4,6)
## 56 [2.88,4.96) [4,6)
## 57 [2.88,4.96) [4,6)
## 58 [2.88,4.96) [2,4)
## 59 [2.88,4.96) [4,6)
## 60 [2.88,4.96) [2,4)
## 61 [2.88,4.96) [2,4)
## 62 [2.88,4.96) [4,6)
## 63 [2.88,4.96) [4,6)
## 64 [2.88,4.96) [4,6)
## 65 [2.88,4.96) [2,4)
## 66 [2.88,4.96) [4,6)
## 67 [2.88,4.96) [4,6)
## 68 [2.88,4.96) [4,6)
## 69 [2.88,4.96) [4,6)
## 70 [2.88,4.96) [2,4)
## 71 [2.88,4.96) [4,6)
## 72 [2.88,4.96) [4,6)
## 73 [2.88,4.96) [4,6)
## 74 [2.88,4.96) [4,6)
## 75 [2.88,4.96) [4,6)
## 76 [2.88,4.96) [4,6)
## 77 [2.88,4.96) [4,6)
## 78 [4.96,6.9] [4,6)
## 79 [2.88,4.96) [4,6)
## 80 [2.88,4.96) [2,4)
## 81 [2.88,4.96) [2,4)
## 82 [2.88,4.96) [2,4)
## 83 [2.88,4.96) [2,4)
## 84 [4.96,6.9] [4,6)
## 85 [2.88,4.96) [4,6)
## 86 [2.88,4.96) [4,6)
## 87 [2.88,4.96) [4,6)
## 88 [2.88,4.96) [4,6)
## 89 [2.88,4.96) [4,6)
## 90 [2.88,4.96) [4,6)
## 91 [2.88,4.96) [4,6)
## 92 [2.88,4.96) [4,6)
## 93 [2.88,4.96) [4,6)
## 94 [2.88,4.96) [2,4)
## 95 [2.88,4.96) [4,6)
## 96 [2.88,4.96) [4,6)
## 97 [2.88,4.96) [4,6)
## 98 [2.88,4.96) [4,6)
## 99 [2.88,4.96) [2,4)
## 100 [2.88,4.96) [4,6)
## 101 [4.96,6.9] [6,8]
## 102 [4.96,6.9] [4,6)
## 103 [4.96,6.9] [4,6)
## 104 [4.96,6.9] [4,6)
## 105 [4.96,6.9] [4,6)
## 106 [4.96,6.9] [6,8]
## 107 [2.88,4.96) [4,6)
## 108 [4.96,6.9] [6,8]
## 109 [4.96,6.9] [4,6)
## 110 [4.96,6.9] [6,8]
## 111 [4.96,6.9] [4,6)
## 112 [4.96,6.9] [4,6)
## 113 [4.96,6.9] [4,6)
## 114 [4.96,6.9] [4,6)
## 115 [4.96,6.9] [4,6)
## 116 [4.96,6.9] [4,6)
## 117 [4.96,6.9] [4,6)
## 118 [4.96,6.9] [6,8]
## 119 [4.96,6.9] [6,8]
## 120 [4.96,6.9] [4,6)
## 121 [4.96,6.9] [4,6)
## 122 [2.88,4.96) [4,6)
## 123 [4.96,6.9] [6,8]
## 124 [2.88,4.96) [4,6)
## 125 [4.96,6.9] [4,6)
## 126 [4.96,6.9] [6,8]
## 127 [2.88,4.96) [4,6)
## 128 [2.88,4.96) [4,6)
## 129 [4.96,6.9] [4,6)
## 130 [4.96,6.9] [4,6)
## 131 [4.96,6.9] [6,8]
## 132 [4.96,6.9] [6,8]
## 133 [4.96,6.9] [4,6)
## 134 [4.96,6.9] [4,6)
## 135 [4.96,6.9] [4,6)
## 136 [4.96,6.9] [6,8]
## 137 [4.96,6.9] [4,6)
## 138 [4.96,6.9] [4,6)
## 139 [2.88,4.96) [4,6)
## 140 [4.96,6.9] [4,6)
## 141 [4.96,6.9] [4,6)
## 142 [4.96,6.9] [4,6)
## 143 [4.96,6.9] [4,6)
## 144 [4.96,6.9] [4,6)
## 145 [4.96,6.9] [4,6)
## 146 [4.96,6.9] [4,6)
## 147 [4.96,6.9] [4,6)
## 148 [4.96,6.9] [4,6)
## 149 [4.96,6.9] [4,6)
## 150 [4.96,6.9] [4,6)
head(iris[, c("Petal.Length.interval", "Petal.Length.frequency", "Petal.Length.cluster", "Petal.Length.fixed")])
## Petal.Length.interval Petal.Length.frequency Petal.Length.cluster
## 1 [1,2.97) [1,2.63) [1,2.88)
## 2 [1,2.97) [1,2.63) [1,2.88)
## 3 [1,2.97) [1,2.63) [1,2.88)
## 4 [1,2.97) [1,2.63) [1,2.88)
## 5 [1,2.97) [1,2.63) [1,2.88)
## 6 [1,2.97) [1,2.63) [1,2.88)
## Petal.Length.fixed
## 1 [0,2)
## 2 [0,2)
## 3 [0,2)
## 4 [0,2)
## 5 [0,2)
## 6 [0,2)
Каждый из этих методов позволяет разделить непрерывную переменную на отдельные участки, но делает это по-своему. Метод «frequency» подходит, когда важно, чтобы данные были распределены равномерно. Метод «interval» удобен для анализа данных, которые распределены равномерно. Метод «cluster» полезен, когда данные имеют сложную структуру. Метод «fixed» необходим, когда требуется строгое разделение данных по заданным критериям.
Использование пакета Boruta для выбора значимых
признаков в наборе данных Ozone.
install.packages("Boruta", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
install.packages("mlbench", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
library(Boruta)
library(mlbench)
data("Ozone", package = "mlbench")
Ozone <- na.omit(Ozone)
head(Ozone)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## 5 1 5 1 5 5760 3 51 54 45.32 1450 25 57.02 60
## 6 1 6 2 6 5720 4 69 35 49.64 1568 15 53.78 60
## 7 1 7 3 4 5790 6 19 45 46.40 2631 -33 54.14 100
## 8 1 8 4 4 5790 3 25 55 52.70 554 -28 64.76 250
## 9 1 9 5 6 5700 3 73 41 48.02 2083 23 52.52 120
## 12 1 12 1 6 5720 3 44 51 54.32 111 9 63.14 150
boruta_result <- Boruta(V4 ~ ., data = Ozone, doTrace = 2)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## After 11 iterations, +0.58 secs:
## confirmed 9 attributes: V1, V10, V11, V12, V13 and 4 more;
## rejected 1 attribute: V3;
## still have 2 attributes left.
## 12. run of importance source...
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## After 15 iterations, +0.78 secs:
## rejected 1 attribute: V2;
## still have 1 attribute left.
## 16. run of importance source...
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## 20. run of importance source...
## 21. run of importance source...
## After 21 iterations, +1.1 secs:
## rejected 1 attribute: V6;
## no more attributes left.
print(boruta_result)
## Boruta performed 21 iterations in 1.081006 secs.
## 9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
## 3 attributes confirmed unimportant: V2, V3, V6;
plot(boruta_result, cex.axis = 0.8)
important_vars <- getSelectedAttributes(boruta_result, withTentative = TRUE)
boxplot(Ozone[, important_vars], main = "Selected Features Boxplot", las = 2, col = "lightblue")
Признак V4 наиболее сильно коррелирует с признаками: V9, V8, V12, V11, V7, V10, V13, V1, V5. Значение V5 значительно выше остальных. Признак V10 имеет большой разброс значений.