Установить пакет CARET, выполнить команду names(getModelInfo()), ознакомиться со списком доступных методов выбора признаков.
names(getModelInfo())
## [1] "ada" "AdaBag" "AdaBoost.M1"
## [4] "adaboost" "amdai" "ANFIS"
## [7] "avNNet" "awnb" "awtan"
## [10] "bag" "bagEarth" "bagEarthGCV"
## [13] "bagFDA" "bagFDAGCV" "bam"
## [16] "bartMachine" "bayesglm" "binda"
## [19] "blackboost" "blasso" "blassoAveraged"
## [22] "bridge" "brnn" "BstLm"
## [25] "bstSm" "bstTree" "C5.0"
## [28] "C5.0Cost" "C5.0Rules" "C5.0Tree"
## [31] "cforest" "chaid" "CSimca"
## [34] "ctree" "ctree2" "cubist"
## [37] "dda" "deepboost" "DENFIS"
## [40] "dnn" "dwdLinear" "dwdPoly"
## [43] "dwdRadial" "earth" "elm"
## [46] "enet" "evtree" "extraTrees"
## [49] "fda" "FH.GBML" "FIR.DM"
## [52] "foba" "FRBCS.CHI" "FRBCS.W"
## [55] "FS.HGD" "gam" "gamboost"
## [58] "gamLoess" "gamSpline" "gaussprLinear"
## [61] "gaussprPoly" "gaussprRadial" "gbm_h2o"
## [64] "gbm" "gcvEarth" "GFS.FR.MOGUL"
## [67] "GFS.LT.RS" "GFS.THRIFT" "glm.nb"
## [70] "glm" "glmboost" "glmnet_h2o"
## [73] "glmnet" "glmStepAIC" "gpls"
## [76] "hda" "hdda" "hdrda"
## [79] "HYFIS" "icr" "J48"
## [82] "JRip" "kernelpls" "kknn"
## [85] "knn" "krlsPoly" "krlsRadial"
## [88] "lars" "lars2" "lasso"
## [91] "lda" "lda2" "leapBackward"
## [94] "leapForward" "leapSeq" "Linda"
## [97] "lm" "lmStepAIC" "LMT"
## [100] "loclda" "logicBag" "LogitBoost"
## [103] "logreg" "lssvmLinear" "lssvmPoly"
## [106] "lssvmRadial" "lvq" "M5"
## [109] "M5Rules" "manb" "mda"
## [112] "Mlda" "mlp" "mlpKerasDecay"
## [115] "mlpKerasDecayCost" "mlpKerasDropout" "mlpKerasDropoutCost"
## [118] "mlpML" "mlpSGD" "mlpWeightDecay"
## [121] "mlpWeightDecayML" "monmlp" "msaenet"
## [124] "multinom" "mxnet" "mxnetAdam"
## [127] "naive_bayes" "nb" "nbDiscrete"
## [130] "nbSearch" "neuralnet" "nnet"
## [133] "nnls" "nodeHarvest" "null"
## [136] "OneR" "ordinalNet" "ordinalRF"
## [139] "ORFlog" "ORFpls" "ORFridge"
## [142] "ORFsvm" "ownn" "pam"
## [145] "parRF" "PART" "partDSA"
## [148] "pcaNNet" "pcr" "pda"
## [151] "pda2" "penalized" "PenalizedLDA"
## [154] "plr" "pls" "plsRglm"
## [157] "polr" "ppr" "pre"
## [160] "PRIM" "protoclass" "qda"
## [163] "QdaCov" "qrf" "qrnn"
## [166] "randomGLM" "ranger" "rbf"
## [169] "rbfDDA" "Rborist" "rda"
## [172] "regLogistic" "relaxo" "rf"
## [175] "rFerns" "RFlda" "rfRules"
## [178] "ridge" "rlda" "rlm"
## [181] "rmda" "rocc" "rotationForest"
## [184] "rotationForestCp" "rpart" "rpart1SE"
## [187] "rpart2" "rpartCost" "rpartScore"
## [190] "rqlasso" "rqnc" "RRF"
## [193] "RRFglobal" "rrlda" "RSimca"
## [196] "rvmLinear" "rvmPoly" "rvmRadial"
## [199] "SBC" "sda" "sdwd"
## [202] "simpls" "SLAVE" "slda"
## [205] "smda" "snn" "sparseLDA"
## [208] "spikeslab" "spls" "stepLDA"
## [211] "stepQDA" "superpc" "svmBoundrangeString"
## [214] "svmExpoString" "svmLinear" "svmLinear2"
## [217] "svmLinear3" "svmLinearWeights" "svmLinearWeights2"
## [220] "svmPoly" "svmRadial" "svmRadialCost"
## [223] "svmRadialSigma" "svmRadialWeights" "svmSpectrumString"
## [226] "tan" "tanSearch" "treebag"
## [229] "vbmpRadial" "vglmAdjCat" "vglmContRatio"
## [232] "vglmCumulative" "widekernelpls" "WM"
## [235] "wsrf" "xgbDART" "xgbLinear"
## [238] "xgbTree" "xyf"
Выполните графический разведочный анализ данных с использование функции featurePlot() для набора данных из справочного файла пакета CARET:
x <- matrix(rnorm(50*5),ncol=5)
y <- factor(rep(c("A", "B"), 25))
featurePlot(x, y, plot="box")
featurePlot(x, y, plot="strip")
featurePlot(x, y, plot="density")
featurePlot(x, y, plot="pairs")
Сохранить полученные графики в *.jpg файлы. Сделать выводы.
На основе случайно сгенерированных значений x с
нормальным распределением были построены графики. При помощи
использования фактора y с чередующимися 25 раз значениями
A и B эти значения были разделены на 2 вида. В
графиках вида box и strip эти виды отображены
в виде пар, в то время как в графиках вида density и
pairs эти виды обозначены разными цветами.
При этом стоит отметить и тот факт, что набор значений x
также разделён на 5 столбцов при помощи функции matrix с
параметром ncol=5, из-за чего и было показано по 5 рисунков
для каждого столбца, а для графика pairs было и вовсе
показано \(5*5-5=20\) рисунков.
Наиболее понятными я считаю графики box и
density, так как они наглядно показывают плотность
распределения значений. Так, density позволяет легко
опознать пики значений.
С использованием функций из пакета Fselector определить важность признаков для решения задачи классификации. Использовать набор data(iris). Сделать выводы.
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
Здесь мы видим характеристики чашелистников (Sepal) и лепестков (Petal) и их сорт. Очевидно, все характеристики зависят в первую очередь от сорта. Попробуем узнать, насколько при помощи функции information.gain из FSelector.
information.gain(Species ~ ., data = iris)
## attr_importance
## Sepal.Length 0.4521286
## Sepal.Width 0.2672750
## Petal.Length 0.9402853
## Petal.Width 0.9554360
Мы видим, что наибольший вес важности у длины и ширины лепестков.
С использованием функции discretize() из пакета arules выполните преобразование непрерывной переменной в категориальную различными методами: «interval» (равная ширина интервала), «frequency» (равная частота), «cluster» (кластеризация) и «fixed» (категории задают границы интервалов). Используйте набор данных iris. Сделайте выводы
Взглянем на ширину лепестков так как это самый важный признак.
Ради интереса построю график
densityplot(iris$Petal.Width, plot="density")
Здесь явно выражено падение в районе ширины 0,8. Интересно, будет ли это отражено при дискретизации, и где будет обозначена граница между второй и третьей категорией.
discretize(iris$Petal.Width, method = "interval")
## [1] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [8] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [15] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [22] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [29] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [36] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [43] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
## [50] [0.1,0.9) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [57] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [64] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [71] [1.7,2.5] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [78] [1.7,2.5] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [85] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [92] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
## [99] [0.9,1.7) [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [106] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [113] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [120] [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [127] [1.7,2.5] [1.7,2.5] [1.7,2.5] [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [134] [0.9,1.7) [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [141] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [148] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## attr(,"discretized:breaks")
## [1] 0.1 0.9 1.7 2.5
## attr(,"discretized:method")
## [1] interval
## Levels: [0.1,0.9) [0.9,1.7) [1.7,2.5]
discretize(iris$Petal.Width, method = "frequency")
## [1] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
## [7] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
## [13] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
## [19] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
## [25] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
## [31] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
## [37] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
## [43] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
## [49] [0.1,0.867) [0.1,0.867) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6)
## [55] [0.867,1.6) [0.867,1.6) [1.6,2.5] [0.867,1.6) [0.867,1.6) [0.867,1.6)
## [61] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6)
## [67] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [1.6,2.5] [0.867,1.6)
## [73] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [1.6,2.5]
## [79] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [1.6,2.5]
## [85] [0.867,1.6) [1.6,2.5] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6)
## [91] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6)
## [97] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [1.6,2.5] [1.6,2.5]
## [103] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5]
## [109] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5]
## [115] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [0.867,1.6)
## [121] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5]
## [127] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5]
## [133] [1.6,2.5] [0.867,1.6) [0.867,1.6) [1.6,2.5] [1.6,2.5] [1.6,2.5]
## [139] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5]
## [145] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5] [1.6,2.5]
## attr(,"discretized:breaks")
## [1] 0.1000000 0.8666667 1.6000000 2.5000000
## attr(,"discretized:method")
## [1] frequency
## Levels: [0.1,0.867) [0.867,1.6) [1.6,2.5]
discretize(iris$Petal.Width, method = "cluster")
## [1] [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785)
## [6] [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785)
## [11] [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785)
## [16] [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785)
## [21] [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785)
## [26] [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785)
## [31] [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785)
## [36] [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785)
## [41] [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785)
## [46] [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785) [0.1,0.785)
## [51] [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69)
## [56] [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69)
## [61] [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69)
## [66] [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69)
## [71] [1.69,2.5] [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69)
## [76] [0.785,1.69) [0.785,1.69) [1.69,2.5] [0.785,1.69) [0.785,1.69)
## [81] [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69)
## [86] [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69)
## [91] [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69)
## [96] [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69) [0.785,1.69)
## [101] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5]
## [106] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5]
## [111] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5]
## [116] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5] [0.785,1.69)
## [121] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5]
## [126] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5] [0.785,1.69)
## [131] [1.69,2.5] [1.69,2.5] [1.69,2.5] [0.785,1.69) [0.785,1.69)
## [136] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5]
## [141] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5]
## [146] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5] [1.69,2.5]
## attr(,"discretized:breaks")
## [1] 0.1000000 0.7845385 1.6907051 2.5000000
## attr(,"discretized:method")
## [1] cluster
## Levels: [0.1,0.785) [0.785,1.69) [1.69,2.5]
discretize(iris$Petal.Width, method = "fixed", breaks = c(min(iris$Petal.Width), 1, 2, max(iris$Petal.Width)))
## [1] [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1)
## [10] [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1)
## [19] [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1)
## [28] [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1)
## [37] [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1)
## [46] [0.1,1) [0.1,1) [0.1,1) [0.1,1) [0.1,1) [1,2) [1,2) [1,2) [1,2)
## [55] [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2)
## [64] [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2)
## [73] [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2)
## [82] [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2)
## [91] [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2) [1,2)
## [100] [1,2) [2,2.5] [1,2) [2,2.5] [1,2) [2,2.5] [2,2.5] [1,2) [1,2)
## [109] [1,2) [2,2.5] [2,2.5] [1,2) [2,2.5] [2,2.5] [2,2.5] [2,2.5] [1,2)
## [118] [2,2.5] [2,2.5] [1,2) [2,2.5] [2,2.5] [2,2.5] [1,2) [2,2.5] [1,2)
## [127] [1,2) [1,2) [2,2.5] [1,2) [1,2) [2,2.5] [2,2.5] [1,2) [1,2)
## [136] [2,2.5] [2,2.5] [1,2) [1,2) [2,2.5] [2,2.5] [2,2.5] [1,2) [2,2.5]
## [145] [2,2.5] [2,2.5] [1,2) [2,2.5] [2,2.5] [1,2)
## attr(,"discretized:breaks")
## [1] 0.1 1.0 2.0 2.5
## attr(,"discretized:method")
## [1] fixed
## Levels: [0.1,1) [1,2) [2,2.5]
Мы видим, что метод interval просто разделяет значения
на равных промежутках, frequency делит по встречаемости
(как примерно и ожидалось при построении графика плотности),
cluster делит по скоплениям, а fixed
использует предопределённые значения.
Установите пакет Boruta и проведите выбор признаков для набора данных data(“Ozone”). Построить график boxplot, сделать выводы.
data("Ozone", package="mlbench")
Описание набора данных заявляет следующие описания столбцов:
Month: 1 = January, …, 12 = December
Day of month
Day of week: 1 = Monday, …, 7 = Sunday
Daily maximum one-hour-average ozone reading
500 millibar pressure height (m) measured at Vandenberg AFB
Wind speed (mph) at Los Angeles International Airport (LAX)
Humidity (%) at LAX
Temperature (degrees F) measured at Sandburg, CA
Temperature (degrees F) measured at El Monte, CA
Inversion base height (feet) at LAX
Pressure gradient (mm Hg) from LAX to Daggett, CA
Inversion base temperature (degrees F) at LAX
Visibility (miles) measured at LAX
Очевидно, мы будем рассматривать 4 столбик, так как ради именно его значений и был собран этот набор данных. Однако не все строки в 4 столбце имеют данные, из-за чего те строки были убраны.
Ozone <- Ozone[complete.cases(Ozone$V4),]
set.seed(777)
boruta_output <- Boruta(V4 ~ ., data=Ozone)
plot(boruta_output, sort=FALSE)
8 столбец оказался наиболее определяющим для измерений озона.