x <- matrix(rnorm(50*5),ncol=5)
y <- factor(rep(c(“A”, “B”), 25))
Сохранить полученные графики в *.jpg файлы. Сделать выводы.
library(caret)
names(getModelInfo())
[1] "ada" "AdaBag" "AdaBoost.M1" "adaboost" "amdai"
[6] "ANFIS" "avNNet" "awnb" "awtan" "bag"
[11] "bagEarth" "bagEarthGCV" "bagFDA" "bagFDAGCV" "bam"
[16] "bartMachine" "bayesglm" "binda" "blackboost" "blasso"
[21] "blassoAveraged" "bridge" "brnn" "BstLm" "bstSm"
[26] "bstTree" "C5.0" "C5.0Cost" "C5.0Rules" "C5.0Tree"
[31] "cforest" "chaid" "CSimca" "ctree" "ctree2"
[36] "cubist" "dda" "deepboost" "DENFIS" "dnn"
[41] "dwdLinear" "dwdPoly" "dwdRadial" "earth" "elm"
[46] "enet" "evtree" "extraTrees" "fda" "FH.GBML"
[51] "FIR.DM" "foba" "FRBCS.CHI" "FRBCS.W" "FS.HGD"
[56] "gam" "gamboost" "gamLoess" "gamSpline" "gaussprLinear"
[61] "gaussprPoly" "gaussprRadial" "gbm_h2o" "gbm" "gcvEarth"
[66] "GFS.FR.MOGUL" "GFS.LT.RS" "GFS.THRIFT" "glm.nb" "glm"
[71] "glmboost" "glmnet_h2o" "glmnet" "glmStepAIC" "gpls"
[76] "hda" "hdda" "hdrda" "HYFIS" "icr"
[81] "J48" "JRip" "kernelpls" "kknn" "knn"
[86] "krlsPoly" "krlsRadial" "lars" "lars2" "lasso"
[91] "lda" "lda2" "leapBackward" "leapForward" "leapSeq"
[96] "Linda" "lm" "lmStepAIC" "LMT" "loclda"
[101] "logicBag" "LogitBoost" "logreg" "lssvmLinear" "lssvmPoly"
[106] "lssvmRadial" "lvq" "M5" "M5Rules" "manb"
[111] "mda" "Mlda" "mlp" "mlpKerasDecay" "mlpKerasDecayCost"
[116] "mlpKerasDropout" "mlpKerasDropoutCost" "mlpML" "mlpSGD" "mlpWeightDecay"
[121] "mlpWeightDecayML" "monmlp" "msaenet" "multinom" "mxnet"
[126] "mxnetAdam" "naive_bayes" "nb" "nbDiscrete" "nbSearch"
[131] "neuralnet" "nnet" "nnls" "nodeHarvest" "null"
[136] "OneR" "ordinalNet" "ordinalRF" "ORFlog" "ORFpls"
[141] "ORFridge" "ORFsvm" "ownn" "pam" "parRF"
[146] "PART" "partDSA" "pcaNNet" "pcr" "pda"
[151] "pda2" "penalized" "PenalizedLDA" "plr" "pls"
[156] "plsRglm" "polr" "ppr" "pre" "PRIM"
[161] "protoclass" "qda" "QdaCov" "qrf" "qrnn"
[166] "randomGLM" "ranger" "rbf" "rbfDDA" "Rborist"
[171] "rda" "regLogistic" "relaxo" "rf" "rFerns"
[176] "RFlda" "rfRules" "ridge" "rlda" "rlm"
[181] "rmda" "rocc" "rotationForest" "rotationForestCp" "rpart"
[186] "rpart1SE" "rpart2" "rpartCost" "rpartScore" "rqlasso"
[191] "rqnc" "RRF" "RRFglobal" "rrlda" "RSimca"
[196] "rvmLinear" "rvmPoly" "rvmRadial" "SBC" "sda"
[201] "sdwd" "simpls" "SLAVE" "slda" "smda"
[206] "snn" "sparseLDA" "spikeslab" "spls" "stepLDA"
[211] "stepQDA" "superpc" "svmBoundrangeString" "svmExpoString" "svmLinear"
[216] "svmLinear2" "svmLinear3" "svmLinearWeights" "svmLinearWeights2" "svmPoly"
[221] "svmRadial" "svmRadialCost" "svmRadialSigma" "svmRadialWeights" "svmSpectrumString"
[226] "tan" "tanSearch" "treebag" "vbmpRadial" "vglmAdjCat"
[231] "vglmContRatio" "vglmCumulative" "widekernelpls" "WM" "wsrf"
[236] "xgbDART" "xgbLinear" "xgbTree" "xyf"
x <- matrix(rnorm(50*5),ncol=5) y <- factor(rep(c("A", "B"), 25))
featurePlot(x, y, plot="box")
График показывает, что распределение значений переменных V1-V5 в категориях A и B очень схожее. Медианы, размах значений и средние показатели почти не отличаются, что говорит об отсутствии значительных различий между группами.
Единственное заметное отличие — наличие выбросов в некоторых переменных (особенно V1 и V2 в группе B). Это может указывать на наличие отдельных аномальных значений, но в целом группы выглядят похожими.
С использование функций из пакета Fselector [2] определить важность признаков для решения задачи классификации. Использовать набор data(iris). Сделать выводы.
library(FSelector)
data(iris)
head(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
importance_gain_ratio <- gain.ratio(Species ~ ., data = iris)
importance_gain_ratio
Анализ важности признаков показывает, что наиболее значимыми являются Petal.Length (0.86) и Petal.Width (0.87), в то время как Sepal.Length (0.42) и Sepal.Width (0.25) оказывают меньшее влияние.
install.packages("arules")
library(arules)
iris_interval <- discretize(iris$Sepal.Length, method="interval", breaks=4)
iris_frequency <- discretize(iris$Sepal.Length, method="frequency", breaks=4)
iris_cluster <- discretize(iris$Sepal.Length, method="cluster", breaks=4)
iris_fixed <- discretize(iris$Sepal.Length, method="fixed", categories=c("0-4","4-6","6-8","8-10"))
> print(iris_interval)
[1] [4.3,5.2) [4.3,5.2) [4.3,5.2) [4.3,5.2) [4.3,5.2) [5.2,6.1) [4.3,5.2) [4.3,5.2) [4.3,5.2) [4.3,5.2) [5.2,6.1) [4.3,5.2) [4.3,5.2) [4.3,5.2) [5.2,6.1) [5.2,6.1) [5.2,6.1) [4.3,5.2) [5.2,6.1) [4.3,5.2) [21] [5.2,6.1) [4.3,5.2) [4.3,5.2) [4.3,5.2) [4.3,5.2) [4.3,5.2) [4.3,5.2) [5.2,6.1) [5.2,6.1) [4.3,5.2) [4.3,5.2) [5.2,6.1) [5.2,6.1) [5.2,6.1) [4.3,5.2) [4.3,5.2) [5.2,6.1) [4.3,5.2) [4.3,5.2) [4.3,5.2) [41] [4.3,5.2) [4.3,5.2) [4.3,5.2) [4.3,5.2) [4.3,5.2) [4.3,5.2) [4.3,5.2) [4.3,5.2) [5.2,6.1) [4.3,5.2) [7,7.9] [6.1,7) [6.1,7) [5.2,6.1) [6.1,7) [5.2,6.1) [6.1,7) [4.3,5.2) [6.1,7) [5.2,6.1) [61] [4.3,5.2) [5.2,6.1) [5.2,6.1) [6.1,7) [5.2,6.1) [6.1,7) [5.2,6.1) [5.2,6.1) [6.1,7) [5.2,6.1) [5.2,6.1) [6.1,7) [6.1,7) [6.1,7) [6.1,7) [6.1,7) [6.1,7) [6.1,7) [5.2,6.1) [5.2,6.1) [81] [5.2,6.1) [5.2,6.1) [5.2,6.1) [5.2,6.1) [5.2,6.1) [5.2,6.1) [6.1,7) [6.1,7) [5.2,6.1) [5.2,6.1) [5.2,6.1) [6.1,7) [5.2,6.1) [4.3,5.2) [5.2,6.1) [5.2,6.1) [5.2,6.1) [6.1,7) [4.3,5.2) [5.2,6.1) [101] [6.1,7) [5.2,6.1) [7,7.9] [6.1,7) [6.1,7) [7,7.9] [4.3,5.2) [7,7.9] [6.1,7) [7,7.9] [6.1,7) [6.1,7) [6.1,7) [5.2,6.1) [5.2,6.1) [6.1,7) [6.1,7) [7,7.9] [7,7.9] [5.2,6.1) [121] [6.1,7) [5.2,6.1) [7,7.9] [6.1,7) [6.1,7) [7,7.9] [6.1,7) [6.1,7) [6.1,7) [7,7.9] [7,7.9] [7,7.9] [6.1,7) [6.1,7) [6.1,7) [7,7.9] [6.1,7) [6.1,7) [5.2,6.1) [6.1,7) [141] [6.1,7) [6.1,7) [5.2,6.1) [6.1,7) [6.1,7) [6.1,7) [6.1,7) [6.1,7) [6.1,7) [5.2,6.1)
attr(,"discretized:breaks") [1] 4.3 5.2 6.1 7.0 7.9
attr(,"discretized:method") [1]
interval Levels: [4.3,5.2) [5.2,6.1) [6.1,7) [7,7.9]
print(iris_frequency) [1] [5.1,5.8) [4.3,5.1) [4.3,5.1) [4.3,5.1) [4.3,5.1) [5.1,5.8) [4.3,5.1) [4.3,5.1) [4.3,5.1) [4.3,5.1) [5.1,5.8) [4.3,5.1) [4.3,5.1) [4.3,5.1) [5.8,6.4) [5.1,5.8) [5.1,5.8) [5.1,5.8) [5.1,5.8) [5.1,5.8) [21] [5.1,5.8) [5.1,5.8) [4.3,5.1) [5.1,5.8) [4.3,5.1) [4.3,5.1) [4.3,5.1) [5.1,5.8) [5.1,5.8) [4.3,5.1) [4.3,5.1) [5.1,5.8) [5.1,5.8) [5.1,5.8) [4.3,5.1) [4.3,5.1) [5.1,5.8) [4.3,5.1) [4.3,5.1) [5.1,5.8) [41] [4.3,5.1) [4.3,5.1) [4.3,5.1) [4.3,5.1) [5.1,5.8) [4.3,5.1) [5.1,5.8) [4.3,5.1) [5.1,5.8) [4.3,5.1) [6.4,7.9] [6.4,7.9] [6.4,7.9] [5.1,5.8) [6.4,7.9] [5.1,5.8) [5.8,6.4) [4.3,5.1) [6.4,7.9] [5.1,5.8) [61] [4.3,5.1) [5.8,6.4) [5.8,6.4) [5.8,6.4) [5.1,5.8) [6.4,7.9] [5.1,5.8) [5.8,6.4) [5.8,6.4) [5.1,5.8) [5.8,6.4) [5.8,6.4) [5.8,6.4) [5.8,6.4) [6.4,7.9] [6.4,7.9] [6.4,7.9] [6.4,7.9] [5.8,6.4) [5.1,5.8) [81] [5.1,5.8) [5.1,5.8) [5.8,6.4) [5.8,6.4) [5.1,5.8) [5.8,6.4) [6.4,7.9] [5.8,6.4) [5.1,5.8) [5.1,5.8) [5.1,5.8) [5.8,6.4) [5.8,6.4) [4.3,5.1) [5.1,5.8) [5.1,5.8) [5.1,5.8) [5.8,6.4) [5.1,5.8) [5.1,5.8) [101] [5.8,6.4) [5.8,6.4) [6.4,7.9] [5.8,6.4) [6.4,7.9] [6.4,7.9] [4.3,5.1) [6.4,7.9] [6.4,7.9] [6.4,7.9] [6.4,7.9] [6.4,7.9] [6.4,7.9] [5.1,5.8) [5.8,6.4) [6.4,7.9] [6.4,7.9] [6.4,7.9] [6.4,7.9] [5.8,6.4) [121] [6.4,7.9] [5.1,5.8) [6.4,7.9] [5.8,6.4) [6.4,7.9] [6.4,7.9] [5.8,6.4) [5.8,6.4) [6.4,7.9] [6.4,7.9] [6.4,7.9] [6.4,7.9] [6.4,7.9] [5.8,6.4) [5.8,6.4) [6.4,7.9] [5.8,6.4) [6.4,7.9] [5.8,6.4) [6.4,7.9] [141] [6.4,7.9] [6.4,7.9] [5.8,6.4) [6.4,7.9] [6.4,7.9] [6.4,7.9] [5.8,6.4) [6.4,7.9] [5.8,6.4) [5.8,6.4)
attr(,"discretized:breaks") [1] 4.3 5.1 5.8 6.4 7.9
attr(,"discretized:method") [1]
frequency Levels: [4.3,5.1) [5.1,5.8) [5.8,6.4) [6.4,7.9]
print(iris_cluster) [1] [4.89,5.59) [4.89,5.59) [4.3,4.89) [4.3,4.89) [4.89,5.59) [4.89,5.59) [4.3,4.89) [4.89,5.59) [4.3,4.89) [4.89,5.59) [4.89,5.59) [4.3,4.89) [4.3,4.89) [4.3,4.89) [5.59,6.49) [5.59,6.49) [17] [4.89,5.59) [4.89,5.59) [5.59,6.49) [4.89,5.59) [4.89,5.59) [4.89,5.59) [4.3,4.89) [4.89,5.59) [4.3,4.89) [4.89,5.59) [4.89,5.59) [4.89,5.59) [4.89,5.59) [4.3,4.89) [4.3,4.89) [4.89,5.59) [33] [4.89,5.59) [4.89,5.59) [4.89,5.59) [4.89,5.59) [4.89,5.59) [4.89,5.59) [4.3,4.89) [4.89,5.59) [4.89,5.59) [4.3,4.89) [4.3,4.89) [4.89,5.59) [4.89,5.59) [4.3,4.89) [4.89,5.59) [4.3,4.89) [49] [4.89,5.59) [4.89,5.59) [6.49,7.9] [5.59,6.49) [6.49,7.9] [4.89,5.59) [6.49,7.9] [5.59,6.49) [5.59,6.49) [4.89,5.59) [6.49,7.9] [4.89,5.59) [4.89,5.59) [5.59,6.49) [5.59,6.49) [5.59,6.49) [65] [5.59,6.49) [6.49,7.9] [5.59,6.49) [5.59,6.49) [5.59,6.49) [5.59,6.49) [5.59,6.49) [5.59,6.49) [5.59,6.49) [5.59,6.49) [5.59,6.49) [6.49,7.9] [6.49,7.9] [6.49,7.9] [5.59,6.49) [5.59,6.49) [81] [4.89,5.59) [4.89,5.59) [5.59,6.49) [5.59,6.49) [4.89,5.59) [5.59,6.49) [6.49,7.9] [5.59,6.49) [5.59,6.49) [4.89,5.59) [4.89,5.59) [5.59,6.49) [5.59,6.49) [4.89,5.59) [5.59,6.49) [5.59,6.49) [97] [5.59,6.49) [5.59,6.49) [4.89,5.59) [5.59,6.49) [5.59,6.49) [5.59,6.49) [6.49,7.9] [5.59,6.49) [6.49,7.9] [6.49,7.9] [4.89,5.59) [6.49,7.9] [6.49,7.9] [6.49,7.9] [6.49,7.9] [5.59,6.49) [113] [6.49,7.9] [5.59,6.49) [5.59,6.49) [5.59,6.49) [6.49,7.9] [6.49,7.9] [6.49,7.9] [5.59,6.49) [6.49,7.9] [5.59,6.49) [6.49,7.9] [5.59,6.49) [6.49,7.9] [6.49,7.9] [5.59,6.49) [5.59,6.49) [129] [5.59,6.49) [6.49,7.9] [6.49,7.9] [6.49,7.9] [5.59,6.49) [5.59,6.49) [5.59,6.49) [6.49,7.9] [5.59,6.49) [5.59,6.49) [5.59,6.49) [6.49,7.9] [6.49,7.9] [6.49,7.9] [5.59,6.49) [6.49,7.9] [145] [6.49,7.9] [6.49,7.9] [5.59,6.49) [6.49,7.9] [5.59,6.49) [5.59,6.49)
attr(,"discretized:breaks") [1] 4.300000 4.891134 5.588455 6.489286 7.900000
attr(,"discretized:method") [1]
cluster Levels: [4.3,4.89) [4.89,5.59) [5.59,6.49) [6.49,7.9]
print(iris_fixed) [1] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> [41] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> [81] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> [121] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
attr(,"discretized:breaks") [1] 0-4 4-6 6-8 8-10
attr(,"discretized:method") [1]
fixed Levels: [,)
Метод ‘interval’ разбивает данные на равные по ширине интервалы, что удобно, если важно сохранить одинаковый шаг, но такой подход не всегда соответствует реальному распределению данных. ‘Frequency’ формирует интервалы так, чтобы в каждом было одинаковое количество объектов, что лучше показывает плотность данных в разных частях выборки. ‘Cluster’ использует алгоритмы кластеризации для определения границ, что делает его полезным при наличии естественных групп в данных. ‘Fixed’ позволяет заранее задать границы интервалов, что удобно, если разбиение должно соответствовать определённым критериям. Выбор метода зависит от задачи: нужна ли равномерность интервалов, одинаковая плотность объектов, учёт кластеров или строго фиксированные границы.
install.packages("boruta")
library(Boruta)
install.packages("mlbench")
data("Ozone", package = "mlbench")
Ozone <- na.omit(Ozone)
boruta_result <- Boruta(V4 ~ ., data = Ozone, doTrace = 2)
1. run of importance source...
2. run of importance source...
3. run of importance source...
4. run of importance source...
5. run of importance source...
6. run of importance source...
7. run of importance source...
8. run of importance source...
9. run of importance source...
10. run of importance source...
11. run of importance source... After 11 iterations, +2 secs:
confirmed 9 attributes: V1, V10, V11, V12, V13 and 4 more;
rejected 2 attributes: V2, V3; still have 1 attribute left.
run of importance source...run of importance source...run of importance source...run of importance source...run of importance source...run of importance source...run of importance source...run of importance source...run of importance source...run of importance source... After 21 iterations, +2.7 secs: rejected 1 attribute: V6; no more attributes left.
boxplot(Ozone_sel, main="Features Boxplot")
График boxplot показывает, что при классификации данных в R с помощью пакета Boruta переменные V1 и V10 имеют значительно большие значения, что может повлиять на анализ. Также в этих переменных, а также в V13, наблюдаются выбросы. Остальные признаки имеют более равномерное распределение. Для корректного анализа стоит нормализовать данные и проверить выбросы, чтобы избежать искажений в модели.