install.packages("caret")
# пакет ‘caret’ успешно распакован, MD5-суммы проверены<br/>
# Скачанные бинарные пакеты находятся в
#C:\Users\pkele\AppData\Local\Temp\Rtmpa4W4tr\downloaded_packages
library(caret)
# Загрузка требуемого пакета: ggplot2
# Загрузка требуемого пакета: lattice
names(getModelInfo())
[1] "ada" "AdaBag" "AdaBoost.M1" "adaboost"
[5] "amdai" "ANFIS" "avNNet" "awnb"
[13] "bagFDA" "bagFDAGCV" "bam" "bartMachine"
[17] "bayesglm" "binda" "blackboost" "blasso"
[21] "blassoAveraged" "bridge" "brnn" "BstLm"
[25] "bstSm" "bstTree" "C5.0" "C5.0Cost"
[29] "C5.0Rules" "C5.0Tree" "cforest" "chaid"
[33] "CSimca" "ctree" "ctree2" "cubist"
[37] "dda" "deepboost" "DENFIS" "dnn"
[41] "dwdLinear" "dwdPoly" "dwdRadial" "earth"
[45] "elm" "enet" "evtree" "extraTrees"
[49] "fda" "FH.GBML" "FIR.DM" "foba"
[53] "FRBCS.CHI" "FRBCS.W" "FS.HGD" "gam"
[57] "gamboost" "gamLoess" "gamSpline" "gaussprLinear"
[61] "gaussprPoly" "gaussprRadial" "gbm_h2o" "gbm"
[65] "gcvEarth" "GFS.FR.MOGUL" "GFS.LT.RS" "GFS.THRIFT"
[69] "glm.nb" "glm" "glmboost" "glmnet_h2o"
[73] "glmnet" "glmStepAIC" "gpls" "hda"
[77] "hdda" "hdrda" "HYFIS" "icr"
[81] "J48" "JRip" "kernelpls" "kknn"
[85] "knn" "krlsPoly" "krlsRadial" "lars"
[89] "lars2" "lasso" "lda" "lda2"
[93] "leapBackward" "leapForward" "leapSeq" "Linda"
[97] "lm" "lmStepAIC" "LMT" "loclda"
[101] "logicBag" "LogitBoost" "logreg" "lssvmLinear"
[105] "lssvmPoly" "lssvmRadial" "lvq" "M5"
[109] "M5Rules" "manb" "mda" "Mlda"
[113] "mlp" "mlpKerasDecay" "mlpKerasDecayCost" "mlpKerasDropout"
[117] "mlpKerasDropoutCost" "mlpML" "mlpSGD" "mlpWeightDecay"
[121] "mlpWeightDecayML" "monmlp" "msaenet" "multinom"
[125] "mxnet" "mxnetAdam" "naive_bayes" "nb"
[129] "nbDiscrete" "nbSearch" "neuralnet" "nnet"
[133] "nnls" "nodeHarvest" "null" "OneR"
[137] "ordinalNet" "ordinalRF" "ORFlog" "ORFpls"
[141] "ORFridge" "ORFsvm" "ownn" "pam"
[145] "parRF" "PART" "partDSA" "pcaNNet"
[149] "pcr" "pda" "pda2" "penalized"
[153] "PenalizedLDA" "plr" "pls" "plsRglm"
[157] "polr" "ppr" "pre" "PRIM"
[161] "protoclass" "qda" "QdaCov" "qrf"
[165] "qrnn" "randomGLM" "ranger" "rbf"
[169] "rbfDDA" "Rborist" "rda" "regLogistic"
[173] "relaxo" "rf" "rFerns" "RFlda"
[177] "rfRules" "ridge" "rlda" "rlm"
[181] "rmda" "rocc" "rotationForest" "rotationForestCp"
[185] "rpart" "rpart1SE" "rpart2" "rpartCost"
[189] "rpartScore" "rqlasso" "rqnc" "RRF"
[193] "RRFglobal" "rrlda" "RSimca" "rvmLinear"
[197] "rvmPoly" "rvmRadial" "SBC" "sda"
[201] "sdwd" "simpls" "SLAVE" "slda"
[205] "smda" "snn" "sparseLDA" "spikeslab"
[209] "spls" "stepLDA" "stepQDA" "superpc"
[213] "svmBoundrangeString" "svmExpoString" "svmLinear" "svmLinear2"
[217] "svmLinear3" "svmLinearWeights" "svmLinearWeights2" "svmPoly"
[221] "svmRadial" "svmRadialCost" "svmRadialSigma" "svmRadialWeights"
[225] "svmSpectrumString" "tan" "tanSearch" "treebag"
[229] "vbmpRadial" "vglmAdjCat" "vglmContRatio" "vglmCumulative"
[233] "widekernelpls" "WM" "wsrf" "xgbDART"
[237] "xgbLinear" "xgbTree" "xyf"
выполним графический разведочный анализ данных с использованием
функции featurePlot() для набора данных x:
featurePlot(x, y, plot="density")
#
Задание 2 Установим пакет FSelector и загрузим набор данных iris:
install.packages("FSelector", repos = "http://cran.us.r-project.org")
#пакет ‘RWekajars’ успешно распакован, MD5-суммы проверены
#пакет ‘rJava’ успешно распакован, MD5-суммы проверены
#пакет ‘entropy’ успешно распакован, MD5-суммы проверены
#пакет ‘randomForest’ успешно распакован, MD5-суммы проверены
#пакет ‘RWeka’ успешно распакован, MD5-суммы проверены
#пакет ‘FSelector’ успешно распакован, MD5-суммы проверены
#Скачанные бинарные пакеты находятся в
#C:\Users\pkele\AppData\Local\Temp\Rtmpa4W4tr\downloaded_packages
library(FSelector)
# java.home option:
# JAVA_HOME environment variable: C:\Program Files\Java\jdk-1.8
# Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
# Please do NOT set it unless you want to override system settings.
data(iris)
Для определения важности признаков для решения задачи классификации воспользуемся функцией information.gain():
gain <- information.gain(Species ~ ., iris)
print(gain)
# attr_importance
# Sepal.Length 0.4521286
# Sepal.Width 0.2672750
# Petal.Length 0.9402853
# Petal.Width 0.9554360
Результат выполнения этой команды покажет, какой признак имеет наибольшую информационную выгоду при решении задачи классификации. #Задание 3 Установим пакет arules:
install.packages("arules", repos = "http://cran.us.r-project.org")
# Устанавливаю пакет в 'C:/Users/artur/AppData/Local/R/win-library/4.3'
# (потому что 'lib' не определено)
# пакет 'arules' успешно распакован, MD5-суммы проверены
# Скачанные бинарные пакеты находятся в
# C:\Users\pkele\AppData\Local\Temp\Rtmpa4W4tr\downloaded_packages
library(arules)
# Загрузка требуемого пакета: Matrix
# Присоединяю пакет: 'arules'
# Следующие объекты скрыты от 'package:base':
# abbreviate, write
data(iris)
Преобразуем непрерывную переменную в категориальную различными методами:
iris_interval <- discretize(iris$Sepal.Length, method="interval", breaks=5)
iris_frequency <- discretize(iris$Sepal.Length, method="frequency", breaks=5)
iris_cluster <- discretize(iris$Sepal.Length, method="cluster", breaks=5)
iris_fixed <- discretize(iris$Sepal.Length, method="fixed", categories=c("0-4","4-6","6-8","8-10"))
# Warning in discretize(iris$Sepal.Length, method = "fixed", categories =
# c("0-4", : Parameter categories is deprecated. Use breaks instead! Also, the
# default method is now frequency!
# Warning in sort.int(as.double(breaks)): в результате преобразования созданы NA
print(iris_interval)
[1] [5.02,5.74) [4.3,5.02) [4.3,5.02)
[4] [4.3,5.02) [4.3,5.02) [5.02,5.74)
[7] [4.3,5.02) [4.3,5.02) [4.3,5.02)
[10] [4.3,5.02) [5.02,5.74) [4.3,5.02)
[13] [4.3,5.02) [4.3,5.02) [5.74,6.46)
[16] [5.02,5.74) [5.02,5.74) [5.02,5.74)
[19] [5.02,5.74) [5.02,5.74) [5.02,5.74)
[22] [5.02,5.74) [4.3,5.02) [5.02,5.74)
[25] [4.3,5.02) [4.3,5.02) [4.3,5.02)
[28] [5.02,5.74) [5.02,5.74) [4.3,5.02)
[31] [4.3,5.02) [5.02,5.74) [5.02,5.74)
[34] [5.02,5.74) [4.3,5.02) [4.3,5.02)
[37] [5.02,5.74) [4.3,5.02) [4.3,5.02)
[40] [5.02,5.74) [4.3,5.02) [4.3,5.02)
[43] [4.3,5.02) [4.3,5.02) [5.02,5.74)
[46] [4.3,5.02) [5.02,5.74) [4.3,5.02)
[49] [5.02,5.74) [4.3,5.02) [6.46,7.18)
[52] [5.74,6.46) [6.46,7.18) [5.02,5.74)
[55] [6.46,7.18) [5.02,5.74) [5.74,6.46)
[58] [4.3,5.02) [6.46,7.18) [5.02,5.74)
[61] [4.3,5.02) [5.74,6.46) [5.74,6.46)
[64] [5.74,6.46) [5.02,5.74) [6.46,7.18)
[67] [5.02,5.74) [5.74,6.46) [5.74,6.46)
[70] [5.02,5.74) [5.74,6.46) [5.74,6.46)
[73] [5.74,6.46) [5.74,6.46) [5.74,6.46)
[76] [6.46,7.18) [6.46,7.18) [6.46,7.18)
[79] [5.74,6.46) [5.02,5.74) [5.02,5.74)
[82] [5.02,5.74) [5.74,6.46) [5.74,6.46)
[85] [5.02,5.74) [5.74,6.46) [6.46,7.18)
[88] [5.74,6.46) [5.02,5.74) [5.02,5.74)
[91] [5.02,5.74) [5.74,6.46) [5.74,6.46)
[94] [4.3,5.02) [5.02,5.74) [5.02,5.74)
[97] [5.02,5.74) [5.74,6.46) [5.02,5.74)
[100] [5.02,5.74) [5.74,6.46) [5.74,6.46)
[103] [6.46,7.18) [5.74,6.46) [6.46,7.18)
[106] [7.18,7.9] [4.3,5.02) [7.18,7.9]
[109] [6.46,7.18) [7.18,7.9] [6.46,7.18)
[112] [5.74,6.46) [6.46,7.18) [5.02,5.74)
[115] [5.74,6.46) [5.74,6.46) [6.46,7.18)
[118] [7.18,7.9] [7.18,7.9] [5.74,6.46)
[121] [6.46,7.18) [5.02,5.74) [7.18,7.9]
[124] [5.74,6.46) [6.46,7.18) [7.18,7.9]
[127] [5.74,6.46) [5.74,6.46) [5.74,6.46)
[130] [7.18,7.9] [7.18,7.9] [7.18,7.9]
[133] [5.74,6.46) [5.74,6.46) [5.74,6.46)
[136] [7.18,7.9] [5.74,6.46) [5.74,6.46)
[139] [5.74,6.46) [6.46,7.18) [6.46,7.18)
[142] [6.46,7.18) [5.74,6.46) [6.46,7.18)
[145] [6.46,7.18) [6.46,7.18) [5.74,6.46)
[148] [6.46,7.18) [5.74,6.46) [5.74,6.46)
attr(,"discretized:breaks")
[1] 4.30 5.02 5.74 6.46 7.18 7.90
attr(,"discretized:method")
[1] interval
5 Levels: [4.3,5.02) [5.02,5.74) ... [7.18,7.9]
print(iris_frequency)
[1] [5,5.6) [4.3,5) [4.3,5) [4.3,5)
[5] [5,5.6) [5,5.6) [4.3,5) [5,5.6)
[9] [4.3,5) [4.3,5) [5,5.6) [4.3,5)
[13] [4.3,5) [4.3,5) [5.6,6.1) [5.6,6.1)
[17] [5,5.6) [5,5.6) [5.6,6.1) [5,5.6)
[21] [5,5.6) [5,5.6) [4.3,5) [5,5.6)
[25] [4.3,5) [5,5.6) [5,5.6) [5,5.6)
[29] [5,5.6) [4.3,5) [4.3,5) [5,5.6)
[33] [5,5.6) [5,5.6) [4.3,5) [5,5.6)
[37] [5,5.6) [4.3,5) [4.3,5) [5,5.6)
[41] [5,5.6) [4.3,5) [4.3,5) [5,5.6)
[45] [5,5.6) [4.3,5) [5,5.6) [4.3,5)
[49] [5,5.6) [5,5.6) [6.52,7.9] [6.1,6.52)
[53] [6.52,7.9] [5,5.6) [6.1,6.52) [5.6,6.1)
[57] [6.1,6.52) [4.3,5) [6.52,7.9] [5,5.6)
[61] [5,5.6) [5.6,6.1) [5.6,6.1) [6.1,6.52)
[65] [5.6,6.1) [6.52,7.9] [5.6,6.1) [5.6,6.1)
[69] [6.1,6.52) [5.6,6.1) [5.6,6.1) [6.1,6.52)
[73] [6.1,6.52) [6.1,6.52) [6.1,6.52) [6.52,7.9]
[77] [6.52,7.9] [6.52,7.9] [5.6,6.1) [5.6,6.1)
[81] [5,5.6) [5,5.6) [5.6,6.1) [5.6,6.1)
[85] [5,5.6) [5.6,6.1) [6.52,7.9] [6.1,6.52)
[89] [5.6,6.1) [5,5.6) [5,5.6) [6.1,6.52)
[93] [5.6,6.1) [5,5.6) [5.6,6.1) [5.6,6.1)
[97] [5.6,6.1) [6.1,6.52) [5,5.6) [5.6,6.1)
[101] [6.1,6.52) [5.6,6.1) [6.52,7.9] [6.1,6.52)
[105] [6.1,6.52) [6.52,7.9] [4.3,5) [6.52,7.9]
[109] [6.52,7.9] [6.52,7.9] [6.1,6.52) [6.1,6.52)
[113] [6.52,7.9] [5.6,6.1) [5.6,6.1) [6.1,6.52)
[117] [6.1,6.52) [6.52,7.9] [6.52,7.9] [5.6,6.1)
[121] [6.52,7.9] [5.6,6.1) [6.52,7.9] [6.1,6.52)
[125] [6.52,7.9] [6.52,7.9] [6.1,6.52) [6.1,6.52)
[129] [6.1,6.52) [6.52,7.9] [6.52,7.9] [6.52,7.9]
[133] [6.1,6.52) [6.1,6.52) [6.1,6.52) [6.52,7.9]
[137] [6.1,6.52) [6.1,6.52) [5.6,6.1) [6.52,7.9]
[141] [6.52,7.9] [6.52,7.9] [5.6,6.1) [6.52,7.9]
[145] [6.52,7.9] [6.52,7.9] [6.1,6.52) [6.1,6.52)
[149] [6.1,6.52) [5.6,6.1)
attr(,"discretized:breaks")
[1] 4.30 5.00 5.60 6.10 6.52 7.90
attr(,"discretized:method")
[1] frequency
5 Levels: [4.3,5) [5,5.6) [5.6,6.1) ... [6.52,7.9]
print(iris_cluster)
[1] [4.77,5.32) [4.77,5.32) [4.3,4.77)
[4] [4.3,4.77) [4.77,5.32) [5.32,5.95)
[7] [4.3,4.77) [4.77,5.32) [4.3,4.77)
[10] [4.77,5.32) [5.32,5.95) [4.77,5.32)
[13] [4.77,5.32) [4.3,4.77) [5.32,5.95)
[16] [5.32,5.95) [5.32,5.95) [4.77,5.32)
[19] [5.32,5.95) [4.77,5.32) [5.32,5.95)
[22] [4.77,5.32) [4.3,4.77) [4.77,5.32)
[25] [4.77,5.32) [4.77,5.32) [4.77,5.32)
[28] [4.77,5.32) [4.77,5.32) [4.3,4.77)
[31] [4.77,5.32) [5.32,5.95) [4.77,5.32)
[34] [5.32,5.95) [4.77,5.32) [4.77,5.32)
[37] [5.32,5.95) [4.77,5.32) [4.3,4.77)
[40] [4.77,5.32) [4.77,5.32) [4.3,4.77)
[43] [4.3,4.77) [4.77,5.32) [4.77,5.32)
[46] [4.77,5.32) [4.77,5.32) [4.3,4.77)
[49] [4.77,5.32) [4.77,5.32) [6.68,7.9]
[52] [5.95,6.68) [6.68,7.9] [5.32,5.95)
[55] [5.95,6.68) [5.32,5.95) [5.95,6.68)
[58] [4.77,5.32) [5.95,6.68) [4.77,5.32)
[61] [4.77,5.32) [5.32,5.95) [5.95,6.68)
[64] [5.95,6.68) [5.32,5.95) [6.68,7.9]
[67] [5.32,5.95) [5.32,5.95) [5.95,6.68)
[70] [5.32,5.95) [5.32,5.95) [5.95,6.68)
[73] [5.95,6.68) [5.95,6.68) [5.95,6.68)
[76] [5.95,6.68) [6.68,7.9] [6.68,7.9]
[79] [5.95,6.68) [5.32,5.95) [5.32,5.95)
[82] [5.32,5.95) [5.32,5.95) [5.95,6.68)
[85] [5.32,5.95) [5.95,6.68) [6.68,7.9]
[88] [5.95,6.68) [5.32,5.95) [5.32,5.95)
[91] [5.32,5.95) [5.95,6.68) [5.32,5.95)
[94] [4.77,5.32) [5.32,5.95) [5.32,5.95)
[97] [5.32,5.95) [5.95,6.68) [4.77,5.32)
[100] [5.32,5.95) [5.95,6.68) [5.32,5.95)
[103] [6.68,7.9] [5.95,6.68) [5.95,6.68)
[106] [6.68,7.9] [4.77,5.32) [6.68,7.9]
[109] [6.68,7.9] [6.68,7.9] [5.95,6.68)
[112] [5.95,6.68) [6.68,7.9] [5.32,5.95)
[115] [5.32,5.95) [5.95,6.68) [5.95,6.68)
[118] [6.68,7.9] [6.68,7.9] [5.95,6.68)
[121] [6.68,7.9] [5.32,5.95) [6.68,7.9]
[124] [5.95,6.68) [6.68,7.9] [6.68,7.9]
[127] [5.95,6.68) [5.95,6.68) [5.95,6.68)
[130] [6.68,7.9] [6.68,7.9] [6.68,7.9]
[133] [5.95,6.68) [5.95,6.68) [5.95,6.68)
[136] [6.68,7.9] [5.95,6.68) [5.95,6.68)
[139] [5.95,6.68) [6.68,7.9] [6.68,7.9]
[142] [6.68,7.9] [5.32,5.95) [6.68,7.9]
[145] [6.68,7.9] [6.68,7.9] [5.95,6.68)
[148] [5.95,6.68) [5.95,6.68) [5.32,5.95)
attr(,"discretized:breaks")
[1] 4.300000 4.769351 5.321931 5.952114 6.676969
[6] 7.900000
attr(,"discretized:method")
[1] cluster
5 Levels: [4.3,4.77) [4.77,5.32) ... [6.68,7.9]
print(iris_fixed)
[1] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
[16] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
[31] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
[46] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
[61] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
[76] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
[91] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
[106] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
[121] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
[136] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
attr(,"discretized:breaks")
[1] 0-4 4-6 6-8 8-10
attr(,"discretized:method")
[1] fixed
Levels: [,)
Установим пакет Boruta и загрузим набор данных Ozone:
install.packages("Boruta", repos = "http://cran.us.r-project.org")
# пакет ‘RcppEigen’ успешно распакован, MD5-суммы проверены
# пакет ‘ranger’ успешно распакован, MD5-суммы проверены
# пакет ‘Boruta’ успешно распакован, MD5-суммы проверены
# Скачанные бинарные пакеты находятся в
# C:\Users\pkele\AppData\Local\Temp\Rtmpa4W4tr\downloaded_packages
library(Boruta)
install.packages("mlbench", repos = "http://cran.us.r-project.org")
# пакет ‘mlbench’ успешно распакован, MD5-суммы проверены
# Скачанные бинарные пакеты находятся в
# C:\Users\pkele\AppData\Local\Temp\Rtmpa4W4tr\downloaded_packages
Проведем выбор признаков для набора данных с помощью алгоритма Боруты:
set.seed(123)
Ozone <- na.omit(Ozone)
boruta_result <- Boruta(V4 ~ ., data = Ozone, doTrace = 2)
1. run of importance source...
2. run of importance source...
3. run of importance source...
4. run of importance source...
5. run of importance source...
6. run of importance source...
7. run of importance source...
8. run of importance source...
9. run of importance source...
10. run of importance source...
11. run of importance source...
After 11 iterations, +0.73 secs:
confirmed 9 attributes: V1, V10, V11, V12, V13 and 4 more;
rejected 2 attributes: V3, V6;
still have 1 attribute left.
12. run of importance source...
13. run of importance source...
14. run of importance source...
15. run of importance source...
16. run of importance source...
17. run of importance source...
18. run of importance source...
19. run of importance source...
20. run of importance source...
21. run of importance source...
22. run of importance source...
23. run of importance source...
24. run of importance source...
After 24 iterations, +1.7 secs:
rejected 1 attribute: V2;
no more attributes left.
print(boruta_result)
Boruta performed 24 iterations in 1.658221 secs.
9 attributes confirmed important: V1, V10,
V11, V12, V13 and 4 more;
3 attributes confirmed unimportant: V2, V3,
V6;
Строим график boxplot для выбранных признаков
selected_features <- getSelectedAttributes(boruta_result)
Ozone_selected <- Ozone[,c(selected_features, "V4")]
boxplot(Ozone_selected, main="Selected Features Boxplot")