Задание 1

Установить пакет CARET, выполнить команду names(getModelInfo()), ознакомиться со списком доступных методов выбора признаков. Выполнить графический разведочный анализ данных с использованием функции featurePlot() для набора данных из справочного файла пакета CARET:

# Установка и подключение пакетов
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
install.packages("caret", repos = "http://cran.us.r-project.org") 
## 
## The downloaded binary packages are in
##  /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
library(caret)
library(ggplot2)

Просмотр доступных моделей

names(getModelInfo())
##   [1] "ada"                 "AdaBag"              "AdaBoost.M1"        
##   [4] "adaboost"            "amdai"               "ANFIS"              
##   [7] "avNNet"              "awnb"                "awtan"              
##  [10] "bag"                 "bagEarth"            "bagEarthGCV"        
##  [13] "bagFDA"              "bagFDAGCV"           "bam"                
##  [16] "bartMachine"         "bayesglm"            "binda"              
##  [19] "blackboost"          "blasso"              "blassoAveraged"     
##  [22] "bridge"              "brnn"                "BstLm"              
##  [25] "bstSm"               "bstTree"             "C5.0"               
##  [28] "C5.0Cost"            "C5.0Rules"           "C5.0Tree"           
##  [31] "cforest"             "chaid"               "CSimca"             
##  [34] "ctree"               "ctree2"              "cubist"             
##  [37] "dda"                 "deepboost"           "DENFIS"             
##  [40] "dnn"                 "dwdLinear"           "dwdPoly"            
##  [43] "dwdRadial"           "earth"               "elm"                
##  [46] "enet"                "evtree"              "extraTrees"         
##  [49] "fda"                 "FH.GBML"             "FIR.DM"             
##  [52] "foba"                "FRBCS.CHI"           "FRBCS.W"            
##  [55] "FS.HGD"              "gam"                 "gamboost"           
##  [58] "gamLoess"            "gamSpline"           "gaussprLinear"      
##  [61] "gaussprPoly"         "gaussprRadial"       "gbm_h2o"            
##  [64] "gbm"                 "gcvEarth"            "GFS.FR.MOGUL"       
##  [67] "GFS.LT.RS"           "GFS.THRIFT"          "glm.nb"             
##  [70] "glm"                 "glmboost"            "glmnet_h2o"         
##  [73] "glmnet"              "glmStepAIC"          "gpls"               
##  [76] "hda"                 "hdda"                "hdrda"              
##  [79] "HYFIS"               "icr"                 "J48"                
##  [82] "JRip"                "kernelpls"           "kknn"               
##  [85] "knn"                 "krlsPoly"            "krlsRadial"         
##  [88] "lars"                "lars2"               "lasso"              
##  [91] "lda"                 "lda2"                "leapBackward"       
##  [94] "leapForward"         "leapSeq"             "Linda"              
##  [97] "lm"                  "lmStepAIC"           "LMT"                
## [100] "loclda"              "logicBag"            "LogitBoost"         
## [103] "logreg"              "lssvmLinear"         "lssvmPoly"          
## [106] "lssvmRadial"         "lvq"                 "M5"                 
## [109] "M5Rules"             "manb"                "mda"                
## [112] "Mlda"                "mlp"                 "mlpKerasDecay"      
## [115] "mlpKerasDecayCost"   "mlpKerasDropout"     "mlpKerasDropoutCost"
## [118] "mlpML"               "mlpSGD"              "mlpWeightDecay"     
## [121] "mlpWeightDecayML"    "monmlp"              "msaenet"            
## [124] "multinom"            "mxnet"               "mxnetAdam"          
## [127] "naive_bayes"         "nb"                  "nbDiscrete"         
## [130] "nbSearch"            "neuralnet"           "nnet"               
## [133] "nnls"                "nodeHarvest"         "null"               
## [136] "OneR"                "ordinalNet"          "ordinalRF"          
## [139] "ORFlog"              "ORFpls"              "ORFridge"           
## [142] "ORFsvm"              "ownn"                "pam"                
## [145] "parRF"               "PART"                "partDSA"            
## [148] "pcaNNet"             "pcr"                 "pda"                
## [151] "pda2"                "penalized"           "PenalizedLDA"       
## [154] "plr"                 "pls"                 "plsRglm"            
## [157] "polr"                "ppr"                 "pre"                
## [160] "PRIM"                "protoclass"          "qda"                
## [163] "QdaCov"              "qrf"                 "qrnn"               
## [166] "randomGLM"           "ranger"              "rbf"                
## [169] "rbfDDA"              "Rborist"             "rda"                
## [172] "regLogistic"         "relaxo"              "rf"                 
## [175] "rFerns"              "RFlda"               "rfRules"            
## [178] "ridge"               "rlda"                "rlm"                
## [181] "rmda"                "rocc"                "rotationForest"     
## [184] "rotationForestCp"    "rpart"               "rpart1SE"           
## [187] "rpart2"              "rpartCost"           "rpartScore"         
## [190] "rqlasso"             "rqnc"                "RRF"                
## [193] "RRFglobal"           "rrlda"               "RSimca"             
## [196] "rvmLinear"           "rvmPoly"             "rvmRadial"          
## [199] "SBC"                 "sda"                 "sdwd"               
## [202] "simpls"              "SLAVE"               "slda"               
## [205] "smda"                "snn"                 "sparseLDA"          
## [208] "spikeslab"           "spls"                "stepLDA"            
## [211] "stepQDA"             "superpc"             "svmBoundrangeString"
## [214] "svmExpoString"       "svmLinear"           "svmLinear2"         
## [217] "svmLinear3"          "svmLinearWeights"    "svmLinearWeights2"  
## [220] "svmPoly"             "svmRadial"           "svmRadialCost"      
## [223] "svmRadialSigma"      "svmRadialWeights"    "svmSpectrumString"  
## [226] "tan"                 "tanSearch"           "treebag"            
## [229] "vbmpRadial"          "vglmAdjCat"          "vglmContRatio"      
## [232] "vglmCumulative"      "widekernelpls"       "WM"                 
## [235] "wsrf"                "xgbDART"             "xgbLinear"          
## [238] "xgbTree"             "xyf"

Генерация данных

set.seed(123) # Для воспроизводимости
x <- matrix(rnorm(50 * 5), ncol = 5) 
y <- factor(rep(c("A", "B"), 25))
head(x)
##             [,1]        [,2]        [,3]       [,4]       [,5]
## [1,] -0.56047565  0.25331851 -0.71040656  0.7877388  2.1988103
## [2,] -0.23017749 -0.02854676  0.25688371  0.7690422  1.3124130
## [3,]  1.55870831 -0.04287046 -0.24669188  0.3322026 -0.2651451
## [4,]  0.07050839  1.36860228 -0.34754260 -1.0083766  0.5431941
## [5,]  0.12928774 -0.22577099 -0.95161857 -0.1194526 -0.4143399
## [6,]  1.71506499  1.51647060 -0.04502772 -0.2803953 -0.4762469
head(y)
## [1] A B A B A B
## Levels: A B

Графики разведочного анализа

featurePlot(x, y, plot = "density")
Графики плотности

Графики плотности

featurePlot(x, y, plot = "box")
Boxplot для каждого признака

Boxplot для каждого признака

featurePlot(x, y, plot = "strip")
Stripplot для каждого признака

Stripplot для каждого признака

featurePlot(x, y, plot = "pairs")
Диаграмма рассеяния (pairs plot)

Диаграмма рассеяния (pairs plot)

Вывод

Графики плотности показывают, что данные подчиняются нормальному распределению (так как использовалась rnorm()). Различия между классами A и B незначительны или отсутствуют, что видно по сильному перекрытию распределений всех признаков (V1–V5).

Задание 2

С использованием функций из пакета FSelector определить важность признаков для классификации, используя набор iris.

install.packages("FSelector", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
library(FSelector)

Оценка важности признаков

data(iris)
gain <- information.gain(Species ~ ., data = iris)
print(gain)
##              attr_importance
## Sepal.Length       0.4521286
## Sepal.Width        0.2672750
## Petal.Length       0.9402853
## Petal.Width        0.9554360

Вывод

Признаки Petal.Length и Petal.Width являются наиболее значимыми, так как содержат наибольшее количество информации о классе.

Задание 3

С использованием функции discretize() из пакета arules выполнить дискретизацию переменной Petal.Length различными методами.

install.packages("arules", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write

Дискретизация

iris$Petal.Length.interval <- discretize(iris$Petal.Length, method = "interval")
iris$Petal.Length.frequency <- discretize(iris$Petal.Length, method = "frequency")
iris$Petal.Length.cluster <- discretize(iris$Petal.Length, method = "cluster")
iris$Petal.Length.fixed <- discretize(iris$Petal.Length, method = "fixed", c(0, 2, 4, 6, 8))
print(iris[, c("Petal.Length", "Petal.Length.interval", "Petal.Length.frequency", "Petal.Length.cluster", "Petal.Length.fixed")])
##     Petal.Length Petal.Length.interval Petal.Length.frequency
## 1            1.4              [1,2.97)               [1,2.63)
## 2            1.4              [1,2.97)               [1,2.63)
## 3            1.3              [1,2.97)               [1,2.63)
## 4            1.5              [1,2.97)               [1,2.63)
## 5            1.4              [1,2.97)               [1,2.63)
## 6            1.7              [1,2.97)               [1,2.63)
## 7            1.4              [1,2.97)               [1,2.63)
## 8            1.5              [1,2.97)               [1,2.63)
## 9            1.4              [1,2.97)               [1,2.63)
## 10           1.5              [1,2.97)               [1,2.63)
## 11           1.5              [1,2.97)               [1,2.63)
## 12           1.6              [1,2.97)               [1,2.63)
## 13           1.4              [1,2.97)               [1,2.63)
## 14           1.1              [1,2.97)               [1,2.63)
## 15           1.2              [1,2.97)               [1,2.63)
## 16           1.5              [1,2.97)               [1,2.63)
## 17           1.3              [1,2.97)               [1,2.63)
## 18           1.4              [1,2.97)               [1,2.63)
## 19           1.7              [1,2.97)               [1,2.63)
## 20           1.5              [1,2.97)               [1,2.63)
## 21           1.7              [1,2.97)               [1,2.63)
## 22           1.5              [1,2.97)               [1,2.63)
## 23           1.0              [1,2.97)               [1,2.63)
## 24           1.7              [1,2.97)               [1,2.63)
## 25           1.9              [1,2.97)               [1,2.63)
## 26           1.6              [1,2.97)               [1,2.63)
## 27           1.6              [1,2.97)               [1,2.63)
## 28           1.5              [1,2.97)               [1,2.63)
## 29           1.4              [1,2.97)               [1,2.63)
## 30           1.6              [1,2.97)               [1,2.63)
## 31           1.6              [1,2.97)               [1,2.63)
## 32           1.5              [1,2.97)               [1,2.63)
## 33           1.5              [1,2.97)               [1,2.63)
## 34           1.4              [1,2.97)               [1,2.63)
## 35           1.5              [1,2.97)               [1,2.63)
## 36           1.2              [1,2.97)               [1,2.63)
## 37           1.3              [1,2.97)               [1,2.63)
## 38           1.4              [1,2.97)               [1,2.63)
## 39           1.3              [1,2.97)               [1,2.63)
## 40           1.5              [1,2.97)               [1,2.63)
## 41           1.3              [1,2.97)               [1,2.63)
## 42           1.3              [1,2.97)               [1,2.63)
## 43           1.3              [1,2.97)               [1,2.63)
## 44           1.6              [1,2.97)               [1,2.63)
## 45           1.9              [1,2.97)               [1,2.63)
## 46           1.4              [1,2.97)               [1,2.63)
## 47           1.6              [1,2.97)               [1,2.63)
## 48           1.4              [1,2.97)               [1,2.63)
## 49           1.5              [1,2.97)               [1,2.63)
## 50           1.4              [1,2.97)               [1,2.63)
## 51           4.7           [2.97,4.93)             [2.63,4.9)
## 52           4.5           [2.97,4.93)             [2.63,4.9)
## 53           4.9           [2.97,4.93)              [4.9,6.9]
## 54           4.0           [2.97,4.93)             [2.63,4.9)
## 55           4.6           [2.97,4.93)             [2.63,4.9)
## 56           4.5           [2.97,4.93)             [2.63,4.9)
## 57           4.7           [2.97,4.93)             [2.63,4.9)
## 58           3.3           [2.97,4.93)             [2.63,4.9)
## 59           4.6           [2.97,4.93)             [2.63,4.9)
## 60           3.9           [2.97,4.93)             [2.63,4.9)
## 61           3.5           [2.97,4.93)             [2.63,4.9)
## 62           4.2           [2.97,4.93)             [2.63,4.9)
## 63           4.0           [2.97,4.93)             [2.63,4.9)
## 64           4.7           [2.97,4.93)             [2.63,4.9)
## 65           3.6           [2.97,4.93)             [2.63,4.9)
## 66           4.4           [2.97,4.93)             [2.63,4.9)
## 67           4.5           [2.97,4.93)             [2.63,4.9)
## 68           4.1           [2.97,4.93)             [2.63,4.9)
## 69           4.5           [2.97,4.93)             [2.63,4.9)
## 70           3.9           [2.97,4.93)             [2.63,4.9)
## 71           4.8           [2.97,4.93)             [2.63,4.9)
## 72           4.0           [2.97,4.93)             [2.63,4.9)
## 73           4.9           [2.97,4.93)              [4.9,6.9]
## 74           4.7           [2.97,4.93)             [2.63,4.9)
## 75           4.3           [2.97,4.93)             [2.63,4.9)
## 76           4.4           [2.97,4.93)             [2.63,4.9)
## 77           4.8           [2.97,4.93)             [2.63,4.9)
## 78           5.0            [4.93,6.9]              [4.9,6.9]
## 79           4.5           [2.97,4.93)             [2.63,4.9)
## 80           3.5           [2.97,4.93)             [2.63,4.9)
## 81           3.8           [2.97,4.93)             [2.63,4.9)
## 82           3.7           [2.97,4.93)             [2.63,4.9)
## 83           3.9           [2.97,4.93)             [2.63,4.9)
## 84           5.1            [4.93,6.9]              [4.9,6.9]
## 85           4.5           [2.97,4.93)             [2.63,4.9)
## 86           4.5           [2.97,4.93)             [2.63,4.9)
## 87           4.7           [2.97,4.93)             [2.63,4.9)
## 88           4.4           [2.97,4.93)             [2.63,4.9)
## 89           4.1           [2.97,4.93)             [2.63,4.9)
## 90           4.0           [2.97,4.93)             [2.63,4.9)
## 91           4.4           [2.97,4.93)             [2.63,4.9)
## 92           4.6           [2.97,4.93)             [2.63,4.9)
## 93           4.0           [2.97,4.93)             [2.63,4.9)
## 94           3.3           [2.97,4.93)             [2.63,4.9)
## 95           4.2           [2.97,4.93)             [2.63,4.9)
## 96           4.2           [2.97,4.93)             [2.63,4.9)
## 97           4.2           [2.97,4.93)             [2.63,4.9)
## 98           4.3           [2.97,4.93)             [2.63,4.9)
## 99           3.0           [2.97,4.93)             [2.63,4.9)
## 100          4.1           [2.97,4.93)             [2.63,4.9)
## 101          6.0            [4.93,6.9]              [4.9,6.9]
## 102          5.1            [4.93,6.9]              [4.9,6.9]
## 103          5.9            [4.93,6.9]              [4.9,6.9]
## 104          5.6            [4.93,6.9]              [4.9,6.9]
## 105          5.8            [4.93,6.9]              [4.9,6.9]
## 106          6.6            [4.93,6.9]              [4.9,6.9]
## 107          4.5           [2.97,4.93)             [2.63,4.9)
## 108          6.3            [4.93,6.9]              [4.9,6.9]
## 109          5.8            [4.93,6.9]              [4.9,6.9]
## 110          6.1            [4.93,6.9]              [4.9,6.9]
## 111          5.1            [4.93,6.9]              [4.9,6.9]
## 112          5.3            [4.93,6.9]              [4.9,6.9]
## 113          5.5            [4.93,6.9]              [4.9,6.9]
## 114          5.0            [4.93,6.9]              [4.9,6.9]
## 115          5.1            [4.93,6.9]              [4.9,6.9]
## 116          5.3            [4.93,6.9]              [4.9,6.9]
## 117          5.5            [4.93,6.9]              [4.9,6.9]
## 118          6.7            [4.93,6.9]              [4.9,6.9]
## 119          6.9            [4.93,6.9]              [4.9,6.9]
## 120          5.0            [4.93,6.9]              [4.9,6.9]
## 121          5.7            [4.93,6.9]              [4.9,6.9]
## 122          4.9           [2.97,4.93)              [4.9,6.9]
## 123          6.7            [4.93,6.9]              [4.9,6.9]
## 124          4.9           [2.97,4.93)              [4.9,6.9]
## 125          5.7            [4.93,6.9]              [4.9,6.9]
## 126          6.0            [4.93,6.9]              [4.9,6.9]
## 127          4.8           [2.97,4.93)             [2.63,4.9)
## 128          4.9           [2.97,4.93)              [4.9,6.9]
## 129          5.6            [4.93,6.9]              [4.9,6.9]
## 130          5.8            [4.93,6.9]              [4.9,6.9]
## 131          6.1            [4.93,6.9]              [4.9,6.9]
## 132          6.4            [4.93,6.9]              [4.9,6.9]
## 133          5.6            [4.93,6.9]              [4.9,6.9]
## 134          5.1            [4.93,6.9]              [4.9,6.9]
## 135          5.6            [4.93,6.9]              [4.9,6.9]
## 136          6.1            [4.93,6.9]              [4.9,6.9]
## 137          5.6            [4.93,6.9]              [4.9,6.9]
## 138          5.5            [4.93,6.9]              [4.9,6.9]
## 139          4.8           [2.97,4.93)             [2.63,4.9)
## 140          5.4            [4.93,6.9]              [4.9,6.9]
## 141          5.6            [4.93,6.9]              [4.9,6.9]
## 142          5.1            [4.93,6.9]              [4.9,6.9]
## 143          5.1            [4.93,6.9]              [4.9,6.9]
## 144          5.9            [4.93,6.9]              [4.9,6.9]
## 145          5.7            [4.93,6.9]              [4.9,6.9]
## 146          5.2            [4.93,6.9]              [4.9,6.9]
## 147          5.0            [4.93,6.9]              [4.9,6.9]
## 148          5.2            [4.93,6.9]              [4.9,6.9]
## 149          5.4            [4.93,6.9]              [4.9,6.9]
## 150          5.1            [4.93,6.9]              [4.9,6.9]
##     Petal.Length.cluster Petal.Length.fixed
## 1               [1,2.88)              [0,2)
## 2               [1,2.88)              [0,2)
## 3               [1,2.88)              [0,2)
## 4               [1,2.88)              [0,2)
## 5               [1,2.88)              [0,2)
## 6               [1,2.88)              [0,2)
## 7               [1,2.88)              [0,2)
## 8               [1,2.88)              [0,2)
## 9               [1,2.88)              [0,2)
## 10              [1,2.88)              [0,2)
## 11              [1,2.88)              [0,2)
## 12              [1,2.88)              [0,2)
## 13              [1,2.88)              [0,2)
## 14              [1,2.88)              [0,2)
## 15              [1,2.88)              [0,2)
## 16              [1,2.88)              [0,2)
## 17              [1,2.88)              [0,2)
## 18              [1,2.88)              [0,2)
## 19              [1,2.88)              [0,2)
## 20              [1,2.88)              [0,2)
## 21              [1,2.88)              [0,2)
## 22              [1,2.88)              [0,2)
## 23              [1,2.88)              [0,2)
## 24              [1,2.88)              [0,2)
## 25              [1,2.88)              [0,2)
## 26              [1,2.88)              [0,2)
## 27              [1,2.88)              [0,2)
## 28              [1,2.88)              [0,2)
## 29              [1,2.88)              [0,2)
## 30              [1,2.88)              [0,2)
## 31              [1,2.88)              [0,2)
## 32              [1,2.88)              [0,2)
## 33              [1,2.88)              [0,2)
## 34              [1,2.88)              [0,2)
## 35              [1,2.88)              [0,2)
## 36              [1,2.88)              [0,2)
## 37              [1,2.88)              [0,2)
## 38              [1,2.88)              [0,2)
## 39              [1,2.88)              [0,2)
## 40              [1,2.88)              [0,2)
## 41              [1,2.88)              [0,2)
## 42              [1,2.88)              [0,2)
## 43              [1,2.88)              [0,2)
## 44              [1,2.88)              [0,2)
## 45              [1,2.88)              [0,2)
## 46              [1,2.88)              [0,2)
## 47              [1,2.88)              [0,2)
## 48              [1,2.88)              [0,2)
## 49              [1,2.88)              [0,2)
## 50              [1,2.88)              [0,2)
## 51           [2.88,4.96)              [4,6)
## 52           [2.88,4.96)              [4,6)
## 53           [2.88,4.96)              [4,6)
## 54           [2.88,4.96)              [4,6)
## 55           [2.88,4.96)              [4,6)
## 56           [2.88,4.96)              [4,6)
## 57           [2.88,4.96)              [4,6)
## 58           [2.88,4.96)              [2,4)
## 59           [2.88,4.96)              [4,6)
## 60           [2.88,4.96)              [2,4)
## 61           [2.88,4.96)              [2,4)
## 62           [2.88,4.96)              [4,6)
## 63           [2.88,4.96)              [4,6)
## 64           [2.88,4.96)              [4,6)
## 65           [2.88,4.96)              [2,4)
## 66           [2.88,4.96)              [4,6)
## 67           [2.88,4.96)              [4,6)
## 68           [2.88,4.96)              [4,6)
## 69           [2.88,4.96)              [4,6)
## 70           [2.88,4.96)              [2,4)
## 71           [2.88,4.96)              [4,6)
## 72           [2.88,4.96)              [4,6)
## 73           [2.88,4.96)              [4,6)
## 74           [2.88,4.96)              [4,6)
## 75           [2.88,4.96)              [4,6)
## 76           [2.88,4.96)              [4,6)
## 77           [2.88,4.96)              [4,6)
## 78            [4.96,6.9]              [4,6)
## 79           [2.88,4.96)              [4,6)
## 80           [2.88,4.96)              [2,4)
## 81           [2.88,4.96)              [2,4)
## 82           [2.88,4.96)              [2,4)
## 83           [2.88,4.96)              [2,4)
## 84            [4.96,6.9]              [4,6)
## 85           [2.88,4.96)              [4,6)
## 86           [2.88,4.96)              [4,6)
## 87           [2.88,4.96)              [4,6)
## 88           [2.88,4.96)              [4,6)
## 89           [2.88,4.96)              [4,6)
## 90           [2.88,4.96)              [4,6)
## 91           [2.88,4.96)              [4,6)
## 92           [2.88,4.96)              [4,6)
## 93           [2.88,4.96)              [4,6)
## 94           [2.88,4.96)              [2,4)
## 95           [2.88,4.96)              [4,6)
## 96           [2.88,4.96)              [4,6)
## 97           [2.88,4.96)              [4,6)
## 98           [2.88,4.96)              [4,6)
## 99           [2.88,4.96)              [2,4)
## 100          [2.88,4.96)              [4,6)
## 101           [4.96,6.9]              [6,8]
## 102           [4.96,6.9]              [4,6)
## 103           [4.96,6.9]              [4,6)
## 104           [4.96,6.9]              [4,6)
## 105           [4.96,6.9]              [4,6)
## 106           [4.96,6.9]              [6,8]
## 107          [2.88,4.96)              [4,6)
## 108           [4.96,6.9]              [6,8]
## 109           [4.96,6.9]              [4,6)
## 110           [4.96,6.9]              [6,8]
## 111           [4.96,6.9]              [4,6)
## 112           [4.96,6.9]              [4,6)
## 113           [4.96,6.9]              [4,6)
## 114           [4.96,6.9]              [4,6)
## 115           [4.96,6.9]              [4,6)
## 116           [4.96,6.9]              [4,6)
## 117           [4.96,6.9]              [4,6)
## 118           [4.96,6.9]              [6,8]
## 119           [4.96,6.9]              [6,8]
## 120           [4.96,6.9]              [4,6)
## 121           [4.96,6.9]              [4,6)
## 122          [2.88,4.96)              [4,6)
## 123           [4.96,6.9]              [6,8]
## 124          [2.88,4.96)              [4,6)
## 125           [4.96,6.9]              [4,6)
## 126           [4.96,6.9]              [6,8]
## 127          [2.88,4.96)              [4,6)
## 128          [2.88,4.96)              [4,6)
## 129           [4.96,6.9]              [4,6)
## 130           [4.96,6.9]              [4,6)
## 131           [4.96,6.9]              [6,8]
## 132           [4.96,6.9]              [6,8]
## 133           [4.96,6.9]              [4,6)
## 134           [4.96,6.9]              [4,6)
## 135           [4.96,6.9]              [4,6)
## 136           [4.96,6.9]              [6,8]
## 137           [4.96,6.9]              [4,6)
## 138           [4.96,6.9]              [4,6)
## 139          [2.88,4.96)              [4,6)
## 140           [4.96,6.9]              [4,6)
## 141           [4.96,6.9]              [4,6)
## 142           [4.96,6.9]              [4,6)
## 143           [4.96,6.9]              [4,6)
## 144           [4.96,6.9]              [4,6)
## 145           [4.96,6.9]              [4,6)
## 146           [4.96,6.9]              [4,6)
## 147           [4.96,6.9]              [4,6)
## 148           [4.96,6.9]              [4,6)
## 149           [4.96,6.9]              [4,6)
## 150           [4.96,6.9]              [4,6)

Вывод данных

head(iris[, c("Petal.Length.interval", "Petal.Length.frequency", "Petal.Length.cluster", "Petal.Length.fixed")])
##   Petal.Length.interval Petal.Length.frequency Petal.Length.cluster
## 1              [1,2.97)               [1,2.63)             [1,2.88)
## 2              [1,2.97)               [1,2.63)             [1,2.88)
## 3              [1,2.97)               [1,2.63)             [1,2.88)
## 4              [1,2.97)               [1,2.63)             [1,2.88)
## 5              [1,2.97)               [1,2.63)             [1,2.88)
## 6              [1,2.97)               [1,2.63)             [1,2.88)
##   Petal.Length.fixed
## 1              [0,2)
## 2              [0,2)
## 3              [0,2)
## 4              [0,2)
## 5              [0,2)
## 6              [0,2)

Общий вывод

Каждый из этих методов позволяет разделить непрерывную переменную на отдельные участки, но делает это по-своему. Метод «frequency» подходит, когда важно, чтобы данные были распределены равномерно. Метод «interval» удобен для анализа данных, которые распределены равномерно. Метод «cluster» полезен, когда данные имеют сложную структуру. Метод «fixed» необходим, когда требуется строгое разделение данных по заданным критериям.

Задание 4

Использование пакета Boruta для выбора значимых признаков в наборе данных Ozone.

install.packages("Boruta", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
install.packages("mlbench", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/60/zcschg850s91w2km129y03xr0000gn/T//RtmpIaTIQN/downloaded_packages
library(Boruta)
library(mlbench)

Загрузка данных и обработка пропусков

data("Ozone", package = "mlbench")
Ozone <- na.omit(Ozone)
head(Ozone)
##    V1 V2 V3 V4   V5 V6 V7 V8    V9  V10 V11   V12 V13
## 5   1  5  1  5 5760  3 51 54 45.32 1450  25 57.02  60
## 6   1  6  2  6 5720  4 69 35 49.64 1568  15 53.78  60
## 7   1  7  3  4 5790  6 19 45 46.40 2631 -33 54.14 100
## 8   1  8  4  4 5790  3 25 55 52.70  554 -28 64.76 250
## 9   1  9  5  6 5700  3 73 41 48.02 2083  23 52.52 120
## 12  1 12  1  6 5720  3 44 51 54.32  111   9 63.14 150

Выбор признаков с Boruta

boruta_result <- Boruta(V4 ~ ., data = Ozone, doTrace = 2)
##  1. run of importance source...
##  2. run of importance source...
##  3. run of importance source...
##  4. run of importance source...
##  5. run of importance source...
##  6. run of importance source...
##  7. run of importance source...
##  8. run of importance source...
##  9. run of importance source...
##  10. run of importance source...
##  11. run of importance source...
## After 11 iterations, +0.58 secs:
##  confirmed 9 attributes: V1, V10, V11, V12, V13 and 4 more;
##  rejected 1 attribute: V3;
##  still have 2 attributes left.
##  12. run of importance source...
##  13. run of importance source...
##  14. run of importance source...
##  15. run of importance source...
## After 15 iterations, +0.78 secs:
##  rejected 1 attribute: V2;
##  still have 1 attribute left.
##  16. run of importance source...
##  17. run of importance source...
##  18. run of importance source...
##  19. run of importance source...
##  20. run of importance source...
##  21. run of importance source...
## After 21 iterations, +1.1 secs:
##  rejected 1 attribute: V6;
##  no more attributes left.
print(boruta_result) 
## Boruta performed 21 iterations in 1.081006 secs.
##  9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
##  3 attributes confirmed unimportant: V2, V3, V6;
plot(boruta_result, cex.axis = 0.8)

important_vars  <- getSelectedAttributes(boruta_result, withTentative = TRUE)
boxplot(Ozone[, important_vars], main = "Selected Features Boxplot", las = 2, col = "lightblue")

Вывод

Признак V4 наиболее сильно коррелирует с признаками: V9, V8, V12, V11, V7, V10, V13, V1, V5. Значение V5 значительно выше остальных. Признак V10 имеет большой разброс значений.