Задание 1

Подключение библиотеки

library(caret)
## Загрузка требуемого пакета: ggplot2
## Загрузка требуемого пакета: lattice

Создание данных

x <- matrix(rnorm(50*5), ncol=5)
y <- factor(rep(c("A", "B"), 25))

Построение графиков

featurePlot(
  x = x,
  y = y,
  plot = "pairs"
)

Сохранение графика

jpeg("feature_plot.jpg")

featurePlot(
  x = x,
  y = y,
  plot = "pairs"
)

dev.off()
## png 
##   2

Вывод

Данные распределены случайным образом, выраженного разделения классов не наблюдается.

Задание 2

library(FSelector)
data(iris)

weights <- information.gain(Species ~ ., iris)

weights
##              attr_importance
## Sepal.Length       0.4521286
## Sepal.Width        0.2672750
## Petal.Length       0.9402853
## Petal.Width        0.9554360

Вывод

Наибольшую важность для определения вида ириса имеют длина и ширина лепестка.

Задание 3

library(arules)
## Загрузка требуемого пакета: Matrix
## 
## Присоединяю пакет: 'arules'
## Следующие объекты скрыты от 'package:base':
## 
##     abbreviate, write
data(iris)

disc_int <- discretize(
  iris$Sepal.Length,
  method = "interval",
  categories = 3
)
## Warning in discretize(iris$Sepal.Length, method = "interval", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
disc_int
##   [1] [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5)
##   [8] [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5)
##  [15] [5.5,6.7) [5.5,6.7) [4.3,5.5) [4.3,5.5) [5.5,6.7) [4.3,5.5) [4.3,5.5)
##  [22] [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5)
##  [29] [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [5.5,6.7) [4.3,5.5)
##  [36] [4.3,5.5) [5.5,6.7) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5)
##  [43] [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5)
##  [50] [4.3,5.5) [6.7,7.9] [5.5,6.7) [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7)
##  [57] [5.5,6.7) [4.3,5.5) [5.5,6.7) [4.3,5.5) [4.3,5.5) [5.5,6.7) [5.5,6.7)
##  [64] [5.5,6.7) [5.5,6.7) [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7)
##  [71] [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [6.7,7.9]
##  [78] [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7)
##  [85] [4.3,5.5) [5.5,6.7) [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7)
##  [92] [5.5,6.7) [5.5,6.7) [4.3,5.5) [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7)
##  [99] [4.3,5.5) [5.5,6.7) [5.5,6.7) [5.5,6.7) [6.7,7.9] [5.5,6.7) [5.5,6.7)
## [106] [6.7,7.9] [4.3,5.5) [6.7,7.9] [6.7,7.9] [6.7,7.9] [5.5,6.7) [5.5,6.7)
## [113] [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [6.7,7.9] [6.7,7.9]
## [120] [5.5,6.7) [6.7,7.9] [5.5,6.7) [6.7,7.9] [5.5,6.7) [6.7,7.9] [6.7,7.9]
## [127] [5.5,6.7) [5.5,6.7) [5.5,6.7) [6.7,7.9] [6.7,7.9] [6.7,7.9] [5.5,6.7)
## [134] [5.5,6.7) [5.5,6.7) [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7) [6.7,7.9]
## [141] [6.7,7.9] [6.7,7.9] [5.5,6.7) [6.7,7.9] [6.7,7.9] [6.7,7.9] [5.5,6.7)
## [148] [5.5,6.7) [5.5,6.7) [5.5,6.7)
## attr(,"discretized:breaks")
## [1] 4.3 5.5 6.7 7.9
## attr(,"discretized:method")
## [1] interval
## Levels: [4.3,5.5) [5.5,6.7) [6.7,7.9]

Вывод

Первый метод с интервалами делит диапазон значений (от 4.3 до 7.9) на три равных отрезка по длине.

Задание 4

library(Boruta)
library(mlbench)

data("Ozone")

ozone_clean <- na.omit(Ozone)

boruta_model <- Boruta(
  V4 ~ .,
  data = ozone_clean,
  doTrace = 2
)
##  1. run of importance source...
##  2. run of importance source...
##  3. run of importance source...
##  4. run of importance source...
##  5. run of importance source...
##  6. run of importance source...
##  7. run of importance source...
##  8. run of importance source...
##  9. run of importance source...
##  10. run of importance source...
##  11. run of importance source...
## After 11 iterations, +0.19 secs:
##  confirmed 9 attributes: V1, V10, V11, V12, V13 and 4 more;
##  rejected 1 attribute: V3;
##  still have 2 attributes left.
##  12. run of importance source...
##  13. run of importance source...
##  14. run of importance source...
##  15. run of importance source...
## After 15 iterations, +0.26 secs:
##  rejected 1 attribute: V2;
##  still have 1 attribute left.
##  16. run of importance source...
##  17. run of importance source...
##  18. run of importance source...
##  19. run of importance source...
##  20. run of importance source...
##  21. run of importance source...
##  22. run of importance source...
##  23. run of importance source...
##  24. run of importance source...
## After 24 iterations, +0.4 secs:
##  rejected 1 attribute: V6;
##  no more attributes left.
print(boruta_model)
## Boruta performed 24 iterations in 0.3988771 secs.
##  9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
##  3 attributes confirmed unimportant: V2, V3, V6;

Boxplot

plot(boruta_model, las = 2)

Вывод

Зелёные прямоугольники – важные признаки. Красные прямоугольники – неважные признаки. Синие прямоугольники – «теневые признаки». Всё, что выше самого высокого синего столбца – это настоящие важные данные.