# --- Шаг 1: Пакет CARET ---
x <- matrix(rnorm(50*5), ncol=5)
y <- factor(rep(c("A", "B"), each=25))

featurePlot(x=x, y=y, plot="pairs")

jpeg("feature_plot.jpg")
featurePlot(x=x, y=y, plot="pairs")
dev.off()
## png 
##   2

Вывод: График показывает распределение признаков и их связь с классами.

# --- Шаг 2: Важность признаков ---
library(FSelector)

data(iris)

weights <- information.gain(Species ~ ., data=iris)
weights
##              attr_importance
## Sepal.Length       0.4521286
## Sepal.Width        0.2672750
## Petal.Length       0.9402853
## Petal.Width        0.9554360
barplot(weights$attr_importance, 
        names.arg=rownames(weights),
        main="Важность признаков в Iris",
        col="skyblue")

Вывод: Petal.Length и Petal.Width наиболее важны для классификации видов ирисов.

# --- Шаг 3: Дискредитация признаков ---

library(arules)  # подключаем пакет arules для дискретизации

# Дискретизация Petal.Length разными методами
iris$Petal.Length.Interval <- discretize(iris$Petal.Length, method="interval", categories=3)
## Warning in discretize(iris$Petal.Length, method = "interval", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
iris$Petal.Length.Frequency <- discretize(iris$Petal.Length, method="frequency", categories=3)
## Warning in discretize(iris$Petal.Length, method = "frequency", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
iris$Petal.Length.Cluster <- discretize(iris$Petal.Length, method="cluster", categories=3)
## Warning in discretize(iris$Petal.Length, method = "cluster", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
iris$Petal.Length.Fixed <- discretize(iris$Petal.Length, method="fixed", categories=c(1, 2, 5, 7))
## Warning in discretize(iris$Petal.Length, method = "fixed", categories = c(1, :
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
# Посмотрим первые 10 строк
head(iris[,c("Petal.Length", "Petal.Length.Interval", "Petal.Length.Frequency",
             "Petal.Length.Cluster", "Petal.Length.Fixed")], 10)
##    Petal.Length Petal.Length.Interval Petal.Length.Frequency
## 1           1.4              [1,2.97)               [1,2.63)
## 2           1.4              [1,2.97)               [1,2.63)
## 3           1.3              [1,2.97)               [1,2.63)
## 4           1.5              [1,2.97)               [1,2.63)
## 5           1.4              [1,2.97)               [1,2.63)
## 6           1.7              [1,2.97)               [1,2.63)
## 7           1.4              [1,2.97)               [1,2.63)
## 8           1.5              [1,2.97)               [1,2.63)
## 9           1.4              [1,2.97)               [1,2.63)
## 10          1.5              [1,2.97)               [1,2.63)
##    Petal.Length.Cluster Petal.Length.Fixed
## 1              [1,2.95)              [1,2)
## 2              [1,2.95)              [1,2)
## 3              [1,2.95)              [1,2)
## 4              [1,2.95)              [1,2)
## 5              [1,2.95)              [1,2)
## 6              [1,2.95)              [1,2)
## 7              [1,2.95)              [1,2)
## 8              [1,2.95)              [1,2)
## 9              [1,2.95)              [1,2)
## 10             [1,2.95)              [1,2)

Вывод: Разные методы дискретизации создают разные категории; выбор метода зависит от задачи.

# --- Шаг 4: Boruta и Ozone ---

library(Boruta)
library(mlbench)

data("Ozone", package="mlbench")
df <- Ozone

# Удаляем все строки с пропусками
df_clean <- na.omit(df)

set.seed(123)
boruta_output <- Boruta(V4 ~ ., data=df_clean, doTrace=2)
##  1. run of importance source...
##  2. run of importance source...
##  3. run of importance source...
##  4. run of importance source...
##  5. run of importance source...
##  6. run of importance source...
##  7. run of importance source...
##  8. run of importance source...
##  9. run of importance source...
##  10. run of importance source...
##  11. run of importance source...
## After 11 iterations, +0.65 secs:
##  confirmed 9 attributes: V1, V10, V11, V12, V13 and 4 more;
##  rejected 2 attributes: V3, V6;
##  still have 1 attribute left.
##  12. run of importance source...
##  13. run of importance source...
##  14. run of importance source...
##  15. run of importance source...
##  16. run of importance source...
##  17. run of importance source...
##  18. run of importance source...
##  19. run of importance source...
##  20. run of importance source...
##  21. run of importance source...
##  22. run of importance source...
##  23. run of importance source...
##  24. run of importance source...
## After 24 iterations, +1.4 secs:
##  rejected 1 attribute: V2;
##  no more attributes left.
print(boruta_output)
## Boruta performed 24 iterations in 1.387642 secs.
##  9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
##  3 attributes confirmed unimportant: V2, V3, V6;
# Строим график boxplot
plot(boruta_output, las=2, cex.axis=0.7)

# Сохраняем график
jpeg("boruta_boxplot.jpg")
plot(boruta_output, las=2, cex.axis=0.7)
dev.off()
## png 
##   2

Вывод: Boruta определяет, какие признаки действительно важны, визуально это видно на boxplot.