# --- Шаг 1: Пакет CARET ---
x <- matrix(rnorm(50*5), ncol=5)
y <- factor(rep(c("A", "B"), each=25))
featurePlot(x=x, y=y, plot="pairs")
jpeg("feature_plot.jpg")
featurePlot(x=x, y=y, plot="pairs")
dev.off()
## png
## 2
Вывод: График показывает распределение признаков и их связь с классами.
# --- Шаг 2: Важность признаков ---
library(FSelector)
data(iris)
weights <- information.gain(Species ~ ., data=iris)
weights
## attr_importance
## Sepal.Length 0.4521286
## Sepal.Width 0.2672750
## Petal.Length 0.9402853
## Petal.Width 0.9554360
barplot(weights$attr_importance,
names.arg=rownames(weights),
main="Важность признаков в Iris",
col="skyblue")
Вывод: Petal.Length и Petal.Width наиболее важны для классификации видов ирисов.
# --- Шаг 3: Дискредитация признаков ---
library(arules) # подключаем пакет arules для дискретизации
# Дискретизация Petal.Length разными методами
iris$Petal.Length.Interval <- discretize(iris$Petal.Length, method="interval", categories=3)
## Warning in discretize(iris$Petal.Length, method = "interval", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
iris$Petal.Length.Frequency <- discretize(iris$Petal.Length, method="frequency", categories=3)
## Warning in discretize(iris$Petal.Length, method = "frequency", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
iris$Petal.Length.Cluster <- discretize(iris$Petal.Length, method="cluster", categories=3)
## Warning in discretize(iris$Petal.Length, method = "cluster", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
iris$Petal.Length.Fixed <- discretize(iris$Petal.Length, method="fixed", categories=c(1, 2, 5, 7))
## Warning in discretize(iris$Petal.Length, method = "fixed", categories = c(1, :
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
# Посмотрим первые 10 строк
head(iris[,c("Petal.Length", "Petal.Length.Interval", "Petal.Length.Frequency",
"Petal.Length.Cluster", "Petal.Length.Fixed")], 10)
## Petal.Length Petal.Length.Interval Petal.Length.Frequency
## 1 1.4 [1,2.97) [1,2.63)
## 2 1.4 [1,2.97) [1,2.63)
## 3 1.3 [1,2.97) [1,2.63)
## 4 1.5 [1,2.97) [1,2.63)
## 5 1.4 [1,2.97) [1,2.63)
## 6 1.7 [1,2.97) [1,2.63)
## 7 1.4 [1,2.97) [1,2.63)
## 8 1.5 [1,2.97) [1,2.63)
## 9 1.4 [1,2.97) [1,2.63)
## 10 1.5 [1,2.97) [1,2.63)
## Petal.Length.Cluster Petal.Length.Fixed
## 1 [1,2.95) [1,2)
## 2 [1,2.95) [1,2)
## 3 [1,2.95) [1,2)
## 4 [1,2.95) [1,2)
## 5 [1,2.95) [1,2)
## 6 [1,2.95) [1,2)
## 7 [1,2.95) [1,2)
## 8 [1,2.95) [1,2)
## 9 [1,2.95) [1,2)
## 10 [1,2.95) [1,2)
Вывод: Разные методы дискретизации создают разные категории; выбор метода зависит от задачи.
# --- Шаг 4: Boruta и Ozone ---
library(Boruta)
library(mlbench)
data("Ozone", package="mlbench")
df <- Ozone
# Удаляем все строки с пропусками
df_clean <- na.omit(df)
set.seed(123)
boruta_output <- Boruta(V4 ~ ., data=df_clean, doTrace=2)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## After 11 iterations, +0.65 secs:
## confirmed 9 attributes: V1, V10, V11, V12, V13 and 4 more;
## rejected 2 attributes: V3, V6;
## still have 1 attribute left.
## 12. run of importance source...
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## 16. run of importance source...
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## 20. run of importance source...
## 21. run of importance source...
## 22. run of importance source...
## 23. run of importance source...
## 24. run of importance source...
## After 24 iterations, +1.4 secs:
## rejected 1 attribute: V2;
## no more attributes left.
print(boruta_output)
## Boruta performed 24 iterations in 1.387642 secs.
## 9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
## 3 attributes confirmed unimportant: V2, V3, V6;
# Строим график boxplot
plot(boruta_output, las=2, cex.axis=0.7)
# Сохраняем график
jpeg("boruta_boxplot.jpg")
plot(boruta_output, las=2, cex.axis=0.7)
dev.off()
## png
## 2
Вывод: Boruta определяет, какие признаки действительно важны, визуально это видно на boxplot.