Этот отчет содержит результаты выполнения лабораторной работы по
анализу данных с использованием R. В работе рассматриваются следующие
задачи: 1. Установка и использование пакета caret
. 2.
Графический анализ данных с использованием функции
featurePlot()
. 3. Определение важности признаков с
использованием пакета FSelector
. 4. Преобразование
непрерывной переменной в категориальную с использованием пакета
arules
. 5. Выбор признаков с использованием пакета
Boruta
. 6. Построение графиков boxplot
для
анализа данных.
set.seed(123)
x <- matrix(rnorm(50 * 5), ncol = 5)
y <- factor(rep(c("A", "B"), 25))
featurePlot(x = iris[, 1:4], y = iris$Species, plot = "density")
featurePlot(x = x, y = y, plot = "density") # Графики плотности
featurePlot(x = x, y = y, plot = "boxplot") # Боксплоты
featurePlot(x = x, y = y, plot = "pairs") # Матрица scatterplot
data(iris)
weights <- information.gain(Species ~ ., data = iris)
print(weights)
## attr_importance
## Sepal.Length 0.4521286
## Sepal.Width 0.2672750
## Petal.Length 0.9402853
## Petal.Width 0.9554360
data(iris)
iris$Sepal.Length_interval <- discretize(iris$Sepal.Length, method = "interval", breaks = 3)
table(iris$Sepal.Length_interval)
##
## [4.3,5.5) [5.5,6.7) [6.7,7.9]
## 52 70 28
iris$Sepal.Length_frequency <- discretize(iris$Sepal.Length, method = "frequency", breaks = 3)
table(iris$Sepal.Length_frequency)
##
## [4.3,5.4) [5.4,6.3) [6.3,7.9]
## 46 53 51
iris$Sepal.Length_cluster <- discretize(iris$Sepal.Length, method = "cluster", breaks = 3)
table(iris$Sepal.Length_cluster)
##
## [4.3,5.33) [5.33,6.27) [6.27,7.9]
## 46 53 51
iris$Sepal.Length_fixed <- discretize(iris$Sepal.Length,method="fixed", breaks = c(4, 5.5, 6.5, 8))
table(iris$Sepal.Length_fixed)
##
## [4,5.5) [5.5,6.5) [6.5,8]
## 52 63 35
data(Ozone)
ozone_data <- na.omit(Ozone)
set.seed(123)
boruta_output <- Boruta(V4 ~ ., data = ozone_data, doTrace = 2)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## After 11 iterations, +0.6 secs:
## confirmed 9 attributes: V1, V10, V11, V12, V13 and 4 more;
## rejected 2 attributes: V3, V6;
## still have 1 attribute left.
## 12. run of importance source...
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## 16. run of importance source...
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## 20. run of importance source...
## 21. run of importance source...
## 22. run of importance source...
## 23. run of importance source...
## 24. run of importance source...
## After 24 iterations, +1.3 secs:
## rejected 1 attribute: V2;
## no more attributes left.
print(boruta_output)
## Boruta performed 24 iterations in 1.2929 secs.
## 9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
## 3 attributes confirmed unimportant: V2, V3, V6;
plot(boruta_output, cex.axis = 0.7, las = 2, xlab = "", main = "Важность признаков (Boruta)")
confirmed_features <- getSelectedAttributes(boruta_output, withTentative = FALSE)
par(mar = c(4, 4, 2, 1))
par(mfrow = c(1, length(confirmed_features)))
for (feature in confirmed_features) { boxplot(ozone_data[[feature]], main = feature, ylab = "Значения") }