Введение

Этот отчет содержит результаты выполнения лабораторной работы по анализу данных с использованием R. В работе рассматриваются следующие задачи: 1. Установка и использование пакета caret. 2. Графический анализ данных с использованием функции featurePlot(). 3. Определение важности признаков с использованием пакета FSelector. 4. Преобразование непрерывной переменной в категориальную с использованием пакета arules. 5. Выбор признаков с использованием пакета Boruta. 6. Построение графиков boxplot для анализа данных.

Установка пакетов

featurePlot()

set.seed(123)
x <- matrix(rnorm(50 * 5), ncol = 5)
y <- factor(rep(c("A", "B"), 25))
featurePlot(x = iris[, 1:4], y = iris$Species, plot = "density")

featurePlot(x = x, y = y, plot = "density")  # Графики плотности

featurePlot(x = x, y = y, plot = "boxplot")  # Боксплоты

featurePlot(x = x, y = y, plot = "pairs")    # Матрица scatterplot

FSelector

data(iris)

weights <- information.gain(Species ~ ., data = iris)

print(weights)
##              attr_importance
## Sepal.Length       0.4521286
## Sepal.Width        0.2672750
## Petal.Length       0.9402853
## Petal.Width        0.9554360

arules

data(iris)

iris$Sepal.Length_interval <- discretize(iris$Sepal.Length, method = "interval", breaks = 3) 
table(iris$Sepal.Length_interval)
## 
## [4.3,5.5) [5.5,6.7) [6.7,7.9] 
##        52        70        28
iris$Sepal.Length_frequency <- discretize(iris$Sepal.Length, method = "frequency", breaks = 3)
table(iris$Sepal.Length_frequency) 
## 
## [4.3,5.4) [5.4,6.3) [6.3,7.9] 
##        46        53        51
iris$Sepal.Length_cluster <- discretize(iris$Sepal.Length, method = "cluster", breaks = 3) 
table(iris$Sepal.Length_cluster)
## 
##  [4.3,5.33) [5.33,6.27)  [6.27,7.9] 
##          46          53          51
iris$Sepal.Length_fixed <- discretize(iris$Sepal.Length,method="fixed", breaks = c(4, 5.5, 6.5, 8))
table(iris$Sepal.Length_fixed)
## 
##   [4,5.5) [5.5,6.5)   [6.5,8] 
##        52        63        35

Boruta

data(Ozone)

ozone_data <- na.omit(Ozone)

set.seed(123)

boruta_output <- Boruta(V4 ~ ., data = ozone_data, doTrace = 2)
##  1. run of importance source...
##  2. run of importance source...
##  3. run of importance source...
##  4. run of importance source...
##  5. run of importance source...
##  6. run of importance source...
##  7. run of importance source...
##  8. run of importance source...
##  9. run of importance source...
##  10. run of importance source...
##  11. run of importance source...
## After 11 iterations, +0.6 secs:
##  confirmed 9 attributes: V1, V10, V11, V12, V13 and 4 more;
##  rejected 2 attributes: V3, V6;
##  still have 1 attribute left.
##  12. run of importance source...
##  13. run of importance source...
##  14. run of importance source...
##  15. run of importance source...
##  16. run of importance source...
##  17. run of importance source...
##  18. run of importance source...
##  19. run of importance source...
##  20. run of importance source...
##  21. run of importance source...
##  22. run of importance source...
##  23. run of importance source...
##  24. run of importance source...
## After 24 iterations, +1.3 secs:
##  rejected 1 attribute: V2;
##  no more attributes left.
print(boruta_output)
## Boruta performed 24 iterations in 1.2929 secs.
##  9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
##  3 attributes confirmed unimportant: V2, V3, V6;
plot(boruta_output, cex.axis = 0.7, las = 2, xlab = "", main = "Важность признаков (Boruta)")

confirmed_features <- getSelectedAttributes(boruta_output, withTentative = FALSE) 
par(mar = c(4, 4, 2, 1))
par(mfrow = c(1, length(confirmed_features)))
for (feature in confirmed_features) { boxplot(ozone_data[[feature]], main = feature, ylab = "Значения") }