Лабораторная работа 2

Задание 1: Разведочный анализ данных с использованием пакета CARET

# Установка и загрузка пакета caret
# install.packages("caret", dependencies = TRUE)
library(caret)

# Получение списка доступных методов
methods <- names(getModelInfo())
# Длина списка методов
length(methods)

## [1] 239

# Создание тестовых данных
set.seed(123)
x <- matrix(rnorm(50*5), ncol=5)
y <- factor(rep(c("A", "B"), 25))

# Графики плотности распределения
jpeg("density_plot.jpg", width = 1000, height = 600)
featurePlot(x, y, plot = "density", 
            scales = list(x = list(relation = "free"), 
                          y = list(relation = "free")),
            adjust = 1.5, pch = "|", 
            layout = c(5, 1), 
            auto.key = list(columns = 2))
dev.off()

## png 
##   2

# Боксплоты
jpeg("box_plot.jpg", width = 1000, height = 600)
featurePlot(x, y, plot = "box", 
            scales = list(y = list(relation = "free"), 
                          x = list(rot = 90)),
            layout = c(5, 1), 
            auto.key = list(columns = 2))
dev.off()

## png 
##   2

# График рассеивания
jpeg("pairs_plot.jpg", width = 1000, height = 800)
featurePlot(x, y, plot = "pairs", 
            auto.key = list(columns = 2))
dev.off()

## png 
##   2

# Эллипсы
jpeg("ellipse_plot.jpg", width = 1000, height = 800)
featurePlot(x, y, plot = "ellipse", 
            auto.key = list(columns = 2))
dev.off()

## png 
##   2

Задание 2: Определение важности признаков с использованием FSelector

# Установка и загрузка пакетов
# install.packages(c("FSelector", "randomForest"))
library(FSelector)
library(randomForest)

# Загрузка данных
data(iris)

# Information Gain
weights_info_gain <- information.gain(Species ~ ., iris)
weights_info_gain_sorted <- weights_info_gain[order(-weights_info_gain$attr_importance), ]
print(weights_info_gain_sorted)

## [1] 0.9554360 0.9402853 0.4521286 0.2672750

# Gain Ratio
weights_gain_ratio <- gain.ratio(Species ~ ., iris)
weights_gain_ratio_sorted <- weights_gain_ratio[order(-weights_gain_ratio$attr_importance), ]
print(weights_gain_ratio_sorted)

## [1] 0.8713692 0.8584937 0.4196464 0.2472972

# Chi-squared
weights_chi_squared <- chi.squared(Species ~ ., iris)
weights_chi_squared_sorted <- weights_chi_squared[order(-weights_chi_squared$attr_importance), ]
print(weights_chi_squared_sorted)

## [1] 0.9432359 0.9346311 0.6288067 0.4922162

# Relief
weights_relief <- relief(Species ~ ., iris, neighbours.count = 5, sample.size = 150)
weights_relief_sorted <- weights_relief[order(-weights_relief$attr_importance), ]
print(weights_relief_sorted)

## [1] 0.3349040 0.3327500 0.1477037 0.1408889

# Random Forest
rf_model <- randomForest(Species ~ ., data = iris, importance = TRUE)
rf_importance <- importance(rf_model)
print(rf_importance)

##                 setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length  6.305187  7.0503115  8.697237            10.588170
## Sepal.Width   4.430627  0.8701779  3.867456             4.632256
## Petal.Length 22.679951 34.7088400 27.365468            34.327847
## Petal.Width  22.111779 31.0510571 29.260476            32.226306
##              MeanDecreaseGini
## Sepal.Length         9.609120
## Sepal.Width          2.246639
## Petal.Length        42.443026
## Petal.Width         44.929924

# Визуализация результатов
methods <- c("Information Gain", "Gain Ratio", "Chi-squared", "Relief")
features <- rownames(weights_info_gain)

# Создание матрицы важности
importance_matrix <- matrix(0, nrow = length(features), ncol = length(methods))
colnames(importance_matrix) <- methods
rownames(importance_matrix) <- features

# Заполнение матрицы
importance_matrix[, "Information Gain"] <- weights_info_gain$attr_importance
importance_matrix[, "Gain Ratio"] <- weights_gain_ratio$attr_importance
importance_matrix[, "Chi-squared"] <- weights_chi_squared$attr_importance
importance_matrix[, "Relief"] <- weights_relief$attr_importance

# Построение графика
jpeg("feature_importance.jpg", width = 1000, height = 600)
barplot(t(importance_matrix), beside = TRUE, col = rainbow(4),
        main = "Важность признаков по разным методам",
        xlab = "Признаки", ylab = "Важность")
legend("topright", methods, fill = rainbow(4))
dev.off()

## png 
##   2

k_best <- 2
selected_attrs_k_best <- cutoff.k.percent(weights_info_gain, k = k_best)
print(selected_attrs_k_best)

## [1] "Petal.Width"  "Petal.Length" "Sepal.Length" "Sepal.Width"

cfs_subset <- cfs(Species ~ ., iris)
print(cfs_subset)

## [1] "Petal.Length" "Petal.Width"

Задание 3: Дискретизация непрерывных переменных

# Установка и загрузка пакетов
# install.packages("arules")
library(arules)

# Метод равных интервалов
sepal_length_interval <- discretize(iris$Sepal.Length, method = "interval", breaks = 3)
table(sepal_length_interval)

## sepal_length_interval
## [4.3,5.5) [5.5,6.7) [6.7,7.9] 
##        52        70        28

jpeg("interval_discretization.jpg", width = 800, height = 600)
plot(iris$Sepal.Length, col = as.integer(sepal_length_interval),
     main = "Дискретизация методом равных интервалов",
     xlab = "Индекс наблюдения", ylab = "Sepal.Length")
legend("topright", legend = levels(sepal_length_interval),
       fill = 1:length(levels(sepal_length_interval)))
dev.off()

## png 
##   2

# Метод равной частоты
sepal_width_frequency <- discretize(iris$Sepal.Width, method = "frequency", breaks = 3)
table(sepal_width_frequency)

## sepal_width_frequency
##   [2,2.9) [2.9,3.2) [3.2,4.4] 
##        47        47        56

jpeg("frequency_discretization.jpg", width = 800, height = 600)
plot(iris$Sepal.Width, col = as.integer(sepal_width_frequency),
     main = "Дискретизация методом равной частоты",
     xlab = "Индекс наблюдения", ylab = "Sepal.Width")
legend("topright", legend = levels(sepal_width_frequency),
       fill = 1:length(levels(sepal_width_frequency)))
dev.off()

## png 
##   2

# Метод кластеризации
petal_length_cluster <- discretize(iris$Petal.Length, method = "cluster", breaks = 3)
table(petal_length_cluster)

## petal_length_cluster
##    [1,2.95) [2.95,5.13)  [5.13,6.9] 
##          50          66          34

jpeg("cluster_discretization.jpg", width = 800, height = 600)
plot(iris$Petal.Length, col = as.integer(petal_length_cluster),
     main = "Дискретизация методом кластеризации",
     xlab = "Индекс наблюдения", ylab = "Petal.Length")
legend("topright", legend = levels(petal_length_cluster),
       fill = 1:length(levels(petal_length_cluster)))
dev.off()

## png 
##   2

# Метод фиксированных границ
petal_width_fixed <- discretize(iris$Petal.Width, method = "fixed", breaks = c(0.1, 1.0, 1.8, 2.5))
table(petal_width_fixed)

## petal_width_fixed
##   [0.1,1)   [1,1.8) [1.8,2.5] 
##        50        54        46

jpeg("fixed_discretization.jpg", width = 800, height = 600)
plot(iris$Petal.Width, col = as.integer(petal_width_fixed),
     main = "Дискретизация методом фиксированных границ",
     xlab = "Индекс наблюдения", ylab = "Petal.Width")
legend("topright", legend = levels(petal_width_fixed),
       fill = 1:length(levels(petal_width_fixed)))
dev.off()

## png 
##   2

Задание 4: Выбор признаков с использованием алгоритма Boruta

# Установка и загрузка пакетов
# install.packages("Boruta")
library(Boruta)
library(mlbench)

# Загрузка и подготовка данных
data("Ozone")
Ozone_clean <- na.omit(Ozone)

# Применение алгоритма Boruta
set.seed(123)
boruta_output <- Boruta(V4 ~ ., data = Ozone_clean)
print(boruta_output)

## Boruta performed 24 iterations in 1.244062 secs.
##  9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
##  3 attributes confirmed unimportant: V2, V3, V6;

# Получение статистики по признакам
boruta_signif <- attStats(boruta_output)
boruta_signif <- boruta_signif[order(-boruta_signif$meanImp), ]
print(boruta_signif)

##        meanImp  medianImp     minImp     maxImp  normHits  decision
## V9  19.2281405 19.0627349 17.5889826 20.9190449 1.0000000 Confirmed
## V8  17.1647491 17.2255744 16.0336735 18.5525852 1.0000000 Confirmed
## V12 14.6326841 14.6095338 13.5595253 16.0775580 1.0000000 Confirmed
## V11 11.8977619 11.8484607 10.9347533 13.6520570 1.0000000 Confirmed
## V7  11.7026875 11.5169965 10.5127703 13.4896943 1.0000000 Confirmed
## V10  9.8662368  9.7266893  8.6477478 11.3131795 1.0000000 Confirmed
## V1   9.5563296  9.7071000  8.4255686 10.7247899 1.0000000 Confirmed
## V13  9.4438214  9.5489762  8.1005306 10.7881019 1.0000000 Confirmed
## V5   9.2426781  9.2313179  8.1108460 10.5140883 1.0000000 Confirmed
## V2   1.1557680  1.1576551 -0.2474598  2.7423660 0.1666667  Rejected
## V6   0.9886679  1.3615721 -1.1013954  1.9852132 0.0000000  Rejected
## V3  -0.9877372 -0.7333367 -3.4162909  0.3794342 0.0000000  Rejected

# Визуализация результатов
jpeg("boruta_plot.jpg", width = 1000, height = 800)
plot(boruta_output, xlab = "", xaxt = "n")
dev.off()

## png 
##   2

# Боксплот важности признаков
jpeg("boruta_boxplot.jpg", width = 1000, height = 800)
lz <- lapply(1:ncol(boruta_output$ImpHistory), function(i) 
  boruta_output$ImpHistory[, i])
names(lz) <- colnames(boruta_output$ImpHistory)
boxplot(lz, main = "Важность признаков (Boruta)", 
        ylab = "Значимость", las = 2, cex.axis = 0.7)
dev.off()

## png 
##   2