Лабораторная работа 2
Задание 1: Разведочный анализ данных с использованием пакета
CARET
# Установка и загрузка пакета caret
# install.packages("caret", dependencies = TRUE)
library(caret)
# Получение списка доступных методов
methods <- names(getModelInfo())
# Длина списка методов
length(methods)
## [1] 239
# Создание тестовых данных
set.seed(123)
x <- matrix(rnorm(50*5), ncol=5)
y <- factor(rep(c("A", "B"), 25))
# Графики плотности распределения
jpeg("density_plot.jpg", width = 1000, height = 600)
featurePlot(x, y, plot = "density",
scales = list(x = list(relation = "free"),
y = list(relation = "free")),
adjust = 1.5, pch = "|",
layout = c(5, 1),
auto.key = list(columns = 2))
dev.off()
## png
## 2
# Боксплоты
jpeg("box_plot.jpg", width = 1000, height = 600)
featurePlot(x, y, plot = "box",
scales = list(y = list(relation = "free"),
x = list(rot = 90)),
layout = c(5, 1),
auto.key = list(columns = 2))
dev.off()
## png
## 2
# График рассеивания
jpeg("pairs_plot.jpg", width = 1000, height = 800)
featurePlot(x, y, plot = "pairs",
auto.key = list(columns = 2))
dev.off()
## png
## 2
# Эллипсы
jpeg("ellipse_plot.jpg", width = 1000, height = 800)
featurePlot(x, y, plot = "ellipse",
auto.key = list(columns = 2))
dev.off()
## png
## 2
Задание 2: Определение важности признаков с использованием
FSelector
# Установка и загрузка пакетов
# install.packages(c("FSelector", "randomForest"))
library(FSelector)
library(randomForest)
# Загрузка данных
data(iris)
# Information Gain
weights_info_gain <- information.gain(Species ~ ., iris)
weights_info_gain_sorted <- weights_info_gain[order(-weights_info_gain$attr_importance), ]
print(weights_info_gain_sorted)
## [1] 0.9554360 0.9402853 0.4521286 0.2672750
# Gain Ratio
weights_gain_ratio <- gain.ratio(Species ~ ., iris)
weights_gain_ratio_sorted <- weights_gain_ratio[order(-weights_gain_ratio$attr_importance), ]
print(weights_gain_ratio_sorted)
## [1] 0.8713692 0.8584937 0.4196464 0.2472972
# Chi-squared
weights_chi_squared <- chi.squared(Species ~ ., iris)
weights_chi_squared_sorted <- weights_chi_squared[order(-weights_chi_squared$attr_importance), ]
print(weights_chi_squared_sorted)
## [1] 0.9432359 0.9346311 0.6288067 0.4922162
# Relief
weights_relief <- relief(Species ~ ., iris, neighbours.count = 5, sample.size = 150)
weights_relief_sorted <- weights_relief[order(-weights_relief$attr_importance), ]
print(weights_relief_sorted)
## [1] 0.3349040 0.3327500 0.1477037 0.1408889
# Random Forest
rf_model <- randomForest(Species ~ ., data = iris, importance = TRUE)
rf_importance <- importance(rf_model)
print(rf_importance)
## setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length 6.305187 7.0503115 8.697237 10.588170
## Sepal.Width 4.430627 0.8701779 3.867456 4.632256
## Petal.Length 22.679951 34.7088400 27.365468 34.327847
## Petal.Width 22.111779 31.0510571 29.260476 32.226306
## MeanDecreaseGini
## Sepal.Length 9.609120
## Sepal.Width 2.246639
## Petal.Length 42.443026
## Petal.Width 44.929924
# Визуализация результатов
methods <- c("Information Gain", "Gain Ratio", "Chi-squared", "Relief")
features <- rownames(weights_info_gain)
# Создание матрицы важности
importance_matrix <- matrix(0, nrow = length(features), ncol = length(methods))
colnames(importance_matrix) <- methods
rownames(importance_matrix) <- features
# Заполнение матрицы
importance_matrix[, "Information Gain"] <- weights_info_gain$attr_importance
importance_matrix[, "Gain Ratio"] <- weights_gain_ratio$attr_importance
importance_matrix[, "Chi-squared"] <- weights_chi_squared$attr_importance
importance_matrix[, "Relief"] <- weights_relief$attr_importance
# Построение графика
jpeg("feature_importance.jpg", width = 1000, height = 600)
barplot(t(importance_matrix), beside = TRUE, col = rainbow(4),
main = "Важность признаков по разным методам",
xlab = "Признаки", ylab = "Важность")
legend("topright", methods, fill = rainbow(4))
dev.off()
## png
## 2
k_best <- 2
selected_attrs_k_best <- cutoff.k.percent(weights_info_gain, k = k_best)
print(selected_attrs_k_best)
## [1] "Petal.Width" "Petal.Length" "Sepal.Length" "Sepal.Width"
cfs_subset <- cfs(Species ~ ., iris)
print(cfs_subset)
## [1] "Petal.Length" "Petal.Width"
Задание 3: Дискретизация непрерывных переменных
# Установка и загрузка пакетов
# install.packages("arules")
library(arules)
# Метод равных интервалов
sepal_length_interval <- discretize(iris$Sepal.Length, method = "interval", breaks = 3)
table(sepal_length_interval)
## sepal_length_interval
## [4.3,5.5) [5.5,6.7) [6.7,7.9]
## 52 70 28
jpeg("interval_discretization.jpg", width = 800, height = 600)
plot(iris$Sepal.Length, col = as.integer(sepal_length_interval),
main = "Дискретизация методом равных интервалов",
xlab = "Индекс наблюдения", ylab = "Sepal.Length")
legend("topright", legend = levels(sepal_length_interval),
fill = 1:length(levels(sepal_length_interval)))
dev.off()
## png
## 2
# Метод равной частоты
sepal_width_frequency <- discretize(iris$Sepal.Width, method = "frequency", breaks = 3)
table(sepal_width_frequency)
## sepal_width_frequency
## [2,2.9) [2.9,3.2) [3.2,4.4]
## 47 47 56
jpeg("frequency_discretization.jpg", width = 800, height = 600)
plot(iris$Sepal.Width, col = as.integer(sepal_width_frequency),
main = "Дискретизация методом равной частоты",
xlab = "Индекс наблюдения", ylab = "Sepal.Width")
legend("topright", legend = levels(sepal_width_frequency),
fill = 1:length(levels(sepal_width_frequency)))
dev.off()
## png
## 2
# Метод кластеризации
petal_length_cluster <- discretize(iris$Petal.Length, method = "cluster", breaks = 3)
table(petal_length_cluster)
## petal_length_cluster
## [1,2.95) [2.95,5.13) [5.13,6.9]
## 50 66 34
jpeg("cluster_discretization.jpg", width = 800, height = 600)
plot(iris$Petal.Length, col = as.integer(petal_length_cluster),
main = "Дискретизация методом кластеризации",
xlab = "Индекс наблюдения", ylab = "Petal.Length")
legend("topright", legend = levels(petal_length_cluster),
fill = 1:length(levels(petal_length_cluster)))
dev.off()
## png
## 2
# Метод фиксированных границ
petal_width_fixed <- discretize(iris$Petal.Width, method = "fixed", breaks = c(0.1, 1.0, 1.8, 2.5))
table(petal_width_fixed)
## petal_width_fixed
## [0.1,1) [1,1.8) [1.8,2.5]
## 50 54 46
jpeg("fixed_discretization.jpg", width = 800, height = 600)
plot(iris$Petal.Width, col = as.integer(petal_width_fixed),
main = "Дискретизация методом фиксированных границ",
xlab = "Индекс наблюдения", ylab = "Petal.Width")
legend("topright", legend = levels(petal_width_fixed),
fill = 1:length(levels(petal_width_fixed)))
dev.off()
## png
## 2
Задание 4: Выбор признаков с использованием алгоритма Boruta
# Установка и загрузка пакетов
# install.packages("Boruta")
library(Boruta)
library(mlbench)
# Загрузка и подготовка данных
data("Ozone")
Ozone_clean <- na.omit(Ozone)
# Применение алгоритма Boruta
set.seed(123)
boruta_output <- Boruta(V4 ~ ., data = Ozone_clean)
print(boruta_output)
## Boruta performed 24 iterations in 1.244062 secs.
## 9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
## 3 attributes confirmed unimportant: V2, V3, V6;
# Получение статистики по признакам
boruta_signif <- attStats(boruta_output)
boruta_signif <- boruta_signif[order(-boruta_signif$meanImp), ]
print(boruta_signif)
## meanImp medianImp minImp maxImp normHits decision
## V9 19.2281405 19.0627349 17.5889826 20.9190449 1.0000000 Confirmed
## V8 17.1647491 17.2255744 16.0336735 18.5525852 1.0000000 Confirmed
## V12 14.6326841 14.6095338 13.5595253 16.0775580 1.0000000 Confirmed
## V11 11.8977619 11.8484607 10.9347533 13.6520570 1.0000000 Confirmed
## V7 11.7026875 11.5169965 10.5127703 13.4896943 1.0000000 Confirmed
## V10 9.8662368 9.7266893 8.6477478 11.3131795 1.0000000 Confirmed
## V1 9.5563296 9.7071000 8.4255686 10.7247899 1.0000000 Confirmed
## V13 9.4438214 9.5489762 8.1005306 10.7881019 1.0000000 Confirmed
## V5 9.2426781 9.2313179 8.1108460 10.5140883 1.0000000 Confirmed
## V2 1.1557680 1.1576551 -0.2474598 2.7423660 0.1666667 Rejected
## V6 0.9886679 1.3615721 -1.1013954 1.9852132 0.0000000 Rejected
## V3 -0.9877372 -0.7333367 -3.4162909 0.3794342 0.0000000 Rejected
# Визуализация результатов
jpeg("boruta_plot.jpg", width = 1000, height = 800)
plot(boruta_output, xlab = "", xaxt = "n")
dev.off()
## png
## 2
# Боксплот важности признаков
jpeg("boruta_boxplot.jpg", width = 1000, height = 800)
lz <- lapply(1:ncol(boruta_output$ImpHistory), function(i)
boruta_output$ImpHistory[, i])
names(lz) <- colnames(boruta_output$ImpHistory)
boxplot(lz, main = "Важность признаков (Boruta)",
ylab = "Значимость", las = 2, cex.axis = 0.7)
dev.off()
## png
## 2