Лабораторная работа по анализу данных
# ---------- ЗАДАНИЕ 1 ----------
# 1. Искусственные данные
set.seed(123)
x <- matrix(rnorm(50*5), ncol=5)
y <- factor(rep(c("A", "B"), 25))
# 2. Графики
jpeg("graph1_pairs.jpg", width=1000, height=800)
pairs(x, col=ifelse(y=="A", "red", "blue"), pch=19,
main="Матрица диаграмм рассеяния")
dev.off()
## png
## 2
jpeg("graph2_density.jpg", width=800, height=600)
par(mfrow=c(2,3))
for(i in 1:5) plot(density(x[,i]), main=paste("Плотность X", i))
dev.off()
## png
## 2
jpeg("graph3_boxplot.jpg", width=800, height=600)
par(mfrow=c(1,5))
for(i in 1:5) boxplot(x[,i] ~ y, main=paste("Boxplot X", i),
col=c("red", "blue"))
dev.off()
## png
## 2
cat("Графики сохранены\n")
## Графики сохранены
# ---------- ЗАДАНИЕ 2 ----------
library(Boruta)
data(iris)
set.seed(123)
boruta_result <- Boruta(Species ~ ., data = iris, doTrace = 0)
print(boruta_result)
## Boruta performed 9 iterations in 1.940796 secs.
## 4 attributes confirmed important: Petal.Length, Petal.Width,
## Sepal.Length, Sepal.Width;
## No attributes deemed unimportant.
jpeg("iris_boruta_importance.jpg", width=800, height=600)
plot(boruta_result, main="Важность признаков для Iris")
dev.off()
## png
## 2
# ---------- ЗАДАНИЕ 3 ----------
library(arules)
## Загрузка требуемого пакета: Matrix
##
## Присоединяю пакет: 'arules'
## Следующие объекты скрыты от 'package:base':
##
## abbreviate, write
data(iris)
continuous_var <- iris$Petal.Length
cat("1. Interval:\n")
## 1. Interval:
disc_interval <- discretize(continuous_var, method = "interval", categories = 4)
## Warning in discretize(continuous_var, method = "interval", categories = 4):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
print(table(disc_interval))
## disc_interval
## [1,2.48) [2.48,3.95) [3.95,5.43) [5.43,6.9]
## 50 11 61 28
cat("\n2. Frequency:\n")
##
## 2. Frequency:
disc_frequency <- discretize(continuous_var, method = "frequency", categories = 4)
## Warning in discretize(continuous_var, method = "frequency", categories = 4):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
print(table(disc_frequency))
## disc_frequency
## [1,1.6) [1.6,4.35) [4.35,5.1) [5.1,6.9]
## 37 38 33 42
cat("\n3. Cluster:\n")
##
## 3. Cluster:
disc_cluster <- discretize(continuous_var, method = "cluster", categories = 4)
## Warning in discretize(continuous_var, method = "cluster", categories = 4):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
print(table(disc_cluster))
## disc_cluster
## [1,2.67) [2.67,4.35) [4.35,5.36) [5.36,6.9]
## 50 25 45 30
cat("\n4. Fixed:\n")
##
## 4. Fixed:
disc_fixed <- discretize(continuous_var,
method = "fixed",
breaks = c(-Inf, 1.0, 3.0, 5.0, Inf),
labels = c("очень короткий", "короткий", "средний", "длинный"))
print(table(disc_fixed))
## disc_fixed
## очень короткий короткий средний длинный
## 0 50 54 46
# ---------- ЗАДАНИЕ 4 ----------
library(Boruta)
library(mlbench)
data(Ozone)
ozone_clean <- na.omit(Ozone)
boruta_ozone <- Boruta(V4 ~ ., data = ozone_clean, doTrace = 0)
print(boruta_ozone)
## Boruta performed 24 iterations in 2.083554 secs.
## 9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
## 3 attributes confirmed unimportant: V2, V3, V6;
jpeg("boruta_ozone_boxplot.jpg", width=1000, height=600)
plot(boruta_ozone,
xlab = "Признаки (V1-V13)",
ylab = "Важность (Z-score)",
main = "Отбор признаков для прогноза концентрации озона")
dev.off()
## png
## 2