Лабораторная работа по анализу данных

# ---------- ЗАДАНИЕ 1 ----------

# 1. Искусственные данные
set.seed(123)
x <- matrix(rnorm(50*5), ncol=5)
y <- factor(rep(c("A", "B"), 25))

# 2. Графики
jpeg("graph1_pairs.jpg", width=1000, height=800)
pairs(x, col=ifelse(y=="A", "red", "blue"), pch=19,
      main="Матрица диаграмм рассеяния")
dev.off()
## png 
##   2
jpeg("graph2_density.jpg", width=800, height=600)
par(mfrow=c(2,3))
for(i in 1:5) plot(density(x[,i]), main=paste("Плотность X", i))
dev.off()
## png 
##   2
jpeg("graph3_boxplot.jpg", width=800, height=600)
par(mfrow=c(1,5))
for(i in 1:5) boxplot(x[,i] ~ y, main=paste("Boxplot X", i), 
                       col=c("red", "blue"))
dev.off()
## png 
##   2
cat("Графики сохранены\n")
## Графики сохранены
# ---------- ЗАДАНИЕ 2 ----------

library(Boruta)
data(iris)

set.seed(123)
boruta_result <- Boruta(Species ~ ., data = iris, doTrace = 0)
print(boruta_result)
## Boruta performed 9 iterations in 1.940796 secs.
##  4 attributes confirmed important: Petal.Length, Petal.Width,
## Sepal.Length, Sepal.Width;
##  No attributes deemed unimportant.
jpeg("iris_boruta_importance.jpg", width=800, height=600)
plot(boruta_result, main="Важность признаков для Iris")
dev.off()
## png 
##   2
# ---------- ЗАДАНИЕ 3 ----------

library(arules)
## Загрузка требуемого пакета: Matrix
## 
## Присоединяю пакет: 'arules'
## Следующие объекты скрыты от 'package:base':
## 
##     abbreviate, write
data(iris)

continuous_var <- iris$Petal.Length

cat("1. Interval:\n")
## 1. Interval:
disc_interval <- discretize(continuous_var, method = "interval", categories = 4)
## Warning in discretize(continuous_var, method = "interval", categories = 4):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
print(table(disc_interval))
## disc_interval
##    [1,2.48) [2.48,3.95) [3.95,5.43)  [5.43,6.9] 
##          50          11          61          28
cat("\n2. Frequency:\n")
## 
## 2. Frequency:
disc_frequency <- discretize(continuous_var, method = "frequency", categories = 4)
## Warning in discretize(continuous_var, method = "frequency", categories = 4):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
print(table(disc_frequency))
## disc_frequency
##    [1,1.6) [1.6,4.35) [4.35,5.1)  [5.1,6.9] 
##         37         38         33         42
cat("\n3. Cluster:\n")
## 
## 3. Cluster:
disc_cluster <- discretize(continuous_var, method = "cluster", categories = 4)
## Warning in discretize(continuous_var, method = "cluster", categories = 4):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
print(table(disc_cluster))
## disc_cluster
##    [1,2.67) [2.67,4.35) [4.35,5.36)  [5.36,6.9] 
##          50          25          45          30
cat("\n4. Fixed:\n")
## 
## 4. Fixed:
disc_fixed <- discretize(continuous_var,
                         method = "fixed",
                         breaks = c(-Inf, 1.0, 3.0, 5.0, Inf),
                         labels = c("очень короткий", "короткий", "средний", "длинный"))
print(table(disc_fixed))
## disc_fixed
## очень короткий       короткий        средний        длинный 
##              0             50             54             46
# ---------- ЗАДАНИЕ 4 ----------

library(Boruta)
library(mlbench)
data(Ozone)

ozone_clean <- na.omit(Ozone)

boruta_ozone <- Boruta(V4 ~ ., data = ozone_clean, doTrace = 0)
print(boruta_ozone)
## Boruta performed 24 iterations in 2.083554 secs.
##  9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
##  3 attributes confirmed unimportant: V2, V3, V6;
jpeg("boruta_ozone_boxplot.jpg", width=1000, height=600)
plot(boruta_ozone, 
     xlab = "Признаки (V1-V13)", 
     ylab = "Важность (Z-score)",
     main = "Отбор признаков для прогноза концентрации озона")
dev.off()
## png 
##   2