1. Работа с пакетом CARET
library(caret)
model_names <- names(getModelInfo())
length(model_names)
## [1] 239
head(model_names, 20)
## [1] "ada" "AdaBag" "AdaBoost.M1" "adaboost" "amdai"
## [6] "ANFIS" "avNNet" "awnb" "awtan" "bag"
## [11] "bagEarth" "bagEarthGCV" "bagFDA" "bagFDAGCV" "bam"
## [16] "bartMachine" "bayesglm" "binda" "blackboost" "blasso"
Генерация данных
set.seed(123)
x <- matrix(rnorm(50*5), ncol=5)
x <- as.data.frame(x)
colnames(x) <- paste0("X", 1:5)
y <- factor(rep(c("A", "B"), 25))
Графический анализ (featurePlot)
Boxplot
featurePlot(x = x, y = y, plot = "box")

jpeg("caret_boxplot.jpg")
featurePlot(x = x, y = y, plot = "box")
dev.off()
## png
## 2
Density
featurePlot(x = x, y = y, plot = "density")

jpeg("caret_density.jpg")
featurePlot(x = x, y = y, plot = "density")
dev.off()
## png
## 2
Pairs
featurePlot(x = x, y = y, plot = "pairs")

jpeg("caret_pairs.jpg")
featurePlot(x = x, y = y, plot = "pairs")
dev.off()
## png
## 2
2. Определение важности признаков (FSelector)
library(FSelector)
data(iris)
Gain Ratio
gr <- gain.ratio(Species ~ ., data = iris)
gr_sorted <- gr[order(-gr$attr_importance), , drop = FALSE]
gr_sorted
## attr_importance
## Petal.Width 0.8713692
## Petal.Length 0.8584937
## Sepal.Length 0.4196464
## Sepal.Width 0.2472972
Chi-squared
chi <- chi.squared(Species ~ ., data = iris)
chi_sorted <- chi[order(-chi$attr_importance), , drop = FALSE]
chi_sorted
## attr_importance
## Petal.Width 0.9432359
## Petal.Length 0.9346311
## Sepal.Length 0.6288067
## Sepal.Width 0.4922162
Сравнение методов
importance_compare <- data.frame(
IG = ig_sorted$attr_importance,
GR = gr_sorted$attr_importance,
CHI = chi_sorted$attr_importance
)
rownames(importance_compare) <- rownames(ig_sorted)
importance_compare
## IG GR CHI
## Petal.Width 0.9554360 0.8713692 0.9432359
## Petal.Length 0.9402853 0.8584937 0.9346311
## Sepal.Length 0.4521286 0.4196464 0.6288067
## Sepal.Width 0.2672750 0.2472972 0.4922162
График
barplot(
ig_sorted$attr_importance,
names.arg = rownames(ig_sorted),
las = 2,
main = "Важность признаков (iris)"
)

jpeg("fselector_iris.jpg")
barplot(
ig_sorted$attr_importance,
names.arg = rownames(ig_sorted),
las = 2
)
dev.off()
## png
## 2
3. Дискретизация (arules)
library(arules)
x <- iris$Sepal.Length
disc_interval <- discretize(x, method = "interval", categories = 3)
disc_frequency <- discretize(x, method = "frequency", categories = 3)
disc_cluster <- discretize(x, method = "cluster", categories = 3)
disc_fixed <- discretize(
x,
method = "fixed",
breaks = c(4, 5.5, 6.5, 8),
labels = c("short", "medium", "long")
)
Графики
par(mfrow = c(2,2))
barplot(table(disc_interval), main = "interval")
barplot(table(disc_frequency), main = "frequency")
barplot(table(disc_cluster), main = "cluster")
barplot(table(disc_fixed), main = "fixed")

par(mfrow = c(1,1))
jpeg("arules_discretize.jpg")
par(mfrow = c(2,2))
barplot(table(disc_interval))
barplot(table(disc_frequency))
barplot(table(disc_cluster))
barplot(table(disc_fixed))
dev.off()
## png
## 2
4. Отбор признаков (Boruta)
library(Boruta)
library(mlbench)
data(Ozone)
ozone_data <- na.omit(Ozone)
set.seed(123)
boruta_model <- Boruta(V4 ~ ., data = ozone_data, doTrace = 0)
boruta_model
## Boruta performed 24 iterations in 1.496624 secs.
## 9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
## 3 attributes confirmed unimportant: V2, V3, V6;
attStats(boruta_model)
## meanImp medianImp minImp maxImp normHits decision
## V1 9.5563296 9.7071000 8.4255686 10.7247899 1.0000000 Confirmed
## V2 1.1557680 1.1576551 -0.2474598 2.7423660 0.1666667 Rejected
## V3 -0.9877372 -0.7333367 -3.4162909 0.3794342 0.0000000 Rejected
## V5 9.2426781 9.2313179 8.1108460 10.5140883 1.0000000 Confirmed
## V6 0.9886679 1.3615721 -1.1013954 1.9852132 0.0000000 Rejected
## V7 11.7026875 11.5169965 10.5127703 13.4896943 1.0000000 Confirmed
## V8 17.1647491 17.2255744 16.0336735 18.5525852 1.0000000 Confirmed
## V9 19.2281405 19.0627349 17.5889826 20.9190449 1.0000000 Confirmed
## V10 9.8662368 9.7266893 8.6477478 11.3131795 1.0000000 Confirmed
## V11 11.8977619 11.8484607 10.9347533 13.6520570 1.0000000 Confirmed
## V12 14.6326841 14.6095338 13.5595253 16.0775580 1.0000000 Confirmed
## V13 9.4438214 9.5489762 8.1005306 10.7881019 1.0000000 Confirmed
getSelectedAttributes(boruta_model, withTentative = TRUE)
## [1] "V1" "V5" "V7" "V8" "V9" "V10" "V11" "V12" "V13"
График
plot(boruta_model, las = 2)

jpeg("boruta_boxplot.jpg")
plot(boruta_model)
dev.off()
## png
## 2