1. Работа с пакетом CARET

library(caret)
model_names <- names(getModelInfo())
length(model_names)
## [1] 239
head(model_names, 20)
##  [1] "ada"         "AdaBag"      "AdaBoost.M1" "adaboost"    "amdai"      
##  [6] "ANFIS"       "avNNet"      "awnb"        "awtan"       "bag"        
## [11] "bagEarth"    "bagEarthGCV" "bagFDA"      "bagFDAGCV"   "bam"        
## [16] "bartMachine" "bayesglm"    "binda"       "blackboost"  "blasso"

Генерация данных

set.seed(123)

x <- matrix(rnorm(50*5), ncol=5)
x <- as.data.frame(x)
colnames(x) <- paste0("X", 1:5)

y <- factor(rep(c("A", "B"), 25))

Графический анализ (featurePlot)

Boxplot

featurePlot(x = x, y = y, plot = "box")

jpeg("caret_boxplot.jpg")
featurePlot(x = x, y = y, plot = "box")
dev.off()
## png 
##   2

Density

featurePlot(x = x, y = y, plot = "density")

jpeg("caret_density.jpg")
featurePlot(x = x, y = y, plot = "density")
dev.off()
## png 
##   2

Pairs

featurePlot(x = x, y = y, plot = "pairs")

jpeg("caret_pairs.jpg")
featurePlot(x = x, y = y, plot = "pairs")
dev.off()
## png 
##   2

2. Определение важности признаков (FSelector)

library(FSelector)
data(iris)

Information Gain

ig <- information.gain(Species ~ ., data = iris)
ig_sorted <- ig[order(-ig$attr_importance), , drop = FALSE]
ig_sorted
##              attr_importance
## Petal.Width        0.9554360
## Petal.Length       0.9402853
## Sepal.Length       0.4521286
## Sepal.Width        0.2672750

Gain Ratio

gr <- gain.ratio(Species ~ ., data = iris)
gr_sorted <- gr[order(-gr$attr_importance), , drop = FALSE]
gr_sorted
##              attr_importance
## Petal.Width        0.8713692
## Petal.Length       0.8584937
## Sepal.Length       0.4196464
## Sepal.Width        0.2472972

Chi-squared

chi <- chi.squared(Species ~ ., data = iris)
chi_sorted <- chi[order(-chi$attr_importance), , drop = FALSE]
chi_sorted
##              attr_importance
## Petal.Width        0.9432359
## Petal.Length       0.9346311
## Sepal.Length       0.6288067
## Sepal.Width        0.4922162

Сравнение методов

importance_compare <- data.frame(
  IG = ig_sorted$attr_importance,
  GR = gr_sorted$attr_importance,
  CHI = chi_sorted$attr_importance
)
rownames(importance_compare) <- rownames(ig_sorted)
importance_compare
##                     IG        GR       CHI
## Petal.Width  0.9554360 0.8713692 0.9432359
## Petal.Length 0.9402853 0.8584937 0.9346311
## Sepal.Length 0.4521286 0.4196464 0.6288067
## Sepal.Width  0.2672750 0.2472972 0.4922162

График

barplot(
  ig_sorted$attr_importance,
  names.arg = rownames(ig_sorted),
  las = 2,
  main = "Важность признаков (iris)"
)

jpeg("fselector_iris.jpg")
barplot(
  ig_sorted$attr_importance,
  names.arg = rownames(ig_sorted),
  las = 2
)
dev.off()
## png 
##   2

3. Дискретизация (arules)

library(arules)

x <- iris$Sepal.Length
disc_interval <- discretize(x, method = "interval", categories = 3)
disc_frequency <- discretize(x, method = "frequency", categories = 3)
disc_cluster <- discretize(x, method = "cluster", categories = 3)

disc_fixed <- discretize(
  x,
  method = "fixed",
  breaks = c(4, 5.5, 6.5, 8),
  labels = c("short", "medium", "long")
)

Графики

par(mfrow = c(2,2))

barplot(table(disc_interval), main = "interval")
barplot(table(disc_frequency), main = "frequency")
barplot(table(disc_cluster), main = "cluster")
barplot(table(disc_fixed), main = "fixed")

par(mfrow = c(1,1))

jpeg("arules_discretize.jpg")
par(mfrow = c(2,2))
barplot(table(disc_interval))
barplot(table(disc_frequency))
barplot(table(disc_cluster))
barplot(table(disc_fixed))
dev.off()
## png 
##   2

4. Отбор признаков (Boruta)

library(Boruta)
library(mlbench)

data(Ozone)
ozone_data <- na.omit(Ozone)
set.seed(123)
boruta_model <- Boruta(V4 ~ ., data = ozone_data, doTrace = 0)
boruta_model
## Boruta performed 24 iterations in 1.496624 secs.
##  9 attributes confirmed important: V1, V10, V11, V12, V13 and 4 more;
##  3 attributes confirmed unimportant: V2, V3, V6;
attStats(boruta_model)
##        meanImp  medianImp     minImp     maxImp  normHits  decision
## V1   9.5563296  9.7071000  8.4255686 10.7247899 1.0000000 Confirmed
## V2   1.1557680  1.1576551 -0.2474598  2.7423660 0.1666667  Rejected
## V3  -0.9877372 -0.7333367 -3.4162909  0.3794342 0.0000000  Rejected
## V5   9.2426781  9.2313179  8.1108460 10.5140883 1.0000000 Confirmed
## V6   0.9886679  1.3615721 -1.1013954  1.9852132 0.0000000  Rejected
## V7  11.7026875 11.5169965 10.5127703 13.4896943 1.0000000 Confirmed
## V8  17.1647491 17.2255744 16.0336735 18.5525852 1.0000000 Confirmed
## V9  19.2281405 19.0627349 17.5889826 20.9190449 1.0000000 Confirmed
## V10  9.8662368  9.7266893  8.6477478 11.3131795 1.0000000 Confirmed
## V11 11.8977619 11.8484607 10.9347533 13.6520570 1.0000000 Confirmed
## V12 14.6326841 14.6095338 13.5595253 16.0775580 1.0000000 Confirmed
## V13  9.4438214  9.5489762  8.1005306 10.7881019 1.0000000 Confirmed
getSelectedAttributes(boruta_model, withTentative = TRUE)
## [1] "V1"  "V5"  "V7"  "V8"  "V9"  "V10" "V11" "V12" "V13"

График

plot(boruta_model, las = 2)

jpeg("boruta_boxplot.jpg")
plot(boruta_model)
dev.off()
## png 
##   2