library(caret)
set.seed(123)
x <- matrix(rnorm(50*5), ncol=5)
y <- factor(rep(c("A","B"), 25))
colnames(x) <- paste0("Var",1:5)
featurePlot(x = x, y = y, plot = "pairs")
featurePlot(x = x, y = y, plot = "box")
Вывод: признаки случайные, классы не разделяются.
library(FSelector)
data(iris)
weights <- information.gain(Species ~ ., data = iris)
weights
## attr_importance
## Sepal.Length 0.4521286
## Sepal.Width 0.2672750
## Petal.Length 0.9402853
## Petal.Width 0.9554360
library(ggplot2)
weights_df <- as.data.frame(weights)
weights_df$Feature <- rownames(weights_df)
ggplot(weights_df,
aes(x = reorder(Feature, attr_importance),
y = attr_importance)) +
geom_bar(stat="identity", fill="steelblue") +
coord_flip() +
xlab("Признак") +
ylab("Важность")
Вывод: Petal.Length и Petal.Width наиболее информативны.
library(arules)
## Загрузка требуемого пакета: Matrix
##
## Присоединяю пакет: 'arules'
## Следующие объекты скрыты от 'package:base':
##
## abbreviate, write
interval_disc <- discretize(iris$Petal.Length,
method="interval",
categories=3)
## Warning in discretize(iris$Petal.Length, method = "interval", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
table(interval_disc)
## interval_disc
## [1,2.97) [2.97,4.93) [4.93,6.9]
## 50 54 46
freq_disc <- discretize(iris$Petal.Length,
method="frequency",
categories=3)
## Warning in discretize(iris$Petal.Length, method = "frequency", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
table(freq_disc)
## freq_disc
## [1,2.63) [2.63,4.9) [4.9,6.9]
## 50 49 51
cluster_disc <- discretize(iris$Petal.Length,
method="cluster",
categories=3)
## Warning in discretize(iris$Petal.Length, method = "cluster", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
table(cluster_disc)
## cluster_disc
## [1,2.85) [2.85,4.89) [4.89,6.9]
## 50 49 51
fixed_disc <- discretize(iris$Petal.Length,
method="fixed",
breaks=c(0,2.5,5,8))
table(fixed_disc)
## fixed_disc
## [0,2.5) [2.5,5) [5,8]
## 50 54 46
Вывод: разные методы дают разное распределение категорий.
library(Boruta)
library(mlbench)
data(Ozone)
Ozone <- na.omit(Ozone)
set.seed(123)
boruta_result <- Boruta(V1 ~ ., data=Ozone, doTrace=2)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## After 11 iterations, +0.92 secs:
## confirmed 10 attributes: V11, V12, V13, V2, V4 and 5 more;
## rejected 1 attribute: V3;
## still have 1 attribute left.
## 12. run of importance source...
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## 16. run of importance source...
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## 20. run of importance source...
## 21. run of importance source...
## 22. run of importance source...
## 23. run of importance source...
## 24. run of importance source...
## 25. run of importance source...
## 26. run of importance source...
## 27. run of importance source...
## 28. run of importance source...
## 29. run of importance source...
## 30. run of importance source...
## 31. run of importance source...
## 32. run of importance source...
## 33. run of importance source...
## 34. run of importance source...
## 35. run of importance source...
## 36. run of importance source...
## 37. run of importance source...
## 38. run of importance source...
## 39. run of importance source...
## 40. run of importance source...
## 41. run of importance source...
## 42. run of importance source...
## 43. run of importance source...
## 44. run of importance source...
## 45. run of importance source...
## 46. run of importance source...
## 47. run of importance source...
## 48. run of importance source...
## 49. run of importance source...
## 50. run of importance source...
## 51. run of importance source...
## 52. run of importance source...
## 53. run of importance source...
## 54. run of importance source...
## 55. run of importance source...
## 56. run of importance source...
## 57. run of importance source...
## 58. run of importance source...
## 59. run of importance source...
## 60. run of importance source...
## 61. run of importance source...
## 62. run of importance source...
## 63. run of importance source...
## 64. run of importance source...
## 65. run of importance source...
## 66. run of importance source...
## 67. run of importance source...
## 68. run of importance source...
## 69. run of importance source...
## 70. run of importance source...
## 71. run of importance source...
## 72. run of importance source...
## 73. run of importance source...
## 74. run of importance source...
## 75. run of importance source...
## 76. run of importance source...
## 77. run of importance source...
## 78. run of importance source...
## 79. run of importance source...
## 80. run of importance source...
## 81. run of importance source...
## 82. run of importance source...
## 83. run of importance source...
## 84. run of importance source...
## 85. run of importance source...
## 86. run of importance source...
## 87. run of importance source...
## 88. run of importance source...
## 89. run of importance source...
## 90. run of importance source...
## 91. run of importance source...
## 92. run of importance source...
## 93. run of importance source...
## 94. run of importance source...
## 95. run of importance source...
## 96. run of importance source...
## 97. run of importance source...
## 98. run of importance source...
## 99. run of importance source...
print(boruta_result)
## Boruta performed 99 iterations in 8.100563 secs.
## 10 attributes confirmed important: V11, V12, V13, V2, V4 and 5 more;
## 1 attributes confirmed unimportant: V3;
## 1 tentative attributes left: V10;
plot(boruta_result, las=2, cex.axis=0.7)
Вывод: определены значимые и незначимые признаки. ## R Markdown
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.