3. С использованием функции discretize() из пакета arules выполните
преобразование непрерывной переменной в категориальную [3] различными ##
методами: «interval» (равная ширина интервала), «frequency» (равная
частота), «cluster» (кластеризация) и «fixed» (категории задают ## ##
границы интервалов). Используйте набор данных iris. Сделайте выводы
install.packages("arules")
##
## The downloaded binary packages are in
## /var/folders/fs/4v998hvs7wn66j723xbq32pxv4ld55/T//Rtmpf57Zc8/downloaded_packages
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
data(iris)
disc_interval <- discretize(iris$Sepal.Length, method = "interval", categories = 3)
## Warning in discretize(iris$Sepal.Length, method = "interval", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
print(disc_interval)
## [1] [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5)
## [8] [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5)
## [15] [5.5,6.7) [5.5,6.7) [4.3,5.5) [4.3,5.5) [5.5,6.7) [4.3,5.5) [4.3,5.5)
## [22] [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5)
## [29] [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [5.5,6.7) [4.3,5.5)
## [36] [4.3,5.5) [5.5,6.7) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5)
## [43] [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5) [4.3,5.5)
## [50] [4.3,5.5) [6.7,7.9] [5.5,6.7) [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7)
## [57] [5.5,6.7) [4.3,5.5) [5.5,6.7) [4.3,5.5) [4.3,5.5) [5.5,6.7) [5.5,6.7)
## [64] [5.5,6.7) [5.5,6.7) [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7)
## [71] [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [6.7,7.9]
## [78] [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7)
## [85] [4.3,5.5) [5.5,6.7) [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7)
## [92] [5.5,6.7) [5.5,6.7) [4.3,5.5) [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7)
## [99] [4.3,5.5) [5.5,6.7) [5.5,6.7) [5.5,6.7) [6.7,7.9] [5.5,6.7) [5.5,6.7)
## [106] [6.7,7.9] [4.3,5.5) [6.7,7.9] [6.7,7.9] [6.7,7.9] [5.5,6.7) [5.5,6.7)
## [113] [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7) [5.5,6.7) [6.7,7.9] [6.7,7.9]
## [120] [5.5,6.7) [6.7,7.9] [5.5,6.7) [6.7,7.9] [5.5,6.7) [6.7,7.9] [6.7,7.9]
## [127] [5.5,6.7) [5.5,6.7) [5.5,6.7) [6.7,7.9] [6.7,7.9] [6.7,7.9] [5.5,6.7)
## [134] [5.5,6.7) [5.5,6.7) [6.7,7.9] [5.5,6.7) [5.5,6.7) [5.5,6.7) [6.7,7.9]
## [141] [6.7,7.9] [6.7,7.9] [5.5,6.7) [6.7,7.9] [6.7,7.9] [6.7,7.9] [5.5,6.7)
## [148] [5.5,6.7) [5.5,6.7) [5.5,6.7)
## attr(,"discretized:breaks")
## [1] 4.3 5.5 6.7 7.9
## attr(,"discretized:method")
## [1] interval
## Levels: [4.3,5.5) [5.5,6.7) [6.7,7.9]
disc_interval <- discretize(iris$Sepal.Length, method = "frequency", categories = 3)
## Warning in discretize(iris$Sepal.Length, method = "frequency", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
print(disc_interval)
## [1] [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4) [5.4,6.3) [4.3,5.4)
## [8] [4.3,5.4) [4.3,5.4) [4.3,5.4) [5.4,6.3) [4.3,5.4) [4.3,5.4) [4.3,5.4)
## [15] [5.4,6.3) [5.4,6.3) [5.4,6.3) [4.3,5.4) [5.4,6.3) [4.3,5.4) [5.4,6.3)
## [22] [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4)
## [29] [4.3,5.4) [4.3,5.4) [4.3,5.4) [5.4,6.3) [4.3,5.4) [5.4,6.3) [4.3,5.4)
## [36] [4.3,5.4) [5.4,6.3) [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4)
## [43] [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4) [4.3,5.4)
## [50] [4.3,5.4) [6.3,7.9] [6.3,7.9] [6.3,7.9] [5.4,6.3) [6.3,7.9] [5.4,6.3)
## [57] [6.3,7.9] [4.3,5.4) [6.3,7.9] [4.3,5.4) [4.3,5.4) [5.4,6.3) [5.4,6.3)
## [64] [5.4,6.3) [5.4,6.3) [6.3,7.9] [5.4,6.3) [5.4,6.3) [5.4,6.3) [5.4,6.3)
## [71] [5.4,6.3) [5.4,6.3) [6.3,7.9] [5.4,6.3) [6.3,7.9] [6.3,7.9] [6.3,7.9]
## [78] [6.3,7.9] [5.4,6.3) [5.4,6.3) [5.4,6.3) [5.4,6.3) [5.4,6.3) [5.4,6.3)
## [85] [5.4,6.3) [5.4,6.3) [6.3,7.9] [6.3,7.9] [5.4,6.3) [5.4,6.3) [5.4,6.3)
## [92] [5.4,6.3) [5.4,6.3) [4.3,5.4) [5.4,6.3) [5.4,6.3) [5.4,6.3) [5.4,6.3)
## [99] [4.3,5.4) [5.4,6.3) [6.3,7.9] [5.4,6.3) [6.3,7.9] [6.3,7.9] [6.3,7.9]
## [106] [6.3,7.9] [4.3,5.4) [6.3,7.9] [6.3,7.9] [6.3,7.9] [6.3,7.9] [6.3,7.9]
## [113] [6.3,7.9] [5.4,6.3) [5.4,6.3) [6.3,7.9] [6.3,7.9] [6.3,7.9] [6.3,7.9]
## [120] [5.4,6.3) [6.3,7.9] [5.4,6.3) [6.3,7.9] [6.3,7.9] [6.3,7.9] [6.3,7.9]
## [127] [5.4,6.3) [5.4,6.3) [6.3,7.9] [6.3,7.9] [6.3,7.9] [6.3,7.9] [6.3,7.9]
## [134] [6.3,7.9] [5.4,6.3) [6.3,7.9] [6.3,7.9] [6.3,7.9] [5.4,6.3) [6.3,7.9]
## [141] [6.3,7.9] [6.3,7.9] [5.4,6.3) [6.3,7.9] [6.3,7.9] [6.3,7.9] [6.3,7.9]
## [148] [6.3,7.9] [5.4,6.3) [5.4,6.3)
## attr(,"discretized:breaks")
## [1] 4.3 5.4 6.3 7.9
## attr(,"discretized:method")
## [1] frequency
## Levels: [4.3,5.4) [5.4,6.3) [6.3,7.9]
disc_interval <- discretize(iris$Sepal.Length, method = "cluster", categories = 3)
## Warning in discretize(iris$Sepal.Length, method = "cluster", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default
## method is now frequency!
print(disc_interval)
## [1] [4.3,5.33) [4.3,5.33) [4.3,5.33) [4.3,5.33) [4.3,5.33) [5.33,6.27)
## [7] [4.3,5.33) [4.3,5.33) [4.3,5.33) [4.3,5.33) [5.33,6.27) [4.3,5.33)
## [13] [4.3,5.33) [4.3,5.33) [5.33,6.27) [5.33,6.27) [5.33,6.27) [4.3,5.33)
## [19] [5.33,6.27) [4.3,5.33) [5.33,6.27) [4.3,5.33) [4.3,5.33) [4.3,5.33)
## [25] [4.3,5.33) [4.3,5.33) [4.3,5.33) [4.3,5.33) [4.3,5.33) [4.3,5.33)
## [31] [4.3,5.33) [5.33,6.27) [4.3,5.33) [5.33,6.27) [4.3,5.33) [4.3,5.33)
## [37] [5.33,6.27) [4.3,5.33) [4.3,5.33) [4.3,5.33) [4.3,5.33) [4.3,5.33)
## [43] [4.3,5.33) [4.3,5.33) [4.3,5.33) [4.3,5.33) [4.3,5.33) [4.3,5.33)
## [49] [4.3,5.33) [4.3,5.33) [6.27,7.9] [6.27,7.9] [6.27,7.9] [5.33,6.27)
## [55] [6.27,7.9] [5.33,6.27) [6.27,7.9] [4.3,5.33) [6.27,7.9] [4.3,5.33)
## [61] [4.3,5.33) [5.33,6.27) [5.33,6.27) [5.33,6.27) [5.33,6.27) [6.27,7.9]
## [67] [5.33,6.27) [5.33,6.27) [5.33,6.27) [5.33,6.27) [5.33,6.27) [5.33,6.27)
## [73] [6.27,7.9] [5.33,6.27) [6.27,7.9] [6.27,7.9] [6.27,7.9] [6.27,7.9]
## [79] [5.33,6.27) [5.33,6.27) [5.33,6.27) [5.33,6.27) [5.33,6.27) [5.33,6.27)
## [85] [5.33,6.27) [5.33,6.27) [6.27,7.9] [6.27,7.9] [5.33,6.27) [5.33,6.27)
## [91] [5.33,6.27) [5.33,6.27) [5.33,6.27) [4.3,5.33) [5.33,6.27) [5.33,6.27)
## [97] [5.33,6.27) [5.33,6.27) [4.3,5.33) [5.33,6.27) [6.27,7.9] [5.33,6.27)
## [103] [6.27,7.9] [6.27,7.9] [6.27,7.9] [6.27,7.9] [4.3,5.33) [6.27,7.9]
## [109] [6.27,7.9] [6.27,7.9] [6.27,7.9] [6.27,7.9] [6.27,7.9] [5.33,6.27)
## [115] [5.33,6.27) [6.27,7.9] [6.27,7.9] [6.27,7.9] [6.27,7.9] [5.33,6.27)
## [121] [6.27,7.9] [5.33,6.27) [6.27,7.9] [6.27,7.9] [6.27,7.9] [6.27,7.9]
## [127] [5.33,6.27) [5.33,6.27) [6.27,7.9] [6.27,7.9] [6.27,7.9] [6.27,7.9]
## [133] [6.27,7.9] [6.27,7.9] [5.33,6.27) [6.27,7.9] [6.27,7.9] [6.27,7.9]
## [139] [5.33,6.27) [6.27,7.9] [6.27,7.9] [6.27,7.9] [5.33,6.27) [6.27,7.9]
## [145] [6.27,7.9] [6.27,7.9] [6.27,7.9] [6.27,7.9] [5.33,6.27) [5.33,6.27)
## attr(,"discretized:breaks")
## [1] 4.300000 5.332732 6.272161 7.900000
## attr(,"discretized:method")
## [1] cluster
## Levels: [4.3,5.33) [5.33,6.27) [6.27,7.9]
disc_interval <- discretize(iris$Sepal.Length, method = "fixed", breaks = c(4,5,6,7))
print(disc_interval)
## [1] [5,6) [4,5) [4,5) [4,5) [5,6) [5,6) [4,5) [5,6) [4,5) [4,5) [5,6) [4,5)
## [13] [4,5) [4,5) [5,6) [5,6) [5,6) [5,6) [5,6) [5,6) [5,6) [5,6) [4,5) [5,6)
## [25] [4,5) [5,6) [5,6) [5,6) [5,6) [4,5) [4,5) [5,6) [5,6) [5,6) [4,5) [5,6)
## [37] [5,6) [4,5) [4,5) [5,6) [5,6) [4,5) [4,5) [5,6) [5,6) [4,5) [5,6) [4,5)
## [49] [5,6) [5,6) [6,7] [6,7] [6,7] [5,6) [6,7] [5,6) [6,7] [4,5) [6,7] [5,6)
## [61] [5,6) [5,6) [6,7] [6,7] [5,6) [6,7] [5,6) [5,6) [6,7] [5,6) [5,6) [6,7]
## [73] [6,7] [6,7] [6,7] [6,7] [6,7] [6,7] [6,7] [5,6) [5,6) [5,6) [5,6) [6,7]
## [85] [5,6) [6,7] [6,7] [6,7] [5,6) [5,6) [5,6) [6,7] [5,6) [5,6) [5,6) [5,6)
## [97] [5,6) [6,7] [5,6) [5,6) [6,7] [5,6) <NA> [6,7] [6,7] <NA> [4,5) <NA>
## [109] [6,7] <NA> [6,7] [6,7] [6,7] [5,6) [5,6) [6,7] [6,7] <NA> <NA> [6,7]
## [121] [6,7] [5,6) <NA> [6,7] [6,7] <NA> [6,7] [6,7] [6,7] <NA> <NA> <NA>
## [133] [6,7] [6,7] [6,7] <NA> [6,7] [6,7] [6,7] [6,7] [6,7] [6,7] [5,6) [6,7]
## [145] [6,7] [6,7] [6,7] [6,7] [6,7] [5,6)
## attr(,"discretized:breaks")
## [1] 4 5 6 7
## attr(,"discretized:method")
## [1] fixed
## Levels: [4,5) [5,6) [6,7]
3. Установите пакет Boruta и проведите выбор признаков для набора
данных data(“Ozone”) [4, 5, 6]. Построить график boxplot, сделать ## ##
выводы.
install.packages("Boruta")
##
## The downloaded binary packages are in
## /var/folders/fs/4v998hvs7wn66j723xbq32pxv4ld55/T//Rtmpf57Zc8/downloaded_packages
library(Boruta)
data("airquality")
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
airquality <- na.omit(airquality)
airquality$OzoneLevel <- factor(ifelse(airquality$Ozone > median(airquality$Ozone, na.rm = TRUE), "High", "Low"))
set.seed(123)
result <- Boruta(OzoneLevel ~ ., data = airquality, doTrace = 2)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## After 10 iterations, +0.097 secs:
## confirmed 4 attributes: Ozone, Solar.R, Temp, Wind;
## still have 2 attributes left.
## 11. run of importance source...
## 12. run of importance source...
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## 16. run of importance source...
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## 20. run of importance source...
## 21. run of importance source...
## 22. run of importance source...
## 23. run of importance source...
## 24. run of importance source...
## 25. run of importance source...
## 26. run of importance source...
## 27. run of importance source...
## 28. run of importance source...
## 29. run of importance source...
## 30. run of importance source...
## 31. run of importance source...
## 32. run of importance source...
## 33. run of importance source...
## 34. run of importance source...
## 35. run of importance source...
## 36. run of importance source...
## 37. run of importance source...
## 38. run of importance source...
## 39. run of importance source...
## 40. run of importance source...
## 41. run of importance source...
## 42. run of importance source...
## 43. run of importance source...
## 44. run of importance source...
## 45. run of importance source...
## 46. run of importance source...
## 47. run of importance source...
## After 47 iterations, +0.51 secs:
## confirmed 1 attribute: Month;
## still have 1 attribute left.
## 48. run of importance source...
## 49. run of importance source...
## 50. run of importance source...
## 51. run of importance source...
## 52. run of importance source...
## 53. run of importance source...
## 54. run of importance source...
## 55. run of importance source...
## 56. run of importance source...
## 57. run of importance source...
## 58. run of importance source...
## 59. run of importance source...
## 60. run of importance source...
## 61. run of importance source...
## 62. run of importance source...
## After 62 iterations, +0.64 secs:
## confirmed 1 attribute: Day;
## no more attributes left.
print(result)
## Boruta performed 62 iterations in 0.640862 secs.
## 6 attributes confirmed important: Day, Month, Ozone, Solar.R, Temp and
## 1 more;
## No attributes deemed unimportant.
priznak <- getSelectedAttributes(result, withTentative = FALSE)
print(priznak)
## [1] "Ozone" "Solar.R" "Wind" "Temp" "Month" "Day"
boxplot(airquality$Solar.R ~ airquality$OzoneLevel,main = "Boxplot of Solar Radiation by Ozone Level",xlab = "Ozone Level", ylab = "Solar Radiation",col = c("lightblue", "lightgreen"))

boxplot(airquality$Wind ~ airquality$OzoneLevel,main = "Boxplot of Wind by Ozone Level",xlab = "Ozone Level", ylab = "Wind", col = c("lightblue", "lightgreen"))
