This function gets a vector “x” as an input and generates its descretized vector. The length of each discret cluster is “step”. The discret value in each cluster is the Median there. If “step” is even, the median will be the average of the middle elements.
discretize <- function( x, step ) {
h = length(x) ## Find the size of x
hh = floor(length(x)/step)*step ## Find the number of steps
d = data.frame( x, index = 1:h ) ## Make an index column
d = d[order(d$x),] ## Sort x
##
## Discretize x by the median in each group
for ( i in seq(from=1, to=hh, by=step) ) {
##
## Find the median in the current group
med_index1 = i + floor((step-1)/2)
med_index2 = i + ceiling((step-1)/2)
med = ( d$x[med_index1] + d$x[med_index2] )/ 2
##
## Replace the group elements with the median
for ( j in i:(i+step-1) ) {
d$x[j] = med
}
}
d=d[order(d$index),] ## Put x back to its original order
return(d$x) ## Return the discretized vector for x
}
## This is a test code for function "discretize.R"
#dump("add2", file="discretize.R")
#source("discretize.R")
data <- matrix(data=cbind(rnorm(30, 0), rnorm(30, 2), rnorm(30, 5)), nrow=30, ncol=3)
head(data)
## [,1] [,2] [,3]
## [1,] 0.43961012 0.6509330 5.688139
## [2,] -0.02170716 3.4614587 7.039521
## [3,] -0.52733417 1.1165835 4.833965
## [4,] 0.18599627 2.7678167 5.186015
## [5,] 2.05573155 1.9584715 6.047592
## [6,] 0.89456137 0.9744364 5.127185
discret_data <- apply(data, 2, function(x) discretize(x,4))
head(discret_data)
## [,1] [,2] [,3]
## [1,] 0.45617732 0.599541 5.623603
## [2,] 0.08388144 3.050305 6.634291
## [3,] -0.52067768 1.169088 5.018349
## [4,] 0.08388144 2.514256 5.259404
## [5,] 2.05573155 1.946419 5.885117
## [6,] 0.86455971 0.599541 5.018349
boxplot(data)
boxplot(discret_data)