Clustering

In this vignette, we explore clustering on generated data and the AUTO data set.

We will highlight a few aspects of ORE:

# Load the ORE library
suppressMessages(library(ORE))

# Turn off row ordering warnings
options(ore.warn.order=FALSE)

# Create an ORE Connection
ore.connect(user        ="rquser",
            conn_string ="ORCLPDB",
            host        ="150.136.31.194",
            password    ="rquser",
            all         =TRUE)


rm(list = ls())  # housekeeping

Create demo data set

set.seed(123)        # enable repeatable results
options(digits=4)    # limit decimal output

#-- generate a data set with three clusters

dat <- rbind(matrix(rnorm(1000, sd = 0.3), ncol = 2),              # cluster 1
             matrix(rnorm(1000, mean = 1, sd = 0.3), ncol = 2),    # cluster 2
             matrix(rnorm(1000, mean = 2.5, sd = 0.4), ncol = 2))  # cluster 3
colnames(dat) <- c("x", "y")
dat <- data.frame(dat)

#-- view the clusters

plot(dat$x, dat$y)

#-- create a temporary database table using ore.push - object deleted when db connection ends

X <- ore.push (data.frame(dat))   

class(X)
## [1] "ore.frame"
## attr(,"package")
## [1] "OREbase"

K-Means Clustering

Build a k-means clustering model with 3 clustered required

km.mod1 <- ore.odmKMeans(~., X,
                         num.centers=3,
                         num.bins=5)

summary(km.mod1)                                 # view the model summary
## 
## Call:
## ore.odmKMeans(formula = ~., data = X, num.centers = 3, num.bins = 5)
## 
## Settings: 
##                                                value
## clus.num.clusters                                  3
## block.growth                                       2
## conv.tolerance                                  0.01
## details                                  details.all
## distance                                   euclidean
## iterations                                         3
## min.pct.attr.support                             0.1
## num.bins                                           5
## random.seed                                        0
## split.criterion                             variance
## odms.details                             odms.enable
## odms.missing.value.treatment odms.missing.value.auto
## odms.sampling                  odms.sampling.disable
## prep.auto                                         ON
## 
## Centers: 
##        x      y
## 2 0.4979 0.5008
## 4 2.4985 2.1550
## 5 2.4140 2.8368

Use model to assign clusters to rows

km.res1 <- predict(km.mod1,X,type="class", supplemental.cols=c("x","y"))

head(km.res1,3)                                  # view assignments

#– Visualize the cluster assignments and centroids

km.res1.local <- ore.pull(km.res1)               # retrieve data from database for visualization

plot(data.frame(x=km.res1.local$x, y=km.res1.local$y), col=km.res1.local$CLUSTER_ID)
points(km.mod1$centers2, col = "black", pch = 8, cex=5)  # plot the cluster centroids

#-- Score data retrieving different details

head(predict(km.mod1,X))        # view default prediction output
tail(predict(km.mod1,X,type=c("class","raw"),supplemental.cols=c("x","y")),3)  # ask for additional columns to be returned
tail(predict(km.mod1,X,type="raw",supplemental.cols=c("x","y")),3)             # ask for only raw probabilities with supp data