In this vignette, we explore clustering on generated data and the AUTO data set.
We will highlight a few aspects of ORE:
# Load the ORE library
suppressMessages(library(ORE))
# Turn off row ordering warnings
options(ore.warn.order=FALSE)
# Create an ORE Connection
ore.connect(user ="rquser",
conn_string ="ORCLPDB",
host ="150.136.31.194",
password ="rquser",
all =TRUE)
rm(list = ls()) # housekeeping
set.seed(123) # enable repeatable results
options(digits=4) # limit decimal output
#-- generate a data set with three clusters
dat <- rbind(matrix(rnorm(1000, sd = 0.3), ncol = 2), # cluster 1
matrix(rnorm(1000, mean = 1, sd = 0.3), ncol = 2), # cluster 2
matrix(rnorm(1000, mean = 2.5, sd = 0.4), ncol = 2)) # cluster 3
colnames(dat) <- c("x", "y")
dat <- data.frame(dat)
#-- view the clusters
plot(dat$x, dat$y)
#-- create a temporary database table using ore.push - object deleted when db connection ends
X <- ore.push (data.frame(dat))
class(X)
## [1] "ore.frame"
## attr(,"package")
## [1] "OREbase"
km.mod1 <- ore.odmKMeans(~., X,
num.centers=3,
num.bins=5)
summary(km.mod1) # view the model summary
##
## Call:
## ore.odmKMeans(formula = ~., data = X, num.centers = 3, num.bins = 5)
##
## Settings:
## value
## clus.num.clusters 3
## block.growth 2
## conv.tolerance 0.01
## details details.all
## distance euclidean
## iterations 3
## min.pct.attr.support 0.1
## num.bins 5
## random.seed 0
## split.criterion variance
## odms.details odms.enable
## odms.missing.value.treatment odms.missing.value.auto
## odms.sampling odms.sampling.disable
## prep.auto ON
##
## Centers:
## x y
## 2 0.4979 0.5008
## 4 2.4985 2.1550
## 5 2.4140 2.8368
km.res1 <- predict(km.mod1,X,type="class", supplemental.cols=c("x","y"))
head(km.res1,3) # view assignments
#– Visualize the cluster assignments and centroids
km.res1.local <- ore.pull(km.res1) # retrieve data from database for visualization
plot(data.frame(x=km.res1.local$x, y=km.res1.local$y), col=km.res1.local$CLUSTER_ID)
points(km.mod1$centers2, col = "black", pch = 8, cex=5) # plot the cluster centroids
#-- Score data retrieving different details
head(predict(km.mod1,X)) # view default prediction output
tail(predict(km.mod1,X,type=c("class","raw"),supplemental.cols=c("x","y")),3) # ask for additional columns to be returned
tail(predict(km.mod1,X,type="raw",supplemental.cols=c("x","y")),3) # ask for only raw probabilities with supp data