Synopsis

This document includes examples of the Data Classification capabilities of ContextBase. “Data Classification” is the use of Machine Learning techniques to organize datasets into related sub-populations, not previous specified in the dataset. This can uncover hidden characteristics within data, and identify hidden categories that new data belongs within.

ContextBase Classification Case Study 1

This Case Study demonstrates the use of Kmeans Clustering, for the Classification of the cars within the Motor Trend Cars 1974 dataset. The Kmeans function in the R programming language was used to divide the dataset into 3 clusters. The centers of the clusters were determined by Miles Per Gallon, Horsepower, and Displacement of the engine cylinders.

# Data Import
data(mtcars)

# Dataset Observations
kable(head(mtcars[,1:3]), caption = "Sample of Records processed for Classification")

Sample of Records processed for Classification
	mpg	cyl	disp
Mazda RX4	21.0	6	160
Mazda RX4 Wag	21.0	6	160
Datsun 710	22.8	4	108
Hornet 4 Drive	21.4	6	258
Hornet Sportabout	18.7	8	360
Valiant	18.1	6	225

# Selecting observations to determine cluster parameters
mtcarsnum <- mtcars[,1:3]

# Kmeans clustering to create 3 clusters
mtcars_k3 <- kmeans(mtcarsnum, centers=3)

## The amount of cars in Cluster 1 = 12

## The amount of cars in Cluster 2 = 16

## The amount of cars in Cluster 3 = 4

# Result of Kmeans processing
kable(mtcars_k3$centers, caption = "Centers of the 3 clusters")

Centers of the 3 clusters
mpg	cyl	disp
16.350	7.666667	304.5333
24.500	4.625000	122.2938
13.675	8.000000	443.0000

# Ward Hierarchical Clustering to determine Kmeans cluster distance
d <- dist(mtcarsnum, method = "euclidean")
fit <- hclust(d, method="ward.D") 

# Dendrogram plot
plot(fit)
groups <- cutree(fit, k=3)
rect.hclust(fit, k=3, border="red")

# Cluster plot
clusplot(mtcarsnum, mtcars_k3$cluster, color=TRUE, shade=TRUE, 
    labels=2, lines=0)

# Classification Tree
form <- as.formula(mpg ~ .)
tree.2 <- rpart(form,mtcars[,1:3])
# prp(tree.2) # fast plot
fancyRpartPlot(tree.2)

ContextBase Classification Case Study 2

This Case Study demonstrates the use of Kmeans Clustering, for the Classification of Internet of Things data. The Kmeans function in the R programming language was used to divide the dataset into 3 clusters. The centers of the clusters were determined by RBA ID, NDT IN KMH, and DISTANCE IN METERS.

# Data Import
IOTdata <- read.csv("trafficMetaData.csv")
IOTdata2 <- data.frame(as.character(IOTdata[,1]), IOTdata[,8], IOTdata[,23], IOTdata[,25])
names(IOTdata2) <- c("POINT_1_STREET", "NDT_IN_KMH", "DISTANCE_IN_METERS", "RBA_ID")
IOTdata2 <- IOTdata2[1:32,]
IOTdata2$POINT_1_STREET <- make.names(IOTdata2$POINT_1_STREET,
                                      unique=T)

# Dataset Observations
kable(head(IOTdata2, caption = "Sample of Records processed for Classification"))

POINT_1_STREET	NDT_IN_KMH	DISTANCE_IN_METERS	RBA_ID
SÃ.ftenvej	70	1505	161062
GrenÃ.vej	44	797	178660
Skanderborgvej	35	823	189702
GrenÃ.vej.1	53	813	192392
HolmstrupgÃ.rdvej	40	1015	229156
Edwin.Rahrs.Vej	42	1090	229194

# Selecting observations to determine cluster parameters
IOTdataNum <- IOTdata2[,2:4]

# Kmeans clustering to create 3 clusters
IOTdataNum_k3 <- kmeans(IOTdataNum, centers=3)

## The amount of Point 1 Street Names in Cluster 1 = 14

## The amount of Point 1 Street Names in Cluster 2 = 14

## The amount of Point 1 Street Names in Cluster 3 = 4

# Result of Kmeans processing
kable(IOTdataNum_k3$centers, caption = "Centers of the 3 clusters")

Centers of the 3 clusters
NDT_IN_KMH	DISTANCE_IN_METERS	RBA_ID
42.50	977.0714	229351.3
49.50	1335.5000	183836.1
42.25	1929.0000	229556.2

# Create cluster_id column
IOTdata2$cluster_id <- IOTdataNum_k3$cluster
rownames(IOTdataNum) <- IOTdata2[,1]

# Ward Hierarchical Clustering
d <- dist(IOTdataNum, method = "euclidean")
fit <- hclust(d, method="ward.D") 

# Dendrogram plot
plot(fit)
groups <- cutree(fit, k=3)
rect.hclust(fit, k=3, border="red")

# Cluster Plot
clusplot(IOTdataNum, IOTdataNum_k3$cluster, color=TRUE, shade=TRUE, 
    labels=2, lines=0)

# Classification Tree
form <- as.formula(NDT_IN_KMH ~ .)
tree.2 <- rpart(form, IOTdata2[,2:4])
# prp(tree.2) # fast plot
fancyRpartPlot(tree.2)

ContextBase Data Classification

http://contextbase.github.io

All programming by John Akwei, ECMp ERMp Data Scientist

September 21, 2016

Synopsis

ContextBase Classification Case Study 1

ContextBase Classification Case Study 2