Three RDF files was uploaded into local web application, the description of each RDF was chaptured for similarity calculation.
endpoint <- "http://localhost:8890/sparql"
query1 <- "select ?o
from <http://localhost:8890/Fsport>
where {?s <http://www.w3.org/2000/01/rdf-schema#comment> ?o }"
fs <- SPARQL(endpoint,query1)
fsS <- toString(fs$results)
query2 <- "select ?o
from <http://localhost:8890/psport>
where {?s <http://www.w3.org/2000/01/rdf-schema#comment> ?o }"
ps <- SPARQL(endpoint,query2)
psS <- toString(ps$results)
query3 <- "select ?o
from <http://localhost:8890/sf>
where {?s <http://www.w3.org/2000/01/rdf-schema#comment> ?o }"
sf <- SPARQL(endpoint,query3)
sfS <- toString(sf$results)
allDesc=list(fsS, psS, sfS)
servs <- lapply(allDesc, function(x) {
text <- gsub("[[:punct:]]", "", x) %>% tolower()
text <- gsub("\\s+", " ", text) %>% str_trim()
word <- strsplit(text, " ") %>% unlist()
return(word)})
L1=length(servs[[1]])
L2=length(servs[[2]])
L3=length(servs[[3]])
Shingling <- function(document, k) {
shingles <- character( length = length(document) - k + 1 )
for( i in 1:( length(document) - k + 1 ) ) {
shingles[i] <- paste( document[ i:(i + k - 1) ], collapse = " " )
}
return( unique(shingles) )
}
# "shingle" our example document, with k = 3
servs <- lapply(servs, function(x) {
Shingling(x, k = 1)})
list( Original = allDesc, Shingled = servs )
## $Original
## $Original[[1]]
## [1] "This is an ontology that keeps track of the favorite sport for students and faculties."
##
## $Original[[2]]
## [1] "This is an ontology to show the preferred sport of all students and faculties. It illustrates the favorite sport for students and faculties, and what they like in common."
##
## $Original[[3]]
## [1] "This is the university ontology, This is an ontology to illustrate the relations between students and faculties. What courses the student enrolled in and who teach these courses."
##
##
## $Shingled
## $Shingled[[1]]
## [1] "this" "is" "an" "ontology" "that"
## [6] "keeps" "track" "of" "the" "favorite"
## [11] "sport" "for" "students" "and" "faculties"
##
## $Shingled[[2]]
## [1] "this" "is" "an" "ontology" "to"
## [6] "show" "the" "preferred" "sport" "of"
## [11] "all" "students" "and" "faculties" "it"
## [16] "illustrates" "favorite" "for" "what" "they"
## [21] "like" "in" "common"
##
## $Shingled[[3]]
## [1] "this" "is" "the" "university" "ontology"
## [6] "an" "to" "illustrate" "relations" "between"
## [11] "students" "and" "faculties" "what" "courses"
## [16] "student" "enrolled" "in" "who" "teach"
## [21] "these"
# unique shingles sets across all services
uniq_set <- unlist(servs) %>% unique()
# "characteristic" matrix
M <- lapply(servs, function(set, dict) {
as.integer(dict %in% set)
}, dict = uniq_set) %>% data.frame()
# set the names for both rows and columns
setnames( M, paste( "Service", 1:length(servs), sep = "_" ) )
rownames(M) <- uniq_set
M
## Service_1 Service_2 Service_3
## this 1 1 1
## is 1 1 1
## an 1 1 1
## ontology 1 1 1
## that 1 0 0
## keeps 1 0 0
## track 1 0 0
## of 1 1 0
## the 1 1 1
## favorite 1 1 0
## sport 1 1 0
## for 1 1 0
## students 1 1 1
## and 1 1 1
## faculties 1 1 1
## to 0 1 1
## show 0 1 0
## preferred 0 1 0
## all 0 1 0
## it 0 1 0
## illustrates 0 1 0
## what 0 1 1
## they 0 1 0
## like 0 1 0
## in 0 1 1
## common 0 1 0
## university 0 0 1
## illustrate 0 0 1
## relations 0 0 1
## between 0 0 1
## courses 0 0 1
## student 0 0 1
## enrolled 0 0 1
## who 0 0 1
## teach 0 0 1
## these 0 0 1
One well known measure for determining the degree of similarity is the Jaccard Similarity. The math formula of this measurement will be:
Jacc(A,B)= |A ^ B|/|A U B|, Jaccard Coefficient is equal to (A Intersect B) / (A Union B) = |A ^ B|/|A| + |B|- |A ^ B| 0 <= Jacc(A,B) <= 1
Jaccard similarity is the size of the intersection divided by the union. The similarity matrix tells us that Service 1 and 2 is the most similar among the three Services.
The following section will calculate the jaccard similarities for all three Services
# how similar is two given document, jaccard similarity
JaccardSimilarity <- function(x, y) {
non_zero <- which(x | y)
set_intersect <- sum( x[non_zero] & y[non_zero] )
set_union <- length(non_zero)
return(set_intersect / set_union)
}
# create a new entry in the registry
pr_DB$set_entry( FUN = JaccardSimilarity, names = c("JaccardSimilarity") )
# jaccard similarity distance matrix
d1 <- dist( t(M), method = "JaccardSimilarity" )
# delete the new entry
pr_DB$delete_entry("JaccardSimilarity")
#we want values for diagonal and upper as well, so:
jacc <- dist(t(M), method = "binary",
diag = TRUE, upper = TRUE)
#ggplot needs a dataframe:
jacc <- as.data.frame(as.matrix(jacc))
jacc
## Service_1 Service_2 Service_3
## Service_1 0.0000000 0.5384615 0.7142857
## Service_2 0.5384615 0.0000000 0.6666667
## Service_3 0.7142857 0.6666667 0.0000000
#we want the jaccard similarity, not the distance:
jaccsim <- 1 - jacc
jaccsim
## Service_1 Service_2 Service_3
## Service_1 1.0000000 0.4615385 0.2857143
## Service_2 0.4615385 1.0000000 0.3333333
## Service_3 0.2857143 0.3333333 1.0000000
#add a row with names to melt on
jaccsim$names <- rownames(jaccsim)
#melt the data frame to make it tidy
jacc.m <- reshape2::melt(jaccsim, id.vars = "names")
#make sure the parties are in correct order in the plot
#convert to factor
jacc.m$names <- factor(jacc.m$names, rownames(jaccsim))
jacc.m$variable <- factor(jacc.m$variable, rev(rownames(jaccsim)))
#sort the data frame
jacc.m <- plyr::arrange(jacc.m, variable, plyr::desc(names))
sim <- ggplot(jacc.m, aes(names, variable)) +
geom_tile(aes(fill=value), colour = "white") +
scale_fill_gradient(low ="#fdebea" , high = "#ee3e32")
base_size <- 20
sim + theme_light(base_size = base_size) +
labs(x = "", y = "") +
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
guides(fill=guide_legend(title=NULL)) +
theme(axis.ticks = element_blank(),
axis.text.x = element_text(size = base_size * 0.8,
angle = 330, hjust = 0),
axis.text.y = element_text(size = base_size * 0.8)
)
vername <- levels(jacc.m$names)
g3 <- graph(c(vername[1],vername[2],vername[2],vername[3],vername[3],vername[1]), directed=F) # named vertices
V(g3)$size <- c(L1,L2,L3)*3
E(g3)$weight <- d1
E(g3)$width <- E(g3)$weight*10
E(g3)$label = round(E(g3)$weight, 2)
plot(g3)