Define My_triple_store endpoints

Three RDF files was uploaded into local web application, the description of each RDF was chaptured for similarity calculation.

endpoint <- "http://localhost:8890/sparql"

query1 <- "select ?o 
from <http://localhost:8890/Fsport>
where {?s <http://www.w3.org/2000/01/rdf-schema#comment> ?o }"
fs <- SPARQL(endpoint,query1)
fsS <- toString(fs$results)

query2 <- "select ?o
from <http://localhost:8890/psport>
where {?s <http://www.w3.org/2000/01/rdf-schema#comment>  ?o }"
ps <- SPARQL(endpoint,query2)
psS <- toString(ps$results)

query3 <- "select ?o
from <http://localhost:8890/sf>
where {?s <http://www.w3.org/2000/01/rdf-schema#comment>  ?o }"
sf <- SPARQL(endpoint,query3)
sfS <- toString(sf$results)

allDesc=list(fsS, psS, sfS)  

servs <- lapply(allDesc, function(x) {
  text <- gsub("[[:punct:]]", "", x) %>% tolower()
  text <- gsub("\\s+", " ", text) %>% str_trim()  
  word <- strsplit(text, " ") %>% unlist()
  return(word)})


L1=length(servs[[1]])
L2=length(servs[[2]])
L3=length(servs[[3]])

Shingling <- function(document, k) {
  shingles <- character( length = length(document) - k + 1 )
  
  for( i in 1:( length(document) - k + 1 ) ) {
    shingles[i] <- paste( document[ i:(i + k - 1) ], collapse = " " )
  }
  
  return( unique(shingles) )  
}

# "shingle" our example document, with k = 3
servs <- lapply(servs, function(x) {
  Shingling(x, k = 1)})  

list( Original = allDesc, Shingled = servs )

## $Original
## $Original[[1]]
## [1] "This is an ontology that keeps track of the favorite sport for students and faculties."
## 
## $Original[[2]]
## [1] "This is an ontology to show the preferred sport of all students and faculties. It illustrates the favorite sport for students and faculties, and what they like in common."
## 
## $Original[[3]]
## [1] "This is the university ontology, This is an ontology to illustrate the relations between students and faculties. What courses the student enrolled in and who teach these courses."
## 
## 
## $Shingled
## $Shingled[[1]]
##  [1] "this"      "is"        "an"        "ontology"  "that"     
##  [6] "keeps"     "track"     "of"        "the"       "favorite" 
## [11] "sport"     "for"       "students"  "and"       "faculties"
## 
## $Shingled[[2]]
##  [1] "this"        "is"          "an"          "ontology"    "to"         
##  [6] "show"        "the"         "preferred"   "sport"       "of"         
## [11] "all"         "students"    "and"         "faculties"   "it"         
## [16] "illustrates" "favorite"    "for"         "what"        "they"       
## [21] "like"        "in"          "common"     
## 
## $Shingled[[3]]
##  [1] "this"       "is"         "the"        "university" "ontology"  
##  [6] "an"         "to"         "illustrate" "relations"  "between"   
## [11] "students"   "and"        "faculties"  "what"       "courses"   
## [16] "student"    "enrolled"   "in"         "who"        "teach"     
## [21] "these"

# unique shingles sets across all services
uniq_set <- unlist(servs) %>% unique()

# "characteristic" matrix
M <- lapply(servs, function(set, dict) {
  as.integer(dict %in% set)
}, dict = uniq_set) %>% data.frame() 

# set the names for both rows and columns
setnames( M, paste( "Service", 1:length(servs), sep = "_" ) )

rownames(M) <- uniq_set
M

##             Service_1 Service_2 Service_3
## this                1         1         1
## is                  1         1         1
## an                  1         1         1
## ontology            1         1         1
## that                1         0         0
## keeps               1         0         0
## track               1         0         0
## of                  1         1         0
## the                 1         1         1
## favorite            1         1         0
## sport               1         1         0
## for                 1         1         0
## students            1         1         1
## and                 1         1         1
## faculties           1         1         1
## to                  0         1         1
## show                0         1         0
## preferred           0         1         0
## all                 0         1         0
## it                  0         1         0
## illustrates         0         1         0
## what                0         1         1
## they                0         1         0
## like                0         1         0
## in                  0         1         1
## common              0         1         0
## university          0         0         1
## illustrate          0         0         1
## relations           0         0         1
## between             0         0         1
## courses             0         0         1
## student             0         0         1
## enrolled            0         0         1
## who                 0         0         1
## teach               0         0         1
## these               0         0         1

One well known measure for determining the degree of similarity is the Jaccard Similarity. The math formula of this measurement will be:

Jacc(A,B)= |A ^ B|/|A U B|, Jaccard Coefficient is equal to (A Intersect B) / (A Union B) = |A ^ B|/|A| + |B|- |A ^ B| 0 <= Jacc(A,B) <= 1

Jaccard similarity is the size of the intersection divided by the union. The similarity matrix tells us that Service 1 and 2 is the most similar among the three Services.

The following section will calculate the jaccard similarities for all three Services

# how similar is two given document, jaccard similarity 
JaccardSimilarity <- function(x, y) {
  non_zero <- which(x | y)
  set_intersect <- sum( x[non_zero] & y[non_zero] )
  set_union <- length(non_zero)
  return(set_intersect / set_union)
}

# create a new entry in the registry
pr_DB$set_entry( FUN = JaccardSimilarity, names = c("JaccardSimilarity") )

# jaccard similarity distance matrix 
d1 <- dist( t(M), method = "JaccardSimilarity" )

# delete the new entry
pr_DB$delete_entry("JaccardSimilarity")


#we want values for diagonal and upper as well, so:
jacc <- dist(t(M), method = "binary",
             diag = TRUE, upper = TRUE)

#ggplot needs a dataframe:
jacc <- as.data.frame(as.matrix(jacc))
jacc

##           Service_1 Service_2 Service_3
## Service_1 0.0000000 0.5384615 0.7142857
## Service_2 0.5384615 0.0000000 0.6666667
## Service_3 0.7142857 0.6666667 0.0000000

#we want the jaccard similarity, not the distance:
jaccsim <- 1 - jacc
jaccsim

##           Service_1 Service_2 Service_3
## Service_1 1.0000000 0.4615385 0.2857143
## Service_2 0.4615385 1.0000000 0.3333333
## Service_3 0.2857143 0.3333333 1.0000000

#add a row with names to melt on
jaccsim$names <- rownames(jaccsim)
#melt the data frame to make it tidy
jacc.m <- reshape2::melt(jaccsim, id.vars = "names")

#make sure the parties are in correct order in the plot
#convert to factor
jacc.m$names <- factor(jacc.m$names, rownames(jaccsim))
jacc.m$variable <- factor(jacc.m$variable, rev(rownames(jaccsim)))
#sort the data frame
jacc.m <- plyr::arrange(jacc.m, variable, plyr::desc(names))

sim <- ggplot(jacc.m, aes(names, variable)) +
  geom_tile(aes(fill=value), colour = "white") +
  scale_fill_gradient(low ="#fdebea" , high = "#ee3e32")

base_size <- 20
sim + theme_light(base_size = base_size) +
  labs(x = "", y = "") +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  guides(fill=guide_legend(title=NULL)) +
  theme(axis.ticks = element_blank(),
        axis.text.x = element_text(size = base_size * 0.8,
                                   angle = 330, hjust = 0),
        axis.text.y = element_text(size = base_size * 0.8)
  )

vername <- levels(jacc.m$names)
g3 <- graph(c(vername[1],vername[2],vername[2],vername[3],vername[3],vername[1]),  directed=F) # named vertices

V(g3)$size <- c(L1,L2,L3)*3
E(g3)$weight <- d1
E(g3)$width <- E(g3)$weight*10
E(g3)$label = round(E(g3)$weight, 2)

plot(g3)

Similarity Analysis

Define My_triple_store endpoints