Introduction

The goal of this document is to show a sample script in which Linked Data on the Web are easily integrated to perform some analyses.

Load libraries

Load the libraries for performing SPARQL queries and for invoking the Google Maps API.

library(SPARQL)
library(ggmap)

Search for corrupt countries

Let’s query the 270a SPARQL endpoint for all the country scores of the Corruption Perceptions Index in 2011.

First we define the required namespaces.

prefixT = c("sdmx-dimension","http://purl.org/linked-data/sdmx/2009/dimension#",
            "property","http://transparency.270a.info/property/",
            "indicator","http://transparency.270a.info/classification/indicator/",
            "year","http://reference.data.gov.uk/id/year/",
            "skos","http://www.w3.org/2004/02/skos/core#")

sparql_prefixT = "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sdmx-dimension: <http://purl.org/linked-data/sdmx/2009/dimension#>
PREFIX property: <http://transparency.270a.info/property/>
PREFIX indicator: <http://transparency.270a.info/classification/indicator/>
PREFIX year: <http://reference.data.gov.uk/id/year/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
"

Then we define a query to get all the country labels with their scores.

qT = paste(sparql_prefixT,"
SELECT ?countryLabel ?score 
WHERE {
  ?s sdmx-dimension:refPeriod year:2011 ;
    property:indicator indicator:corruption-perceptions-index ;
    property:score ?score ;
    sdmx-dimension:refArea ?countryURI .
  ?countryURI skos:prefLabel ?countryLabel .
}
ORDER BY ?score
")

And submit it to the SPARQL endpoint.

endpointT = "http://transparency.270a.info/sparql"
optionsT=""

countryScores = SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results

Let’s take a look at the output of the query.

Is a lower index better or worse?

head(countryScores)

##         countryLabel     score
## 1       "Somalia"@en 0.9834057
## 2 "Korea (North)"@en 1.0096260
## 3       "Myanmar"@en 1.4902960
## 4   "Afghanistan"@en 1.5170930
## 5         "Sudan"@en 1.5637580
## 6  "Turkmenistan"@en 1.6031730

tail(countryScores)

##         countryLabel    score
## 178      "Norway"@en 8.989854
## 179   "Singapore"@en 9.166839
## 180      "Sweden"@en 9.298488
## 181     "Denmark"@en 9.392080
## 182     "Finland"@en 9.403927
## 183 "New Zealand"@en 9.462681

We have a lot of countries, let’s visualize the country scores through a histogram.

hist(countryScores$score, main="Corruption Perceptions Index 2011", xlab = "CPI")

How corrupt is Spain?

If we take a look to the labels that we have retreived, we can see that we need to normalize them.

head(countryScores$countryLabel,20)

##  [1] "\"Somalia\"@en"                         
##  [2] "\"Korea (North)\"@en"                   
##  [3] "\"Myanmar\"@en"                         
##  [4] "\"Afghanistan\"@en"                     
##  [5] "\"Sudan\"@en"                           
##  [6] "\"Turkmenistan\"@en"                    
##  [7] "\"Uzbekistan\"@en"                      
##  [8] "\"Haiti\"@en"                           
##  [9] "\"Iraq\"@en"                            
## [10] "\"Venezuela\"@en"                       
## [11] "\"Equatorial Guinea\"@en"               
## [12] "\"Burundi\"@en"                         
## [13] "\"Angola\"@en"                          
## [14] "\"Libya\"@en"                           
## [15] "\"Democratic Republic of the Congo\"@en"
## [16] "\"Chad\"@en"                            
## [17] "\"Yemen\"@en"                           
## [18] "\"Guinea\"@en"                          
## [19] "\"Cambodia\"@en"                        
## [20] "\"Kyrgyzstan\"@en"

Let’s clean the country labels to remove the language tag (@en) and quotes (“).

countryScores$countryLabel = gsub("@en","",countryScores$countryLabel)
countryScores$countryLabel = gsub("\"","",countryScores$countryLabel)

head(countryScores$countryLabel,20)

##  [1] "Somalia"                          "Korea (North)"                   
##  [3] "Myanmar"                          "Afghanistan"                     
##  [5] "Sudan"                            "Turkmenistan"                    
##  [7] "Uzbekistan"                       "Haiti"                           
##  [9] "Iraq"                             "Venezuela"                       
## [11] "Equatorial Guinea"                "Burundi"                         
## [13] "Angola"                           "Libya"                           
## [15] "Democratic Republic of the Congo" "Chad"                            
## [17] "Yemen"                            "Guinea"                          
## [19] "Cambodia"                         "Kyrgyzstan"

Which was Spain’s Corruption Perceptions Index in 2011?

SpainCPI2011 = countryScores[countryScores$countryLabel=="Spain",]

points(SpainCPI2011$score,20,pch=19, col="red")
text(SpainCPI2011$score,23,labels=SpainCPI2011$countryLabel, col="red")

Now let’s plot the CPI of another country.

OtherCPI2011 = countryScores[countryScores$countryLabel=="Greece",]

points(OtherCPI2011$score,30,pch=19, col="red")
text(OtherCPI2011$score,33,labels=OtherCPI2011$countryLabel, col="red")

What is the situation in Europe?

Let’s query DBpedia for all the countries in Europe.

First we define the namespaces to be used.

prefixD = c("yago","http://dbpedia.org/class/yago/",
            "dbpedia-owl","http://dbpedia.org/ontology/")

sparql_prefixD = "
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX yago: <http://dbpedia.org/class/yago/>
PREFIX dbpedia-owl: <http://dbpedia.org/ontology/>
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
"

And we define the query to retrieve every country in Europe.

qD = paste(sparql_prefixD,"
SELECT ?country WHERE {
    ?place rdf:type yago:EuropeanCountries .
    ?place rdf:type dbpedia-owl:Country .
    ?place rdfs:label ?country .
FILTER(LANGMATCHES(LANG(?country), \"en\"))
}
")

We submit the query to the DBpedia SPARQL endpoint.

endpointD = "http://dbpedia.org/sparql"
optionsD=""

europeanCountries = SPARQL(endpointD,qD,ns=prefixD,extra=optionsD)$results

And take a look at the output of the query.

europeanCountries

##                  country    country.1    country.2    country.3
## 1 "Georgia (country)"@en "Albania"@en "Andorra"@en "Armenia"@en
##      country.4       country.5    country.6    country.7     country.8
## 1 "Austria"@en "Azerbaijan"@en "Belarus"@en "Belgium"@en "Bulgaria"@en
##      country.9  country.10          country.11   country.12   country.13
## 1 "Croatia"@en "Cyprus"@en "Czech Republic"@en "Denmark"@en "Estonia"@en
##     country.14  country.15   country.16  country.17   country.18
## 1 "Finland"@en "France"@en "Germany"@en "Greece"@en "Hungary"@en
##     country.19 country.20  country.21         country.22     country.23
## 1 "Iceland"@en "Italy"@en "Latvia"@en "Liechtenstein"@en "Lithuania"@en
##        country.24   country.25  country.26      country.27
## 1 "Luxembourg"@en "Moldova"@en "Monaco"@en "Montenegro"@en
##         country.28  country.29    country.30   country.31  country.32
## 1 "Netherlands"@en "Poland"@en "Portugal"@en "Romania"@en "Russia"@en
##        country.33  country.34    country.35    country.36 country.37
## 1 "San Marino"@en "Serbia"@en "Slovakia"@en "Slovenia"@en "Spain"@en
##    country.38       country.39   country.40          country.41
## 1 "Sweden"@en "Switzerland"@en "Ukraine"@en "United Kingdom"@en
##             country.42               country.43        country.44
## 1 "Northern Cyprus"@en "Republic of Ireland"@en "Vatican City"@en
##                   country.45
## 1 "Republic of Macedonia"@en

We obtained the data in a row.

dim(europeanCountries)

## [1]  1 46

Let’s transpose it to a column.

europeanCountries=as.data.frame(t(europeanCountries))
colnames(europeanCountries)[1] = "countryLabel"
head(europeanCountries,10)

##                     countryLabel
## country   "Georgia (country)"@en
## country.1           "Albania"@en
## country.2           "Andorra"@en
## country.3           "Armenia"@en
## country.4           "Austria"@en
## country.5        "Azerbaijan"@en
## country.6           "Belarus"@en
## country.7           "Belgium"@en
## country.8          "Bulgaria"@en
## country.9           "Croatia"@en

Now we clean the country label to remove the language tag (@en) and quotes (“).

europeanCountries$countryLabel = gsub("@en","",europeanCountries$countryLabel)
europeanCountries$countryLabel = gsub("\"","",europeanCountries$countryLabel)
head(europeanCountries,10)

##                countryLabel
## country   Georgia (country)
## country.1           Albania
## country.2           Andorra
## country.3           Armenia
## country.4           Austria
## country.5        Azerbaijan
## country.6           Belarus
## country.7           Belgium
## country.8          Bulgaria
## country.9           Croatia

We also need to clean the Georgia and Ireland labels.

europeanCountries$countryLabel = gsub(" \\(country\\)","",europeanCountries$countryLabel)
europeanCountries$countryLabel = gsub("Republic of ","",europeanCountries$countryLabel)
head(europeanCountries)

##           countryLabel
## country        Georgia
## country.1      Albania
## country.2      Andorra
## country.3      Armenia
## country.4      Austria
## country.5   Azerbaijan

We have two data frames to merge: the one with the country scores and the one with the European countries.

head(countryScores)

##    countryLabel     score
## 1       Somalia 0.9834057
## 2 Korea (North) 1.0096260
## 3       Myanmar 1.4902960
## 4   Afghanistan 1.5170930
## 5         Sudan 1.5637580
## 6  Turkmenistan 1.6031730

head(europeanCountries)

##           countryLabel
## country        Georgia
## country.1      Albania
## country.2      Andorra
## country.3      Armenia
## country.4      Austria
## country.5   Azerbaijan

Let’s merge the data frames using the values of their countryLabel columns.

europeanCountriesWithCPI = merge(countryScores, europeanCountries, 
                                  by.x = "countryLabel", by.y = "countryLabel")
head(europeanCountriesWithCPI,10)

##      countryLabel    score
## 1         Albania 3.052946
## 2         Armenia 2.628996
## 3         Austria 7.786903
## 4      Azerbaijan 2.375235
## 5         Belarus 2.418323
## 6         Belgium 7.487431
## 7        Bulgaria 3.328966
## 8         Croatia 4.034245
## 9          Cyprus 6.265644
## 10 Czech Republic 4.368084

Do we have CPIs for all European countries?

europeanCountriesWithCPI.allY = merge(countryScores, europeanCountries, 
                                      by.x = "countryLabel", by.y = "countryLabel", 
                                      all.y=TRUE)
europeanCountriesWithCPI.allY[is.na(europeanCountriesWithCPI.allY$score),]

##       countryLabel score
## 2          Andorra    NA
## 24   Liechtenstein    NA
## 27       Macedonia    NA
## 29          Monaco    NA
## 32 Northern Cyprus    NA
## 37      San Marino    NA
## 46    Vatican City    NA

Let’s visualize the corruption scores along Europe.

hist(europeanCountriesWithCPI$score, main="Corruption Perceptions Index 2011 in Europe", xlab = "CPI")

And now we plot Spain’s CPI and the CPI of another country.

points(SpainCPI2011$score,5,pch=19, col="red")
text(SpainCPI2011$score,5.5,labels=SpainCPI2011$countryLabel, col="red")

points(OtherCPI2011$score,5,pch=19, col="red")
text(OtherCPI2011$score,5.5,labels=OtherCPI2011$countryLabel, col="red")

Draw the two graphs (World and Europe) at once.

With par we define a new graph configuration to include two plots in a graph. At the end we restore it to include only one graph.

par(mfrow=c(1, 2))

hist(countryScores$score, main="World Corruption Perceptions Index 2011", xlab = "CPI")
points(SpainCPI2011$score,20,pch=19, col="red")
text(SpainCPI2011$score,23,labels=SpainCPI2011$countryLabel, col="red")
points(OtherCPI2011$score,30,pch=19, col="red")
text(OtherCPI2011$score,33,labels=OtherCPI2011$countryLabel, col="red")

hist(europeanCountriesWithCPI$score, main="Europe Corruption Perceptions Index 2011", xlab = "CPI")
points(SpainCPI2011$score,5,pch=19, col="red")
text(SpainCPI2011$score,5.5,labels=SpainCPI2011$countryLabel, col="red")
points(OtherCPI2011$score,5,pch=19, col="red")
text(OtherCPI2011$score,5.5,labels=OtherCPI2011$countryLabel, col="red")

par(mfrow=c(1, 1))

Integration beyond string matching

Let’s query again the 720a data set for all the country scores of the Corruption Perceptions Index in 2011, but now we also retrieve the links to DBpedia.

We can reuse the same prefixes as before.

qT2 = paste(sparql_prefixT,"
SELECT ?score ?countryLabel ?URI
WHERE {
  ?s sdmx-dimension:refPeriod year:2011 ;
    property:indicator indicator:corruption-perceptions-index ;
    property:score ?score ;
    sdmx-dimension:refArea ?countryURI .
  ?countryURI skos:prefLabel ?countryLabel ;
    owl:sameAs ?URI .
  FILTER (REGEX(STR(?URI),\"dbpedia\"))
}
ORDER BY ?score
")

Let’s run the query and see the output.

countryScoresWLinks = SPARQL(endpointT,qT2,ns=prefixT,extra=optionsT)$results

head(countryScoresWLinks,10)

##        score       countryLabel                                        URI
## 1  0.9834057       "Somalia"@en      <http://dbpedia.org/resource/Somalia>
## 2  1.0096260 "Korea (North)"@en  <http://dbpedia.org/resource/North_Korea>
## 3  1.4902960       "Myanmar"@en        <http://dbpedia.org/resource/Burma>
## 4  1.5170930   "Afghanistan"@en  <http://dbpedia.org/resource/Afghanistan>
## 5  1.5637580         "Sudan"@en        <http://dbpedia.org/resource/Sudan>
## 6  1.6031730  "Turkmenistan"@en <http://dbpedia.org/resource/Turkmenistan>
## 7  1.6245630    "Uzbekistan"@en   <http://dbpedia.org/resource/Uzbekistan>
## 8  1.7998830         "Haiti"@en        <http://dbpedia.org/resource/Haiti>
## 9  1.8041280          "Iraq"@en         <http://dbpedia.org/resource/Iraq>
## 10 1.8894970     "Venezuela"@en    <http://dbpedia.org/resource/Venezuela>

And now we query DBpedia for all the URIs of the countries in Europe.

qD2 = paste(sparql_prefixD,"
SELECT ?place  WHERE {
    ?place rdf:type yago:EuropeanCountries .
    ?place rdf:type dbpedia-owl:Country .
}
")

Let’s run the query and see the output.

europeanCountriesWURIs = SPARQL(endpointD,qD2,ns=prefixD,extra=optionsD)$results

europeanCountriesWURIs[,1:10]

##                                             place
## 1 <http://dbpedia.org/resource/Georgia_(country)>
##                                 place.1
## 1 <http://dbpedia.org/resource/Albania>
##                                 place.2
## 1 <http://dbpedia.org/resource/Andorra>
##                                 place.3
## 1 <http://dbpedia.org/resource/Armenia>
##                                 place.4
## 1 <http://dbpedia.org/resource/Austria>
##                                    place.5
## 1 <http://dbpedia.org/resource/Azerbaijan>
##                                 place.6
## 1 <http://dbpedia.org/resource/Belarus>
##                                 place.7
## 1 <http://dbpedia.org/resource/Belgium>
##                                  place.8
## 1 <http://dbpedia.org/resource/Bulgaria>
##                                 place.9
## 1 <http://dbpedia.org/resource/Croatia>

Let’s transpose it into a column.

europeanCountriesWURIs=as.data.frame(t(europeanCountriesWURIs))
colnames(europeanCountriesWURIs)[1] = "countryURI"
head(europeanCountriesWURIs)

##                                              countryURI
## place   <http://dbpedia.org/resource/Georgia_(country)>
## place.1           <http://dbpedia.org/resource/Albania>
## place.2           <http://dbpedia.org/resource/Andorra>
## place.3           <http://dbpedia.org/resource/Armenia>
## place.4           <http://dbpedia.org/resource/Austria>
## place.5        <http://dbpedia.org/resource/Azerbaijan>

Now we merge the data using the URIs instead of the labels.

On the one hand there is no need to clean the labels and search for potential problems.
On the other hand, we now that if both URIs are the same, they refer to the same entity.

europeanCountriesWithCPIv2 = merge(countryScoresWLinks, europeanCountriesWURIs, by.x = "URI", by.y = "countryURI")
head(europeanCountriesWithCPIv2,10)

##                                             URI    score
## 1         <http://dbpedia.org/resource/Albania> 3.052946
## 2         <http://dbpedia.org/resource/Armenia> 2.628996
## 3         <http://dbpedia.org/resource/Austria> 7.786903
## 4      <http://dbpedia.org/resource/Azerbaijan> 2.375235
## 5         <http://dbpedia.org/resource/Belarus> 2.418323
## 6         <http://dbpedia.org/resource/Belgium> 7.487431
## 7        <http://dbpedia.org/resource/Bulgaria> 3.328966
## 8         <http://dbpedia.org/resource/Croatia> 4.034245
## 9          <http://dbpedia.org/resource/Cyprus> 6.265644
## 10 <http://dbpedia.org/resource/Czech_Republic> 4.368084
##           countryLabel
## 1         "Albania"@en
## 2         "Armenia"@en
## 3         "Austria"@en
## 4      "Azerbaijan"@en
## 5         "Belarus"@en
## 6         "Belgium"@en
## 7        "Bulgaria"@en
## 8         "Croatia"@en
## 9          "Cyprus"@en
## 10 "Czech Republic"@en

With these data we obtain the same histogram.

hist(europeanCountriesWithCPIv2$score, main="Corruption Perceptions Index 2011 in Europe", xlab = "CPI")
points(SpainCPI2011$score,5,pch=19, col="red")
text(SpainCPI2011$score,5.5,labels=SpainCPI2011$countryLabel, col="red")
points(OtherCPI2011$score,5,pch=19, col="red")
text(OtherCPI2011$score,5.5,labels=OtherCPI2011$countryLabel, col="red")

Is there any geospatial correlation?

Let’s show the most corrupt countries in a map.

Let’s query DBpedia for all the countries in Europe with their URI and coordinates.

qD3 = paste(sparql_prefixD,"
SELECT ?place ?lat ?long WHERE {
    ?place rdf:type yago:EuropeanCountries .
    ?place rdf:type dbpedia-owl:Country .
    ?place geo:lat ?lat .           
    ?place geo:long ?long . 
}
")

Let’s run the query and see the output.

europeanCountriesWCoord = SPARQL(endpointD,qD3,ns=prefixD,extra=optionsD)$results

head(europeanCountriesWCoord,10)

##                                              place     lat     long
## 1  <http://dbpedia.org/resource/Georgia_(country)> 41.7167 44.78330
## 2            <http://dbpedia.org/resource/Albania> 41.3333 19.80000
## 3            <http://dbpedia.org/resource/Andorra> 42.5000  1.51667
## 4            <http://dbpedia.org/resource/Armenia> 40.1833 44.51670
## 5            <http://dbpedia.org/resource/Austria> 48.2000 16.35000
## 6         <http://dbpedia.org/resource/Azerbaijan> 40.4167 49.83330
## 7            <http://dbpedia.org/resource/Belarus> 53.9167 27.55000
## 8            <http://dbpedia.org/resource/Belgium> 50.8500  4.35000
## 9           <http://dbpedia.org/resource/Bulgaria> 42.6833 23.31670
## 10           <http://dbpedia.org/resource/Croatia> 45.8000 16.00000

Now we merge the results with the previous data.

europeanCountriesWithCPIandCoord = merge(europeanCountriesWithCPIv2, europeanCountriesWCoord, 
                                          by.x = "URI", by.y = "place")
head(europeanCountriesWithCPIandCoord,10)

##                                             URI    score
## 1         <http://dbpedia.org/resource/Albania> 3.052946
## 2         <http://dbpedia.org/resource/Armenia> 2.628996
## 3         <http://dbpedia.org/resource/Austria> 7.786903
## 4      <http://dbpedia.org/resource/Azerbaijan> 2.375235
## 5         <http://dbpedia.org/resource/Belarus> 2.418323
## 6         <http://dbpedia.org/resource/Belgium> 7.487431
## 7        <http://dbpedia.org/resource/Bulgaria> 3.328966
## 8         <http://dbpedia.org/resource/Croatia> 4.034245
## 9          <http://dbpedia.org/resource/Cyprus> 6.265644
## 10 <http://dbpedia.org/resource/Czech_Republic> 4.368084
##           countryLabel     lat    long
## 1         "Albania"@en 41.3333 19.8000
## 2         "Armenia"@en 40.1833 44.5167
## 3         "Austria"@en 48.2000 16.3500
## 4      "Azerbaijan"@en 40.4167 49.8333
## 5         "Belarus"@en 53.9167 27.5500
## 6         "Belgium"@en 50.8500  4.3500
## 7        "Bulgaria"@en 42.6833 23.3167
## 8         "Croatia"@en 45.8000 16.0000
## 9          "Cyprus"@en 35.1667 33.3667
## 10 "Czech Republic"@en 50.0833 14.4667

We can put all these data in a map by calling the Google Maps API.

qmap('Brussels', zoom=4) +
     geom_point(aes(x=europeanCountriesWithCPIandCoord$long, 
                    y=europeanCountriesWithCPIandCoord$lat), 
     data=europeanCountriesWithCPIandCoord, size=europeanCountriesWithCPIandCoord$score, 
     colour="red") + 
     scale_color_manual(values = rainbow(10))

Let’s show the most corrupt ones (CPI<4).

europeanCountriesWithCPIandCoordWorst = europeanCountriesWithCPIandCoord[europeanCountriesWithCPIandCoord$score<4,]
europeanCountriesWithCPIandCoordWorst

##                                                    URI    score
## 1                <http://dbpedia.org/resource/Albania> 3.052946
## 2                <http://dbpedia.org/resource/Armenia> 2.628996
## 4             <http://dbpedia.org/resource/Azerbaijan> 2.375235
## 5                <http://dbpedia.org/resource/Belarus> 2.418323
## 7               <http://dbpedia.org/resource/Bulgaria> 3.328966
## 17                <http://dbpedia.org/resource/Greece> 3.388967
## 20                 <http://dbpedia.org/resource/Italy> 3.906913
## 24               <http://dbpedia.org/resource/Moldova> 2.878946
## 25            <http://dbpedia.org/resource/Montenegro> 3.969978
## 29 <http://dbpedia.org/resource/Republic_of_Macedonia> 3.942666
## 30               <http://dbpedia.org/resource/Romania> 3.612146
## 31                <http://dbpedia.org/resource/Russia> 2.448757
## 32                <http://dbpedia.org/resource/Serbia> 3.312795
## 33              <http://dbpedia.org/resource/Slovakia> 3.971617
## 38               <http://dbpedia.org/resource/Ukraine> 2.296875
##          countryLabel     lat    long
## 1        "Albania"@en 41.3333 19.8000
## 2        "Armenia"@en 40.1833 44.5167
## 4     "Azerbaijan"@en 40.4167 49.8333
## 5        "Belarus"@en 53.9167 27.5500
## 7       "Bulgaria"@en 42.6833 23.3167
## 17        "Greece"@en 37.9667 23.7167
## 20         "Italy"@en 41.9000 12.4833
## 24       "Moldova"@en 47.0000 28.9167
## 25    "Montenegro"@en 42.7833 19.4667
## 29 "FYR Macedonia"@en 42.0000 21.4333
## 30       "Romania"@en 44.4167 26.1000
## 31        "Russia"@en 55.7500 37.6167
## 32        "Serbia"@en 44.8000 20.4667
## 33      "Slovakia"@en 48.1500 17.1167
## 38       "Ukraine"@en 50.4500 30.5000

And plot them in the map.

qmap('Brussels', zoom=4) +
  geom_point(aes(x=europeanCountriesWithCPIandCoordWorst$long, y=europeanCountriesWithCPIandCoordWorst$lat), 
             data=europeanCountriesWithCPIandCoordWorst, size=europeanCountriesWithCPIandCoordWorst$score, colour="red") + 
  scale_color_manual(values = rainbow(10))

Correlations everywhere…

Is there any correlation between the movies produced by a country and the Corruption Perceptions Index?

Let’s query the Linked Movie Data Base for all the movies from European countries (Note: the “IN” approach doesn’t work for the endpoint, so we change the approach).

Let’s first define the prefixes and the endpoint, and initialize a dataframe for storing the results.

prefixI = c("movie","http://data.linkedmdb.org/page/movie/")

sparql_prefixI = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX movie: <http://data.linkedmdb.org/resource/movie/>
"
endpointI = "http://data.linkedmdb.org/sparql"
optionsI=""

moviesPerCountry = data.frame(country= character(0), numMovies = integer(0))

Then we search for the movies in every country.

for (i in 1:length(europeanCountriesWithCPIandCoord$URI)) {
  qI = paste(sparql_prefixI,"
  SELECT ?country (COUNT(*) AS ?numMovies) WHERE
  { 
    ?countryI rdf:type movie:country .
    ?countryI owl:sameAs ?country .
    ?film movie:country ?countryI .
  FILTER (?country=", europeanCountriesWithCPIandCoord$URI[i] ,")
  }
  GROUP BY ?country
  ")
  
  numberMovies  = SPARQL(endpointI,qI,ns=prefixI,extra=optionsI)$results

  moviesPerCountry = rbind(moviesPerCountry,numberMovies)

  # To avoid being rejected by the endpoint
  Sys.sleep(2)
}

It may happen that the Linked Movie Data SPARQL endpoint is down. In that case, we can generate the number of movies randomly.

for (i in 1:length(europeanCountriesWithCPIandCoord$URI)) {
  numberMovies = data.frame(matrix(ncol = 2, nrow = 1))
  numberMovies[1,1] = europeanCountriesWithCPIandCoord$URI[i]
  numberMovies[1,2] = round(rnorm(1, mean = 500, sd = 200))
  colnames(numberMovies) = c("country","numMovies")

  moviesPerCountry = rbind(moviesPerCountry,numberMovies)
}

We obtained the number of movies per country.

head(moviesPerCountry,10)

##                                         country numMovies
## 1         <http://dbpedia.org/resource/Albania>       245
## 2         <http://dbpedia.org/resource/Armenia>       665
## 3         <http://dbpedia.org/resource/Austria>       382
## 4      <http://dbpedia.org/resource/Azerbaijan>       244
## 5         <http://dbpedia.org/resource/Belarus>       332
## 6         <http://dbpedia.org/resource/Belgium>       783
## 7        <http://dbpedia.org/resource/Bulgaria>       388
## 8         <http://dbpedia.org/resource/Croatia>       199
## 9          <http://dbpedia.org/resource/Cyprus>       466
## 10 <http://dbpedia.org/resource/Czech_Republic>       567

We merge our results with the previous data.

europeanCountriesWithCPICoordMovies = merge(europeanCountriesWithCPIandCoord, moviesPerCountry, 
                                             by.x = "URI", by.y = "country")
head(europeanCountriesWithCPICoordMovies, 10)

##                                             URI    score
## 1         <http://dbpedia.org/resource/Albania> 3.052946
## 2         <http://dbpedia.org/resource/Armenia> 2.628996
## 3         <http://dbpedia.org/resource/Austria> 7.786903
## 4      <http://dbpedia.org/resource/Azerbaijan> 2.375235
## 5         <http://dbpedia.org/resource/Belarus> 2.418323
## 6         <http://dbpedia.org/resource/Belgium> 7.487431
## 7        <http://dbpedia.org/resource/Bulgaria> 3.328966
## 8         <http://dbpedia.org/resource/Croatia> 4.034245
## 9          <http://dbpedia.org/resource/Cyprus> 6.265644
## 10 <http://dbpedia.org/resource/Czech_Republic> 4.368084
##           countryLabel     lat    long numMovies
## 1         "Albania"@en 41.3333 19.8000       245
## 2         "Armenia"@en 40.1833 44.5167       665
## 3         "Austria"@en 48.2000 16.3500       382
## 4      "Azerbaijan"@en 40.4167 49.8333       244
## 5         "Belarus"@en 53.9167 27.5500       332
## 6         "Belgium"@en 50.8500  4.3500       783
## 7        "Bulgaria"@en 42.6833 23.3167       388
## 8         "Croatia"@en 45.8000 16.0000       199
## 9          "Cyprus"@en 35.1667 33.3667       466
## 10 "Czech Republic"@en 50.0833 14.4667       567

Is there any correlation between the number of movies produced by a country in IMDB and the Corruption Perceptions Index in 2011?

plot(europeanCountriesWithCPICoordMovies$score,europeanCountriesWithCPICoordMovies$numMovies,
     main="Number of movies per CPI", xlab = "CPI", ylab="Numer of movies")

Intelligent Data Analysis. Lab 3

Raúl García-Castro (rgarcia@fi.upm.es)

Introduction

Load libraries

Search for corrupt countries

How corrupt is Spain?

What is the situation in Europe?

Integration beyond string matching

Is there any geospatial correlation?

Correlations everywhere…