The goal of this document is to show a sample script in which Linked Data on the Web are easily integrated to perform some analyses.
Load the libraries for performing SPARQL queries and for invoking the Google Maps API.
library(SPARQL)
library(ggmap)
Let’s query the 270a SPARQL endpoint for all the country scores of the Corruption Perceptions Index in 2011.
First we define the required namespaces.
prefixT = c("sdmx-dimension","http://purl.org/linked-data/sdmx/2009/dimension#",
"property","http://transparency.270a.info/property/",
"indicator","http://transparency.270a.info/classification/indicator/",
"year","http://reference.data.gov.uk/id/year/",
"skos","http://www.w3.org/2004/02/skos/core#")
sparql_prefixT = "
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX sdmx-dimension: <http://purl.org/linked-data/sdmx/2009/dimension#>
PREFIX property: <http://transparency.270a.info/property/>
PREFIX indicator: <http://transparency.270a.info/classification/indicator/>
PREFIX year: <http://reference.data.gov.uk/id/year/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
"
Then we define a query to get all the country labels with their scores.
qT = paste(sparql_prefixT,"
SELECT ?countryLabel ?score
WHERE {
?s sdmx-dimension:refPeriod year:2011 ;
property:indicator indicator:corruption-perceptions-index ;
property:score ?score ;
sdmx-dimension:refArea ?countryURI .
?countryURI skos:prefLabel ?countryLabel .
}
ORDER BY ?score
")
And submit it to the SPARQL endpoint.
endpointT = "http://transparency.270a.info/sparql"
optionsT=""
countryScores = SPARQL(endpointT,qT,ns=prefixT,extra=optionsT)$results
Let’s take a look at the output of the query.
Is a lower index better or worse?
head(countryScores)
## countryLabel score
## 1 "Somalia"@en 0.9834057
## 2 "Korea (North)"@en 1.0096260
## 3 "Myanmar"@en 1.4902960
## 4 "Afghanistan"@en 1.5170930
## 5 "Sudan"@en 1.5637580
## 6 "Turkmenistan"@en 1.6031730
tail(countryScores)
## countryLabel score
## 178 "Norway"@en 8.989854
## 179 "Singapore"@en 9.166839
## 180 "Sweden"@en 9.298488
## 181 "Denmark"@en 9.392080
## 182 "Finland"@en 9.403927
## 183 "New Zealand"@en 9.462681
We have a lot of countries, let’s visualize the country scores through a histogram.
hist(countryScores$score, main="Corruption Perceptions Index 2011", xlab = "CPI")
If we take a look to the labels that we have retreived, we can see that we need to normalize them.
head(countryScores$countryLabel,20)
## [1] "\"Somalia\"@en"
## [2] "\"Korea (North)\"@en"
## [3] "\"Myanmar\"@en"
## [4] "\"Afghanistan\"@en"
## [5] "\"Sudan\"@en"
## [6] "\"Turkmenistan\"@en"
## [7] "\"Uzbekistan\"@en"
## [8] "\"Haiti\"@en"
## [9] "\"Iraq\"@en"
## [10] "\"Venezuela\"@en"
## [11] "\"Equatorial Guinea\"@en"
## [12] "\"Burundi\"@en"
## [13] "\"Angola\"@en"
## [14] "\"Libya\"@en"
## [15] "\"Democratic Republic of the Congo\"@en"
## [16] "\"Chad\"@en"
## [17] "\"Yemen\"@en"
## [18] "\"Guinea\"@en"
## [19] "\"Cambodia\"@en"
## [20] "\"Kyrgyzstan\"@en"
Let’s clean the country labels to remove the language tag (@en) and quotes (“).
countryScores$countryLabel = gsub("@en","",countryScores$countryLabel)
countryScores$countryLabel = gsub("\"","",countryScores$countryLabel)
head(countryScores$countryLabel,20)
## [1] "Somalia" "Korea (North)"
## [3] "Myanmar" "Afghanistan"
## [5] "Sudan" "Turkmenistan"
## [7] "Uzbekistan" "Haiti"
## [9] "Iraq" "Venezuela"
## [11] "Equatorial Guinea" "Burundi"
## [13] "Angola" "Libya"
## [15] "Democratic Republic of the Congo" "Chad"
## [17] "Yemen" "Guinea"
## [19] "Cambodia" "Kyrgyzstan"
Which was Spain’s Corruption Perceptions Index in 2011?
SpainCPI2011 = countryScores[countryScores$countryLabel=="Spain",]
points(SpainCPI2011$score,20,pch=19, col="red")
text(SpainCPI2011$score,23,labels=SpainCPI2011$countryLabel, col="red")
Now let’s plot the CPI of another country.
OtherCPI2011 = countryScores[countryScores$countryLabel=="Greece",]
points(OtherCPI2011$score,30,pch=19, col="red")
text(OtherCPI2011$score,33,labels=OtherCPI2011$countryLabel, col="red")
Let’s query DBpedia for all the countries in Europe.
First we define the namespaces to be used.
prefixD = c("yago","http://dbpedia.org/class/yago/",
"dbpedia-owl","http://dbpedia.org/ontology/")
sparql_prefixD = "
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX yago: <http://dbpedia.org/class/yago/>
PREFIX dbpedia-owl: <http://dbpedia.org/ontology/>
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
"
And we define the query to retrieve every country in Europe.
qD = paste(sparql_prefixD,"
SELECT ?country WHERE {
?place rdf:type yago:EuropeanCountries .
?place rdf:type dbpedia-owl:Country .
?place rdfs:label ?country .
FILTER(LANGMATCHES(LANG(?country), \"en\"))
}
")
We submit the query to the DBpedia SPARQL endpoint.
endpointD = "http://dbpedia.org/sparql"
optionsD=""
europeanCountries = SPARQL(endpointD,qD,ns=prefixD,extra=optionsD)$results
And take a look at the output of the query.
europeanCountries
## country country.1 country.2 country.3
## 1 "Georgia (country)"@en "Albania"@en "Andorra"@en "Armenia"@en
## country.4 country.5 country.6 country.7 country.8
## 1 "Austria"@en "Azerbaijan"@en "Belarus"@en "Belgium"@en "Bulgaria"@en
## country.9 country.10 country.11 country.12 country.13
## 1 "Croatia"@en "Cyprus"@en "Czech Republic"@en "Denmark"@en "Estonia"@en
## country.14 country.15 country.16 country.17 country.18
## 1 "Finland"@en "France"@en "Germany"@en "Greece"@en "Hungary"@en
## country.19 country.20 country.21 country.22 country.23
## 1 "Iceland"@en "Italy"@en "Latvia"@en "Liechtenstein"@en "Lithuania"@en
## country.24 country.25 country.26 country.27
## 1 "Luxembourg"@en "Moldova"@en "Monaco"@en "Montenegro"@en
## country.28 country.29 country.30 country.31 country.32
## 1 "Netherlands"@en "Poland"@en "Portugal"@en "Romania"@en "Russia"@en
## country.33 country.34 country.35 country.36 country.37
## 1 "San Marino"@en "Serbia"@en "Slovakia"@en "Slovenia"@en "Spain"@en
## country.38 country.39 country.40 country.41
## 1 "Sweden"@en "Switzerland"@en "Ukraine"@en "United Kingdom"@en
## country.42 country.43 country.44
## 1 "Northern Cyprus"@en "Republic of Ireland"@en "Vatican City"@en
## country.45
## 1 "Republic of Macedonia"@en
We obtained the data in a row.
dim(europeanCountries)
## [1] 1 46
Let’s transpose it to a column.
europeanCountries=as.data.frame(t(europeanCountries))
colnames(europeanCountries)[1] = "countryLabel"
head(europeanCountries,10)
## countryLabel
## country "Georgia (country)"@en
## country.1 "Albania"@en
## country.2 "Andorra"@en
## country.3 "Armenia"@en
## country.4 "Austria"@en
## country.5 "Azerbaijan"@en
## country.6 "Belarus"@en
## country.7 "Belgium"@en
## country.8 "Bulgaria"@en
## country.9 "Croatia"@en
Now we clean the country label to remove the language tag (@en) and quotes (“).
europeanCountries$countryLabel = gsub("@en","",europeanCountries$countryLabel)
europeanCountries$countryLabel = gsub("\"","",europeanCountries$countryLabel)
head(europeanCountries,10)
## countryLabel
## country Georgia (country)
## country.1 Albania
## country.2 Andorra
## country.3 Armenia
## country.4 Austria
## country.5 Azerbaijan
## country.6 Belarus
## country.7 Belgium
## country.8 Bulgaria
## country.9 Croatia
We also need to clean the Georgia and Ireland labels.
europeanCountries$countryLabel = gsub(" \\(country\\)","",europeanCountries$countryLabel)
europeanCountries$countryLabel = gsub("Republic of ","",europeanCountries$countryLabel)
head(europeanCountries)
## countryLabel
## country Georgia
## country.1 Albania
## country.2 Andorra
## country.3 Armenia
## country.4 Austria
## country.5 Azerbaijan
We have two data frames to merge: the one with the country scores and the one with the European countries.
head(countryScores)
## countryLabel score
## 1 Somalia 0.9834057
## 2 Korea (North) 1.0096260
## 3 Myanmar 1.4902960
## 4 Afghanistan 1.5170930
## 5 Sudan 1.5637580
## 6 Turkmenistan 1.6031730
head(europeanCountries)
## countryLabel
## country Georgia
## country.1 Albania
## country.2 Andorra
## country.3 Armenia
## country.4 Austria
## country.5 Azerbaijan
Let’s merge the data frames using the values of their countryLabel columns.
europeanCountriesWithCPI = merge(countryScores, europeanCountries,
by.x = "countryLabel", by.y = "countryLabel")
head(europeanCountriesWithCPI,10)
## countryLabel score
## 1 Albania 3.052946
## 2 Armenia 2.628996
## 3 Austria 7.786903
## 4 Azerbaijan 2.375235
## 5 Belarus 2.418323
## 6 Belgium 7.487431
## 7 Bulgaria 3.328966
## 8 Croatia 4.034245
## 9 Cyprus 6.265644
## 10 Czech Republic 4.368084
Do we have CPIs for all European countries?
europeanCountriesWithCPI.allY = merge(countryScores, europeanCountries,
by.x = "countryLabel", by.y = "countryLabel",
all.y=TRUE)
europeanCountriesWithCPI.allY[is.na(europeanCountriesWithCPI.allY$score),]
## countryLabel score
## 2 Andorra NA
## 24 Liechtenstein NA
## 27 Macedonia NA
## 29 Monaco NA
## 32 Northern Cyprus NA
## 37 San Marino NA
## 46 Vatican City NA
Let’s visualize the corruption scores along Europe.
hist(europeanCountriesWithCPI$score, main="Corruption Perceptions Index 2011 in Europe", xlab = "CPI")
And now we plot Spain’s CPI and the CPI of another country.
points(SpainCPI2011$score,5,pch=19, col="red")
text(SpainCPI2011$score,5.5,labels=SpainCPI2011$countryLabel, col="red")
points(OtherCPI2011$score,5,pch=19, col="red")
text(OtherCPI2011$score,5.5,labels=OtherCPI2011$countryLabel, col="red")
Draw the two graphs (World and Europe) at once.
With par we define a new graph configuration to include two plots in a graph. At the end we restore it to include only one graph.
par(mfrow=c(1, 2))
hist(countryScores$score, main="World Corruption Perceptions Index 2011", xlab = "CPI")
points(SpainCPI2011$score,20,pch=19, col="red")
text(SpainCPI2011$score,23,labels=SpainCPI2011$countryLabel, col="red")
points(OtherCPI2011$score,30,pch=19, col="red")
text(OtherCPI2011$score,33,labels=OtherCPI2011$countryLabel, col="red")
hist(europeanCountriesWithCPI$score, main="Europe Corruption Perceptions Index 2011", xlab = "CPI")
points(SpainCPI2011$score,5,pch=19, col="red")
text(SpainCPI2011$score,5.5,labels=SpainCPI2011$countryLabel, col="red")
points(OtherCPI2011$score,5,pch=19, col="red")
text(OtherCPI2011$score,5.5,labels=OtherCPI2011$countryLabel, col="red")
par(mfrow=c(1, 1))
Let’s query again the 720a data set for all the country scores of the Corruption Perceptions Index in 2011, but now we also retrieve the links to DBpedia.
We can reuse the same prefixes as before.
qT2 = paste(sparql_prefixT,"
SELECT ?score ?countryLabel ?URI
WHERE {
?s sdmx-dimension:refPeriod year:2011 ;
property:indicator indicator:corruption-perceptions-index ;
property:score ?score ;
sdmx-dimension:refArea ?countryURI .
?countryURI skos:prefLabel ?countryLabel ;
owl:sameAs ?URI .
FILTER (REGEX(STR(?URI),\"dbpedia\"))
}
ORDER BY ?score
")
Let’s run the query and see the output.
countryScoresWLinks = SPARQL(endpointT,qT2,ns=prefixT,extra=optionsT)$results
head(countryScoresWLinks,10)
## score countryLabel URI
## 1 0.9834057 "Somalia"@en <http://dbpedia.org/resource/Somalia>
## 2 1.0096260 "Korea (North)"@en <http://dbpedia.org/resource/North_Korea>
## 3 1.4902960 "Myanmar"@en <http://dbpedia.org/resource/Burma>
## 4 1.5170930 "Afghanistan"@en <http://dbpedia.org/resource/Afghanistan>
## 5 1.5637580 "Sudan"@en <http://dbpedia.org/resource/Sudan>
## 6 1.6031730 "Turkmenistan"@en <http://dbpedia.org/resource/Turkmenistan>
## 7 1.6245630 "Uzbekistan"@en <http://dbpedia.org/resource/Uzbekistan>
## 8 1.7998830 "Haiti"@en <http://dbpedia.org/resource/Haiti>
## 9 1.8041280 "Iraq"@en <http://dbpedia.org/resource/Iraq>
## 10 1.8894970 "Venezuela"@en <http://dbpedia.org/resource/Venezuela>
And now we query DBpedia for all the URIs of the countries in Europe.
qD2 = paste(sparql_prefixD,"
SELECT ?place WHERE {
?place rdf:type yago:EuropeanCountries .
?place rdf:type dbpedia-owl:Country .
}
")
Let’s run the query and see the output.
europeanCountriesWURIs = SPARQL(endpointD,qD2,ns=prefixD,extra=optionsD)$results
europeanCountriesWURIs[,1:10]
## place
## 1 <http://dbpedia.org/resource/Georgia_(country)>
## place.1
## 1 <http://dbpedia.org/resource/Albania>
## place.2
## 1 <http://dbpedia.org/resource/Andorra>
## place.3
## 1 <http://dbpedia.org/resource/Armenia>
## place.4
## 1 <http://dbpedia.org/resource/Austria>
## place.5
## 1 <http://dbpedia.org/resource/Azerbaijan>
## place.6
## 1 <http://dbpedia.org/resource/Belarus>
## place.7
## 1 <http://dbpedia.org/resource/Belgium>
## place.8
## 1 <http://dbpedia.org/resource/Bulgaria>
## place.9
## 1 <http://dbpedia.org/resource/Croatia>
Let’s transpose it into a column.
europeanCountriesWURIs=as.data.frame(t(europeanCountriesWURIs))
colnames(europeanCountriesWURIs)[1] = "countryURI"
head(europeanCountriesWURIs)
## countryURI
## place <http://dbpedia.org/resource/Georgia_(country)>
## place.1 <http://dbpedia.org/resource/Albania>
## place.2 <http://dbpedia.org/resource/Andorra>
## place.3 <http://dbpedia.org/resource/Armenia>
## place.4 <http://dbpedia.org/resource/Austria>
## place.5 <http://dbpedia.org/resource/Azerbaijan>
Now we merge the data using the URIs instead of the labels.
europeanCountriesWithCPIv2 = merge(countryScoresWLinks, europeanCountriesWURIs, by.x = "URI", by.y = "countryURI")
head(europeanCountriesWithCPIv2,10)
## URI score
## 1 <http://dbpedia.org/resource/Albania> 3.052946
## 2 <http://dbpedia.org/resource/Armenia> 2.628996
## 3 <http://dbpedia.org/resource/Austria> 7.786903
## 4 <http://dbpedia.org/resource/Azerbaijan> 2.375235
## 5 <http://dbpedia.org/resource/Belarus> 2.418323
## 6 <http://dbpedia.org/resource/Belgium> 7.487431
## 7 <http://dbpedia.org/resource/Bulgaria> 3.328966
## 8 <http://dbpedia.org/resource/Croatia> 4.034245
## 9 <http://dbpedia.org/resource/Cyprus> 6.265644
## 10 <http://dbpedia.org/resource/Czech_Republic> 4.368084
## countryLabel
## 1 "Albania"@en
## 2 "Armenia"@en
## 3 "Austria"@en
## 4 "Azerbaijan"@en
## 5 "Belarus"@en
## 6 "Belgium"@en
## 7 "Bulgaria"@en
## 8 "Croatia"@en
## 9 "Cyprus"@en
## 10 "Czech Republic"@en
With these data we obtain the same histogram.
hist(europeanCountriesWithCPIv2$score, main="Corruption Perceptions Index 2011 in Europe", xlab = "CPI")
points(SpainCPI2011$score,5,pch=19, col="red")
text(SpainCPI2011$score,5.5,labels=SpainCPI2011$countryLabel, col="red")
points(OtherCPI2011$score,5,pch=19, col="red")
text(OtherCPI2011$score,5.5,labels=OtherCPI2011$countryLabel, col="red")
Let’s show the most corrupt countries in a map.
Let’s query DBpedia for all the countries in Europe with their URI and coordinates.
qD3 = paste(sparql_prefixD,"
SELECT ?place ?lat ?long WHERE {
?place rdf:type yago:EuropeanCountries .
?place rdf:type dbpedia-owl:Country .
?place geo:lat ?lat .
?place geo:long ?long .
}
")
Let’s run the query and see the output.
europeanCountriesWCoord = SPARQL(endpointD,qD3,ns=prefixD,extra=optionsD)$results
head(europeanCountriesWCoord,10)
## place lat long
## 1 <http://dbpedia.org/resource/Georgia_(country)> 41.7167 44.78330
## 2 <http://dbpedia.org/resource/Albania> 41.3333 19.80000
## 3 <http://dbpedia.org/resource/Andorra> 42.5000 1.51667
## 4 <http://dbpedia.org/resource/Armenia> 40.1833 44.51670
## 5 <http://dbpedia.org/resource/Austria> 48.2000 16.35000
## 6 <http://dbpedia.org/resource/Azerbaijan> 40.4167 49.83330
## 7 <http://dbpedia.org/resource/Belarus> 53.9167 27.55000
## 8 <http://dbpedia.org/resource/Belgium> 50.8500 4.35000
## 9 <http://dbpedia.org/resource/Bulgaria> 42.6833 23.31670
## 10 <http://dbpedia.org/resource/Croatia> 45.8000 16.00000
Now we merge the results with the previous data.
europeanCountriesWithCPIandCoord = merge(europeanCountriesWithCPIv2, europeanCountriesWCoord,
by.x = "URI", by.y = "place")
head(europeanCountriesWithCPIandCoord,10)
## URI score
## 1 <http://dbpedia.org/resource/Albania> 3.052946
## 2 <http://dbpedia.org/resource/Armenia> 2.628996
## 3 <http://dbpedia.org/resource/Austria> 7.786903
## 4 <http://dbpedia.org/resource/Azerbaijan> 2.375235
## 5 <http://dbpedia.org/resource/Belarus> 2.418323
## 6 <http://dbpedia.org/resource/Belgium> 7.487431
## 7 <http://dbpedia.org/resource/Bulgaria> 3.328966
## 8 <http://dbpedia.org/resource/Croatia> 4.034245
## 9 <http://dbpedia.org/resource/Cyprus> 6.265644
## 10 <http://dbpedia.org/resource/Czech_Republic> 4.368084
## countryLabel lat long
## 1 "Albania"@en 41.3333 19.8000
## 2 "Armenia"@en 40.1833 44.5167
## 3 "Austria"@en 48.2000 16.3500
## 4 "Azerbaijan"@en 40.4167 49.8333
## 5 "Belarus"@en 53.9167 27.5500
## 6 "Belgium"@en 50.8500 4.3500
## 7 "Bulgaria"@en 42.6833 23.3167
## 8 "Croatia"@en 45.8000 16.0000
## 9 "Cyprus"@en 35.1667 33.3667
## 10 "Czech Republic"@en 50.0833 14.4667
We can put all these data in a map by calling the Google Maps API.
qmap('Brussels', zoom=4) +
geom_point(aes(x=europeanCountriesWithCPIandCoord$long,
y=europeanCountriesWithCPIandCoord$lat),
data=europeanCountriesWithCPIandCoord, size=europeanCountriesWithCPIandCoord$score,
colour="red") +
scale_color_manual(values = rainbow(10))
Let’s show the most corrupt ones (CPI<4).
europeanCountriesWithCPIandCoordWorst = europeanCountriesWithCPIandCoord[europeanCountriesWithCPIandCoord$score<4,]
europeanCountriesWithCPIandCoordWorst
## URI score
## 1 <http://dbpedia.org/resource/Albania> 3.052946
## 2 <http://dbpedia.org/resource/Armenia> 2.628996
## 4 <http://dbpedia.org/resource/Azerbaijan> 2.375235
## 5 <http://dbpedia.org/resource/Belarus> 2.418323
## 7 <http://dbpedia.org/resource/Bulgaria> 3.328966
## 17 <http://dbpedia.org/resource/Greece> 3.388967
## 20 <http://dbpedia.org/resource/Italy> 3.906913
## 24 <http://dbpedia.org/resource/Moldova> 2.878946
## 25 <http://dbpedia.org/resource/Montenegro> 3.969978
## 29 <http://dbpedia.org/resource/Republic_of_Macedonia> 3.942666
## 30 <http://dbpedia.org/resource/Romania> 3.612146
## 31 <http://dbpedia.org/resource/Russia> 2.448757
## 32 <http://dbpedia.org/resource/Serbia> 3.312795
## 33 <http://dbpedia.org/resource/Slovakia> 3.971617
## 38 <http://dbpedia.org/resource/Ukraine> 2.296875
## countryLabel lat long
## 1 "Albania"@en 41.3333 19.8000
## 2 "Armenia"@en 40.1833 44.5167
## 4 "Azerbaijan"@en 40.4167 49.8333
## 5 "Belarus"@en 53.9167 27.5500
## 7 "Bulgaria"@en 42.6833 23.3167
## 17 "Greece"@en 37.9667 23.7167
## 20 "Italy"@en 41.9000 12.4833
## 24 "Moldova"@en 47.0000 28.9167
## 25 "Montenegro"@en 42.7833 19.4667
## 29 "FYR Macedonia"@en 42.0000 21.4333
## 30 "Romania"@en 44.4167 26.1000
## 31 "Russia"@en 55.7500 37.6167
## 32 "Serbia"@en 44.8000 20.4667
## 33 "Slovakia"@en 48.1500 17.1167
## 38 "Ukraine"@en 50.4500 30.5000
And plot them in the map.
qmap('Brussels', zoom=4) +
geom_point(aes(x=europeanCountriesWithCPIandCoordWorst$long, y=europeanCountriesWithCPIandCoordWorst$lat),
data=europeanCountriesWithCPIandCoordWorst, size=europeanCountriesWithCPIandCoordWorst$score, colour="red") +
scale_color_manual(values = rainbow(10))
Is there any correlation between the movies produced by a country and the Corruption Perceptions Index?
Let’s query the Linked Movie Data Base for all the movies from European countries (Note: the “IN” approach doesn’t work for the endpoint, so we change the approach).
Let’s first define the prefixes and the endpoint, and initialize a dataframe for storing the results.
prefixI = c("movie","http://data.linkedmdb.org/page/movie/")
sparql_prefixI = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX movie: <http://data.linkedmdb.org/resource/movie/>
"
endpointI = "http://data.linkedmdb.org/sparql"
optionsI=""
moviesPerCountry = data.frame(country= character(0), numMovies = integer(0))
Then we search for the movies in every country.
for (i in 1:length(europeanCountriesWithCPIandCoord$URI)) {
qI = paste(sparql_prefixI,"
SELECT ?country (COUNT(*) AS ?numMovies) WHERE
{
?countryI rdf:type movie:country .
?countryI owl:sameAs ?country .
?film movie:country ?countryI .
FILTER (?country=", europeanCountriesWithCPIandCoord$URI[i] ,")
}
GROUP BY ?country
")
numberMovies = SPARQL(endpointI,qI,ns=prefixI,extra=optionsI)$results
moviesPerCountry = rbind(moviesPerCountry,numberMovies)
# To avoid being rejected by the endpoint
Sys.sleep(2)
}
It may happen that the Linked Movie Data SPARQL endpoint is down. In that case, we can generate the number of movies randomly.
for (i in 1:length(europeanCountriesWithCPIandCoord$URI)) {
numberMovies = data.frame(matrix(ncol = 2, nrow = 1))
numberMovies[1,1] = europeanCountriesWithCPIandCoord$URI[i]
numberMovies[1,2] = round(rnorm(1, mean = 500, sd = 200))
colnames(numberMovies) = c("country","numMovies")
moviesPerCountry = rbind(moviesPerCountry,numberMovies)
}
We obtained the number of movies per country.
head(moviesPerCountry,10)
## country numMovies
## 1 <http://dbpedia.org/resource/Albania> 245
## 2 <http://dbpedia.org/resource/Armenia> 665
## 3 <http://dbpedia.org/resource/Austria> 382
## 4 <http://dbpedia.org/resource/Azerbaijan> 244
## 5 <http://dbpedia.org/resource/Belarus> 332
## 6 <http://dbpedia.org/resource/Belgium> 783
## 7 <http://dbpedia.org/resource/Bulgaria> 388
## 8 <http://dbpedia.org/resource/Croatia> 199
## 9 <http://dbpedia.org/resource/Cyprus> 466
## 10 <http://dbpedia.org/resource/Czech_Republic> 567
We merge our results with the previous data.
europeanCountriesWithCPICoordMovies = merge(europeanCountriesWithCPIandCoord, moviesPerCountry,
by.x = "URI", by.y = "country")
head(europeanCountriesWithCPICoordMovies, 10)
## URI score
## 1 <http://dbpedia.org/resource/Albania> 3.052946
## 2 <http://dbpedia.org/resource/Armenia> 2.628996
## 3 <http://dbpedia.org/resource/Austria> 7.786903
## 4 <http://dbpedia.org/resource/Azerbaijan> 2.375235
## 5 <http://dbpedia.org/resource/Belarus> 2.418323
## 6 <http://dbpedia.org/resource/Belgium> 7.487431
## 7 <http://dbpedia.org/resource/Bulgaria> 3.328966
## 8 <http://dbpedia.org/resource/Croatia> 4.034245
## 9 <http://dbpedia.org/resource/Cyprus> 6.265644
## 10 <http://dbpedia.org/resource/Czech_Republic> 4.368084
## countryLabel lat long numMovies
## 1 "Albania"@en 41.3333 19.8000 245
## 2 "Armenia"@en 40.1833 44.5167 665
## 3 "Austria"@en 48.2000 16.3500 382
## 4 "Azerbaijan"@en 40.4167 49.8333 244
## 5 "Belarus"@en 53.9167 27.5500 332
## 6 "Belgium"@en 50.8500 4.3500 783
## 7 "Bulgaria"@en 42.6833 23.3167 388
## 8 "Croatia"@en 45.8000 16.0000 199
## 9 "Cyprus"@en 35.1667 33.3667 466
## 10 "Czech Republic"@en 50.0833 14.4667 567
Is there any correlation between the number of movies produced by a country in IMDB and the Corruption Perceptions Index in 2011?
plot(europeanCountriesWithCPICoordMovies$score,europeanCountriesWithCPICoordMovies$numMovies,
main="Number of movies per CPI", xlab = "CPI", ylab="Numer of movies")