Textbook: AUTOMATED DATA COLLECTION WITH R

1.1 Case-Study: UNESCO WORLD HERITAGE SITES IN DANGER

Required Packages

library(stringr)
library(XML)
library(RCurl)
## Loading required package: bitops
library(maps)

Parsing from Wikipedia web site

htmlData <- getURL("https://en.wikipedia.org/wiki/List_of_World_Heritage_in_Danger")
parsedData <-htmlParse(htmlData, encoding = "UTF-8")

Extract the desired table

tables <- readHTMLTable(parsedData, stringsAsFactors = FALSE)
danger_table <- tables[[2]]
danger_table[1:3,]
##                                V1    V2
## 1                            Name Image
## 2                        Abu Mena      
## 3 Air and Ténéré Natural Reserves      
##                                                                                                                V3
## 1                                                                                                        Location
## 2                   EgyAbusir, Egypt30°50′30″N 29°39′50″E / 30.84167°N 29.66389°E / 30.84167; 29.66389 (Abu Mena)
## 3 Niger1Arlit Department, Niger18°17′N 8°0′E / 18.283°N 8.000°E / 18.283; 8.000 (Air and Ténéré Natural Reserves)
##                         V4                                         V5
## 1                 Criteria                              Areaha (acre)
## 2            Cultural:(iv)              7006182000000000000♠182 (450)
## 3 Natural:(vii), (ix), (x) 7010773600000000000♠7,736,000 (19,120,000)
##           V6         V7
## 1 Year (WHS) Endangered
## 2       1979      2001–
## 3       1991      1992–
##                                                                                                                                            V8
## 1                                                                                                                                      Reason
## 2                              Cave-ins in the area caused by the clay at the surface, which becomes semi-liquid when met with "excess water"
## 3 Military conflict and civil disturbance in the region as well as a reduction of wildlife population and degradation of the vegetation cover
##             V9
## 1         Refs
## 2 [17][18][19]
## 3     [20][21]

After extracting the html table with the information about the endangered sites, the table has columns names as V1, V2, …, V9, and the first row actually contains what should be the names of the columns: “Names”, “Image”, “Location”, “Criteria”, “Areaha (acre)”,“Year (WHS)”, “Endangered”, “Reason”, “Refs”.

Since we are interested in columns with names, location, criteria, year of inclusion in the list and year considered ‘endangered’, we subset the table to skip the first row, include desired columns and change the columns names:

danger_table <- danger_table[2:55,c(1,3,4,6,7)]
colnames(danger_table) <- c("name","locn","crit","yins","yend")
names(danger_table)
## [1] "name" "locn" "crit" "yins" "yend"
danger_table$name[1:4]
## [1] "Abu Mena"                        "Air and Ténéré Natural Reserves"
## [3] "Ancient City of Aleppo"          "Ancient City of Bosra"

Cleanse criteria

danger_table$crit <- ifelse(str_detect(danger_table$crit, "Natural")==T, "nat", "cult")
danger_table$crit[1:4]
## [1] "cult" "nat"  "cult" "cult"

Cleanse year ins

danger_table$yins <- as.numeric(danger_table$yins)
danger_table$yins[1:4]
## [1] 1979 1991 1986 1980

Cleanse year end

danger_table$yend[18]
## [1] "1993–2007, 2010–"
yend_clean <- unlist(str_extract_all(danger_table$yend, "[[:digit:]]{4}[^-]$"))
yend_clean <- unlist(str_extract_all(yend_clean, "^[[:digit:]]{4}")) #or use: yend_clean <- str_sub(yend_clean, 1, -2)
danger_table$yend <- as.numeric(yend_clean)
danger_table$yend[18]
## [1] 2010

The locn variable is a bit of a mess as the example below shows:

danger_table$locn[c(1,3,5)]
## [1] "EgyAbusir, Egypt30°50′30″N 29°39′50″E / 30.84167°N 29.66389°E / 30.84167; 29.66389 (Abu Mena)"                            
## [2] "Aleppo Governorate,  Syria36°14′N 37°10′E / 36.233°N 37.167°E / 36.233; 37.167 (Ancient City of Aleppo)"                  
## [3] "Damascus Governorate,  Syria33°30′41″N 36°18′23″E / 33.51139°N 36.30639°E / 33.51139; 36.30639 (Ancient City of Damascus)"

get coordinates

reg_y <- "[/][ -]*[[:digit:]]*[.]*[[:digit:]]*[;]"
reg_x <- "[;][ -]*[[:digit:]]*[.]*[[:digit:]]*"
y_coords <- str_extract(danger_table$locn, reg_y)
(y_coords <- as.numeric(str_sub(y_coords, 3, -2)))
##  [1]  30.84167  18.28300  36.23300  32.51806  33.51139  36.33417  32.82500
##  [8]  32.63833  32.80528  35.45667  -8.11111  31.70444 -19.58361  11.41700
## [15]  34.78167  34.83194 -11.68306  25.31700   9.55389   4.00000  35.58806
## [22]  31.52417  39.05000  48.20000  14.20000 -20.20833  -2.50000   3.05222
## [29]  53.40667   9.00000  34.39667  42.66111   7.60000   6.83972  13.00000
## [36]   2.00000  31.77667  15.35556  30.13333  13.90639  31.71972  15.92694
## [43] -14.46700  15.74444  24.83300  -2.00000  34.20000  -9.00000  34.55417
## [50]  16.77333  16.28972   0.32917  -2.50000   0.91700
danger_table$y_coords <- y_coords
x_coords <- str_extract(danger_table$locn, reg_x)
(x_coords <- as.numeric(str_sub(x_coords, 3, -1)))
##  [1]  29.66389   8.00000  37.16700  36.48167  36.30639  36.84417  21.85833
##  [8]  14.29306  12.48500  43.26250 -79.07500  35.20750 -65.75306 -69.66700
## [15]  36.26306  67.82667 160.18306 -80.93300 -79.65583  29.25000  42.71833
## [22]  35.10889  66.83333  16.36700  43.31700 -69.79444  28.75000  36.50361
## [29]  -2.84444  21.50000  64.51611  20.26556  -8.38300 158.33083 -12.66700
## [36]  28.50000  35.23417  44.20806   9.50000  -4.55500  35.13056  48.62667
## [43]  49.70000 -84.67500  10.33300  21.00000  43.86700  37.40000  38.26667
## [50]  -2.99944  -0.04444  32.55333 101.50000  29.16700
danger_table$x_coords <- x_coords
names(danger_table)
## [1] "name"     "locn"     "crit"     "yins"     "yend"     "y_coords"
## [7] "x_coords"
danger_table$locn <- NULL

Data in table is completly clean

dim(danger_table)
## [1] 54  6
head(danger_table)
##                                 name crit yins yend y_coords x_coords
## 2                           Abu Mena cult 1979 2001 30.84167 29.66389
## 3    Air and Ténéré Natural Reserves  nat 1991 1992 18.28300  8.00000
## 4             Ancient City of Aleppo cult 1986 2013 36.23300 37.16700
## 5              Ancient City of Bosra cult 1980 2013 32.51806 36.48167
## 6           Ancient City of Damascus cult 1979 2013 33.51139 36.30639
## 7 Ancient Villages of Northern Syria cult 2011 2013 36.33417 36.84417

Plot endangered heritage sites

par(oma=c(0,0,0,0))
par(mar=c(0,0,0,0))
pch<- ifelse(danger_table$crit=="nat", 19, 2)
map("world", col ="darkgrey", lwd = 0.5, mar = c(0.1, 0.1, 0.1, 0.1))
title("Location of UNESCO World Heritage Sites \n")
points(danger_table$x_coords, danger_table$y_coords, pch = pch)
box()

Plot year of endangerment

table(danger_table$crit)
## 
## cult  nat 
##   38   16
#pdf(file="heritage-hist1.pdf", height=3.3, width=7, family="URWTimes")
par(oma=c(0,0,0,0))
par(mar=c(4,4,1,.5))
hist(danger_table$yend, freq=TRUE, xlab="Year when site was put on the list of endangered sites", main="")
box()

Plot time between inscription and endangerment

duration <- danger_table$yend - danger_table$yins
#pdf(file="heritage-hist2.pdf", height=3.3, width=7, family="URWTimes")
par(oma=c(0,0,0,0))
par(mar=c(4,4,1,.5))
hist(duration, freq=TRUE, xlab="Number of ears to become an endangered site", main="")
box()