Textbook: AUTOMATED DATA COLLECTION WITH R
1.1 Case-Study: UNESCO WORLD HERITAGE SITES IN DANGER
library(stringr)
library(XML)
library(RCurl)
## Loading required package: bitops
library(maps)
htmlData <- getURL("https://en.wikipedia.org/wiki/List_of_World_Heritage_in_Danger")
parsedData <-htmlParse(htmlData, encoding = "UTF-8")
tables <- readHTMLTable(parsedData, stringsAsFactors = FALSE)
danger_table <- tables[[2]]
danger_table[1:3,]
## V1 V2
## 1 Name Image
## 2 Abu Mena
## 3 Air and Ténéré Natural Reserves
## V3
## 1 Location
## 2 EgyAbusir, Egypt30°50′30″N 29°39′50″E / 30.84167°N 29.66389°E / 30.84167; 29.66389 (Abu Mena)
## 3 Niger1Arlit Department, Niger18°17′N 8°0′E / 18.283°N 8.000°E / 18.283; 8.000 (Air and Ténéré Natural Reserves)
## V4 V5
## 1 Criteria Areaha (acre)
## 2 Cultural:(iv) 7006182000000000000♠182 (450)
## 3 Natural:(vii), (ix), (x) 7010773600000000000♠7,736,000 (19,120,000)
## V6 V7
## 1 Year (WHS) Endangered
## 2 1979 2001–
## 3 1991 1992–
## V8
## 1 Reason
## 2 Cave-ins in the area caused by the clay at the surface, which becomes semi-liquid when met with "excess water"
## 3 Military conflict and civil disturbance in the region as well as a reduction of wildlife population and degradation of the vegetation cover
## V9
## 1 Refs
## 2 [17][18][19]
## 3 [20][21]
After extracting the html table with the information about the endangered sites, the table has columns names as V1, V2, …, V9, and the first row actually contains what should be the names of the columns: “Names”, “Image”, “Location”, “Criteria”, “Areaha (acre)”,“Year (WHS)”, “Endangered”, “Reason”, “Refs”.
Since we are interested in columns with names, location, criteria, year of inclusion in the list and year considered ‘endangered’, we subset the table to skip the first row, include desired columns and change the columns names:
danger_table <- danger_table[2:55,c(1,3,4,6,7)]
colnames(danger_table) <- c("name","locn","crit","yins","yend")
names(danger_table)
## [1] "name" "locn" "crit" "yins" "yend"
danger_table$name[1:4]
## [1] "Abu Mena" "Air and Ténéré Natural Reserves"
## [3] "Ancient City of Aleppo" "Ancient City of Bosra"
danger_table$crit <- ifelse(str_detect(danger_table$crit, "Natural")==T, "nat", "cult")
danger_table$crit[1:4]
## [1] "cult" "nat" "cult" "cult"
danger_table$yins <- as.numeric(danger_table$yins)
danger_table$yins[1:4]
## [1] 1979 1991 1986 1980
danger_table$yend[18]
## [1] "1993–2007, 2010–"
yend_clean <- unlist(str_extract_all(danger_table$yend, "[[:digit:]]{4}[^-]$"))
yend_clean <- unlist(str_extract_all(yend_clean, "^[[:digit:]]{4}")) #or use: yend_clean <- str_sub(yend_clean, 1, -2)
danger_table$yend <- as.numeric(yend_clean)
danger_table$yend[18]
## [1] 2010
The locn variable is a bit of a mess as the example below shows:
danger_table$locn[c(1,3,5)]
## [1] "EgyAbusir, Egypt30°50′30″N 29°39′50″E / 30.84167°N 29.66389°E / 30.84167; 29.66389 (Abu Mena)"
## [2] "Aleppo Governorate, Syria36°14′N 37°10′E / 36.233°N 37.167°E / 36.233; 37.167 (Ancient City of Aleppo)"
## [3] "Damascus Governorate, Syria33°30′41″N 36°18′23″E / 33.51139°N 36.30639°E / 33.51139; 36.30639 (Ancient City of Damascus)"
reg_y <- "[/][ -]*[[:digit:]]*[.]*[[:digit:]]*[;]"
reg_x <- "[;][ -]*[[:digit:]]*[.]*[[:digit:]]*"
y_coords <- str_extract(danger_table$locn, reg_y)
(y_coords <- as.numeric(str_sub(y_coords, 3, -2)))
## [1] 30.84167 18.28300 36.23300 32.51806 33.51139 36.33417 32.82500
## [8] 32.63833 32.80528 35.45667 -8.11111 31.70444 -19.58361 11.41700
## [15] 34.78167 34.83194 -11.68306 25.31700 9.55389 4.00000 35.58806
## [22] 31.52417 39.05000 48.20000 14.20000 -20.20833 -2.50000 3.05222
## [29] 53.40667 9.00000 34.39667 42.66111 7.60000 6.83972 13.00000
## [36] 2.00000 31.77667 15.35556 30.13333 13.90639 31.71972 15.92694
## [43] -14.46700 15.74444 24.83300 -2.00000 34.20000 -9.00000 34.55417
## [50] 16.77333 16.28972 0.32917 -2.50000 0.91700
danger_table$y_coords <- y_coords
x_coords <- str_extract(danger_table$locn, reg_x)
(x_coords <- as.numeric(str_sub(x_coords, 3, -1)))
## [1] 29.66389 8.00000 37.16700 36.48167 36.30639 36.84417 21.85833
## [8] 14.29306 12.48500 43.26250 -79.07500 35.20750 -65.75306 -69.66700
## [15] 36.26306 67.82667 160.18306 -80.93300 -79.65583 29.25000 42.71833
## [22] 35.10889 66.83333 16.36700 43.31700 -69.79444 28.75000 36.50361
## [29] -2.84444 21.50000 64.51611 20.26556 -8.38300 158.33083 -12.66700
## [36] 28.50000 35.23417 44.20806 9.50000 -4.55500 35.13056 48.62667
## [43] 49.70000 -84.67500 10.33300 21.00000 43.86700 37.40000 38.26667
## [50] -2.99944 -0.04444 32.55333 101.50000 29.16700
danger_table$x_coords <- x_coords
names(danger_table)
## [1] "name" "locn" "crit" "yins" "yend" "y_coords"
## [7] "x_coords"
danger_table$locn <- NULL
Data in table is completly clean
dim(danger_table)
## [1] 54 6
head(danger_table)
## name crit yins yend y_coords x_coords
## 2 Abu Mena cult 1979 2001 30.84167 29.66389
## 3 Air and Ténéré Natural Reserves nat 1991 1992 18.28300 8.00000
## 4 Ancient City of Aleppo cult 1986 2013 36.23300 37.16700
## 5 Ancient City of Bosra cult 1980 2013 32.51806 36.48167
## 6 Ancient City of Damascus cult 1979 2013 33.51139 36.30639
## 7 Ancient Villages of Northern Syria cult 2011 2013 36.33417 36.84417
par(oma=c(0,0,0,0))
par(mar=c(0,0,0,0))
pch<- ifelse(danger_table$crit=="nat", 19, 2)
map("world", col ="darkgrey", lwd = 0.5, mar = c(0.1, 0.1, 0.1, 0.1))
title("Location of UNESCO World Heritage Sites \n")
points(danger_table$x_coords, danger_table$y_coords, pch = pch)
box()
table(danger_table$crit)
##
## cult nat
## 38 16
#pdf(file="heritage-hist1.pdf", height=3.3, width=7, family="URWTimes")
par(oma=c(0,0,0,0))
par(mar=c(4,4,1,.5))
hist(danger_table$yend, freq=TRUE, xlab="Year when site was put on the list of endangered sites", main="")
box()
duration <- danger_table$yend - danger_table$yins
#pdf(file="heritage-hist2.pdf", height=3.3, width=7, family="URWTimes")
par(oma=c(0,0,0,0))
par(mar=c(4,4,1,.5))
hist(duration, freq=TRUE, xlab="Number of ears to become an endangered site", main="")
box()