Compiled on 2020-09-28 by Maroun Bou Sleiman.

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(sf))
suppressPackageStartupMessages(library(leaflet))
suppressPackageStartupMessages(library(rhdx))
suppressPackageStartupMessages(library(parallel))
# suppressPackageStartupMessages(library(ggmap)) # for later
# suppressPackageStartupMessages(library(osmdata)) # for later

## some helper functions
is_arabic <- function(x){grepl("[\u0600-\u06ff]|[\u0750-\u077f]|[\ufb50-\ufc3f]|[\ufe70-\ufefc]", x, perl = FALSE)}

1 Introduction

One major issue we’re dealing with is the alternate names (and different languages) that Lebanese places have. Different databases and data sources use different names, which renders data integration very time-consuming. Here, I will try to simplify this through integration of different existing databases.

2 HDX maps + geonames integration

2.1 Obtaining the data

First I will import the HDX map with the administrative divisions of Lebanon (levels 1-3). The level 3 map, which is at the town/village level is of most interest here. The shapefile has the following information:

leb_administrative_hdx <- rhdx::search_datasets("Lebanon - Subnational Administrative Boundaries") %>% nth(1) %>% get_resource(2) %>% download_resource(folder = "../data/")
unzip(leb_administrative_hdx, exdir = "../data/hdx")

leb_administrative_hdx_3 <- sf::read_sf("../data/hdx/lbn_admbnda_adm3_cdr_20200810.shp")

# remove conflict and litige tiles
leb_administrative_hdx_3 <- leb_administrative_hdx_3 %>% filter(admin3Name != "Conflict") %>% filter(admin3Name != "Litige")

DT::datatable(leb_administrative_hdx_3 %>% as_tibble()  %>% select(-c("geometry")) %>% head(n = 20), caption = "First 20 records of the HDX/OCHA Level 3 Administrative Division Map")

Now let’s get the geonames data, which has more extensive alternate names. Specifically, we need two files about Lebanon:

  • The main geoname file: link
  • A file with alternate names only. This is redundant with the main geoname file, but we’ll download it anyway. link

The geonames are points with names basically. Note: I will add the gn_ prefix to the geonames data columns. Here is how the geonames data look like:

## geonames data dumps for lebanon
dir.create("../data/geonames")

### LB.zip file
download.file("https://download.geonames.org/export/dump/LB.zip", destfile = "../data/geonames/LB.zip")
unzip("../data/geonames/LB.zip", exdir = "../data/geonames/LB")
geonames_lb <- read.table("../data/geonames/LB/LB.txt", sep = "\t", header = F, fill = T, quote = "") %>% as_tibble()

geonames_lb_metadata_rl <- readLines("../data/geonames/LB/readme.txt")
whichLine <- grepl("^The main \\'geoname\\'", geonames_lb_metadata_rl) %>% which
geonames_lb_metadata <- geonames_lb_metadata_rl[(whichLine + 2):(whichLine+2+18)]
geonames_lb_metadata <- do.call(rbind, lapply(strsplit(geonames_lb_metadata, "\\:"), function(x){
  x <- gsub("( )+$", "", x) # remove trailing spaces
  x <- gsub("^( )+", "", x) # remove leading spaces
  l <- length(x)
  c(x[1], paste(x[2:l], collapse = ":"))
})) %>% as_tibble()
colnames(geonames_lb_metadata) <- c("feature", "description")
geonames_lb_metadata$feature <- paste0("gn_", geonames_lb_metadata$feature)

colnames(geonames_lb) <- geonames_lb_metadata$feature
geonames_lb <- geonames_lb %>% mutate(latitude = as.numeric(gn_latitude), longitude = as.numeric(gn_longitude))

geonames_lb_sf <- sf::st_as_sf(geonames_lb, coords=c("gn_longitude", "gn_latitude"))
DT::datatable(geonames_lb_sf %>% as_tibble() %>%  select(-c("geometry")) %>% head(n = 20), caption = "First 20 records in the geonames database")

Let’s filter geonames by class first: see link. I will keep the following:

  • A country, state, region,…:
  • L parks,area, …
  • P city, village,…
gn_keep_feature_classes <- c("A","L", "P")
geonames_lb_sf <- geonames_lb_sf %>% filter(`gn_feature class` %in% gn_keep_feature_classes)

Out of the 24597 records, 5535 records are kept for downstream matching.

2.2 Intersecting Admin3 level data with geonames

Now let’s intersect both sources! Here’s a sample of what we get:

st_crs(geonames_lb_sf) <- st_crs(leb_administrative_hdx_3) # a hack, not sure about it

hdx_geonames_intersect <- sf::st_intersection(leb_administrative_hdx_3, geonames_lb_sf)
DT::datatable(hdx_geonames_intersect %>% as_tibble() %>%  select(-c("geometry")) %>% head(n=20) ,  caption = "First 20 records of intersection table, with all columns")

The geonames data are more granular, and they (almost?) always have a place that intersects an HDX polygon with the same or very similar name. Let’s find that geonames point(s) that has the most similar name. The new isMin column encodes this information. A value of TRUE means that it is the closest in terms of edit distance to the HDX admin3Name.

isMin_func <- function(dist, n1, n2){
  first_condition <- dist == min(dist) # record has the lowest distance
  second_condition <- dist < min(nchar(c(n1,n2))) # the distance is less than the shortest string in the pair
  third_condition <- dist <=10 # at most 10 differences
  first_condition & second_condition & third_condition
}

hdx_geonames_intersect <- hdx_geonames_intersect  %>% rowwise() %>% mutate(edit_dist = adist(admin3Name, gn_name)) %>% 
  group_by(admin3Name) %>% 
  mutate(admin3Name_geoname_minimized = admin3Name[which.min(edit_dist)],
         isMin = isMin_func(edit_dist, admin3Name, gn_name)  )  %>% ungroup()

DT::datatable(head(hdx_geonames_intersect, n = 100) %>% select("admin3Name", "gn_name", "isMin"), caption = "First 100 records of intersection table, with selected columns")

2.3 Adding new alternate names to geonames

2.3.1 The HDX new names

Now let’s create a new alternate names column start appending to the existing geobanes alternate names. However, first check if the name already exists.

First let’s do the admin3Name column

#append to alternate names function

append_alternate <- function(x=admin3Name,y=alternatenames,z=isMin) {
  if(is.na(y) & !is.na(x)){return(x)}
  if(is.na(x)){return(y)}
  
  an <- strsplit(y, ",")[[1]]
  if((!tolower(x) %in% tolower(an) ) & z){
    paste(c(an,x), collapse = ",")
  }else{
    paste(an, collapse = ",")
  }
}
#append alternate "admin3Name"
hdx_geonames_intersect <- hdx_geonames_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=admin3Name,y=gn_alternatenames,z=isMin))

First let’s do the admin3Na_1 Arabic column (from HDX)

#append alternate "admin3Na_1"
hdx_geonames_intersect <- hdx_geonames_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=admin3Na_1,y=alternatenames_appended,z=isMin))

2.3.2 append arabic transliteration (Rich Nielson system)

Let’s use the arabicStemR from Rich Nielson to transliterate the admin3Na_1 and add this version.

suppressPackageStartupMessages(library(arabicStemR))
hdx_geonames_intersect <- hdx_geonames_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=arabicStemR::transliterate(admin3Na_1),y=alternatenames_appended,z=isMin))
hdx_geonames_intersect %>% select(c("admin3Name", "gn_name", "admin3Name_geoname_minimized", "isMin", "alternatenames_appended")) %>% filter( isMin) %>% head(n=100) %>%
  DT::datatable(caption = "Sample of 100 records that where the admin3Name is a likely source of a new alternate name (isMin = TRUE)")

2.3.3 append transliteration using the arabic transliterator (ALA-LC standard)

Great Arabic Transliteration tool that can be found here. I have modified the DOCKERFILE to download and install it. I also added pandas to the requirements.txt file. Then, I prepared a python script (/src/transliterate.py) that accepts a csv file with one column (Arabic) and outputs csv files with two columns (arabic, transliteration result). I parallelized this step to make it faster, according to the number of cores (parallel package). Here is a sample of the result after running the script on 5499 Arabic names:

ncores <- detectCores()
chunks <- split(hdx_geonames_intersect$admin3Na_1, cut_number(1:length(hdx_geonames_intersect$admin3Na_1), ncores))

transliterated <- mclapply(1:length(chunks), function(x){
  tmpfile_in <- paste0("../data/tmp_totransliterate.",x,".csv")
  write.table(chunks[[x]], file = tmpfile_in, row.names = F, quote = F, col.names = T, sep = ",")
  system(paste0 ("python3 ../src/transliterate.py ", tmpfile_in))
  out <- read.csv(paste0(tmpfile_in, ".transliterated"))
  out
}, mc.cores = ncores)
transliterated <- do.call(rbind, transliterated)

hdx_geonames_intersect$admin3Na_1_ala_lc <- transliterated$transliterations

DT::datatable(head(transliterated, n=100), caption = "First 100 ALA-LC transliteration results")

Append transliterations:

# append alternate ALA_LC transliterations
hdx_geonames_intersect <- hdx_geonames_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=admin3Na_1_ala_lc,y=alternatenames_appended,z=isMin))
hdx_geonames_intersect %>% select(c("admin3Name", "gn_name", "isMin", "edit_dist", "alternatenames_appended")) %>% 
  filter(isMin) %>% 
  head(n = 100) %>% 
  DT::datatable(caption = "Sample of 100 records that where the admin3Name is a likely source of a new alternate name (isMin = TRUE)")

2.4 HDX settlement dataset integration

Now let’s see the HDX settlements points relate the the administrative level 3 data. Note: I will add the set_ prefix to the settlement data columns.

leb_settlements_file <- rhdx::search_datasets("Lebanon - Settlements (villages, towns, cities)") %>% nth(1) %>% get_resource(1) %>% download_resource(folder = "../data/")
unzip(leb_settlements_file, exdir = "../data/hdx")

leb_settlements <- sf::read_sf("../data/hdx/Lebanon_Villages.shp")
colnames(leb_settlements)[1:(ncol(leb_settlements)-1)] <- paste0("set_", colnames(leb_settlements)[1:(ncol(leb_settlements)-1)])
DT::datatable(head(leb_settlements, n = 20), caption = "First 20 records in the settlements file")

How well does this agree/overlap with the UN admin level 3 file?

  • In terms of size,: there are 2713 settlements and 1545 entities in the UN admin level 3 file.
  • In terms of intersections of names, 520
  • In terms of intersections of Pcodes, 167

Let’s do spatial intersection now

settlements_hdx_intersect <- sf::st_intersection(leb_settlements, leb_administrative_hdx_3)

settlements_hdx_intersect %>% select(c("set_Pcode", "set_Nam_Eng", "set_Nam_Ar", "admin3Name", "admin3Name", "admin3Na_1")) %>% head(n = 20) %>% as_tibble() %>%  select(-c("geometry")) %>% DT::datatable(caption = "First 20 records of the intersection between the HDX admin 3 map and the settlements map")

2698 features can be intersected spatially. English names do not seem to be harmonized here, need to identify the likely set_Nam_Eng that corresponds to the best match. In this case, matching of Arabic names, which are less ambiguous, may do the job. Also, all the admin3Na_1 names are included in the set_Nam_Ar, and there is a 1 to 1 correspondence between the two:

  • There are 1545 of the leb_administrative_hdx_3$admin3Na_1 in leb_administrative_hdx_3$set_Nam_Ar
  • There are at most 1 occurences of each of the leb_administrative_hdx_3$admin3Na_1 in leb_administrative_hdx_3$set_Nam_Ar.

Since there are multple settlements per admin3 entity, I created a new column called isPrimary to flag the settlement that should have the same english name as its admin3 counterpart.

settlements_hdx_intersect$isPrimary <- settlements_hdx_intersect$set_Nam_Ar == settlements_hdx_intersect$admin1Na_1

Now let’s relate the settlements to the geonames using geographic proximity and overlap with the admin3 as criteria. The end result is adding a gn_geonameid to each settlement. Initially I wanted to do this distance calculation within admin3 levels so as to avoid making links that cross the admin3 borders. However, it seems that some settlements are assigned to nearby administrative regions. My solution is to expand the search to include the nearby admin3 positions.

all_admin3Name <- sort(unique(settlements_hdx_intersect$admin3Name))
settlements_hdx_intersect_geonameid <- mclapply(all_admin3Name, function(x){
  
  settlements_hdx_intersect_sub <- settlements_hdx_intersect %>% filter(admin3Name == x)
  if(nrow(settlements_hdx_intersect_sub) == 0){
    return(NULL)
  }
  # expand search to neghbors
  neighbors <- leb_administrative_hdx_3$admin3Name[st_relate(leb_administrative_hdx_3 %>% filter(admin3Name == x), leb_administrative_hdx_3, pattern = "F***T****")[[1]]]
  
  hdx_geonames_sub <- hdx_geonames_intersect %>% filter(admin3Name %in% c(x, neighbors))
  
  # which_nearest <- sf::st_nearest_feature(settlements_hdx_intersect_sub, hdx_geonames_sub$geometry)
  
  # cbind(settlements_hdx_intersect_sub[,c("set_Nam_Eng")],hdx_geonames_sub[which_nearest, c("gn_name")])
  # perform a geographic distance check
  distances_to_all <- sf::st_distance(settlements_hdx_intersect_sub, hdx_geonames_sub$geometry)
  distances_to_all_percentile <- t(apply(distances_to_all, MAR=1, function(z){percent_rank(z)}))
  distances_to_all_percentile_ok <- distances_to_all_percentile < 0.05
  distances_to_all_percentile_which_ok <- lapply(1:nrow(distances_to_all_percentile_ok), function(y){which(distances_to_all_percentile_ok[y,])})
  
  distances_to_all_min <- as.list(apply(distances_to_all, MAR = 1, which.min))
  
  #perform a string distance check: is the nearest object the closest in semantic terms? If not, pick a closer one
  edit_dist_matrix <- adist(settlements_hdx_intersect_sub$set_Nam_Eng, hdx_geonames_sub$gn_name)
  edit_dist_matrix_percentile <- t(apply(edit_dist_matrix, MAR = 1, percent_rank))
  edit_dist_matrix_percentile_ok <- edit_dist_matrix_percentile < 0.05
  edit_dist_matrix_percentile_which_ok <- lapply(1:nrow(edit_dist_matrix_percentile_ok), function(y){which(edit_dist_matrix_percentile_ok[y,])}) 
  
  edit_dist_matrix_min <- as.list(apply(edit_dist_matrix, MAR = 1, which.min))
  
  which_nearest <- mapply(a=distances_to_all_percentile_which_ok, b = edit_dist_matrix_percentile_which_ok, c = distances_to_all_min, d = edit_dist_matrix_min,
         FUN = function(a,b,c,d){
           if(all(c %in% d)){
             return(c[1])
           }
           candidates_percentiles <- intersect(a,b)
           
           if(length(candidates_percentiles) == 1){return(candidates_percentiles)}
           
           # narrow down based on proximity first, then based on edit_dist
           prox_filt <- intersect(candidates_percentiles, c)
           if(length(prox_filt ) == 1){return(prox_filt)}
           edit_filt <- intersect(candidates_percentiles, d)
           if(length(edit_filt ) == 1){return(edit_filt)}
           # if everything fails, return NA
           return(NA)
           
           # if( (c%in%candidates_percentiles) | (d%in%candidates_percentiles) ){
           #   candidates_level2 <- intersect(union(c,d), candidates_percentiles)
           # }else{
           #   
           # }
         }, SIMPLIFY = F)
  which_nearest <- unlist(which_nearest)
  # which_nearest <- which_nearest[dist_mins]
  
  # settlements_hdx_intersect_sub$gn_geonameid <- hdx_geonames_sub$gn_geonameid[which_nearest]
  # settlements_hdx_intersect_sub$gn_name <- hdx_geonames_sub$gn_name[which_nearest]
  if(all(is.na(which_nearest))){
    return(NULL)
  }
  
  return(data.frame(set_Pcode = settlements_hdx_intersect_sub$set_Pcode,
                    gn_geonameid = hdx_geonames_sub$gn_geonameid[which_nearest]))
}, mc.cores = ncores)


settlements_hdx_intersect_geonameid <- do.call(rbind, settlements_hdx_intersect_geonameid)

settlements_hdx_intersect$gn_geonameid <- settlements_hdx_intersect_geonameid$gn_geonameid[match(settlements_hdx_intersect$set_Pcode, settlements_hdx_intersect_geonameid$set_Pcode, incomparables = NA )]

ljoin <- left_join(settlements_hdx_intersect, hdx_geonames_intersect %>% select(-c("geometry")), by = "gn_geonameid")

hdx_geonames_settlement_intersect <- left_join(hdx_geonames_intersect, 
                   
                   settlements_hdx_intersect %>% select("set_Nam_Eng", "set_Nam_Ar", "set_Pcode", "gn_geonameid") %>% select(-c("geometry")), by = "gn_geonameid")

hdx_geonames_settlement_intersect %>% select(c("admin3Name", "admin3Na_1", "gn_name","set_Nam_Eng","set_Nam_Ar","alternatenames_appended")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of combined table")

Perform a check of edit distance between the set_Nam_Eng and all the alternatenames_appended. Get the minimum value and decide, based on that, to remove those settlements before adding the alternate names.

# perform a check of edit distance between the set_Nam_Eng and all the alternatenames_appended
min_dist_check <- function(x,y){
  m <- min(adist(x, strsplit(y, ",")[[1]]))
  check <- m < nchar(x) 
  if(is.na(check)){return(FALSE)}else{
    return(check)
  }
}
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(min_dist_ok = min_dist_check(set_Nam_Eng, alternatenames_appended))

hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(set_Nam_Eng = ifelse(min_dist_ok, set_Nam_Eng, NA),
                                                           set_Nam_Ar = ifelse(min_dist_ok, set_Nam_Ar, NA),
                                                           set_Pcode = ifelse(min_dist_ok, set_Pcode, NA))

hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(set_Nam_Eng, alternatenames_appended, min_dist_ok))
# now add the arabic names
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(set_Nam_Ar, alternatenames_appended, min_dist_ok))
hdx_geonames_settlement_intersect %>% filter(min_dist_ok) %>% select(c("admin3Name", "admin3Na_1", "gn_name","set_Nam_Eng","set_Nam_Ar","gn_alternatenames","alternatenames_appended")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of combined table where min_dist_ok is TRUE (new settlement terms that can be appended if the name doesn't exist)")

Let’s transliterate the settlement arabic terms and add them if they don’t already exist. Rich Neilson method first, then the ALA-LC

# Rich Neilson
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=arabicStemR::transliterate(set_Nam_Ar),y=alternatenames_appended,z=min_dist_ok))
# ALA-LC
set_Nam_Ar_noNA <- hdx_geonames_settlement_intersect$set_Nam_Ar[!is.na(hdx_geonames_settlement_intersect$set_Nam_Ar)]

chunks <- split(set_Nam_Ar_noNA, cut_number(1:length(set_Nam_Ar_noNA), ncores))

transliterated <- mclapply(1:length(chunks), function(x){
  tmpfile_in <- paste0("../data/tmp_totransliterate.",x,".csv")
  write.table(chunks[[x]], file = tmpfile_in, row.names = F, quote = F, col.names = T, sep = ",")
  system(paste0 ("python3 ../src/transliterate.py ", tmpfile_in))
  out <- read.csv(paste0(tmpfile_in, ".transliterated"))
  out
}, mc.cores = ncores)
transliterated <- do.call(rbind, transliterated)

hdx_geonames_settlement_intersect$set_Nam_Ar_noNA_ala_lc <- transliterated$transliterations[match(hdx_geonames_settlement_intersect$set_Nam_Ar, transliterated$x, incomparables = NA )]
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=set_Nam_Ar_noNA_ala_lc,y=alternatenames_appended,z=min_dist_ok))

hdx_geonames_settlement_intersect %>% filter(min_dist_ok) %>% select(c("admin3Name", "admin3Na_1", "gn_name","set_Nam_Eng","set_Nam_Ar","gn_alternatenames","alternatenames_appended")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of combined table where min_dist_ok is TRUE (new settlement terms that can be appended if the name doesn't exist)")

3 Open Street Map / Geofrabrik

Let’s now add the Open Street Map data that is made available by Geofabrik. This is updated daily. I also added some metadata that is available in this pdf file.

if(!dir.exists("../data/osm/")) dir.create("../data/osm/")
download.file("https://download.geofabrik.de/asia/lebanon-latest-free.shp.zip", destfile = "../data/osm/lebanon-latest-free.shp.zip")
unzip("../data/osm/lebanon-latest-free.shp.zip", exdir = "../data/osm/")

osm_layers_data <- data.frame(filepath = list.files("../data/osm", pattern = "gis_(.)+_free_1.shp", full.names = T))
osm_layers_data$filename <- gsub("(.)+\\/", "", osm_layers_data$filepath)
osm_layers_data$layer_content <- gsub("gis_osm_", "", gsub("_free(.)+", "", osm_layers_data$filename))

#Enhance metadata based on the following pdf: http://download.geofabrik.de/osm-data-in-gis-formats-free.pdf

osm_layers_data$layer_content_description <- plyr::mapvalues(gsub("_a", "", osm_layers_data$layer_content),
                                                                 from = c("buildings",
                                                                          "landuse",
                                                                          "water",
                                                                          "waterways",
                                                                          "roads",
                                                                          "railways",
                                                                          "traffic",
                                                                          "transport",
                                                                          "places",
                                                                          "pois",
                                                                          "pofw",
                                                                          "natural"),
                                                                 to = c("Building outlines - Code: 15xx",
                                                                        "Forests, residential areas, industrial areas,... - Code: 72xx",
                                                                        "Lakes, ... - Code: 82xx",
                                                                        "Rivers, canals, streams, ... - Code: 81xx",
                                                                        "Roads, tracks, paths, ... - Code: 51xx",
                                                                        "Railway, subways, light rail, trams, ... - Code: 61xx",
                                                                        "Traffic related - Code: 52xx",
                                                                        "Parking lots, petrol (gas) stations, ... - Code: 50xx",
                                                                        "Cities, towns, suburbs, villages,... - Code: 10xx",
                                                                        "Points of interest, many categories - Code: 2xxx",
                                                                        "Places of worship such as churches, mosques, ... - Code: 3xxx",
                                                                        "Natural features - Code: 41xx"))
osm_layers_data$layer_content_code <- gsub("(.)+Code: ", "", osm_layers_data$layer_content_description)
osm_layers_data$layer_content_description <- gsub(" - Code(.)+", "", osm_layers_data$layer_content_description)
osm_layers_data$layer_is_area <- grepl("_a_", osm_layers_data$filename)
osm_layers_data <- osm_layers_data %>% as_tibble()
osm_layers_data$shp <- lapply(osm_layers_data$filepath, sf::read_sf)

osm_layers_data <- osm_layers_data %>% rowwise() %>% mutate(nfeatures = nrow(shp),
                                                                    fclasses = paste(sort(unique(shp$fclass)), collapse = ", "),
                                                                    flcasses_count = paste(paste0(names(sort(table(shp$fclass), decreasing = T) ), " (", as.numeric(sort(table(shp$fclass), decreasing = T) ),")"), collapse = ", ") )

osm_layers_data %>% select(-c("filepath", "shp", "fclasses")) %>% DT::datatable(caption = "Metadata of OSM shapefiles made available by Geofabrik.de")

The places map is of immediate interest here.

osm_layers_data %>% filter(layer_content == "places") %>% select("shp") %>% nth(1) %>% nth(1) %>% as_tibble() %>% select(-c("geometry")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of OSM places")

Let’s overlap and apply a similar pipeline for intersection. First admin level 3 layer, then with settlements of UN/OCHA

osm_places <- osm_layers_data %>% filter(layer_content == "places") %>% select("shp") %>% nth(1) %>% nth(1)
colnames(osm_places)[-length(colnames(osm_places))] <- paste0("osm_", colnames(osm_places)[-length(colnames(osm_places))])
osm_places_intersect_admin3 <- sf::st_intersection(osm_places, leb_administrative_hdx_3)

osm_places_intersect_admin3_Pcode <- mclapply(unique(osm_places_intersect_admin3$admin3Name), function(x){
  
  query_sub <- osm_places_intersect_admin3 %>% filter(admin3Name == x)
  if(nrow(query_sub) == 0){
    return(NULL)
  }
  # expand search to neghbors
  neighbors <- leb_administrative_hdx_3$admin3Name[st_relate(leb_administrative_hdx_3 %>% filter(admin3Name == x), leb_administrative_hdx_3, pattern = "F***T****")[[1]]]
  target_sub <- settlements_hdx_intersect %>% filter(admin3Name %in% c(x, neighbors))
  # perform a geographic distance check
  distances_to_all <- sf::st_distance(query_sub, target_sub$geometry)
  distances_to_all_percentile <- t(apply(distances_to_all, MAR=1, function(z){percent_rank(z)}))
  distances_to_all_percentile_ok <- distances_to_all_percentile < 0.05
  distances_to_all_percentile_which_ok <- lapply(1:nrow(distances_to_all_percentile_ok), function(y){which(distances_to_all_percentile_ok[y,])})
  distances_to_all_min <- as.list(apply(distances_to_all, MAR = 1, which.min))
  
  #perform a string distance check: is the nearest object the closest in semantic terms? If not, pick a closer one
  query_isArabic <- ifelse(is_arabic(query_sub$osm_name), "arabic", "english")
  target_cols <- c("arabic" = "set_Nam_Ar", "english" = "set_Nam_Eng")[query_isArabic]
  edit_dist_matrix <- t(mapply(a = query_sub$osm_name, b = target_cols, function(a,b){adist(a, target_sub[,b] %>% as_tibble() %>% pull(b) )}))
  edit_dist_matrix_percentile <- t(apply(edit_dist_matrix, MAR = 1, percent_rank))
  edit_dist_matrix_percentile_ok <- edit_dist_matrix_percentile < 0.05
  edit_dist_matrix_percentile_which_ok <- lapply(1:nrow(edit_dist_matrix_percentile_ok), function(y){which(edit_dist_matrix_percentile_ok[y,])}) 
  edit_dist_matrix_min <- as.list(apply(edit_dist_matrix, MAR = 1, which.min))
  
  which_nearest <- mapply(a=distances_to_all_percentile_which_ok, b = edit_dist_matrix_percentile_which_ok, c = distances_to_all_min, d = edit_dist_matrix_min,
         FUN = function(a,b,c,d){
           if(all(c %in% d)){
             return(c[1])
           }
           candidates_percentiles <- intersect(a,b)
           if(length(candidates_percentiles) == 1){return(candidates_percentiles)}
           # narrow down based on proximity first, then based on edit_dist
           prox_filt <- intersect(candidates_percentiles, c)
           if(length(prox_filt ) == 1){return(prox_filt)}
           edit_filt <- intersect(candidates_percentiles, d)
           if(length(edit_filt ) == 1){return(edit_filt)}
           # if everything fails, return NA
           return(NA)
           # if( (c%in%candidates_percentiles) | (d%in%candidates_percentiles) ){
           #   candidates_level2 <- intersect(union(c,d), candidates_percentiles)
           # }else{
           #   
           # }
         }, SIMPLIFY = F)
  which_nearest <- unlist(which_nearest)
  # which_nearest <- which_nearest[dist_mins]
  
  # settlements_hdx_intersect_sub$gn_geonameid <- hdx_geonames_sub$gn_geonameid[which_nearest]
  # settlements_hdx_intersect_sub$gn_name <- hdx_geonames_sub$gn_name[which_nearest]
  if(all(is.na(which_nearest))){
    return(NULL)
  }
  
  return(data.frame(osm_osm_id = query_sub$osm_osm_id,
                    set_Pcode = target_sub$set_Pcode[which_nearest]))
}, mc.cores = ncores)

osm_places_intersect_admin3_Pcode <- do.call(rbind, osm_places_intersect_admin3_Pcode)
osm_places_intersect_admin3$set_Pcode <- osm_places_intersect_admin3_Pcode$set_Pcode[match(osm_places_intersect_admin3$osm_osm_id, osm_places_intersect_admin3_Pcode$osm_osm_id, incomparables = NA)]

osm_places_intersect_admin3 %>% as_tibble() %>% left_join(settlements_hdx_intersect %>% as_tibble(), by = "set_Pcode") %>% select(c("osm_osm_id","set_Pcode", "osm_name", "set_Nam_Ar", "set_Nam_Eng")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of intersection between OSM places and UN Settlements by osm_id")

So far, 1384 OSM places out of 1782 are mapped to a UN settlement. How about linking them with geonames?

osm_places_intersect_admin3_geonames <- mclapply(unique(osm_places_intersect_admin3$admin3Name), function(x){
  
  query_sub <- osm_places_intersect_admin3 %>% filter(admin3Name == x)
  if(nrow(query_sub) == 0){
    return(NULL)
  }
  # expand search to neghbors
  neighbors <- leb_administrative_hdx_3$admin3Name[st_relate(leb_administrative_hdx_3 %>% filter(admin3Name == x), leb_administrative_hdx_3, pattern = "F***T****")[[1]]]
  target_sub <- hdx_geonames_intersect %>% filter(admin3Name %in% c(x, neighbors))
  # perform a geographic distance check
  distances_to_all <- sf::st_distance(query_sub, target_sub$geometry)
  distances_to_all_percentile <- t(apply(distances_to_all, MAR=1, function(z){percent_rank(z)}))
  distances_to_all_percentile_ok <- distances_to_all_percentile < 0.05
  distances_to_all_percentile_which_ok <- lapply(1:nrow(distances_to_all_percentile_ok), function(y){which(distances_to_all_percentile_ok[y,])})
  distances_to_all_min <- as.list(apply(distances_to_all, MAR = 1, which.min))
  
  #perform a string distance check: is the nearest object the closest in semantic terms? If not, pick a closer one
  query_isArabic <- ifelse(is_arabic(query_sub$osm_name), "arabic", "english")
  
  # take the gn_name as the english name and the first occurence of an arabic name in alternate names as a second name
  target_sub$arabic <- strsplit(target_sub$alternatenames_appended, ",") %>% sapply(function(y){
    ia <- is_arabic(y)
    if(length(y) == 0 ){
      return(NA)
      }else if(!any(ia)){
        return(NA)
      }else{
        return(y[min(which(ia))])
      }
  })
  target_sub$english <- target_sub$gn_name
  
  
  target_cols <- c("arabic" = "arabic", "english" = "english")[query_isArabic]
  edit_dist_matrix <- t(mapply(a = query_sub$osm_name, b = target_cols, function(a,b){adist(a, target_sub[,b] %>% as_tibble() %>% pull(b) )}))
  edit_dist_matrix_percentile <- t(apply(edit_dist_matrix, MAR = 1, percent_rank))
  edit_dist_matrix_percentile_ok <- edit_dist_matrix_percentile < 0.05
  edit_dist_matrix_percentile_which_ok <- lapply(1:nrow(edit_dist_matrix_percentile_ok), function(y){which(edit_dist_matrix_percentile_ok[y,])}) 
  edit_dist_matrix_min <- as.list(apply(edit_dist_matrix, MAR = 1, which.min))
  
  which_nearest <- mapply(a=distances_to_all_percentile_which_ok, b = edit_dist_matrix_percentile_which_ok, c = distances_to_all_min, d = edit_dist_matrix_min,
         FUN = function(a,b,c,d){
           if(all(c %in% d)){
             return(c[1])
           }
           candidates_percentiles <- intersect(a,b)
           if(length(candidates_percentiles) == 0){return(NA)} ##### think of adding everywhere
           if(length(candidates_percentiles) == 1){return(candidates_percentiles)}
           # narrow down based on proximity first, then based on edit_dist
           prox_filt <- intersect(candidates_percentiles, c)
           if(length(prox_filt ) == 1){return(prox_filt)}
           edit_filt <- intersect(candidates_percentiles, d)
           if(length(edit_filt ) == 1){return(edit_filt)}
           # if everything fails, return NA
           return(NA)
         }, SIMPLIFY = F)
  which_nearest <- unlist(which_nearest)
  # which_nearest <- which_nearest[dist_mins]
  
  # settlements_hdx_intersect_sub$gn_geonameid <- hdx_geonames_sub$gn_geonameid[which_nearest]
  # settlements_hdx_intersect_sub$gn_name <- hdx_geonames_sub$gn_name[which_nearest]
  if(all(is.na(which_nearest))){
    return(NULL)
  }
  
  return(data.frame(osm_osm_id = query_sub$osm_osm_id,
                    gn_geonameid = target_sub$gn_geonameid[which_nearest]))
}, mc.cores = ncores)

osm_places_intersect_admin3_geonames <- do.call(rbind, osm_places_intersect_admin3_geonames)
osm_places_intersect_admin3_geonames$gn_geonameid <- as.integer(osm_places_intersect_admin3_geonames$gn_geonameid)

osm_places_intersect_admin3$gn_geonameid <- osm_places_intersect_admin3_geonames$gn_geonameid[match(osm_places_intersect_admin3$osm_osm_id, osm_places_intersect_admin3_geonames$osm_osm_id, incomparables = NA)]
osm_places_intersect_admin3 %>% as_tibble() %>% left_join(hdx_geonames_intersect %>% as_tibble(), by = "gn_geonameid") %>% select(c("osm_osm_id","set_Pcode", "osm_name", "gn_geonameid", "gn_name", "alternatenames_appended")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of intersection between OSM places and geonames by geonameid")

Finally add OSM names to alternate names list if they are not already there

hdx_geonames_settlement_intersect$osm_osm_id_by_geonameid <- osm_places_intersect_admin3$osm_osm_id[match(hdx_geonames_settlement_intersect$gn_geonameid, osm_places_intersect_admin3$gn_geonameid, incomparables = NA)]
hdx_geonames_settlement_intersect$osm_osm_id_by_set_Pcode <- osm_places_intersect_admin3$osm_osm_id[match(hdx_geonames_settlement_intersect$set_Pcode, osm_places_intersect_admin3$set_Pcode, incomparables = NA)]

hdx_geonames_settlement_intersect$osm_name_by_geonameid <- osm_places_intersect_admin3$osm_name[match(hdx_geonames_settlement_intersect$gn_geonameid, osm_places_intersect_admin3$gn_geonameid, incomparables = NA)]
hdx_geonames_settlement_intersect$osm_name_by_set_Pcode <- osm_places_intersect_admin3$osm_name[match(hdx_geonames_settlement_intersect$set_Pcode, osm_places_intersect_admin3$set_Pcode,incomparables = NA)]
hdx_geonames_settlement_intersect$osm_name_toappend <- hdx_geonames_settlement_intersect$osm_name_by_geonameid == hdx_geonames_settlement_intersect$osm_name_by_set_Pcode # only add the name when the two osm names agree
hdx_geonames_settlement_intersect$osm_name_toappend[is.na(hdx_geonames_settlement_intersect$osm_name_toappend )] <- F
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=osm_name_by_set_Pcode,y=alternatenames_appended,z=osm_name_toappend))

4 TODO: add UN LOCODE

5 Output and Stats

Let’s compare the starting points with the endpoint.

Some results:

  • geonames data: Started with 5535 entries.
  • HDX/OCHA levels 3 administrative data: Started with 1545 entries, of which 1446 are mapped to at least one geonames object.
  • HDX/OCHA settlement data: Started with 2713 settlements of which 2054 are mapped to at least one geonames object, 1353 assigned to an OSM place. 500 features remain to be assigned and are shown in red below.
  • OSM data: 1163 OSM places out of 1782 are mapped to a geonames id. 1011 OSM places are linked to both UN settlements and geonames. 246 are not assigned to either geonames or UN Pcode and are shown in orange below.
  • Alternate names database: 2121 records have at least one new alternate name.

Here’s how the result looks like:

In red dots are all settlements that are still not “integrated” using this pipeline. Orange dots are non-integrated OSM places. Circles with blue fill are UN settlements or OSM places that have been successfully linked to geonames. Circles with green fill are points where OSM, Settlements and geonames are linked.

# forplot <- sf::st_coordinates(hdx_geonames_settlement_intersect$geometry.x) %>% as_tibble() %>% rename(lat = Y, lon = X) %>% cbind(hdx_geonames_settlement_intersect)
forplot <- hdx_geonames_settlement_intersect
forplot <- forplot %>% rowwise() %>% mutate(lab =paste(
  c(paste0("admin3Name: ",admin3Name),
    paste0("gn_name: ",gn_name),
    paste0("gn_geonameid: ", gn_geonameid),
    paste0("Settlement english name:", set_Nam_Eng),
    paste0("Settlement Pcode:", set_Pcode),
    paste0("OSM Name(s):", paste(unique(c(osm_name_by_geonameid, osm_name_by_set_Pcode)), collapse = ",") ),
    paste0("OSM IDs(s):", paste(unique(c(osm_osm_id_by_geonameid, osm_osm_id_by_set_Pcode)), collapse = ",") ),
    paste0("Alternate Names (old): ", gn_alternatenames),
    paste0("Added names: <b>", gsub(gn_alternatenames, "", alternatenames_appended), "</b>") ),
  collapse = "<br>"))

forplot$lab <- lapply(forplot$lab, htmltools::HTML)

# admin2Col <- colorFactor(topo.colors(length(unique(forplot$admin2Name))), unique(forplot$admin2Name))

forplot$isSettlement <- !is.na(forplot$set_Nam_Eng)
forplot$isOSM <- !is.na(forplot$osm_osm_id_by_geonameid) | !is.na(forplot$osm_osm_id_by_set_Pcode)

forplot$point_fill <- c("#FFFFFF00", "blue", "green")[forplot$isSettlement + forplot$isOSM + 1]

settlements_not_assigned <- leb_settlements %>% filter(!leb_settlements$set_Pcode %in% hdx_geonames_settlement_intersect$set_Pcode, !leb_settlements$set_Pcode %in% osm_places_intersect_admin3$set_Pcode)
osm_not_assigned <- osm_places_intersect_admin3 %>% filter(is.na(set_Pcode),is.na(gn_geonameid)) 

forplot %>% leaflet() %>% addTiles() %>%
  addCircleMarkers(label = ~lab, fillColor= ~point_fill, fillOpacity = 0.5, clusterOptions = markerClusterOptions(),
    clusterId = "whatever") %>%
  addCircles(data = settlements_not_assigned, label = ~set_Nam_Eng,color = "red") %>%
  addCircles(data = osm_not_assigned, label = ~osm_name,color = "orange")

Write out the final file in:

  • csv format: data/lebanon_geo_names_fill.csv (also a zipped version)
hdx_geonames_settlement_intersect %>% as_tibble() %>% readr::write_csv(path = "../data/lebanon_geo_names_full.csv")
zip(zipfile = "../data/lebanon_geo_names_full.zip", files = "../data/lebanon_geo_names_full.csv")