Compiled on 2020-09-28 by Maroun Bou Sleiman.
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(sf))
suppressPackageStartupMessages(library(leaflet))
suppressPackageStartupMessages(library(rhdx))
suppressPackageStartupMessages(library(parallel))
# suppressPackageStartupMessages(library(ggmap)) # for later
# suppressPackageStartupMessages(library(osmdata)) # for later
## some helper functions
is_arabic <- function(x){grepl("[\u0600-\u06ff]|[\u0750-\u077f]|[\ufb50-\ufc3f]|[\ufe70-\ufefc]", x, perl = FALSE)}
One major issue we’re dealing with is the alternate names (and different languages) that Lebanese places have. Different databases and data sources use different names, which renders data integration very time-consuming. Here, I will try to simplify this through integration of different existing databases.
First I will import the HDX map with the administrative divisions of Lebanon (levels 1-3). The level 3 map, which is at the town/village level is of most interest here. The shapefile has the following information:
leb_administrative_hdx <- rhdx::search_datasets("Lebanon - Subnational Administrative Boundaries") %>% nth(1) %>% get_resource(2) %>% download_resource(folder = "../data/")
unzip(leb_administrative_hdx, exdir = "../data/hdx")
leb_administrative_hdx_3 <- sf::read_sf("../data/hdx/lbn_admbnda_adm3_cdr_20200810.shp")
# remove conflict and litige tiles
leb_administrative_hdx_3 <- leb_administrative_hdx_3 %>% filter(admin3Name != "Conflict") %>% filter(admin3Name != "Litige")
DT::datatable(leb_administrative_hdx_3 %>% as_tibble() %>% select(-c("geometry")) %>% head(n = 20), caption = "First 20 records of the HDX/OCHA Level 3 Administrative Division Map")
Now let’s get the geonames data, which has more extensive alternate names. Specifically, we need two files about Lebanon:
The geonames are points with names basically. Note: I will add the gn_ prefix to the geonames data columns. Here is how the geonames data look like:
## geonames data dumps for lebanon
dir.create("../data/geonames")
### LB.zip file
download.file("https://download.geonames.org/export/dump/LB.zip", destfile = "../data/geonames/LB.zip")
unzip("../data/geonames/LB.zip", exdir = "../data/geonames/LB")
geonames_lb <- read.table("../data/geonames/LB/LB.txt", sep = "\t", header = F, fill = T, quote = "") %>% as_tibble()
geonames_lb_metadata_rl <- readLines("../data/geonames/LB/readme.txt")
whichLine <- grepl("^The main \\'geoname\\'", geonames_lb_metadata_rl) %>% which
geonames_lb_metadata <- geonames_lb_metadata_rl[(whichLine + 2):(whichLine+2+18)]
geonames_lb_metadata <- do.call(rbind, lapply(strsplit(geonames_lb_metadata, "\\:"), function(x){
x <- gsub("( )+$", "", x) # remove trailing spaces
x <- gsub("^( )+", "", x) # remove leading spaces
l <- length(x)
c(x[1], paste(x[2:l], collapse = ":"))
})) %>% as_tibble()
colnames(geonames_lb_metadata) <- c("feature", "description")
geonames_lb_metadata$feature <- paste0("gn_", geonames_lb_metadata$feature)
colnames(geonames_lb) <- geonames_lb_metadata$feature
geonames_lb <- geonames_lb %>% mutate(latitude = as.numeric(gn_latitude), longitude = as.numeric(gn_longitude))
geonames_lb_sf <- sf::st_as_sf(geonames_lb, coords=c("gn_longitude", "gn_latitude"))
DT::datatable(geonames_lb_sf %>% as_tibble() %>% select(-c("geometry")) %>% head(n = 20), caption = "First 20 records in the geonames database")
Let’s filter geonames by class first: see link. I will keep the following:
gn_keep_feature_classes <- c("A","L", "P")
geonames_lb_sf <- geonames_lb_sf %>% filter(`gn_feature class` %in% gn_keep_feature_classes)
Out of the 24597 records, 5535 records are kept for downstream matching.
Now let’s intersect both sources! Here’s a sample of what we get:
st_crs(geonames_lb_sf) <- st_crs(leb_administrative_hdx_3) # a hack, not sure about it
hdx_geonames_intersect <- sf::st_intersection(leb_administrative_hdx_3, geonames_lb_sf)
DT::datatable(hdx_geonames_intersect %>% as_tibble() %>% select(-c("geometry")) %>% head(n=20) , caption = "First 20 records of intersection table, with all columns")
The geonames data are more granular, and they (almost?) always have a place that intersects an HDX polygon with the same or very similar name. Let’s find that geonames point(s) that has the most similar name. The new isMin column encodes this information. A value of TRUE means that it is the closest in terms of edit distance to the HDX admin3Name.
isMin_func <- function(dist, n1, n2){
first_condition <- dist == min(dist) # record has the lowest distance
second_condition <- dist < min(nchar(c(n1,n2))) # the distance is less than the shortest string in the pair
third_condition <- dist <=10 # at most 10 differences
first_condition & second_condition & third_condition
}
hdx_geonames_intersect <- hdx_geonames_intersect %>% rowwise() %>% mutate(edit_dist = adist(admin3Name, gn_name)) %>%
group_by(admin3Name) %>%
mutate(admin3Name_geoname_minimized = admin3Name[which.min(edit_dist)],
isMin = isMin_func(edit_dist, admin3Name, gn_name) ) %>% ungroup()
DT::datatable(head(hdx_geonames_intersect, n = 100) %>% select("admin3Name", "gn_name", "isMin"), caption = "First 100 records of intersection table, with selected columns")
Now let’s create a new alternate names column start appending to the existing geobanes alternate names. However, first check if the name already exists.
First let’s do the admin3Name column
#append to alternate names function
append_alternate <- function(x=admin3Name,y=alternatenames,z=isMin) {
if(is.na(y) & !is.na(x)){return(x)}
if(is.na(x)){return(y)}
an <- strsplit(y, ",")[[1]]
if((!tolower(x) %in% tolower(an) ) & z){
paste(c(an,x), collapse = ",")
}else{
paste(an, collapse = ",")
}
}
#append alternate "admin3Name"
hdx_geonames_intersect <- hdx_geonames_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=admin3Name,y=gn_alternatenames,z=isMin))
First let’s do the admin3Na_1 Arabic column (from HDX)
#append alternate "admin3Na_1"
hdx_geonames_intersect <- hdx_geonames_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=admin3Na_1,y=alternatenames_appended,z=isMin))
Let’s use the arabicStemR from Rich Nielson to transliterate the admin3Na_1 and add this version.
suppressPackageStartupMessages(library(arabicStemR))
hdx_geonames_intersect <- hdx_geonames_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=arabicStemR::transliterate(admin3Na_1),y=alternatenames_appended,z=isMin))
hdx_geonames_intersect %>% select(c("admin3Name", "gn_name", "admin3Name_geoname_minimized", "isMin", "alternatenames_appended")) %>% filter( isMin) %>% head(n=100) %>%
DT::datatable(caption = "Sample of 100 records that where the admin3Name is a likely source of a new alternate name (isMin = TRUE)")
Great Arabic Transliteration tool that can be found here. I have modified the DOCKERFILE to download and install it. I also added pandas to the requirements.txt file. Then, I prepared a python script (/src/transliterate.py) that accepts a csv file with one column (Arabic) and outputs csv files with two columns (arabic, transliteration result). I parallelized this step to make it faster, according to the number of cores (parallel package). Here is a sample of the result after running the script on 5499 Arabic names:
ncores <- detectCores()
chunks <- split(hdx_geonames_intersect$admin3Na_1, cut_number(1:length(hdx_geonames_intersect$admin3Na_1), ncores))
transliterated <- mclapply(1:length(chunks), function(x){
tmpfile_in <- paste0("../data/tmp_totransliterate.",x,".csv")
write.table(chunks[[x]], file = tmpfile_in, row.names = F, quote = F, col.names = T, sep = ",")
system(paste0 ("python3 ../src/transliterate.py ", tmpfile_in))
out <- read.csv(paste0(tmpfile_in, ".transliterated"))
out
}, mc.cores = ncores)
transliterated <- do.call(rbind, transliterated)
hdx_geonames_intersect$admin3Na_1_ala_lc <- transliterated$transliterations
DT::datatable(head(transliterated, n=100), caption = "First 100 ALA-LC transliteration results")
Append transliterations:
# append alternate ALA_LC transliterations
hdx_geonames_intersect <- hdx_geonames_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=admin3Na_1_ala_lc,y=alternatenames_appended,z=isMin))
hdx_geonames_intersect %>% select(c("admin3Name", "gn_name", "isMin", "edit_dist", "alternatenames_appended")) %>%
filter(isMin) %>%
head(n = 100) %>%
DT::datatable(caption = "Sample of 100 records that where the admin3Name is a likely source of a new alternate name (isMin = TRUE)")
Now let’s see the HDX settlements points relate the the administrative level 3 data. Note: I will add the set_ prefix to the settlement data columns.
leb_settlements_file <- rhdx::search_datasets("Lebanon - Settlements (villages, towns, cities)") %>% nth(1) %>% get_resource(1) %>% download_resource(folder = "../data/")
unzip(leb_settlements_file, exdir = "../data/hdx")
leb_settlements <- sf::read_sf("../data/hdx/Lebanon_Villages.shp")
colnames(leb_settlements)[1:(ncol(leb_settlements)-1)] <- paste0("set_", colnames(leb_settlements)[1:(ncol(leb_settlements)-1)])
DT::datatable(head(leb_settlements, n = 20), caption = "First 20 records in the settlements file")
How well does this agree/overlap with the UN admin level 3 file?
Let’s do spatial intersection now
settlements_hdx_intersect <- sf::st_intersection(leb_settlements, leb_administrative_hdx_3)
settlements_hdx_intersect %>% select(c("set_Pcode", "set_Nam_Eng", "set_Nam_Ar", "admin3Name", "admin3Name", "admin3Na_1")) %>% head(n = 20) %>% as_tibble() %>% select(-c("geometry")) %>% DT::datatable(caption = "First 20 records of the intersection between the HDX admin 3 map and the settlements map")
2698 features can be intersected spatially. English names do not seem to be harmonized here, need to identify the likely set_Nam_Eng that corresponds to the best match. In this case, matching of Arabic names, which are less ambiguous, may do the job. Also, all the admin3Na_1 names are included in the set_Nam_Ar, and there is a 1 to 1 correspondence between the two:
leb_administrative_hdx_3$admin3Na_1 in leb_administrative_hdx_3$set_Nam_Arleb_administrative_hdx_3$admin3Na_1 in leb_administrative_hdx_3$set_Nam_Ar.Since there are multple settlements per admin3 entity, I created a new column called isPrimary to flag the settlement that should have the same english name as its admin3 counterpart.
settlements_hdx_intersect$isPrimary <- settlements_hdx_intersect$set_Nam_Ar == settlements_hdx_intersect$admin1Na_1
Now let’s relate the settlements to the geonames using geographic proximity and overlap with the admin3 as criteria. The end result is adding a gn_geonameid to each settlement. Initially I wanted to do this distance calculation within admin3 levels so as to avoid making links that cross the admin3 borders. However, it seems that some settlements are assigned to nearby administrative regions. My solution is to expand the search to include the nearby admin3 positions.
all_admin3Name <- sort(unique(settlements_hdx_intersect$admin3Name))
settlements_hdx_intersect_geonameid <- mclapply(all_admin3Name, function(x){
settlements_hdx_intersect_sub <- settlements_hdx_intersect %>% filter(admin3Name == x)
if(nrow(settlements_hdx_intersect_sub) == 0){
return(NULL)
}
# expand search to neghbors
neighbors <- leb_administrative_hdx_3$admin3Name[st_relate(leb_administrative_hdx_3 %>% filter(admin3Name == x), leb_administrative_hdx_3, pattern = "F***T****")[[1]]]
hdx_geonames_sub <- hdx_geonames_intersect %>% filter(admin3Name %in% c(x, neighbors))
# which_nearest <- sf::st_nearest_feature(settlements_hdx_intersect_sub, hdx_geonames_sub$geometry)
# cbind(settlements_hdx_intersect_sub[,c("set_Nam_Eng")],hdx_geonames_sub[which_nearest, c("gn_name")])
# perform a geographic distance check
distances_to_all <- sf::st_distance(settlements_hdx_intersect_sub, hdx_geonames_sub$geometry)
distances_to_all_percentile <- t(apply(distances_to_all, MAR=1, function(z){percent_rank(z)}))
distances_to_all_percentile_ok <- distances_to_all_percentile < 0.05
distances_to_all_percentile_which_ok <- lapply(1:nrow(distances_to_all_percentile_ok), function(y){which(distances_to_all_percentile_ok[y,])})
distances_to_all_min <- as.list(apply(distances_to_all, MAR = 1, which.min))
#perform a string distance check: is the nearest object the closest in semantic terms? If not, pick a closer one
edit_dist_matrix <- adist(settlements_hdx_intersect_sub$set_Nam_Eng, hdx_geonames_sub$gn_name)
edit_dist_matrix_percentile <- t(apply(edit_dist_matrix, MAR = 1, percent_rank))
edit_dist_matrix_percentile_ok <- edit_dist_matrix_percentile < 0.05
edit_dist_matrix_percentile_which_ok <- lapply(1:nrow(edit_dist_matrix_percentile_ok), function(y){which(edit_dist_matrix_percentile_ok[y,])})
edit_dist_matrix_min <- as.list(apply(edit_dist_matrix, MAR = 1, which.min))
which_nearest <- mapply(a=distances_to_all_percentile_which_ok, b = edit_dist_matrix_percentile_which_ok, c = distances_to_all_min, d = edit_dist_matrix_min,
FUN = function(a,b,c,d){
if(all(c %in% d)){
return(c[1])
}
candidates_percentiles <- intersect(a,b)
if(length(candidates_percentiles) == 1){return(candidates_percentiles)}
# narrow down based on proximity first, then based on edit_dist
prox_filt <- intersect(candidates_percentiles, c)
if(length(prox_filt ) == 1){return(prox_filt)}
edit_filt <- intersect(candidates_percentiles, d)
if(length(edit_filt ) == 1){return(edit_filt)}
# if everything fails, return NA
return(NA)
# if( (c%in%candidates_percentiles) | (d%in%candidates_percentiles) ){
# candidates_level2 <- intersect(union(c,d), candidates_percentiles)
# }else{
#
# }
}, SIMPLIFY = F)
which_nearest <- unlist(which_nearest)
# which_nearest <- which_nearest[dist_mins]
# settlements_hdx_intersect_sub$gn_geonameid <- hdx_geonames_sub$gn_geonameid[which_nearest]
# settlements_hdx_intersect_sub$gn_name <- hdx_geonames_sub$gn_name[which_nearest]
if(all(is.na(which_nearest))){
return(NULL)
}
return(data.frame(set_Pcode = settlements_hdx_intersect_sub$set_Pcode,
gn_geonameid = hdx_geonames_sub$gn_geonameid[which_nearest]))
}, mc.cores = ncores)
settlements_hdx_intersect_geonameid <- do.call(rbind, settlements_hdx_intersect_geonameid)
settlements_hdx_intersect$gn_geonameid <- settlements_hdx_intersect_geonameid$gn_geonameid[match(settlements_hdx_intersect$set_Pcode, settlements_hdx_intersect_geonameid$set_Pcode, incomparables = NA )]
ljoin <- left_join(settlements_hdx_intersect, hdx_geonames_intersect %>% select(-c("geometry")), by = "gn_geonameid")
hdx_geonames_settlement_intersect <- left_join(hdx_geonames_intersect,
settlements_hdx_intersect %>% select("set_Nam_Eng", "set_Nam_Ar", "set_Pcode", "gn_geonameid") %>% select(-c("geometry")), by = "gn_geonameid")
hdx_geonames_settlement_intersect %>% select(c("admin3Name", "admin3Na_1", "gn_name","set_Nam_Eng","set_Nam_Ar","alternatenames_appended")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of combined table")
Perform a check of edit distance between the set_Nam_Eng and all the alternatenames_appended. Get the minimum value and decide, based on that, to remove those settlements before adding the alternate names.
# perform a check of edit distance between the set_Nam_Eng and all the alternatenames_appended
min_dist_check <- function(x,y){
m <- min(adist(x, strsplit(y, ",")[[1]]))
check <- m < nchar(x)
if(is.na(check)){return(FALSE)}else{
return(check)
}
}
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(min_dist_ok = min_dist_check(set_Nam_Eng, alternatenames_appended))
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(set_Nam_Eng = ifelse(min_dist_ok, set_Nam_Eng, NA),
set_Nam_Ar = ifelse(min_dist_ok, set_Nam_Ar, NA),
set_Pcode = ifelse(min_dist_ok, set_Pcode, NA))
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(set_Nam_Eng, alternatenames_appended, min_dist_ok))
# now add the arabic names
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(set_Nam_Ar, alternatenames_appended, min_dist_ok))
hdx_geonames_settlement_intersect %>% filter(min_dist_ok) %>% select(c("admin3Name", "admin3Na_1", "gn_name","set_Nam_Eng","set_Nam_Ar","gn_alternatenames","alternatenames_appended")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of combined table where min_dist_ok is TRUE (new settlement terms that can be appended if the name doesn't exist)")
Let’s transliterate the settlement arabic terms and add them if they don’t already exist. Rich Neilson method first, then the ALA-LC
# Rich Neilson
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=arabicStemR::transliterate(set_Nam_Ar),y=alternatenames_appended,z=min_dist_ok))
# ALA-LC
set_Nam_Ar_noNA <- hdx_geonames_settlement_intersect$set_Nam_Ar[!is.na(hdx_geonames_settlement_intersect$set_Nam_Ar)]
chunks <- split(set_Nam_Ar_noNA, cut_number(1:length(set_Nam_Ar_noNA), ncores))
transliterated <- mclapply(1:length(chunks), function(x){
tmpfile_in <- paste0("../data/tmp_totransliterate.",x,".csv")
write.table(chunks[[x]], file = tmpfile_in, row.names = F, quote = F, col.names = T, sep = ",")
system(paste0 ("python3 ../src/transliterate.py ", tmpfile_in))
out <- read.csv(paste0(tmpfile_in, ".transliterated"))
out
}, mc.cores = ncores)
transliterated <- do.call(rbind, transliterated)
hdx_geonames_settlement_intersect$set_Nam_Ar_noNA_ala_lc <- transliterated$transliterations[match(hdx_geonames_settlement_intersect$set_Nam_Ar, transliterated$x, incomparables = NA )]
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=set_Nam_Ar_noNA_ala_lc,y=alternatenames_appended,z=min_dist_ok))
hdx_geonames_settlement_intersect %>% filter(min_dist_ok) %>% select(c("admin3Name", "admin3Na_1", "gn_name","set_Nam_Eng","set_Nam_Ar","gn_alternatenames","alternatenames_appended")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of combined table where min_dist_ok is TRUE (new settlement terms that can be appended if the name doesn't exist)")
Let’s now add the Open Street Map data that is made available by Geofabrik. This is updated daily. I also added some metadata that is available in this pdf file.
if(!dir.exists("../data/osm/")) dir.create("../data/osm/")
download.file("https://download.geofabrik.de/asia/lebanon-latest-free.shp.zip", destfile = "../data/osm/lebanon-latest-free.shp.zip")
unzip("../data/osm/lebanon-latest-free.shp.zip", exdir = "../data/osm/")
osm_layers_data <- data.frame(filepath = list.files("../data/osm", pattern = "gis_(.)+_free_1.shp", full.names = T))
osm_layers_data$filename <- gsub("(.)+\\/", "", osm_layers_data$filepath)
osm_layers_data$layer_content <- gsub("gis_osm_", "", gsub("_free(.)+", "", osm_layers_data$filename))
#Enhance metadata based on the following pdf: http://download.geofabrik.de/osm-data-in-gis-formats-free.pdf
osm_layers_data$layer_content_description <- plyr::mapvalues(gsub("_a", "", osm_layers_data$layer_content),
from = c("buildings",
"landuse",
"water",
"waterways",
"roads",
"railways",
"traffic",
"transport",
"places",
"pois",
"pofw",
"natural"),
to = c("Building outlines - Code: 15xx",
"Forests, residential areas, industrial areas,... - Code: 72xx",
"Lakes, ... - Code: 82xx",
"Rivers, canals, streams, ... - Code: 81xx",
"Roads, tracks, paths, ... - Code: 51xx",
"Railway, subways, light rail, trams, ... - Code: 61xx",
"Traffic related - Code: 52xx",
"Parking lots, petrol (gas) stations, ... - Code: 50xx",
"Cities, towns, suburbs, villages,... - Code: 10xx",
"Points of interest, many categories - Code: 2xxx",
"Places of worship such as churches, mosques, ... - Code: 3xxx",
"Natural features - Code: 41xx"))
osm_layers_data$layer_content_code <- gsub("(.)+Code: ", "", osm_layers_data$layer_content_description)
osm_layers_data$layer_content_description <- gsub(" - Code(.)+", "", osm_layers_data$layer_content_description)
osm_layers_data$layer_is_area <- grepl("_a_", osm_layers_data$filename)
osm_layers_data <- osm_layers_data %>% as_tibble()
osm_layers_data$shp <- lapply(osm_layers_data$filepath, sf::read_sf)
osm_layers_data <- osm_layers_data %>% rowwise() %>% mutate(nfeatures = nrow(shp),
fclasses = paste(sort(unique(shp$fclass)), collapse = ", "),
flcasses_count = paste(paste0(names(sort(table(shp$fclass), decreasing = T) ), " (", as.numeric(sort(table(shp$fclass), decreasing = T) ),")"), collapse = ", ") )
osm_layers_data %>% select(-c("filepath", "shp", "fclasses")) %>% DT::datatable(caption = "Metadata of OSM shapefiles made available by Geofabrik.de")
The places map is of immediate interest here.
osm_layers_data %>% filter(layer_content == "places") %>% select("shp") %>% nth(1) %>% nth(1) %>% as_tibble() %>% select(-c("geometry")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of OSM places")
Let’s overlap and apply a similar pipeline for intersection. First admin level 3 layer, then with settlements of UN/OCHA
osm_places <- osm_layers_data %>% filter(layer_content == "places") %>% select("shp") %>% nth(1) %>% nth(1)
colnames(osm_places)[-length(colnames(osm_places))] <- paste0("osm_", colnames(osm_places)[-length(colnames(osm_places))])
osm_places_intersect_admin3 <- sf::st_intersection(osm_places, leb_administrative_hdx_3)
osm_places_intersect_admin3_Pcode <- mclapply(unique(osm_places_intersect_admin3$admin3Name), function(x){
query_sub <- osm_places_intersect_admin3 %>% filter(admin3Name == x)
if(nrow(query_sub) == 0){
return(NULL)
}
# expand search to neghbors
neighbors <- leb_administrative_hdx_3$admin3Name[st_relate(leb_administrative_hdx_3 %>% filter(admin3Name == x), leb_administrative_hdx_3, pattern = "F***T****")[[1]]]
target_sub <- settlements_hdx_intersect %>% filter(admin3Name %in% c(x, neighbors))
# perform a geographic distance check
distances_to_all <- sf::st_distance(query_sub, target_sub$geometry)
distances_to_all_percentile <- t(apply(distances_to_all, MAR=1, function(z){percent_rank(z)}))
distances_to_all_percentile_ok <- distances_to_all_percentile < 0.05
distances_to_all_percentile_which_ok <- lapply(1:nrow(distances_to_all_percentile_ok), function(y){which(distances_to_all_percentile_ok[y,])})
distances_to_all_min <- as.list(apply(distances_to_all, MAR = 1, which.min))
#perform a string distance check: is the nearest object the closest in semantic terms? If not, pick a closer one
query_isArabic <- ifelse(is_arabic(query_sub$osm_name), "arabic", "english")
target_cols <- c("arabic" = "set_Nam_Ar", "english" = "set_Nam_Eng")[query_isArabic]
edit_dist_matrix <- t(mapply(a = query_sub$osm_name, b = target_cols, function(a,b){adist(a, target_sub[,b] %>% as_tibble() %>% pull(b) )}))
edit_dist_matrix_percentile <- t(apply(edit_dist_matrix, MAR = 1, percent_rank))
edit_dist_matrix_percentile_ok <- edit_dist_matrix_percentile < 0.05
edit_dist_matrix_percentile_which_ok <- lapply(1:nrow(edit_dist_matrix_percentile_ok), function(y){which(edit_dist_matrix_percentile_ok[y,])})
edit_dist_matrix_min <- as.list(apply(edit_dist_matrix, MAR = 1, which.min))
which_nearest <- mapply(a=distances_to_all_percentile_which_ok, b = edit_dist_matrix_percentile_which_ok, c = distances_to_all_min, d = edit_dist_matrix_min,
FUN = function(a,b,c,d){
if(all(c %in% d)){
return(c[1])
}
candidates_percentiles <- intersect(a,b)
if(length(candidates_percentiles) == 1){return(candidates_percentiles)}
# narrow down based on proximity first, then based on edit_dist
prox_filt <- intersect(candidates_percentiles, c)
if(length(prox_filt ) == 1){return(prox_filt)}
edit_filt <- intersect(candidates_percentiles, d)
if(length(edit_filt ) == 1){return(edit_filt)}
# if everything fails, return NA
return(NA)
# if( (c%in%candidates_percentiles) | (d%in%candidates_percentiles) ){
# candidates_level2 <- intersect(union(c,d), candidates_percentiles)
# }else{
#
# }
}, SIMPLIFY = F)
which_nearest <- unlist(which_nearest)
# which_nearest <- which_nearest[dist_mins]
# settlements_hdx_intersect_sub$gn_geonameid <- hdx_geonames_sub$gn_geonameid[which_nearest]
# settlements_hdx_intersect_sub$gn_name <- hdx_geonames_sub$gn_name[which_nearest]
if(all(is.na(which_nearest))){
return(NULL)
}
return(data.frame(osm_osm_id = query_sub$osm_osm_id,
set_Pcode = target_sub$set_Pcode[which_nearest]))
}, mc.cores = ncores)
osm_places_intersect_admin3_Pcode <- do.call(rbind, osm_places_intersect_admin3_Pcode)
osm_places_intersect_admin3$set_Pcode <- osm_places_intersect_admin3_Pcode$set_Pcode[match(osm_places_intersect_admin3$osm_osm_id, osm_places_intersect_admin3_Pcode$osm_osm_id, incomparables = NA)]
osm_places_intersect_admin3 %>% as_tibble() %>% left_join(settlements_hdx_intersect %>% as_tibble(), by = "set_Pcode") %>% select(c("osm_osm_id","set_Pcode", "osm_name", "set_Nam_Ar", "set_Nam_Eng")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of intersection between OSM places and UN Settlements by osm_id")
So far, 1384 OSM places out of 1782 are mapped to a UN settlement. How about linking them with geonames?
osm_places_intersect_admin3_geonames <- mclapply(unique(osm_places_intersect_admin3$admin3Name), function(x){
query_sub <- osm_places_intersect_admin3 %>% filter(admin3Name == x)
if(nrow(query_sub) == 0){
return(NULL)
}
# expand search to neghbors
neighbors <- leb_administrative_hdx_3$admin3Name[st_relate(leb_administrative_hdx_3 %>% filter(admin3Name == x), leb_administrative_hdx_3, pattern = "F***T****")[[1]]]
target_sub <- hdx_geonames_intersect %>% filter(admin3Name %in% c(x, neighbors))
# perform a geographic distance check
distances_to_all <- sf::st_distance(query_sub, target_sub$geometry)
distances_to_all_percentile <- t(apply(distances_to_all, MAR=1, function(z){percent_rank(z)}))
distances_to_all_percentile_ok <- distances_to_all_percentile < 0.05
distances_to_all_percentile_which_ok <- lapply(1:nrow(distances_to_all_percentile_ok), function(y){which(distances_to_all_percentile_ok[y,])})
distances_to_all_min <- as.list(apply(distances_to_all, MAR = 1, which.min))
#perform a string distance check: is the nearest object the closest in semantic terms? If not, pick a closer one
query_isArabic <- ifelse(is_arabic(query_sub$osm_name), "arabic", "english")
# take the gn_name as the english name and the first occurence of an arabic name in alternate names as a second name
target_sub$arabic <- strsplit(target_sub$alternatenames_appended, ",") %>% sapply(function(y){
ia <- is_arabic(y)
if(length(y) == 0 ){
return(NA)
}else if(!any(ia)){
return(NA)
}else{
return(y[min(which(ia))])
}
})
target_sub$english <- target_sub$gn_name
target_cols <- c("arabic" = "arabic", "english" = "english")[query_isArabic]
edit_dist_matrix <- t(mapply(a = query_sub$osm_name, b = target_cols, function(a,b){adist(a, target_sub[,b] %>% as_tibble() %>% pull(b) )}))
edit_dist_matrix_percentile <- t(apply(edit_dist_matrix, MAR = 1, percent_rank))
edit_dist_matrix_percentile_ok <- edit_dist_matrix_percentile < 0.05
edit_dist_matrix_percentile_which_ok <- lapply(1:nrow(edit_dist_matrix_percentile_ok), function(y){which(edit_dist_matrix_percentile_ok[y,])})
edit_dist_matrix_min <- as.list(apply(edit_dist_matrix, MAR = 1, which.min))
which_nearest <- mapply(a=distances_to_all_percentile_which_ok, b = edit_dist_matrix_percentile_which_ok, c = distances_to_all_min, d = edit_dist_matrix_min,
FUN = function(a,b,c,d){
if(all(c %in% d)){
return(c[1])
}
candidates_percentiles <- intersect(a,b)
if(length(candidates_percentiles) == 0){return(NA)} ##### think of adding everywhere
if(length(candidates_percentiles) == 1){return(candidates_percentiles)}
# narrow down based on proximity first, then based on edit_dist
prox_filt <- intersect(candidates_percentiles, c)
if(length(prox_filt ) == 1){return(prox_filt)}
edit_filt <- intersect(candidates_percentiles, d)
if(length(edit_filt ) == 1){return(edit_filt)}
# if everything fails, return NA
return(NA)
}, SIMPLIFY = F)
which_nearest <- unlist(which_nearest)
# which_nearest <- which_nearest[dist_mins]
# settlements_hdx_intersect_sub$gn_geonameid <- hdx_geonames_sub$gn_geonameid[which_nearest]
# settlements_hdx_intersect_sub$gn_name <- hdx_geonames_sub$gn_name[which_nearest]
if(all(is.na(which_nearest))){
return(NULL)
}
return(data.frame(osm_osm_id = query_sub$osm_osm_id,
gn_geonameid = target_sub$gn_geonameid[which_nearest]))
}, mc.cores = ncores)
osm_places_intersect_admin3_geonames <- do.call(rbind, osm_places_intersect_admin3_geonames)
osm_places_intersect_admin3_geonames$gn_geonameid <- as.integer(osm_places_intersect_admin3_geonames$gn_geonameid)
osm_places_intersect_admin3$gn_geonameid <- osm_places_intersect_admin3_geonames$gn_geonameid[match(osm_places_intersect_admin3$osm_osm_id, osm_places_intersect_admin3_geonames$osm_osm_id, incomparables = NA)]
osm_places_intersect_admin3 %>% as_tibble() %>% left_join(hdx_geonames_intersect %>% as_tibble(), by = "gn_geonameid") %>% select(c("osm_osm_id","set_Pcode", "osm_name", "gn_geonameid", "gn_name", "alternatenames_appended")) %>% head(n = 100) %>% DT::datatable(caption = "First 100 records of intersection between OSM places and geonames by geonameid")
Finally add OSM names to alternate names list if they are not already there
hdx_geonames_settlement_intersect$osm_osm_id_by_geonameid <- osm_places_intersect_admin3$osm_osm_id[match(hdx_geonames_settlement_intersect$gn_geonameid, osm_places_intersect_admin3$gn_geonameid, incomparables = NA)]
hdx_geonames_settlement_intersect$osm_osm_id_by_set_Pcode <- osm_places_intersect_admin3$osm_osm_id[match(hdx_geonames_settlement_intersect$set_Pcode, osm_places_intersect_admin3$set_Pcode, incomparables = NA)]
hdx_geonames_settlement_intersect$osm_name_by_geonameid <- osm_places_intersect_admin3$osm_name[match(hdx_geonames_settlement_intersect$gn_geonameid, osm_places_intersect_admin3$gn_geonameid, incomparables = NA)]
hdx_geonames_settlement_intersect$osm_name_by_set_Pcode <- osm_places_intersect_admin3$osm_name[match(hdx_geonames_settlement_intersect$set_Pcode, osm_places_intersect_admin3$set_Pcode,incomparables = NA)]
hdx_geonames_settlement_intersect$osm_name_toappend <- hdx_geonames_settlement_intersect$osm_name_by_geonameid == hdx_geonames_settlement_intersect$osm_name_by_set_Pcode # only add the name when the two osm names agree
hdx_geonames_settlement_intersect$osm_name_toappend[is.na(hdx_geonames_settlement_intersect$osm_name_toappend )] <- F
hdx_geonames_settlement_intersect <- hdx_geonames_settlement_intersect %>% rowwise() %>% mutate(alternatenames_appended = append_alternate(x=osm_name_by_set_Pcode,y=alternatenames_appended,z=osm_name_toappend))
Let’s compare the starting points with the endpoint.
Some results:
Here’s how the result looks like:
In red dots are all settlements that are still not “integrated” using this pipeline. Orange dots are non-integrated OSM places. Circles with blue fill are UN settlements or OSM places that have been successfully linked to geonames. Circles with green fill are points where OSM, Settlements and geonames are linked.
# forplot <- sf::st_coordinates(hdx_geonames_settlement_intersect$geometry.x) %>% as_tibble() %>% rename(lat = Y, lon = X) %>% cbind(hdx_geonames_settlement_intersect)
forplot <- hdx_geonames_settlement_intersect
forplot <- forplot %>% rowwise() %>% mutate(lab =paste(
c(paste0("admin3Name: ",admin3Name),
paste0("gn_name: ",gn_name),
paste0("gn_geonameid: ", gn_geonameid),
paste0("Settlement english name:", set_Nam_Eng),
paste0("Settlement Pcode:", set_Pcode),
paste0("OSM Name(s):", paste(unique(c(osm_name_by_geonameid, osm_name_by_set_Pcode)), collapse = ",") ),
paste0("OSM IDs(s):", paste(unique(c(osm_osm_id_by_geonameid, osm_osm_id_by_set_Pcode)), collapse = ",") ),
paste0("Alternate Names (old): ", gn_alternatenames),
paste0("Added names: <b>", gsub(gn_alternatenames, "", alternatenames_appended), "</b>") ),
collapse = "<br>"))
forplot$lab <- lapply(forplot$lab, htmltools::HTML)
# admin2Col <- colorFactor(topo.colors(length(unique(forplot$admin2Name))), unique(forplot$admin2Name))
forplot$isSettlement <- !is.na(forplot$set_Nam_Eng)
forplot$isOSM <- !is.na(forplot$osm_osm_id_by_geonameid) | !is.na(forplot$osm_osm_id_by_set_Pcode)
forplot$point_fill <- c("#FFFFFF00", "blue", "green")[forplot$isSettlement + forplot$isOSM + 1]
settlements_not_assigned <- leb_settlements %>% filter(!leb_settlements$set_Pcode %in% hdx_geonames_settlement_intersect$set_Pcode, !leb_settlements$set_Pcode %in% osm_places_intersect_admin3$set_Pcode)
osm_not_assigned <- osm_places_intersect_admin3 %>% filter(is.na(set_Pcode),is.na(gn_geonameid))
forplot %>% leaflet() %>% addTiles() %>%
addCircleMarkers(label = ~lab, fillColor= ~point_fill, fillOpacity = 0.5, clusterOptions = markerClusterOptions(),
clusterId = "whatever") %>%
addCircles(data = settlements_not_assigned, label = ~set_Nam_Eng,color = "red") %>%
addCircles(data = osm_not_assigned, label = ~osm_name,color = "orange")
Write out the final file in:
data/lebanon_geo_names_fill.csv (also a zipped version)hdx_geonames_settlement_intersect %>% as_tibble() %>% readr::write_csv(path = "../data/lebanon_geo_names_full.csv")
zip(zipfile = "../data/lebanon_geo_names_full.zip", files = "../data/lebanon_geo_names_full.csv")