Compiled on 2020-10-28 by Maroun Bou Sleiman. Contact me at maroun@openmaplebanon.org
Comparing the population estimates from different sources. The goal is to sharpen and calibrate the Covid-19 response.
Already prepared in the mobility analysis notebook. We only need to read the file in. Here are the first few records:
knitr::opts_chunk$set(dpi = 150, fig.path='../data/Figs/population_validation/',
echo=TRUE, warning=FALSE, message=FALSE,
out.width='100%',out.height = "500px")
suppressPackageStartupMessages(library(rhdx))
suppressPackageStartupMessages(library(leaflet))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(sf))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(plotly))
options(scipen=999)
mun_map <- readRDS("../leb_covid_movement/data/mun_map.RDS")
mun_map$st_area <- st_area(mun_map)
mun_map_stripped <- mun_map %>% select(code, nm_resi,adm3_name,kada_na, geometry, st_area)
DT::datatable(head(mun_map, n = 100), caption = "First 100 records of municipality map")
There are 1359, covering an area of 10236.9518034261 square km, with a population total of 10.631564 million people. Seems to be too high, which may decrease the risk scores. Remember: the risk score is calculated as the number of new cases in the last 14 days divided by the number of residents (actually by 100,000 residents). Therefore, overestimating the population has a more severe effect than underestimating.
I will get the data using the rhdx package and searching for Lebanon High Resolution Population Density Maps.
fb_pop <- rhdx::search_datasets(query="Lebanon High Resolution Population Density Maps", rows = 1) %>% nth(1) %>%
get_resources() %>% nth(2) %>% download_resource(folder = "../data/")
fb_pop <- unzip(fb_pop, exdir = "../data/")
fb_pop <- read.csv(fb_pop)
DT::datatable(head(fb_pop))
The facebook dataset has 647048 values, which account for a population estimate of 5891480.7655107. These values form a grid of 1 arc second, which is 30 * 30 meters at the equator [note to self, verify this]. Each value is a statistical estimate of the number of people living in that area. Note that these values can be decimals and smaller than 1. Now we need to intersect these points with the municipality polygons, and summarize the data by municipality.
fb_pop_sfc <- mclapply(1:nrow(fb_pop),FUN = function(x){
st_point(c(fb_pop[x,2],fb_pop[x,1]))
}, mc.cores = detectCores())
fb_pop_sfc <- st_sfc(fb_pop_sfc, crs= st_crs(mun_map))
fb_pop_sf <- st_sf(data.frame(Population=fb_pop[,"Population"], geom=fb_pop_sfc))
fb_pop_sf_intersect <- mclapply(split(fb_pop_sf, ggplot2::cut_number(1:nrow(fb_pop_sf), 50)), function(x){st_intersection(x, mun_map_stripped)},
mc.cores = detectCores())
fb_pop_sf_intersect <- do.call(rbind,fb_pop_sf_intersect)
fb_pop_sf_intersect_summary <- fb_pop_sf_intersect %>% group_by(kada_na, adm3_name, code) %>%
summarize(Population_mean_dens = mean(Population), fb_nm_resi = sum(Population), nm_resi = unique(nm_resi), area = unique(st_area) )
fb_pop_sf_intersect_summary <- fb_pop_sf_intersect_summary %>% rowwise() %>%
mutate(diff = fb_nm_resi - nm_resi, rel_diff = abs((nm_resi-fb_nm_resi))/nm_resi )
p <- fb_pop_sf_intersect_summary %>% ggplot(aes(x = nm_resi, y = fb_nm_resi, adm3_name = adm3_name)) +
geom_point() +
# coord_fixed(ratio = 1) +
geom_abline(slope = 1) +
geom_text(data = fb_pop_sf_intersect_summary %>% filter(abs(diff) > 100000), aes(label = adm3_name), nudge_x =50000) +
theme_cowplot() +
xlab("Current estimate") +
ylab("Facebook estimate")
ggplotly(p)
(#fig:prepare_compare_fb)x-axis is the current estimate (as used in the risk calculations), the y-axis is the Facebook estimate
Now compare by Caza:
fb_pop_sf_intersect_summary_bycaza <- fb_pop_sf_intersect_summary %>% group_by(kada_na) %>% summarize(total_pop_gov = sum(nm_resi), total_pop_fb = sum(fb_nm_resi)) %>%
mutate(diff = abs(total_pop_fb - total_pop_gov))
g <- fb_pop_sf_intersect_summary_bycaza %>%
ggplot(aes(x = total_pop_gov, y = total_pop_fb, label = kada_na)) +
geom_point() +
# coord_fixed(ratio = 1) +
geom_abline(slope = 1) +
geom_text(data = fb_pop_sf_intersect_summary_bycaza %>% filter(abs(diff) > 100000), aes(label = kada_na), nudge_x =100000) +
theme_cowplot() +
xlab("Current estimate") +
ylab("Facebook estimate")
ggplotly(g)
Figure 2.1: x-axis is the current estimate (as used in the risk calculations), the y-axis is the Facebook estimate
So what if the Facebook data were true? What’s the impact on risk?
mun_map$fb_pop <- fb_pop_sf_intersect_summary$fb_nm_resi[match(mun_map$code, fb_pop_sf_intersect_summary$code)]
mun_map <- mun_map %>% arrange(-risk_lev) %>%
rowwise() %>%
mutate(rate_gov = num_14_1/(nm_resi/100000), rate_fb = num_14_1/(fb_pop/100000))
g <- mun_map %>% filter(num_14_1 >=5) %>%
ggplot(aes(x = rate_gov,
y = rate_fb,
fill = risk_level_clean,
adm3_name = adm3_name,
color = lockdown_october13,
nm_resi = nm_resi,
nm_resi_fb = round(fb_pop),
num_14_1=num_14_1)) +
geom_point(size = 3,stroke = 1.5, alpha = 0.5, pch = 21)+
# coord_fixed(ratio = 1) +
theme_cowplot() +
geom_vline(xintercept = 110) +
geom_hline(yintercept = 110) +
ylim(0,1500) +
xlim(0,1500) +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black"))
g +
annotate("text", x= 1000, y = 1000, label = "High risk using both metrics", angle = 45, size = 6) +
annotate("text", x= 0, y = 1000, label = "High risk using facebook estimate only", angle = 90, size = 6) +
annotate("text", x= 1000, y = 0, label = "High risk using current estimate only", angle = 0, size = 6) +
annotate("rect", xmin=10, xmax = 10, ymin = 0, ymax = 10)
Interactive version of the same plot:
ggplotly(g)
So this means that if we were to take a decision using the different population estimates, here are the two scenarios:
The intersection of both is 186, of which 132 are currently under lockdown.
Here’s a graphical view.
p <- mun_map %>% filter(num_14_1 >=5) %>% mutate(high_risk_gov = ifelse(rate_gov >= 110, "Yes", "No"),
high_risk_fb = ifelse(rate_fb >= 110, "Yes", "No"),
currently_in_lockdown = ifelse(lockdown_october13, "In lockdown", "Not in lockdown")) %>%
ggplot(aes(x = high_risk_gov, fill = high_risk_fb)) + stat_count(position = "dodge") +
theme_cowplot() +
ylab("Number of municipalities") +
xlab("High risk in official calculation") +
facet_grid(~currently_in_lockdown) +
scale_fill_manual(values = c("Yes" = "Red", "No" = "grey"), name = "High risk in Facebook-based calculation") +
theme(legend.position = "bottom") +
geom_text(stat='count', aes(label=..count..), vjust=-0.5, position = position_dodge(width = 1))
ggplotly(p)
The Central Administration of Statistics has published a report titled: “Labour Force and Household Living Conditions Survey 2018-2019”. Let’s call it LFHLCS from now. here
Table 1.1 of page 18 is the most relevant here and contains the number of residents per caza. I manually prepared a file with the information in tab-delimited format.
LFHLCS <- read.table("../data/LFHLCS_2018_2019_table1.1.tsv", sep = "\t", header = T)
DT::datatable(LFHLCS, caption = "LFHLCS table 1.1")
The total in this table is 4842400 Note that this does not account for all people in Lebanon. I quote them directly:
The scope of the LFHLCS covered the population of Lebanon living in residential dwellings in the time period from April 2018 to March 2019, divided into four rounds. Like any other household survey, it excluded the population living in non-residential units, such as construction and agriculture sites, shops, stores, factories, unfinished buildings, army barracks, refugee camps and adjacent gatherings, and informal settlements, etc.
We will still need some UNHCR data for Syrian refugees from here.
unhcr_governorate <- jsonlite::fromJSON("https://data2.unhcr.org/population/get/sublocation?widget_id=161367&geo_id=71&sv_id=4&population_collection=24&forcesublocation=0&fromDate=1900-01-01")$data
unhcr_governorate$individuals <- as.numeric(unhcr_governorate$individuals)
DT::datatable(unhcr_governorate)
The total Syrian population as per the UNRWA is currently 879529
As for palestinian refugees, UNRWA estimates that there are 180,000 living in the country. I must say there are so many conflicting estimates here too. So if we add up the CAS data + the UNHCR Syrian Refugees + the UNRWA Palestinian Refugees, we get 5901929, which is close to the Facebook estimate. Does the sub-national distribution make sense? Let’s compare this data to the current municipal map data and the facebook data at the governorate level. First need to combine the LFHLCS and the UNHCR data. I will not include the Palestinian refugees for now, as I do not have a reliable source of sub-national data yet.
unhcr_governorate$governorate <- gsub(" Lebanon", "", unhcr_governorate$geomaster_name)
LFHLCS_UNHCR <- left_join(LFHLCS %>% group_by(governorate) %>% summarize(number_individuals = sum(number_individuals, na.rm = T)), unhcr_governorate, by = "governorate") %>% rowwise() %>% mutate(number_residents_LFHLCS_UNHCR = sum(c(number_individuals, individuals), na.rm = T) ) %>% mutate(number_residents_LFHLCS = number_individuals, number_residents_UNHCR = individuals) %>% select(governorate, number_residents_LFHLCS, number_residents_UNHCR, number_residents_LFHLCS_UNHCR)
comparison_governorate <- mun_map %>% mutate(governorate = moh_new) %>% group_by(governorate) %>% summarize(number_residents_gov = sum(nm_resi, na.rm = T),
number_residents_fb = sum(fb_pop, na.rm = T)) %>% left_join(LFHLCS_UNHCR, by = "governorate")
barplot(colSums(comparison_governorate %>% select(-governorate) %>% select(number_residents_gov, number_residents_fb, number_residents_LFHLCS_UNHCR), na.rm = T),
names.arg = c("Current", "Facebook", "LFHLCS+UNHCR"), ylab = "Number of individuals")
The Facebook data is awefully close to the LFHLCS+UNHCR data, could it be because they used (some of) the same sources as input or calibration? or is their population density estimate that good? In any case, the Current data used by the Government to calculate risks by municipality is likely very inaccurate and should be revised.
a <- comparison_governorate %>% ggplot(aes(x = number_residents_LFHLCS_UNHCR/1e6, y = number_residents_gov/1e6, label = governorate)) +
geom_label() +
geom_abline(slope = 1) +
theme_cowplot() +
xlab("LFHLCS+UNHCR estimate (millions)") +
ylab("Current Government estimate (millions)")
b <- comparison_governorate %>% ggplot(aes(x = number_residents_LFHLCS_UNHCR/1e6, y = number_residents_fb/1e6, label = governorate)) +
geom_label() +
geom_abline(slope = 1) +
theme_cowplot()+
xlab("LFHLCS+UNHCR estimate (millions)")+
ylab("Facebook estimate (millions)")
plot_grid(a,b, nrow = 1)
mun_map %>% as_tibble() %>% select(-geometry) %>% select(code, adm3_name,kada_na, nm_resi, fb_pop, rate_gov, rate_fb, risk_level, lockdown_october13) %>% DT::datatable(options = list(buttons = c('copy', 'csv', 'excel')))
write.table(mun_map %>% select(code, adm3_name, kada_na, moh_new, nm_resi, fb_pop), file = "../data/cnrs_gis/cnrs_gis_population_new.csv", sep = ",")