# install and load packages as needed. 
pacman::p_load(sp, raster, leaflet, rgdal, 
               tidyverse, geosphere, rgeos,
               viridis, stats, ggplot2, 
               kableExtra )

Malaria test positive prevalence, table and map:

# Data Aquisition:
ETH_Adm_2 <- raster::getData("GADM", 
                             country="ETH", 
                             level =2)
ETH_malaria_data <- read.csv("https://raw.githubusercontent.com/HughSt/HughSt.github.io/master/course_materials/week1/Lab_files/Data/mal_data_eth_2009_no_dups.csv",header=T)

ETH_land_use <- raster("https://github.com/HughSt/HughSt.github.io/blob/master/course_materials/week2/Lab_files/ETH_land_use.tif?raw=true")
# Spatial Formatting:
ETH_malaria_data_SPDF <- SpatialPointsDataFrame(
  coords = ETH_malaria_data[,c('longitude','latitude')],  data = ETH_malaria_data[,c("examined", 
                    "pf_pos",
                    "pf_pr")],
        proj4string = CRS("+init=epsg:4326"))

# matching CRS.
ETH_malaria_data_SPDF <-
  spTransform(ETH_malaria_data_SPDF, crs(ETH_Adm_2))

# Define points over Adm_2 polygons.
ETH_Adm_2_per_point <- 
  over(ETH_malaria_data_SPDF, ETH_Adm_2)

# sum() examined values at points over each polygon shape.
num_exams_per_Adm2 <- tapply(ETH_malaria_data_SPDF$examined, 
       ETH_Adm_2_per_point$NAME_2, sum)

# sum() disease positive values at point over each polygon shape.
num_pos_per_Adm2 <- 
  tapply(ETH_malaria_data_SPDF$pf_pos,
         ETH_Adm_2_per_point$NAME_2, sum)

# Divide summed positive values over examination values.
prev_per_Adm2 <- num_pos_per_Adm2 / num_exams_per_Adm2

# create dataframe with prevalence values. 
prev_per_Adm2_df <- data.frame(NAME_2 = names(prev_per_Adm2), prevalence = prev_per_Adm2, 
             row.names = NULL)

# merge prevalence dataframe with ETH_Adm_2 dataframe. 
ETH_Adm_2 <- merge(ETH_Adm_2, prev_per_Adm2_df, by = "NAME_2")

# structure data for adm2 level table. 
Adm_2_data <- ETH_Adm_2@data
Adm_2_tab <- Adm_2_data %>% 
  drop_na(prevalence) %>% 
  mutate(NAME_2 = as.factor(NAME_2)) %>% 
  group_by(NAME_2) %>% 
  summarise(prevalence)

# create table
kbl(Adm_2_tab,
    col.names = c("Location", "Prevalence")) %>%
   kable_styling(bootstrap_options = c("striped"), font_size = 12) %>% 
  column_spec(2, color = "white",
              background = spec_color(Adm_2_tab$prevalence))
Location Prevalence
Agnuak 0.0000000
Arsi 0.0000000
Asosa 0.0091743
Bale 0.0010619
Borena 0.0088224
Debub Mirab Shewa 0.0093458
Guji 0.0014144
Horo Guduru 0.0000000
Ilubabor 0.0000000
Jimma 0.0196347
Kelem Wellega 0.0000000
Mirab Arsi 0.0000000
Mirab Hararghe 0.0004114
Mirab Shewa 0.0046512
Mirab Welega 0.0004866
Misraq Harerge 0.0000000
Misraq Shewa 0.0000000
Misraq Wellega 0.0000000
# define palette and create map. 
colorPal <- colorNumeric("viridis", 
              domain = ETH_Adm_2$prevalence)

Adm_2_prevs_map <- leaflet() %>% 
  addProviderTiles("CartoDB.Positron") %>%
  addPolygons(
  data= ETH_Adm_2, col=colorPal(ETH_Adm_2$prevalence), 
  fillOpacity = .3, weight = .7,
  highlightOptions = highlightOptions(color = "black", weight = 3,
      bringToFront = TRUE), popup = ETH_Adm_2$NAME_2) %>% 
  addLegend(pal = colorPal, 
  values = ETH_Adm_2$prevalence,
  title = "Prevalence")
Adm_2_prevs_map

Land class type and infection prevalence:

# extract() land use values present at points in malaria_data_SPDF.
ETH_malaria_data_SPDF$land_use <- 
  raster::extract(ETH_land_use, ETH_malaria_data_SPDF)

# rename values as per GLOBcover2009 convention. 
land_cat_prev <- ETH_malaria_data_SPDF@data %>% 
  mutate(land_use = case_when(
    land_use == 20 | land_use == 14  ~ "Cropland",
    land_use == 60 | land_use == 40 ~ "Forest",
    land_use == 110 ~ "Grassland",
    land_use == 130 ~ "Shrubland",
    land_use == 30 ~ "Vegetation"
  )) %>% 
  drop_na(land_use) %>% 
  group_by(land_use) %>% 
  # summarise to table
  summarise(mean_prev = mean(pf_pr, na.rm = FALSE)) %>% 
  mutate(land_use = fct_reorder(land_use, mean_prev)) 


# plot mean prevalence by land use type
ggplot(land_cat_prev) + 
  geom_col(aes(land_use, mean_prev, fill = land_use), 
           alpha = .6) +
   labs(x= "Land Use Types") +
  ylab("Test Positive Location, Prevalence > 0") +
  ggtitle("Mean Prevalence, Malaria Positive by Land Use Type: 2009") +
  theme(legend.position = "none") +
  scale_fill_viridis_d()

Test positive locations and climate data:

# worldclim dataset. 
climate <- getData("worldclim",var="bio",res=10)

# extract values matching malaria test site locations.
ETH_local_climate <- 
  raster::extract(climate, ETH_malaria_data_SPDF)

# transform to dataframe. 
ETH_local_climate_df <- 
  as.data.frame(ETH_local_climate)

# select and name variables of interest.
ETH_local_climate_df <-
  ETH_local_climate_df %>% 
  select(bio1, bio12) %>% 
  rename(annual_mean_temp = bio1, 
         annual_precip = bio12)   
# bind climate data to SPDF object.
land_cat_w_clim <-  cbind(ETH_malaria_data_SPDF@data, ETH_local_climate_df)

# structure for plotting.
ETH_climate_prevs <- 
  land_cat_w_clim %>% 
   mutate(land_use = factor(land_use)) %>% 
mutate(land_use = case_when(
    land_use == 20 | land_use == 14  ~ "Cropland",
    land_use == 60 | land_use == 40 ~ "Forest",
    land_use == 110 ~ "Grassland",
    land_use == 130 ~ "Shrubland",
    land_use == 30 ~ "Vegetation"
  )) %>%  
  drop_na(land_use) %>% 
  mutate(land_use = fct_reorder(land_use, pf_pr)) %>% 
  mutate(annual_precip= as.numeric(annual_precip / 1000)) %>%
  mutate(annual_mean_temp = as.numeric(annual_mean_temp / 10)) %>%
  filter(pf_pr > 0)
# exploratory pearson cor-coef.
cor_temp <- cor(ETH_climate_prevs$annual_mean_temp,
                  ETH_climate_prevs$pf_pr)

# scatter plot with smooth line and pasted cor-coef value.
ggplot(ETH_climate_prevs) + 
   geom_smooth(aes(annual_mean_temp, pf_pr), color = "light blue", size = 1.5, alpha = .5) +
    geom_point(aes(annual_mean_temp, pf_pr, color = land_use), alpha = .5, size = 2) +
  ylim(-.05, .15) +
   scale_color_viridis(discrete=TRUE, "Land Use Type") +
  xlab("Average Annual Temperature (Celsius)") +
  ylab("Test Positive Location, Prevalence > 0") +
  ggtitle(label = "Malaria Prevalence and Average Annual Temperature", 
    subtitle = paste0("Pearson's Cor = ", round(cor_temp, 3))) 

### Malaria test sites and precipitation:  ###

# exploratory pearson cor-coef.
cor_precip <- cor(ETH_climate_prevs$annual_precip,
                  ETH_climate_prevs$pf_pr)

# scatter plot with smooth line and pasted cor-coef value.
ggplot(ETH_climate_prevs) + 
  geom_smooth(aes(annual_precip, pf_pr), color = "light blue", size = 1.5, alpha = .5) +
    geom_point(aes(annual_precip, pf_pr, color = land_use), alpha = .5, size = 2) +
   ylim(-.05, .15) +
    xlab("Average Annual Precipitation (meters)") +
  ylab("Test Positive Location, Prevalence > 0") +
   scale_color_viridis(discrete=TRUE, "Land Use") +
  ggtitle(label = "Malaria Prevalence and Average Annual Precipitation", 
    subtitle = paste0("Pearson's Cor = ", round(cor_precip, 3))) 

Test positive sites and proximity to health facility:

# load African health facility locations data from local source. 
af_health_cent <- readxl::read_xlsx("who-cds-gmp-2019-01-eng.xlsx")

# subset health facilities in Ethiopia 
ETH_health_cent <- af_health_cent %>% 
  filter(Country == "Ethiopia") %>% 
  filter(Lat != "NA" & Long != "NA") %>% 
  select("Facility type", "Lat", "Long") %>% 
  rename(fac_type = "Facility type")

# transform to spatial object. 
ETH_health_cent_SPDF <- SpatialPointsDataFrame(
  coords = ETH_health_cent[,c('Long','Lat')],  
  data = ETH_health_cent[,'fac_type'],
        proj4string = CRS("+init=epsg:4326"))

plot(ETH_health_cent_SPDF)

# generate distance matrix. 
dist_matrix <- geosphere::distm(ETH_malaria_data_SPDF,
                                ETH_health_cent_SPDF)

# apply a min function to each row of malaria site data
ETH_malaria_data_SPDF$dist_to_health_cent <- 
  apply(dist_matrix, 1, min)

# structure malaria site data to plot. 
plot_health_cents_dist <- ETH_malaria_data_SPDF@data %>% 
  select(pf_pr, dist_to_health_cent, land_use) %>% 
  mutate(dist_to_health_cent = as.numeric(round(dist_to_health_cent / 1000, 1))) %>% 
  mutate(land_use = factor(land_use)) %>% 
mutate(land_use = case_when(
    land_use == 20 | land_use == 14  ~ "Cropland",
    land_use == 60 | land_use == 40 ~ "Forest",
    land_use == 110 ~ "Grassland",
    land_use == 130 ~ "Shrubland",
    land_use == 30 ~ "Vegetation"
  )) %>%  
  drop_na(land_use) %>% 
  mutate(land_use = fct_reorder(land_use, pf_pr)) %>%
  filter(pf_pr > 0) %>% 
  # limit outlier value.
  filter(dist_to_health_cent < 15)
# exploratory pearson cor-coef.
cor_dist <- cor(plot_health_cents_dist$dist_to_health_cent,
                  plot_health_cents_dist$pf_pr)

# scatter plot with smooth line and pasted cor-coef value.
ggplot(plot_health_cents_dist) + 
  geom_smooth(aes(dist_to_health_cent, pf_pr), color = "light blue", size = 1.5, alpha = .5) +
    geom_point(aes(dist_to_health_cent, pf_pr, color = land_use), alpha = .5, size = 2) +
   ylim(-.05, .15) +
    xlab("Minimum Distance to Health Centre, Clinic, or Hosptial (Km)") +
  ylab("Test Positive Location, Prevalence > 0") +
   scale_color_viridis(discrete=TRUE, "Land Use Type") +
  ggtitle(label = "Malaria Prevalence and Proximity to Health Services", 
    subtitle = paste0("Pearson's Cor = ", round(cor_dist, 3))) 

~fin