Explore data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
water_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-04/water.csv")
## Rows: 473293 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): report_date, status_id, water_source, water_tech, facility_type, co...
## dbl (4): row_id, lat_deg, lon_deg, install_year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
water_raw %>%
  filter(
    country_name == "Sierra Leone",
    lat_deg > 0, lat_deg < 15, lon_deg < 0,
    status_id %in% c("y", "n")
  ) %>%
  ggplot(aes(lon_deg, lat_deg, color = status_id)) +
  geom_point(alpha = 0.1) +
  coord_fixed() +
  guides(color = guide_legend(override.aes = list(alpha = 1)))

water <- water_raw %>%
  filter(
    country_name == "Sierra Leone",
    lat_deg > 0, lat_deg < 15, lon_deg < 0,
    status_id %in% c("y", "n")
  ) %>%
  mutate(pay = case_when(
    str_detect(pay, "^No") ~ "no",
    str_detect(pay, "^Yes") ~ "yes",
    is.na(pay) ~ pay,
    TRUE ~ "it's complicated"
  )) %>%
  select(-country_name, -status, -report_date) %>%
  mutate_if(is.character, as.factor)
water %>%
  ggplot(aes(install_year, y = ..density.., fill = status_id)) +
  geom_histogram(position = "identity", alpha = 0.5) +
  labs(fill = "Water available?")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7074 rows containing non-finite values (`stat_bin()`).

water %>%
  ggplot(aes(y = pay, fill = status_id)) +
  geom_bar(position = "fill") +
  labs(fill = "Water available?")

Building a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
set.seed(123)
water_split <- initial_split(water, strata = status_id)
water_train <- training(water_split)
water_test <- testing(water_split)

set.seed(234)
water_folds <- vfold_cv(water_train, strata = status_id)
water_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [36984/4110]> Fold01
##  2 <split [36984/4110]> Fold02
##  3 <split [36984/4110]> Fold03
##  4 <split [36984/4110]> Fold04
##  5 <split [36984/4110]> Fold05
##  6 <split [36985/4109]> Fold06
##  7 <split [36985/4109]> Fold07
##  8 <split [36985/4109]> Fold08
##  9 <split [36985/4109]> Fold09
## 10 <split [36986/4108]> Fold10
usemodels::use_ranger(status_id ~ ., data = water_train)
## ranger_recipe <- 
##   recipe(formula = status_id ~ ., data = water_train) 
## 
## ranger_spec <- 
##   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
##   set_mode("classification") %>% 
##   set_engine("ranger") 
## 
## ranger_workflow <- 
##   workflow() %>% 
##   add_recipe(ranger_recipe) %>% 
##   add_model(ranger_spec) 
## 
## set.seed(74403)
## ranger_tune <-
##   tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(themis)
ranger_recipe <-
  recipe(formula = status_id ~ ., data = water_train) %>%
  update_role(row_id, new_role = "id") %>%
  step_unknown(all_nominal_predictors()) %>%
  step_other(all_nominal_predictors(), threshold = 0.03) %>%
  step_impute_linear(install_year) %>%
  step_downsample(status_id)

ranger_spec <-
  rand_forest(trees = 1000) %>%
  set_mode("classification") %>%
  set_engine("ranger")

ranger_workflow <-
  workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(ranger_spec)

doParallel::registerDoParallel()
set.seed(74403)
ranger_rs <-
  fit_resamples(ranger_workflow,
    resamples = water_folds,
    control = control_resamples(save_pred = TRUE)
  )

Explore the results

collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.893    10 0.00138 Preprocessor1_Model1
## 2 roc_auc  binary     0.951    10 0.00102 Preprocessor1_Model1
collect_predictions(ranger_rs) %>%
  group_by(id) %>%
  roc_curve(status_id, .pred_n) %>%
  autoplot()

conf_mat_resampled(ranger_rs, tidy = FALSE) %>%
  autoplot()

final_fitted <- last_fit(ranger_workflow, water_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.898 Preprocessor1_Model1
## 2 roc_auc  binary         0.953 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
  conf_mat(status_id, .pred_class) %>%
  autoplot()

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
imp_data <- ranger_recipe %>%
  prep() %>%
  bake(new_data = NULL) %>%
  select(-row_id)

ranger_spec %>%
  set_engine("ranger", importance = "permutation") %>%
  fit(status_id ~ ., data = imp_data) %>%
  vip(geom = "point")

imp_data %>%
  select(status_id, pay, water_tech, installer) %>%
  pivot_longer(pay:installer, names_to = "feature", values_to = "value") %>%
  ggplot(aes(y = value, fill = status_id)) +
  geom_bar(position = "fill") +
  facet_grid(rows = vars(feature), scales = "free_y", space = "free_y") +
  theme(legend.position = "top") +
  scale_fill_brewer(type = "qual", palette = 7) +
  scale_x_continuous(expand = expansion(mult = c(0, .01)), labels = scales::percent) +
  labs(
    x = "% of water sources", y = NULL, fill = "Water available?",
    title = "Water availability by source characteristic in Sierra Leone",
    subtitle = "Water sources with no payment information are likely to have no water available"
  )

  1. Can we predict the availability status of water sources in Sierra Leone based on various features like payment, water technology, installation year, installer, and other characteristics?

The dataset pertains to water sources in Sierra Leone and includes geographical coordinates, water availability status, payment requirements, installation year, technology type, and installer details. Initial steps involve data filtering specific to Sierra Leone and categorical variable transformation. Visualizations like scatter plots and histograms are used for insights. The data then drives a Random Forest model, designed to predict water availability, with its performance and influential variables subsequently assessed.

The dataset’s key variables include geographical coordinates (latitude and longitude) pinpointing water sources in Sierra Leone. The status_id indicates water availability, categorized as ‘available’ or ‘not available’. The Pay variable highlights if accessing the water incurs a fee, simplified to “yes”, “no”, or “it’s complicated”. The Install_year offers a timeline of when sources were established, while Water Tech and Installer provide categorical insights into the technology used and the installing entity, respectively. Together, these variables aim to predict and understand water availability nuances.

  1. The original dataset offers a broad scope of water sources, potentially spanning various regions. For modeling, it’s refined to focus on Sierra Leone. The Pay column is standardized to a few categories for clarity. Non-essential columns are discarded, and character variables are converted to factors for effective modeling. Missing values, notably in install_year, are linearly imputed. This preprocessing, by reducing noise and emphasizing key attributes, ensures a dataset that’s optimized for accurate predictive modeling. The transformations are essential to enhance model performance and interpretability.

  2. The mentioned data preparation steps include filtering to focus on Sierra Leone, standardizing the Pay column, discarding non-essential columns, converting character variables to factors, and linearly imputing missing values in install_year. These steps aim to optimize the dataset for predictive modeling.

The machine learning model used in the analysis is the “Random Forest” model, as indicated by the use of the rand_forest function and the “ranger” engine for its implementation.

  1. The model evaluation relies on metrics from the collect_metrics function, notably the ROC-AUC and confusion matrices. The ROC-AUC gauges the model’s discriminative power for water availability, while the confusion matrix offers insights into prediction accuracy. These metrics are essential for assessing the model’s effectiveness in aligning predictions with actual water statuses in Sierra Leone.

  2. The analysis of the Sierra Leone water sources dataset reveals spatial patterns in water availability, with certain regions possibly having more reliable sources. Temporal insights suggest variations in water availability based on installation years. A significant correlation exists between water availability and payment methods, hinting that non-paying sources might often lack water. The Random Forest model effectively predicts water availability, with features like payment method, water technology, and installer being highly influential in its predictions. These findings provide a comprehensive understanding of factors affecting water availability in the region.