knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.3.2

library(correlationfunnel)

## Warning: package 'correlationfunnel' was built under R version 4.3.2

## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)

library(textrecipes)

## Warning: package 'textrecipes' was built under R version 4.3.2

## Loading required package: recipes
## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.3.2

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.6     ✔ workflows    1.1.3
## ✔ modeldata    1.3.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.3.0

## Warning: package 'dials' was built under R version 4.3.2

## Warning: package 'scales' was built under R version 4.3.2

## Warning: package 'infer' was built under R version 4.3.2

## Warning: package 'modeldata' was built under R version 4.3.2

## Warning: package 'parsnip' was built under R version 4.3.2

## Warning: package 'tune' was built under R version 4.3.2

## Warning: package 'workflows' was built under R version 4.3.2

## Warning: package 'workflowsets' was built under R version 4.3.2

## Warning: package 'yardstick' was built under R version 4.3.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.3.2

## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice

library(ggplot2)

water_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-04/water.csv")

## Rows: 473293 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): report_date, status_id, water_source, water_tech, facility_type, co...
## dbl (4): row_id, lat_deg, lon_deg, install_year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

water_raw

## # A tibble: 473,293 × 13
##    row_id lat_deg lon_deg report_date status_id water_source          water_tech
##     <dbl>   <dbl>   <dbl> <chr>       <chr>     <chr>                 <chr>     
##  1   3957   8.07     38.6 04/06/2017  y         <NA>                  <NA>      
##  2  33512   7.37     40.5 08/04/2020  y         Protected Spring      <NA>      
##  3  35125   0.773    34.9 03/18/2015  y         Protected Shallow We… <NA>      
##  4  37760   0.781    35.0 03/18/2015  y         Borehole              <NA>      
##  5  38118   0.779    35.0 03/18/2015  y         Protected Shallow We… <NA>      
##  6  38501   0.308    34.1 03/19/2015  y         Borehole              <NA>      
##  7  46357   0.419    34.3 05/19/2015  y         Unprotected Spring    <NA>      
##  8  46535   0.444    34.3 05/19/2015  y         Protected Shallow We… <NA>      
##  9  46560   0.456    34.3 05/19/2015  y         Protected Shallow We… <NA>      
## 10  46782   0.467    34.3 05/20/2015  y         Protected Shallow We… <NA>      
## # ℹ 473,283 more rows
## # ℹ 6 more variables: facility_type <chr>, country_name <chr>,
## #   install_year <dbl>, installer <chr>, pay <chr>, status <chr>

Question and Data

Research Question

The research question is about whether a water source has water available at it based on the characteristics of the water source observed during a visit.

Describe Data

The data describes different water sources, whether you need to pay for them or not, whether they have a source installed, if the water is actually available, and shows data for different locations.

Key Variables

status: Tells whether the water source is functional/available or not country_name: Different countries pay: whether the water is paid for or not water_source: Where the water is sourced from. water_tech: who installed the water source

water_raw %>%
  filter(
    country_name == "Sierra Leone",
    lat_deg > 0, lat_deg < 15, lon_deg < 0,
    status_id %in% c("y", "n")
  ) %>%
  ggplot(aes(lon_deg, lat_deg, color = status_id)) +
  geom_point(alpha = 0.1) +
  coord_fixed() +
  guides(color = guide_legend(override.aes = list(alpha = 1)))

water <- water_raw %>%
  filter(
    country_name == "Sierra Leone",
    lat_deg > 0, lat_deg < 15, lon_deg < 0,
    status_id %in% c("y", "n")
  ) %>%
  mutate(pay = case_when(
    str_detect(pay, "^No") ~ "no",
    str_detect(pay, "^Yes") ~ "yes",
    is.na(pay) ~ pay,
    TRUE ~ "it's complicated"
  )) %>%
  select(-country_name, -status, -report_date) %>%
  mutate_if(is.character, as.factor)

Data Exploration and Transformation

Differences between original and transformed data

The transformed data shows no report date, keeps only that status_id (instead of status[functional]), and is only data in the country of Sierra Leone within specific coordinates. It also gets rid of data where “pay” is N/A.

water %>%
  ggplot(aes(install_year, y = ..density.., fill = status_id)) +
  geom_histogram(position = "identity", alpha = 0.5) +
  labs(fill = "Water available?")

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 7074 rows containing non-finite outside the scale range
## (`stat_bin()`).

water %>%
  ggplot(aes(y = pay, fill = status_id)) +
  geom_bar(position = "fill") +
  labs(fill = "Water available?")

library(tidymodels)

set.seed(123)
water_split <- initial_split(water, strata = status_id)
water_train <- training(water_split)
water_test <- testing(water_split)

set.seed(234)
water_folds <- vfold_cv(water_train, strata = status_id)
water_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [36984/4110]> Fold01
##  2 <split [36984/4110]> Fold02
##  3 <split [36984/4110]> Fold03
##  4 <split [36984/4110]> Fold04
##  5 <split [36984/4110]> Fold05
##  6 <split [36985/4109]> Fold06
##  7 <split [36985/4109]> Fold07
##  8 <split [36985/4109]> Fold08
##  9 <split [36985/4109]> Fold09
## 10 <split [36986/4108]> Fold10

Data Prep and Modeling

Names of data prep steps from video

Splitting the water, training the water, then testing the water. Fold the water dataset using water_train.

Name of machine learning models in analysis:

ranger, random forest models

usemodels::use_ranger(status_id ~ ., data = water_train)

## ranger_recipe <- 
##   recipe(formula = status_id ~ ., data = water_train) 
## 
## ranger_spec <- 
##   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
##   set_mode("classification") %>% 
##   set_engine("ranger") 
## 
## ranger_workflow <- 
##   workflow() %>% 
##   add_recipe(ranger_recipe) %>% 
##   add_model(ranger_spec) 
## 
## set.seed(74403)
## ranger_tune <-
##   tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

library(themis)

## Warning: package 'themis' was built under R version 4.3.3

ranger_recipe <-
  recipe(formula = status_id ~ ., data = water_train) %>%
  update_role(row_id, new_role = "id") %>%
  step_unknown(all_nominal_predictors()) %>%
  step_other(all_nominal_predictors(), threshold = 0.03) %>%
  step_impute_linear(install_year) %>%
  step_downsample(status_id)

ranger_spec <-
  rand_forest(trees = 1000) %>%
  set_mode("classification") %>%
  set_engine("ranger")

ranger_workflow <-
  workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(ranger_spec)

doParallel::registerDoParallel()
set.seed(74403)
ranger_rs <-
  fit_resamples(ranger_workflow,
    resamples = water_folds,
    control = control_resamples(save_pred = TRUE)
  )

collect_metrics(ranger_rs)

## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.893    10 0.00139 Preprocessor1_Model1
## 2 roc_auc  binary     0.951    10 0.00101 Preprocessor1_Model1

Model Evaluation

Metrics used in model evaluation

Accuracy and roc_auc were used as metrics in this model. A good accuracy is generally above .7. Accuracy measures the number of correct predictions made by the model in relation to the total number of predictions made. Roc_auc is good if its 0.8 to 0.9. It stands for the area under the Roc curve.

collect_predictions(ranger_rs) %>%
  group_by(id) %>%
  roc_curve(status_id, .pred_n) %>%
  autoplot()

conf_mat_resampled(ranger_rs, tidy = FALSE) %>%
  autoplot()

final_fitted <- last_fit(ranger_workflow, water_split)

## Warning: package 'ranger' was built under R version 4.3.2

collect_metrics(final_fitted) ## metrics on the *testing* set

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.898 Preprocessor1_Model1
## 2 roc_auc  binary         0.953 Preprocessor1_Model1

collect_predictions(final_fitted) %>%
  conf_mat(status_id, .pred_class) %>%
  autoplot()

library(vip)

## Warning: package 'vip' was built under R version 4.3.3

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

imp_data <- ranger_recipe %>%
  prep() %>%
  bake(new_data = NULL) %>%
  select(-row_id)

ranger_spec %>%
  set_engine("ranger", importance = "permutation") %>%
  fit(status_id ~ ., data = imp_data) %>%
  vip(geom = "point")

imp_data %>%
  select(status_id, pay, water_tech, installer) %>%
  pivot_longer(pay:installer, names_to = "feature", values_to = "value") %>%
  ggplot(aes(y = value, fill = status_id)) +
  geom_bar(position = "fill") +
  facet_grid(rows = vars(feature), scales = "free_y", space = "free_y") +
  theme(legend.position = "top") +
  scale_fill_brewer(type = "qual", palette = 7) +
  scale_x_continuous(expand = expansion(mult = c(0, .01)), labels = scales::percent) +
  labs(
    x = "% of water sources", y = NULL, fill = "Water available?",
    title = "Water availability by source characteristic in Sierra Leone",
    subtitle = "Water sources with no payment information are likely to have no water available"
  )

Conclusion

For Sierra Leone, water sources with no payment information are likely to have no water available. “Pay” is the most important predictor, and the next most important predictors were the technology used at the water source, as well as who installed the water sources.

Code Along 5

Joe Stanley

2024-03-04