knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.2
library(correlationfunnel)
## Warning: package 'correlationfunnel' was built under R version 4.3.2
## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)
library(textrecipes)
## Warning: package 'textrecipes' was built under R version 4.3.2
## Loading required package: recipes
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.6 ✔ workflows 1.1.3
## ✔ modeldata 1.3.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.3.0
## Warning: package 'dials' was built under R version 4.3.2
## Warning: package 'scales' was built under R version 4.3.2
## Warning: package 'infer' was built under R version 4.3.2
## Warning: package 'modeldata' was built under R version 4.3.2
## Warning: package 'parsnip' was built under R version 4.3.2
## Warning: package 'tune' was built under R version 4.3.2
## Warning: package 'workflows' was built under R version 4.3.2
## Warning: package 'workflowsets' was built under R version 4.3.2
## Warning: package 'yardstick' was built under R version 4.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.3.2
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
library(ggplot2)
water_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-04/water.csv")
## Rows: 473293 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): report_date, status_id, water_source, water_tech, facility_type, co...
## dbl (4): row_id, lat_deg, lon_deg, install_year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
water_raw
## # A tibble: 473,293 × 13
## row_id lat_deg lon_deg report_date status_id water_source water_tech
## <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 3957 8.07 38.6 04/06/2017 y <NA> <NA>
## 2 33512 7.37 40.5 08/04/2020 y Protected Spring <NA>
## 3 35125 0.773 34.9 03/18/2015 y Protected Shallow We… <NA>
## 4 37760 0.781 35.0 03/18/2015 y Borehole <NA>
## 5 38118 0.779 35.0 03/18/2015 y Protected Shallow We… <NA>
## 6 38501 0.308 34.1 03/19/2015 y Borehole <NA>
## 7 46357 0.419 34.3 05/19/2015 y Unprotected Spring <NA>
## 8 46535 0.444 34.3 05/19/2015 y Protected Shallow We… <NA>
## 9 46560 0.456 34.3 05/19/2015 y Protected Shallow We… <NA>
## 10 46782 0.467 34.3 05/20/2015 y Protected Shallow We… <NA>
## # ℹ 473,283 more rows
## # ℹ 6 more variables: facility_type <chr>, country_name <chr>,
## # install_year <dbl>, installer <chr>, pay <chr>, status <chr>
The research question is about whether a water source has water available at it based on the characteristics of the water source observed during a visit.
The data describes different water sources, whether you need to pay for them or not, whether they have a source installed, if the water is actually available, and shows data for different locations.
status: Tells whether the water source is functional/available or not country_name: Different countries pay: whether the water is paid for or not water_source: Where the water is sourced from. water_tech: who installed the water source
water_raw %>%
filter(
country_name == "Sierra Leone",
lat_deg > 0, lat_deg < 15, lon_deg < 0,
status_id %in% c("y", "n")
) %>%
ggplot(aes(lon_deg, lat_deg, color = status_id)) +
geom_point(alpha = 0.1) +
coord_fixed() +
guides(color = guide_legend(override.aes = list(alpha = 1)))
water <- water_raw %>%
filter(
country_name == "Sierra Leone",
lat_deg > 0, lat_deg < 15, lon_deg < 0,
status_id %in% c("y", "n")
) %>%
mutate(pay = case_when(
str_detect(pay, "^No") ~ "no",
str_detect(pay, "^Yes") ~ "yes",
is.na(pay) ~ pay,
TRUE ~ "it's complicated"
)) %>%
select(-country_name, -status, -report_date) %>%
mutate_if(is.character, as.factor)
The transformed data shows no report date, keeps only that status_id (instead of status[functional]), and is only data in the country of Sierra Leone within specific coordinates. It also gets rid of data where “pay” is N/A.
water %>%
ggplot(aes(install_year, y = ..density.., fill = status_id)) +
geom_histogram(position = "identity", alpha = 0.5) +
labs(fill = "Water available?")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7074 rows containing non-finite outside the scale range
## (`stat_bin()`).
water %>%
ggplot(aes(y = pay, fill = status_id)) +
geom_bar(position = "fill") +
labs(fill = "Water available?")
library(tidymodels)
set.seed(123)
water_split <- initial_split(water, strata = status_id)
water_train <- training(water_split)
water_test <- testing(water_split)
set.seed(234)
water_folds <- vfold_cv(water_train, strata = status_id)
water_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [36984/4110]> Fold01
## 2 <split [36984/4110]> Fold02
## 3 <split [36984/4110]> Fold03
## 4 <split [36984/4110]> Fold04
## 5 <split [36984/4110]> Fold05
## 6 <split [36985/4109]> Fold06
## 7 <split [36985/4109]> Fold07
## 8 <split [36985/4109]> Fold08
## 9 <split [36985/4109]> Fold09
## 10 <split [36986/4108]> Fold10
Splitting the water, training the water, then testing the water. Fold the water dataset using water_train.
ranger, random forest models
usemodels::use_ranger(status_id ~ ., data = water_train)
## ranger_recipe <-
## recipe(formula = status_id ~ ., data = water_train)
##
## ranger_spec <-
## rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
## set_mode("classification") %>%
## set_engine("ranger")
##
## ranger_workflow <-
## workflow() %>%
## add_recipe(ranger_recipe) %>%
## add_model(ranger_spec)
##
## set.seed(74403)
## ranger_tune <-
## tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(themis)
## Warning: package 'themis' was built under R version 4.3.3
ranger_recipe <-
recipe(formula = status_id ~ ., data = water_train) %>%
update_role(row_id, new_role = "id") %>%
step_unknown(all_nominal_predictors()) %>%
step_other(all_nominal_predictors(), threshold = 0.03) %>%
step_impute_linear(install_year) %>%
step_downsample(status_id)
ranger_spec <-
rand_forest(trees = 1000) %>%
set_mode("classification") %>%
set_engine("ranger")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
doParallel::registerDoParallel()
set.seed(74403)
ranger_rs <-
fit_resamples(ranger_workflow,
resamples = water_folds,
control = control_resamples(save_pred = TRUE)
)
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.893 10 0.00139 Preprocessor1_Model1
## 2 roc_auc binary 0.951 10 0.00101 Preprocessor1_Model1
Accuracy and roc_auc were used as metrics in this model. A good accuracy is generally above .7. Accuracy measures the number of correct predictions made by the model in relation to the total number of predictions made. Roc_auc is good if its 0.8 to 0.9. It stands for the area under the Roc curve.
collect_predictions(ranger_rs) %>%
group_by(id) %>%
roc_curve(status_id, .pred_n) %>%
autoplot()
conf_mat_resampled(ranger_rs, tidy = FALSE) %>%
autoplot()
final_fitted <- last_fit(ranger_workflow, water_split)
## Warning: package 'ranger' was built under R version 4.3.2
collect_metrics(final_fitted) ## metrics on the *testing* set
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.898 Preprocessor1_Model1
## 2 roc_auc binary 0.953 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
conf_mat(status_id, .pred_class) %>%
autoplot()
library(vip)
## Warning: package 'vip' was built under R version 4.3.3
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
imp_data <- ranger_recipe %>%
prep() %>%
bake(new_data = NULL) %>%
select(-row_id)
ranger_spec %>%
set_engine("ranger", importance = "permutation") %>%
fit(status_id ~ ., data = imp_data) %>%
vip(geom = "point")
imp_data %>%
select(status_id, pay, water_tech, installer) %>%
pivot_longer(pay:installer, names_to = "feature", values_to = "value") %>%
ggplot(aes(y = value, fill = status_id)) +
geom_bar(position = "fill") +
facet_grid(rows = vars(feature), scales = "free_y", space = "free_y") +
theme(legend.position = "top") +
scale_fill_brewer(type = "qual", palette = 7) +
scale_x_continuous(expand = expansion(mult = c(0, .01)), labels = scales::percent) +
labs(
x = "% of water sources", y = NULL, fill = "Water available?",
title = "Water availability by source characteristic in Sierra Leone",
subtitle = "Water sources with no payment information are likely to have no water available"
)
For Sierra Leone, water sources with no payment information are likely to have no water available. “Pay” is the most important predictor, and the next most important predictors were the technology used at the water source, as well as who installed the water sources.