##Explore data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
water_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-04/water.csv")
## Rows: 473293 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): report_date, status_id, water_source, water_tech, facility_type, co...
## dbl (4): row_id, lat_deg, lon_deg, install_year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
water_raw %>%
filter(country_name == "Sierra Leone",
lat_deg > 0, lat_deg < 15, lon_deg < 0,
status_id %in% c("y", "n")) %>%
ggplot(aes(lon_deg, lat_deg, color = status_id)) +
geom_point(alpha = 0.2) +
coord_fixed() +
guides(color = guide_legend(override.aes = list(alpha = 1)))
water <- water_raw %>%
filter(country_name == "Sierra Leone",
lat_deg > 0, lat_deg < 15, lon_deg < 0,
status_id %in% c("y", "n")) %>%
mutate(pay = case_when(str_detect(pay, "^No") ~ "no",
str_detect(pay, "^Yes") ~ "yes",
is.na(pay) ~ pay,
TRUE ~ "it's complicated")) %>%
select(-country_name, -status, -report_date) %>%
mutate_if(is.character, as.factor)
water %>%
ggplot(aes(install_year, y = ..density.., fill = status_id)) +
geom_histogram(position = "identity", alpha = 0.5) +
labs(fill = "water available?")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7074 rows containing non-finite values (`stat_bin()`).
water %>%
ggplot(aes(y = pay, fill = status_id)) +
geom_bar(position = "fill") +
labs(fill = "Water available?")
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
set.seed(123)
water_split <- initial_split(water, strata = status_id)
water_train <- training(water_split)
water_test <- testing(water_split)
set.seed(234)
water_folds <- vfold_cv(water_train, strata = status_id)
water_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [36984/4110]> Fold01
## 2 <split [36984/4110]> Fold02
## 3 <split [36984/4110]> Fold03
## 4 <split [36984/4110]> Fold04
## 5 <split [36984/4110]> Fold05
## 6 <split [36985/4109]> Fold06
## 7 <split [36985/4109]> Fold07
## 8 <split [36985/4109]> Fold08
## 9 <split [36985/4109]> Fold09
## 10 <split [36986/4108]> Fold10
library(themis)
ranger_recipe <-
recipe(formula = status_id ~ ., data = water_train) %>%
update_role(row_id, new_role = "id") %>%
step_unknown(all_nominal_predictors()) %>%
step_other(all_nominal_predictors(), threshold = 0.03) %>%
step_impute_linear(install_year) %>%
step_downsample(status_id)
ranger_spec <-
rand_forest(trees = 1000) %>%
set_mode("classification") %>%
set_engine("ranger")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
doParallel::registerDoParallel()
set.seed(74403)
ranger_rs <-
fit_resamples(ranger_workflow,
resamples = water_folds,
control = control_resamples(save_pred = TRUE)
)
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.893 10 0.00138 Preprocessor1_Model1
## 2 roc_auc binary 0.951 10 0.00102 Preprocessor1_Model1
collect_predictions(ranger_rs) %>%
group_by(id) %>%
roc_curve(status_id, .pred_n) %>%
autoplot()
conf_mat_resampled(ranger_rs, tidy = FALSE) %>%
autoplot()
final_fitted <- last_fit(ranger_workflow, water_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.898 Preprocessor1_Model1
## 2 roc_auc binary 0.953 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
conf_mat(status_id, .pred_class) %>%
autoplot()
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
imp_data <- ranger_recipe %>%
prep() %>%
bake(new_data = NULL) %>%
select(-row_id)
ranger_spec %>%
set_engine("ranger", importance = "permutation") %>%
fit(status_id ~ ., data = imp_data) %>%
vip(geom = "point")
imp_data %>%
select(status_id, pay, water_tech, installer) %>%
pivot_longer(pay:installer, names_to = "feature", values_to = "value") %>%
ggplot(aes(y = value, fill = status_id)) +
geom_bar(position = "fill") +
facet_grid(rows = vars(feature), scales = "free_y", space = "free_y") +
theme(legend.position = "top") +
scale_fill_brewer(type = "qual", palette = 7) +
scale_x_continuous(expand = expansion(mult = c(0, .01)), labels = scales::percent) +
labs(
x = "% of water sources", y = NULL, fill = "Water available?",
title = "Water availability by source characteristic in Sierra Leone",
subtitle = "Water sources with no payment information are likely to have no water available"
)
#2. Data Exploration and Transformation: - The newly transformed data has dummed down the specifications in the “pay” category to just yes no and it’s complicated as well as turned some characters to factors.
- There were a few steps made in this data prep and modeling section that include: update_role which turns the id into an identifier instead of data used in the modeling, step_unknown which will assign a missing value in a factor level to unknown, step_other which will pool uncommon values together, step_impute_linear which will create linear regression models to impute for missing data, and step_downsample which will remove rows of a data set to make occurence levels in a factor equal.
#4. Model Evaluation: - Using ROC curves and confusion matrices we can see that the model performed very well.