##Explore data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
water_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-04/water.csv")
## Rows: 473293 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): report_date, status_id, water_source, water_tech, facility_type, co...
## dbl (4): row_id, lat_deg, lon_deg, install_year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
water_raw %>%
    filter(country_name == "Sierra Leone",
           lat_deg > 0, lat_deg < 15, lon_deg < 0,
           status_id %in% c("y", "n")) %>%
    ggplot(aes(lon_deg, lat_deg, color = status_id)) +
    geom_point(alpha = 0.2) +
    coord_fixed() +
    guides(color = guide_legend(override.aes = list(alpha = 1)))

water <- water_raw %>%
    filter(country_name == "Sierra Leone",
           lat_deg > 0, lat_deg < 15, lon_deg < 0,
           status_id %in% c("y", "n")) %>%
    mutate(pay = case_when(str_detect(pay, "^No") ~ "no",
                           str_detect(pay, "^Yes") ~ "yes",
                           is.na(pay) ~ pay,
                           TRUE ~ "it's complicated")) %>% 
    select(-country_name, -status, -report_date) %>%
    mutate_if(is.character, as.factor)
water %>%
    ggplot(aes(install_year, y = ..density.., fill = status_id)) +
    geom_histogram(position = "identity", alpha = 0.5) +
    labs(fill = "water available?")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7074 rows containing non-finite values (`stat_bin()`).

water %>%
  ggplot(aes(y = pay, fill = status_id)) +
  geom_bar(position = "fill") +
  labs(fill = "Water available?")

Build Model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
set.seed(123)
water_split <- initial_split(water, strata = status_id)
water_train <- training(water_split)
water_test <- testing(water_split)

set.seed(234)
water_folds <- vfold_cv(water_train, strata = status_id)
water_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [36984/4110]> Fold01
##  2 <split [36984/4110]> Fold02
##  3 <split [36984/4110]> Fold03
##  4 <split [36984/4110]> Fold04
##  5 <split [36984/4110]> Fold05
##  6 <split [36985/4109]> Fold06
##  7 <split [36985/4109]> Fold07
##  8 <split [36985/4109]> Fold08
##  9 <split [36985/4109]> Fold09
## 10 <split [36986/4108]> Fold10
library(themis)
ranger_recipe <-
  recipe(formula = status_id ~ ., data = water_train) %>%
  update_role(row_id, new_role = "id") %>%
  step_unknown(all_nominal_predictors()) %>%
  step_other(all_nominal_predictors(), threshold = 0.03) %>%
  step_impute_linear(install_year) %>%
  step_downsample(status_id)

ranger_spec <-
  rand_forest(trees = 1000) %>%
  set_mode("classification") %>%
  set_engine("ranger")

ranger_workflow <-
  workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(ranger_spec)

doParallel::registerDoParallel()
set.seed(74403)
ranger_rs <-
  fit_resamples(ranger_workflow,
    resamples = water_folds,
    control = control_resamples(save_pred = TRUE)
  )
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.893    10 0.00138 Preprocessor1_Model1
## 2 roc_auc  binary     0.951    10 0.00102 Preprocessor1_Model1
collect_predictions(ranger_rs) %>%
    group_by(id) %>%
    roc_curve(status_id, .pred_n) %>% 
    autoplot()

conf_mat_resampled(ranger_rs, tidy = FALSE) %>%
    autoplot()

final_fitted <- last_fit(ranger_workflow, water_split) 
collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.898 Preprocessor1_Model1
## 2 roc_auc  binary         0.953 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
    conf_mat(status_id, .pred_class) %>% 
    autoplot()

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
imp_data <- ranger_recipe %>%
  prep() %>%
  bake(new_data = NULL) %>%
  select(-row_id)

ranger_spec %>%
  set_engine("ranger", importance = "permutation") %>%
  fit(status_id ~ ., data = imp_data) %>%
  vip(geom = "point")

    imp_data %>%
  select(status_id, pay, water_tech, installer) %>%
  pivot_longer(pay:installer, names_to = "feature", values_to = "value") %>%
  ggplot(aes(y = value, fill = status_id)) +
  geom_bar(position = "fill") +
  facet_grid(rows = vars(feature), scales = "free_y", space = "free_y") +
  theme(legend.position = "top") +
  scale_fill_brewer(type = "qual", palette = 7) +
  scale_x_continuous(expand = expansion(mult = c(0, .01)), labels = scales::percent) +
  labs(
    x = "% of water sources", y = NULL, fill = "Water available?",
    title = "Water availability by source characteristic in Sierra Leone",
    subtitle = "Water sources with no payment information are likely to have no water available"
  )

1. What is the research question?

#2. Data Exploration and Transformation: - The newly transformed data has dummed down the specifications in the “pay” category to just yes no and it’s complicated as well as turned some characters to factors.

3. Data Preparation and Modeling:

- There were a few steps made in this data prep and modeling section that include: update_role which turns the id into an identifier instead of data used in the modeling, step_unknown which will assign a missing value in a factor level to unknown, step_other which will pool uncommon values together, step_impute_linear which will create linear regression models to impute for missing data, and step_downsample which will remove rows of a data set to make occurence levels in a factor equal.   

#4. Model Evaluation: - Using ROC curves and confusion matrices we can see that the model performed very well.

5. Conclusion: