##Explore data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

water_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-04/water.csv")

## Rows: 473293 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): report_date, status_id, water_source, water_tech, facility_type, co...
## dbl (4): row_id, lat_deg, lon_deg, install_year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

water_raw %>%
    filter(country_name == "Sierra Leone",
           lat_deg > 0, lat_deg < 15, lon_deg < 0,
           status_id %in% c("y", "n")) %>%
    ggplot(aes(lon_deg, lat_deg, color = status_id)) +
    geom_point(alpha = 0.2) +
    coord_fixed() +
    guides(color = guide_legend(override.aes = list(alpha = 1)))

water <- water_raw %>%
    filter(country_name == "Sierra Leone",
           lat_deg > 0, lat_deg < 15, lon_deg < 0,
           status_id %in% c("y", "n")) %>%
    mutate(pay = case_when(str_detect(pay, "^No") ~ "no",
                           str_detect(pay, "^Yes") ~ "yes",
                           is.na(pay) ~ pay,
                           TRUE ~ "it's complicated")) %>% 
    select(-country_name, -status, -report_date) %>%
    mutate_if(is.character, as.factor)

water %>%
    ggplot(aes(install_year, y = ..density.., fill = status_id)) +
    geom_histogram(position = "identity", alpha = 0.5) +
    labs(fill = "water available?")

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 7074 rows containing non-finite values (`stat_bin()`).

water %>%
  ggplot(aes(y = pay, fill = status_id)) +
  geom_bar(position = "fill") +
  labs(fill = "Water available?")

Build Model

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org

set.seed(123)
water_split <- initial_split(water, strata = status_id)
water_train <- training(water_split)
water_test <- testing(water_split)

set.seed(234)
water_folds <- vfold_cv(water_train, strata = status_id)
water_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [36984/4110]> Fold01
##  2 <split [36984/4110]> Fold02
##  3 <split [36984/4110]> Fold03
##  4 <split [36984/4110]> Fold04
##  5 <split [36984/4110]> Fold05
##  6 <split [36985/4109]> Fold06
##  7 <split [36985/4109]> Fold07
##  8 <split [36985/4109]> Fold08
##  9 <split [36985/4109]> Fold09
## 10 <split [36986/4108]> Fold10

library(themis)
ranger_recipe <-
  recipe(formula = status_id ~ ., data = water_train) %>%
  update_role(row_id, new_role = "id") %>%
  step_unknown(all_nominal_predictors()) %>%
  step_other(all_nominal_predictors(), threshold = 0.03) %>%
  step_impute_linear(install_year) %>%
  step_downsample(status_id)

ranger_spec <-
  rand_forest(trees = 1000) %>%
  set_mode("classification") %>%
  set_engine("ranger")

ranger_workflow <-
  workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(ranger_spec)

doParallel::registerDoParallel()
set.seed(74403)
ranger_rs <-
  fit_resamples(ranger_workflow,
    resamples = water_folds,
    control = control_resamples(save_pred = TRUE)
  )

collect_metrics(ranger_rs)

## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.893    10 0.00138 Preprocessor1_Model1
## 2 roc_auc  binary     0.951    10 0.00102 Preprocessor1_Model1

collect_predictions(ranger_rs) %>%
    group_by(id) %>%
    roc_curve(status_id, .pred_n) %>% 
    autoplot()

conf_mat_resampled(ranger_rs, tidy = FALSE) %>%
    autoplot()

final_fitted <- last_fit(ranger_workflow, water_split) 
collect_metrics(final_fitted)

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.898 Preprocessor1_Model1
## 2 roc_auc  binary         0.953 Preprocessor1_Model1

collect_predictions(final_fitted) %>%
    conf_mat(status_id, .pred_class) %>% 
    autoplot()

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

imp_data <- ranger_recipe %>%
  prep() %>%
  bake(new_data = NULL) %>%
  select(-row_id)

ranger_spec %>%
  set_engine("ranger", importance = "permutation") %>%
  fit(status_id ~ ., data = imp_data) %>%
  vip(geom = "point")

    imp_data %>%
  select(status_id, pay, water_tech, installer) %>%
  pivot_longer(pay:installer, names_to = "feature", values_to = "value") %>%
  ggplot(aes(y = value, fill = status_id)) +
  geom_bar(position = "fill") +
  facet_grid(rows = vars(feature), scales = "free_y", space = "free_y") +
  theme(legend.position = "top") +
  scale_fill_brewer(type = "qual", palette = 7) +
  scale_x_continuous(expand = expansion(mult = c(0, .01)), labels = scales::percent) +
  labs(
    x = "% of water sources", y = NULL, fill = "Water available?",
    title = "Water availability by source characteristic in Sierra Leone",
    subtitle = "Water sources with no payment information are likely to have no water available"
  )

1. What is the research question?

The question we are trying to answer is whether or not based on the data, we can predict if there is water available at that location. # Describe the data briefly
The data consists of latitude and longitudes of locations, report date, where the water is from both source and country and the tech used to measure, whether it is paid for or not, etc. # What are the characteristics of the key variables used in the analysis?
The primary variables we are using in this analysis consist of whether there is water available, who was the installer, is it paif for or not, and what water tech is used.

#2. Data Exploration and Transformation: - The newly transformed data has dummed down the specifications in the “pay” category to just yes no and it’s complicated as well as turned some characters to factors.

3. Data Preparation and Modeling:

- There were a few steps made in this data prep and modeling section that include: update_role which turns the id into an identifier instead of data used in the modeling, step_unknown which will assign a missing value in a factor level to unknown, step_other which will pool uncommon values together, step_impute_linear which will create linear regression models to impute for missing data, and step_downsample which will remove rows of a data set to make occurence levels in a factor equal.

#4. Model Evaluation: - Using ROC curves and confusion matrices we can see that the model performed very well.

5. Conclusion:

The key findings of this model are that pay is the most important variable and after that are what tech was used at a source and who installed it.

Code Along 5

Colin Tracy

2023-10-08

Build Model

1. What is the research question?

3. Data Preparation and Modeling:

5. Conclusion: