library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
water_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-04/water.csv")
## Rows: 473293 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): report_date, status_id, water_source, water_tech, facility_type, co...
## dbl (4): row_id, lat_deg, lon_deg, install_year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore data

water_raw %>%
  filter(country_name == "Sierra Leone",
    lat_deg > 0, lat_deg < 15, lon_deg < 0,
    status_id %in% c("y", "n")
  ) %>%
  ggplot(aes(lon_deg, lat_deg, color = status_id)) +
  geom_point(alpha = 0.1) +
  coord_fixed() +
  guides(color = guide_legend(override.aes = list(alpha = 1)))

water <- water_raw %>%
  filter(
    country_name == "Sierra Leone",
    lat_deg > 0, lat_deg < 15, lon_deg < 0,
    status_id %in% c("y", "n")
  ) %>%
  mutate(pay = case_when(
    str_detect(pay, "^No") ~ "no",
    str_detect(pay, "^Yes") ~ "yes",
    is.na(pay) ~ pay,
    TRUE ~ "it's complicated"
  )) %>%
  select(-country_name, -status, -report_date) %>%
  mutate_if(is.character, as.factor)
water %>%
  ggplot(aes(install_year, y = ..density.., fill = status_id)) +
  geom_histogram(position = "identity", alpha = 0.5) +
  labs(fill = "Water available?")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7074 rows containing non-finite values (`stat_bin()`).

water %>%
    ggplot(aes(y = pay, fill = status_id)) + 
    geom_bar(position = "fill") +
    labs(fill = "Water available?")

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
set.seed(123)
water_split <- initial_split(water, strata = status_id)
water_train <- training(water_split)
water_test <- testing(water_split)

set.seed(234)
water_folds <- vfold_cv(water_train, strata = status_id)
water_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [36984/4110]> Fold01
##  2 <split [36984/4110]> Fold02
##  3 <split [36984/4110]> Fold03
##  4 <split [36984/4110]> Fold04
##  5 <split [36984/4110]> Fold05
##  6 <split [36985/4109]> Fold06
##  7 <split [36985/4109]> Fold07
##  8 <split [36985/4109]> Fold08
##  9 <split [36985/4109]> Fold09
## 10 <split [36986/4108]> Fold10
library(themis)
ranger_recipe <-
  recipe(formula = status_id ~ ., data = water_train) %>%
  update_role(row_id, new_role = "id") %>%
  step_unknown(all_nominal_predictors()) %>%
  step_other(all_nominal_predictors(), threshold = 0.03) %>%
  step_impute_linear(install_year) %>%
  step_downsample(status_id)

ranger_spec <-
  rand_forest(trees = 1000) %>%
  set_mode("classification") %>%
  set_engine("ranger")

ranger_workflow <-
  workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(ranger_spec)

doParallel::registerDoParallel()
set.seed(74403)
ranger_rs <-
  fit_resamples(ranger_workflow,
    resamples = water_folds,
    control = control_resamples(save_pred = TRUE)
  )

Explore results

collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.893    10 0.00138 Preprocessor1_Model1
## 2 roc_auc  binary     0.951    10 0.00102 Preprocessor1_Model1
collect_predictions(ranger_rs) %>%
    group_by(id) %>%
    roc_curve(status_id, .pred_n) %>%
    autoplot()

conf_mat_resampled(ranger_rs, tidy = FALSE) %>%
    autoplot()

final_fitted <- last_fit(ranger_workflow, water_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.898 Preprocessor1_Model1
## 2 roc_auc  binary         0.953 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
    conf_mat(status_id, .pred_class) %>%
    autoplot()

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
imp_data <- ranger_recipe %>%
  prep() %>%
  bake(new_data = NULL) %>%
  select(-row_id)

ranger_spec %>%
  set_engine("ranger", importance = "permutation") %>%
  fit(status_id ~ ., data = imp_data) %>%
  vip(geom = "point")

imp_data %>%
  select(status_id, pay, water_tech, installer) %>%
  pivot_longer(pay:installer, names_to = "feature", values_to = "value") %>%
  ggplot(aes(y = value, fill = status_id)) +
  geom_bar(position = "fill") +
  facet_grid(rows = vars(feature), scales = "free_y", space = "free_y") +
  theme(legend.position = "top") +
  scale_fill_brewer(type = "qual", palette = 7) +
  scale_x_continuous(expand = expansion(mult = c(0, .01)), labels = scales::percent) +
  labs(
    x = "% of water sources", y = NULL, fill = "Water available?",
    title = "Water availability by source characteristic in Sierra Leone",
    subtitle = "Water sources with no payment information are likely to have no water available"
  )

Questions

  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. The research question is “Can we predict when water is available or is not available at the water sources in Sierra Leone using characteristics such as water_tech, pay, and installer?”
    • Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The original raw data data contains 473,293 obs. of 13 variables. The variables consist of row_id, water_source, status_id, facility_type, install_year, latitude, longitude, report_date, country_name, installer, water-tech, status and pay. There are 4 variables that are numerical data and 9 that are character data. The data is about water sources from many different countries in the world.
    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The primary data of interest is the status_id variable. This is the variable that we are trying to predict with our machine learning model. The status_id variable says whether or not that water source has any water in it. This variable is character data, and is either y for yes, n for no, or u for unknown. Water_tech is also a primary variable which describes the technology or method used for sourcing the water. Installer is another primary variable which explains who installed the water source. Pay is also a primary variable which indicates whether or not you have to pay to get water from the water source.
  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial. The original data set had 3 more variables and many more observations. The transformed data has only 54,793 observations. The original data set had data for many different countries, whereas the transformed data was filtered to focus on just Sierra Leone.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. The pay variable was transformed into categorical data with three levels, either yes, no, or it’s complicated. This was done using the mutate and case_when function. We also removed 3 columns that were not as relevant to the machine learning model. We used the select function and got rid of country_name, status, and report_date. Mutate_if was used to convert character columns into factors. We used step_impute_linear to get rid of missing values in install_year. We also used step_downsample to reduce the number of observations in the status_id column.
    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning model used in this code along was random forest specifically ranger. Random forest was the best choice for this task because it can handle numerical and categorical predictor variables. Since this data set was a mix of both types, random forest was a good option to choose. Random forest was used for classification with the target variable being status id. Random forest provides insight about which predictors are the most important when it comes to determining water source availability.
  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. The metrics used in the model evaluation was ROC curve. ROC was used to distinguish between yes or no. The ROC curve helps understand the model’s discriminatory power. We also used the confusion matrix to calculate the performance metrics, specifically in terms of its abiliity to correctly predict positive and negative cases. This was insightful on understanding the accuracy of the models predictions on water source availability.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new dataset. The key findings in this case analysis show a few different things. The first is that the pay column has a huge impact on whether water is available or not. According to the machine learning model graph, water is much more likely to be unavailable in the pay column. In the water_tech section, the rope and bucket water sources seems to be more likely to have water available. Also in water_tech the hand pump and unknown methods are more likely to have water be unavailable. In the installer section the private and private person water sources are more likely to be a yes for water available. Water aid is also in installer and is also more likely to be a no for water availability.