library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
water_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-04/water.csv")
## Rows: 473293 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): report_date, status_id, water_source, water_tech, facility_type, co...
## dbl (4): row_id, lat_deg, lon_deg, install_year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Explore data
water_raw %>%
filter(country_name == "Sierra Leone",
lat_deg > 0, lat_deg < 15, lon_deg < 0,
status_id %in% c("y", "n")
) %>%
ggplot(aes(lon_deg, lat_deg, color = status_id)) +
geom_point(alpha = 0.1) +
coord_fixed() +
guides(color = guide_legend(override.aes = list(alpha = 1)))

water <- water_raw %>%
filter(
country_name == "Sierra Leone",
lat_deg > 0, lat_deg < 15, lon_deg < 0,
status_id %in% c("y", "n")
) %>%
mutate(pay = case_when(
str_detect(pay, "^No") ~ "no",
str_detect(pay, "^Yes") ~ "yes",
is.na(pay) ~ pay,
TRUE ~ "it's complicated"
)) %>%
select(-country_name, -status, -report_date) %>%
mutate_if(is.character, as.factor)
water %>%
ggplot(aes(install_year, y = ..density.., fill = status_id)) +
geom_histogram(position = "identity", alpha = 0.5) +
labs(fill = "Water available?")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7074 rows containing non-finite values (`stat_bin()`).

water %>%
ggplot(aes(y = pay, fill = status_id)) +
geom_bar(position = "fill") +
labs(fill = "Water available?")

Build a model
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
set.seed(123)
water_split <- initial_split(water, strata = status_id)
water_train <- training(water_split)
water_test <- testing(water_split)
set.seed(234)
water_folds <- vfold_cv(water_train, strata = status_id)
water_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [36984/4110]> Fold01
## 2 <split [36984/4110]> Fold02
## 3 <split [36984/4110]> Fold03
## 4 <split [36984/4110]> Fold04
## 5 <split [36984/4110]> Fold05
## 6 <split [36985/4109]> Fold06
## 7 <split [36985/4109]> Fold07
## 8 <split [36985/4109]> Fold08
## 9 <split [36985/4109]> Fold09
## 10 <split [36986/4108]> Fold10
library(themis)
ranger_recipe <-
recipe(formula = status_id ~ ., data = water_train) %>%
update_role(row_id, new_role = "id") %>%
step_unknown(all_nominal_predictors()) %>%
step_other(all_nominal_predictors(), threshold = 0.03) %>%
step_impute_linear(install_year) %>%
step_downsample(status_id)
ranger_spec <-
rand_forest(trees = 1000) %>%
set_mode("classification") %>%
set_engine("ranger")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
doParallel::registerDoParallel()
set.seed(74403)
ranger_rs <-
fit_resamples(ranger_workflow,
resamples = water_folds,
control = control_resamples(save_pred = TRUE)
)
Explore results
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.893 10 0.00138 Preprocessor1_Model1
## 2 roc_auc binary 0.951 10 0.00102 Preprocessor1_Model1
collect_predictions(ranger_rs) %>%
group_by(id) %>%
roc_curve(status_id, .pred_n) %>%
autoplot()

conf_mat_resampled(ranger_rs, tidy = FALSE) %>%
autoplot()

final_fitted <- last_fit(ranger_workflow, water_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.898 Preprocessor1_Model1
## 2 roc_auc binary 0.953 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
conf_mat(status_id, .pred_class) %>%
autoplot()

library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
imp_data <- ranger_recipe %>%
prep() %>%
bake(new_data = NULL) %>%
select(-row_id)
ranger_spec %>%
set_engine("ranger", importance = "permutation") %>%
fit(status_id ~ ., data = imp_data) %>%
vip(geom = "point")

imp_data %>%
select(status_id, pay, water_tech, installer) %>%
pivot_longer(pay:installer, names_to = "feature", values_to = "value") %>%
ggplot(aes(y = value, fill = status_id)) +
geom_bar(position = "fill") +
facet_grid(rows = vars(feature), scales = "free_y", space = "free_y") +
theme(legend.position = "top") +
scale_fill_brewer(type = "qual", palette = 7) +
scale_x_continuous(expand = expansion(mult = c(0, .01)), labels = scales::percent) +
labs(
x = "% of water sources", y = NULL, fill = "Water available?",
title = "Water availability by source characteristic in Sierra Leone",
subtitle = "Water sources with no payment information are likely to have no water available"
)

Questions
- Question and Data:
- What is the research question? Clearly state the research question
you aim to address using the new dataset. The research question is “Can
we predict when water is available or is not available at the water
sources in Sierra Leone using characteristics such as water_tech, pay,
and installer?”
- Describe the data briefly: Provide an overview of the new dataset,
highlighting its key characteristics and dimensions. The original raw
data data contains 473,293 obs. of 13 variables. The variables consist
of row_id, water_source, status_id, facility_type, install_year,
latitude, longitude, report_date, country_name, installer, water-tech,
status and pay. There are 4 variables that are numerical data and 9 that
are character data. The data is about water sources from many different
countries in the world.
- What are the characteristics of the key variables used in the
analysis? Describe the primary variables of interest in the dataset and
their characteristics. The primary data of interest is the status_id
variable. This is the variable that we are trying to predict with our
machine learning model. The status_id variable says whether or not that
water source has any water in it. This variable is character data, and
is either y for yes, n for no, or u for unknown. Water_tech is also a
primary variable which describes the technology or method used for
sourcing the water. Installer is another primary variable which explains
who installed the water source. Pay is also a primary variable which
indicates whether or not you have to pay to get water from the water
source.
- Data Exploration and Transformation:
- Describe the differences between the original data and the data
transformed for modeling. Why? Explain any preprocessing or
transformations performed on the new dataset compared to the original
data. Discuss why these changes were necessary or beneficial. The
original data set had 3 more variables and many more observations. The
transformed data has only 54,793 observations. The original data set had
data for many different countries, whereas the transformed data was
filtered to focus on just Sierra Leone.
- Data Preparation and Modeling:
- What are the names of data preparation steps mentioned in the video?
List and describe any data preparation steps or techniques mentioned in
the CA video that you applied to the new dataset. The pay variable was
transformed into categorical data with three levels, either yes, no, or
it’s complicated. This was done using the mutate and case_when function.
We also removed 3 columns that were not as relevant to the machine
learning model. We used the select function and got rid of country_name,
status, and report_date. Mutate_if was used to convert character columns
into factors. We used step_impute_linear to get rid of missing values in
install_year. We also used step_downsample to reduce the number of
observations in the status_id column.
- What is the name of the machine learning model(s) used in the
analysis? Specify the machine learning model(s) you employed for your
analysis and briefly explain their relevance to the research question.
The machine learning model used in this code along was random forest
specifically ranger. Random forest was the best choice for this task
because it can handle numerical and categorical predictor variables.
Since this data set was a mix of both types, random forest was a good
option to choose. Random forest was used for classification with the
target variable being status id. Random forest provides insight about
which predictors are the most important when it comes to determining
water source availability.
- Model Evaluation:
- What metrics are used in the model evaluation? Detail the evaluation
metrics you used to assess the performance of your machine learning
model(s) on the new dataset. Discuss the significance of these metrics
in the context of your research question. The metrics used in the model
evaluation was ROC curve. ROC was used to distinguish between yes or no.
The ROC curve helps understand the model’s discriminatory power. We also
used the confusion matrix to calculate the performance metrics,
specifically in terms of its abiliity to correctly predict positive and
negative cases. This was insightful on understanding the accuracy of the
models predictions on water source availability.
- Conclusion:
- What are the major findings? Summarize the key findings and insights
obtained from your analysis of the new dataset. The key findings in this
case analysis show a few different things. The first is that the pay
column has a huge impact on whether water is available or not. According
to the machine learning model graph, water is much more likely to be
unavailable in the pay column. In the water_tech section, the rope and
bucket water sources seems to be more likely to have water available.
Also in water_tech the hand pump and unknown methods are more likely to
have water be unavailable. In the installer section the private and
private person water sources are more likely to be a yes for water
available. Water aid is also in installer and is also more likely to be
a no for water availability.