Setup

Packages

Let’s load the required packages for this analysis.

library(here)
library(tidyverse)
library(skimr)
library(corrplot)
library(tidytable)
library(janitor)
library(knitr)

Collect the data

Let’s import the data and preview it.

recycling <- 
  read_csv(here("data/cln/data.csv"))

glimpse(recycling)

## Rows: 309
## Columns: 31
## $ lad21cd                     <chr> "E06000001", "E06000002", "E06000003", "E0…
## $ local_authority             <chr> "Hartlepool", "Middlesbrough", "Redcar and…
## $ in_lad                      <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y…
## $ wa21cd                      <chr> "E06000001", "E06000002", "E06000003", "E0…
## $ population                  <dbl> 92571, 143734, 136616, 197030, 108222, 128…
## $ collection_type             <chr> "Co-Mingled", "Co-Mingled", "Two Stream", …
## $ collection_type_sort        <dbl> 1, 1, 2, 3, 3, 1, 1, 2, 2, 1, 1, 3, 3, 3, …
## $ in_foi                      <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y…
## $ waste_authority             <chr> "Hartlepool Borough Council", "Middlesbrou…
## $ waste_authority_type        <chr> "Unitary", "Unitary", "Unitary", "Unitary"…
## $ total_collected             <dbl> 45400, 67649, 67283, 105430, 52638, 63344,…
## $ total_recycled              <dbl> 13264, 20557, 23107, 32635, 17080, 24560, …
## $ total_not_recycled          <dbl> 32136, 47092, 44177, 72794, 35559, 38783, …
## $ total_rejects               <dbl> 510, 3754, 1848, -458, 1561, 3055, 4431, 1…
## $ household_collected         <dbl> 39552, 64433, 57866, 88050, 46203, 59958, …
## $ household_recycled          <dbl> 12893, 19169, 22077, 22592, 14968, 22137, …
## $ household_not_recycled      <dbl> 26659, 45264, 35789, 65458, 31235, 37821, …
## $ household_rejects           <dbl> 510, 3754, 1848, -458, 1561, 3055, 4431, 1…
## $ non_household_collected     <dbl> 5848, 3216, 9417, 17380, 6435, 3385, 5331,…
## $ non_household_recycled      <dbl> 371, 1388, 1030, 10044, 2112, 2423, 2668, …
## $ non_household_not_recycled  <dbl> 5477, 1828, 8388, 7337, 4324, 962, 2664, 5…
## $ non_household_rejects       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ in_rec                      <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y…
## $ household_recycle_rate      <dbl> 0.3259759, 0.2975028, 0.3815194, 0.2565815…
## $ rgn21cd                     <chr> "E12000001", "E12000001", "E12000001", "E1…
## $ region                      <chr> "North East", "North East", "North East", …
## $ bng_e                       <dbl> 447160, 451141, 464361, 444940, 428029, 35…
## $ bng_n                       <dbl> 531474, 516887, 519597, 518183, 515648, 38…
## $ long                        <dbl> -1.27018, -1.21099, -1.00608, -1.30664, -1…
## $ lat                         <dbl> 54.6761, 54.5447, 54.5675, 54.5569, 54.535…
## $ household_collected_per_cap <dbl> 0.4272612, 0.4482795, 0.4235668, 0.4468863…

The dataset contains a large number of character columns. Let’s select only the columns we are interested in for this analysis.

recycling <- 
  recycling |> 
  select(
    lad21cd,
    population,
    collection_type,
    household_collected,
    household_recycled,
    household_not_recycled,
    household_rejects
  )

head(recycling)

## # A tidytable: 6 × 7
##   lad21cd   population collection_type household_collected household_recycled
##   <chr>          <dbl> <chr>                         <dbl>              <dbl>
## 1 E06000001      92571 Co-Mingled                    39552              12893
## 2 E06000002     143734 Co-Mingled                    64433              19169
## 3 E06000003     136616 Two Stream                    57866              22077
## 4 E06000004     197030 Multi-Stream                  88050              22592
## 5 E06000005     108222 Multi-Stream                  46203              14968
## 6 E06000006     128577 Co-Mingled                    59958              22137
## # ℹ 2 more variables: household_not_recycled <dbl>, household_rejects <dbl>

Now we can check the summary statistics for each column and see if we have any missing values.

skim(recycling)

Data summary
Name	recycling
Number of rows	309
Number of columns	7
Key	NULL
_______________________
Column type frequency:
character	2
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
lad21cd	0	1	9	9	0	309	0
collection_type	0	1	10	12	0	3	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
population	1	182965.76	123691.45	2271	103617	142951	224826	1142494	▇▂▁▁▁
household_collected	1	73568.17	54141.93	1373	40697	55102	89832	401225	▇▂▁▁▁
household_recycled	1	30829.63	24491.44	553	17101	22933	34976	149880	▇▃▁▁▁
household_not_recycled	1	42738.56	33479.94	821	22191	31235	55777	309555	▇▂▁▁▁
household_rejects	1	1955.74	2390.31	-866	544	1230	2479	19852	▇▁▁▁▁

We can see that there are no missing values in any of the columns. We notice that household_rejects has a some negative values, which doesn’t make sense. I propose to ignore that column for my analysis.

Exploratory Analysis

Correlations

Let’s run a correlation matrix to understand relationships between the variables.

recycling |> 
  select(where(is.numeric)) |> 
  cor(use = "pairwise.complete.obs") |> 
  round(2) |> 
  corrplot(method = "color", type = "upper")

There is a high correlation between population and household_collected meaning that as the population increases, so does the total amount of household waste that is collected. Same goes for population vs household_not_recycled. Both of these seem fairly obvious, so what if we look at the proportion of waste that is recycled instead of the amount? To do this we need to engineer a new feature into the dataset.

recycling <- 
  recycling |> 
  mutate(
    household_recycled_pct = round(household_recycled / household_collected, 3)
  )

recycling |> 
  select(where(is.numeric)) |> 
  cor(use = "pairwise.complete.obs") |> 
  round(2) |> 
  corrplot(method = "color", type = "upper")

Now we see a negative correlation between population and household_recycled_pct, meaning that as population increases, the proportion of waste that is recycled decreases. Larger populations are associated with lower recycling rates.

Let’s explore this relationship in a scatterplot.

recycling |> 
  ggplot(
    aes(
      x = household_recycled_pct,
      y = population
    )
  ) +
  geom_point() +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Clustering

Normalization

scaled_recycling <- 
  recycling |> 
  select(where(is.numeric)) |> 
  scale()

Fit the model

# Get the 2 columns of interest
scaled_recycling_2col <- 
  scaled_recycling[, c("population", "household_recycled_pct")]

set.seed(123)
km.out <- kmeans(scaled_recycling_2col, centers = 3, nstart = 20)
km.out

## K-means clustering with 3 clusters of sizes 57, 115, 137
## 
## Cluster means:
##   population household_recycled_pct
## 1  1.5274893             -0.7403150
## 2 -0.2386262              1.0287800
## 3 -0.4352181             -0.5555602
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 2 2 3 2 3 1 1 2 1 3 2 1 2 1 2 2 1 3 3 3 3 3 3 1 2 2 2 3
##  [38] 2 2 2 1 3 1 2 1 2 2 2 1 3 1 3 1 1 1 2 1 1 1 2 2 3 2 2 3 3 3 3 3 2 3 3 3 2
##  [75] 3 2 2 2 2 3 2 2 2 2 2 2 3 3 3 2 2 2 2 3 2 3 2 2 3 2 2 3 2 2 2 2 3 2 2 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2 2 2 3 3 3 2 3 2 3 3 2 2 3 2 3 3 3 3 3 3 3 2
## [149] 2 2 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 2 3 3 3 3 3 3 3 3 2
## [186] 2 2 2 2 2 2 2 2 3 3 2 2 2 2 2 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 3 3 3 2 2 3 3
## [223] 2 3 2 3 3 3 3 3 3 3 3 2 2 2 3 3 3 2 2 2 1 2 2 2 2 2 2 2 3 1 3 1 1 2 1 2 1
## [260] 1 3 3 1 1 1 1 1 3 1 1 1 2 1 1 1 3 3 3 1 3 1 1 3 1 1 1 1 1 3 1 1 1 1 1 3 3
## [297] 2 1 1 3 1 1 3 1 2 1 1 1 3
## 
## Within cluster sum of squares by cluster:
## [1] 114.98964  70.55520  69.72411
##  (between_SS / total_SS =  58.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Optimize the model

# Decide how many clusters to look at
n_clusters <- 10

# Initialize total within sum of squares error: wss
wss <- numeric(n_clusters)

set.seed(123)

# Look over 1 to n possible clusters
for (i in 1:n_clusters) {
  # Fit the model
  km.out <- kmeans(scaled_recycling_2col, centers = i, nstart = 20)
  # Save the within cluster sum of squares
  wss[i] <- km.out$tot.withinss
}

# Produce a scree plot
wss_df <- tibble(clusters = 1:n_clusters, wss = wss)

scree_plot <- ggplot(wss_df, aes(x = clusters, y = wss, group = 1)) +
  geom_point(size = 4) +
  geom_line() +
  scale_x_continuous(breaks = c(2, 4, 6, 8, 10)) +
  xlab("Number of clusters")

scree_plot

Rebuild the model with optimized k

k <- 6

set.seed(123)

km.out <- kmeans(scaled_recycling_2col, centers = k, nstart = 20)

recycling$cluster_id <- factor(km.out$cluster)

ggplot(
  recycling, 
  aes(
    x = population,
    y = household_recycled_pct,
    colour = cluster_id
  )
) +
  geom_point() +
  xlab("Population") +
  ylab("Recycling Rate")

Interpret the results

First, we dummy code the only categorical feature, collection_type to allow us to get descriptive statistics for the values of this feature.

# Get dummies
recycling <- 
  recycling |> 
  get_dummies(cols = collection_type) |> 
  select(!collection_type) |> 
  clean_names()

# Summarise descriptive stats of full data
recycling_stats <- 
  recycling |> 
  summarise(
    global_comingled = mean(collection_type_co_mingled),
    global_two_stream = mean(collection_type_two_stream),
    global_multi_stream = mean(collection_type_multi_stream),
    global_population = median(population),
    global_household_collected = median(household_collected),
    global_household_recycled = median(household_recycled),
    global_household_rate = median(household_recycled_pct)
  ) |> 
  round(2)

recycling_stats |> 
  kable()

global_comingled	global_two_stream	global_multi_stream	global_population	global_household_collected	global_household_recycled	global_household_rate
0.51	0.36	0.13	142951	55102	22933	0.42

Next get the same descriptive stats but grouped by cluster.

recycling_stats_by_cluster <- 
  recycling |> 
  group_by(cluster_id) |> 
  summarise(
    cluster_comingled = round(mean(collection_type_co_mingled), 2),
    cluster_two_stream = round(mean(collection_type_two_stream), 2),
    cluster_multi_stream = round(mean(collection_type_multi_stream), 2),
    cluster_population = median(population),
    cluster_household_collected = median(household_collected),
    cluster_household_recycled = median(household_recycled),
    cluster_household_rate = median(household_recycled_pct)
  )

recycling_stats_by_cluster |> 
  kable()

cluster_id	cluster_comingled	cluster_two_stream	cluster_multi_stream	cluster_population	cluster_household_collected	cluster_household_recycled	cluster_household_rate
1	0.56	0.36	0.08	123383.0	45807	14587	0.3240
2	0.22	0.67	0.11	554401.0	239199	92163	0.3610
3	0.69	0.24	0.07	289254.0	107383	34901	0.3200
4	0.47	0.39	0.15	113047.0	45328	19781	0.4270
5	0.52	0.30	0.18	135062.5	51970	28780	0.5425
6	0.33	0.52	0.15	308705.0	127334	62984	0.4870

Let’s plot the differences from the population for each cluster.

recycling_cluster_profiles <- 
  recycling_stats_by_cluster |> 
  bind_cols(recycling_stats) |> 
  # Calculate the difference from global for each cluster
  mutate(
    type_comingled = (cluster_comingled - global_comingled) / global_comingled,
    type_two_stream = (cluster_two_stream - global_two_stream) / global_two_stream,
    type_multi_stream = (cluster_multi_stream - global_multi_stream) / global_multi_stream,
    population = (cluster_population - global_population) / global_population,
    household_collected = (cluster_household_collected - global_household_collected) / global_household_collected,
    household_recycled = (cluster_household_recycled - global_household_recycled) / global_household_recycled,
    household_rate = (cluster_household_rate - global_household_rate) / global_household_rate
  ) |> 
  select(cluster_id, type_comingled, type_two_stream, type_multi_stream, population,
         household_collected, household_recycled, household_rate) |> 
  pivot_longer(cols = !cluster_id, names_to = "feature", values_to = "percent_difference") |> 
  mutate(value_type = if_else(percent_difference >= 0, "positive", "negative"))

recycling_cluster_profiles |> 
  ggplot(aes(x = percent_difference, y = feature, fill = value_type)) +
  geom_col() +
  facet_wrap(~cluster_id, scales = "fixed")

A cluster analysis of recycling systems in England

Clare Gibson