Setup

Packages

Let’s load the required packages for this analysis.

library(here)
library(tidyverse)
library(skimr)
library(corrplot)
library(tidytable)
library(janitor)
library(knitr)

Collect the data

Let’s import the data and preview it.

recycling <- 
  read_csv(here("data/cln/data.csv"))

glimpse(recycling)
## Rows: 309
## Columns: 31
## $ lad21cd                     <chr> "E06000001", "E06000002", "E06000003", "E0…
## $ local_authority             <chr> "Hartlepool", "Middlesbrough", "Redcar and…
## $ in_lad                      <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y…
## $ wa21cd                      <chr> "E06000001", "E06000002", "E06000003", "E0…
## $ population                  <dbl> 92571, 143734, 136616, 197030, 108222, 128…
## $ collection_type             <chr> "Co-Mingled", "Co-Mingled", "Two Stream", …
## $ collection_type_sort        <dbl> 1, 1, 2, 3, 3, 1, 1, 2, 2, 1, 1, 3, 3, 3, …
## $ in_foi                      <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y…
## $ waste_authority             <chr> "Hartlepool Borough Council", "Middlesbrou…
## $ waste_authority_type        <chr> "Unitary", "Unitary", "Unitary", "Unitary"…
## $ total_collected             <dbl> 45400, 67649, 67283, 105430, 52638, 63344,…
## $ total_recycled              <dbl> 13264, 20557, 23107, 32635, 17080, 24560, …
## $ total_not_recycled          <dbl> 32136, 47092, 44177, 72794, 35559, 38783, …
## $ total_rejects               <dbl> 510, 3754, 1848, -458, 1561, 3055, 4431, 1…
## $ household_collected         <dbl> 39552, 64433, 57866, 88050, 46203, 59958, …
## $ household_recycled          <dbl> 12893, 19169, 22077, 22592, 14968, 22137, …
## $ household_not_recycled      <dbl> 26659, 45264, 35789, 65458, 31235, 37821, …
## $ household_rejects           <dbl> 510, 3754, 1848, -458, 1561, 3055, 4431, 1…
## $ non_household_collected     <dbl> 5848, 3216, 9417, 17380, 6435, 3385, 5331,…
## $ non_household_recycled      <dbl> 371, 1388, 1030, 10044, 2112, 2423, 2668, …
## $ non_household_not_recycled  <dbl> 5477, 1828, 8388, 7337, 4324, 962, 2664, 5…
## $ non_household_rejects       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ in_rec                      <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y…
## $ household_recycle_rate      <dbl> 0.3259759, 0.2975028, 0.3815194, 0.2565815…
## $ rgn21cd                     <chr> "E12000001", "E12000001", "E12000001", "E1…
## $ region                      <chr> "North East", "North East", "North East", …
## $ bng_e                       <dbl> 447160, 451141, 464361, 444940, 428029, 35…
## $ bng_n                       <dbl> 531474, 516887, 519597, 518183, 515648, 38…
## $ long                        <dbl> -1.27018, -1.21099, -1.00608, -1.30664, -1…
## $ lat                         <dbl> 54.6761, 54.5447, 54.5675, 54.5569, 54.535…
## $ household_collected_per_cap <dbl> 0.4272612, 0.4482795, 0.4235668, 0.4468863…

The dataset contains a large number of character columns. Let’s select only the columns we are interested in for this analysis.

recycling <- 
  recycling |> 
  select(
    lad21cd,
    population,
    collection_type,
    household_collected,
    household_recycled,
    household_not_recycled,
    household_rejects
  )

head(recycling)
## # A tidytable: 6 × 7
##   lad21cd   population collection_type household_collected household_recycled
##   <chr>          <dbl> <chr>                         <dbl>              <dbl>
## 1 E06000001      92571 Co-Mingled                    39552              12893
## 2 E06000002     143734 Co-Mingled                    64433              19169
## 3 E06000003     136616 Two Stream                    57866              22077
## 4 E06000004     197030 Multi-Stream                  88050              22592
## 5 E06000005     108222 Multi-Stream                  46203              14968
## 6 E06000006     128577 Co-Mingled                    59958              22137
## # ℹ 2 more variables: household_not_recycled <dbl>, household_rejects <dbl>

Now we can check the summary statistics for each column and see if we have any missing values.

skim(recycling)
Data summary
Name recycling
Number of rows 309
Number of columns 7
Key NULL
_______________________
Column type frequency:
character 2
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
lad21cd 0 1 9 9 0 309 0
collection_type 0 1 10 12 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
population 0 1 182965.76 123691.45 2271 103617 142951 224826 1142494 ▇▂▁▁▁
household_collected 0 1 73568.17 54141.93 1373 40697 55102 89832 401225 ▇▂▁▁▁
household_recycled 0 1 30829.63 24491.44 553 17101 22933 34976 149880 ▇▃▁▁▁
household_not_recycled 0 1 42738.56 33479.94 821 22191 31235 55777 309555 ▇▂▁▁▁
household_rejects 0 1 1955.74 2390.31 -866 544 1230 2479 19852 ▇▁▁▁▁

We can see that there are no missing values in any of the columns. We notice that household_rejects has a some negative values, which doesn’t make sense. I propose to ignore that column for my analysis.

Exploratory Analysis

Correlations

Let’s run a correlation matrix to understand relationships between the variables.

recycling |> 
  select(where(is.numeric)) |> 
  cor(use = "pairwise.complete.obs") |> 
  round(2) |> 
  corrplot(method = "color", type = "upper")

There is a high correlation between population and household_collected meaning that as the population increases, so does the total amount of household waste that is collected. Same goes for population vs household_not_recycled. Both of these seem fairly obvious, so what if we look at the proportion of waste that is recycled instead of the amount? To do this we need to engineer a new feature into the dataset.

recycling <- 
  recycling |> 
  mutate(
    household_recycled_pct = round(household_recycled / household_collected, 3)
  )

recycling |> 
  select(where(is.numeric)) |> 
  cor(use = "pairwise.complete.obs") |> 
  round(2) |> 
  corrplot(method = "color", type = "upper")

Now we see a negative correlation between population and household_recycled_pct, meaning that as population increases, the proportion of waste that is recycled decreases. Larger populations are associated with lower recycling rates.

Let’s explore this relationship in a scatterplot.

recycling |> 
  ggplot(
    aes(
      x = household_recycled_pct,
      y = population
    )
  ) +
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Clustering

Normalization

scaled_recycling <- 
  recycling |> 
  select(where(is.numeric)) |> 
  scale()

Fit the model

# Get the 2 columns of interest
scaled_recycling_2col <- 
  scaled_recycling[, c("population", "household_recycled_pct")]

set.seed(123)
km.out <- kmeans(scaled_recycling_2col, centers = 3, nstart = 20)
km.out
## K-means clustering with 3 clusters of sizes 57, 115, 137
## 
## Cluster means:
##   population household_recycled_pct
## 1  1.5274893             -0.7403150
## 2 -0.2386262              1.0287800
## 3 -0.4352181             -0.5555602
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 2 2 3 2 3 1 1 2 1 3 2 1 2 1 2 2 1 3 3 3 3 3 3 1 2 2 2 3
##  [38] 2 2 2 1 3 1 2 1 2 2 2 1 3 1 3 1 1 1 2 1 1 1 2 2 3 2 2 3 3 3 3 3 2 3 3 3 2
##  [75] 3 2 2 2 2 3 2 2 2 2 2 2 3 3 3 2 2 2 2 3 2 3 2 2 3 2 2 3 2 2 2 2 3 2 2 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2 2 2 3 3 3 2 3 2 3 3 2 2 3 2 3 3 3 3 3 3 3 2
## [149] 2 2 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 2 3 3 3 3 3 3 3 3 2
## [186] 2 2 2 2 2 2 2 2 3 3 2 2 2 2 2 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 3 3 3 2 2 3 3
## [223] 2 3 2 3 3 3 3 3 3 3 3 2 2 2 3 3 3 2 2 2 1 2 2 2 2 2 2 2 3 1 3 1 1 2 1 2 1
## [260] 1 3 3 1 1 1 1 1 3 1 1 1 2 1 1 1 3 3 3 1 3 1 1 3 1 1 1 1 1 3 1 1 1 1 1 3 3
## [297] 2 1 1 3 1 1 3 1 2 1 1 1 3
## 
## Within cluster sum of squares by cluster:
## [1] 114.98964  70.55520  69.72411
##  (between_SS / total_SS =  58.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Optimize the model

# Decide how many clusters to look at
n_clusters <- 10

# Initialize total within sum of squares error: wss
wss <- numeric(n_clusters)

set.seed(123)

# Look over 1 to n possible clusters
for (i in 1:n_clusters) {
  # Fit the model
  km.out <- kmeans(scaled_recycling_2col, centers = i, nstart = 20)
  # Save the within cluster sum of squares
  wss[i] <- km.out$tot.withinss
}

# Produce a scree plot
wss_df <- tibble(clusters = 1:n_clusters, wss = wss)

scree_plot <- ggplot(wss_df, aes(x = clusters, y = wss, group = 1)) +
  geom_point(size = 4) +
  geom_line() +
  scale_x_continuous(breaks = c(2, 4, 6, 8, 10)) +
  xlab("Number of clusters")

scree_plot

Rebuild the model with optimized k

k <- 6

set.seed(123)

km.out <- kmeans(scaled_recycling_2col, centers = k, nstart = 20)
recycling$cluster_id <- factor(km.out$cluster)

ggplot(
  recycling, 
  aes(
    x = population,
    y = household_recycled_pct,
    colour = cluster_id
  )
) +
  geom_point() +
  xlab("Population") +
  ylab("Recycling Rate")

Interpret the results

First, we dummy code the only categorical feature, collection_type to allow us to get descriptive statistics for the values of this feature.

# Get dummies
recycling <- 
  recycling |> 
  get_dummies(cols = collection_type) |> 
  select(!collection_type) |> 
  clean_names()

# Summarise descriptive stats of full data
recycling_stats <- 
  recycling |> 
  summarise(
    global_comingled = mean(collection_type_co_mingled),
    global_two_stream = mean(collection_type_two_stream),
    global_multi_stream = mean(collection_type_multi_stream),
    global_population = median(population),
    global_household_collected = median(household_collected),
    global_household_recycled = median(household_recycled),
    global_household_rate = median(household_recycled_pct)
  ) |> 
  round(2)

recycling_stats |> 
  kable()
global_comingled global_two_stream global_multi_stream global_population global_household_collected global_household_recycled global_household_rate
0.51 0.36 0.13 142951 55102 22933 0.42

Next get the same descriptive stats but grouped by cluster.

recycling_stats_by_cluster <- 
  recycling |> 
  group_by(cluster_id) |> 
  summarise(
    cluster_comingled = round(mean(collection_type_co_mingled), 2),
    cluster_two_stream = round(mean(collection_type_two_stream), 2),
    cluster_multi_stream = round(mean(collection_type_multi_stream), 2),
    cluster_population = median(population),
    cluster_household_collected = median(household_collected),
    cluster_household_recycled = median(household_recycled),
    cluster_household_rate = median(household_recycled_pct)
  )

recycling_stats_by_cluster |> 
  kable()
cluster_id cluster_comingled cluster_two_stream cluster_multi_stream cluster_population cluster_household_collected cluster_household_recycled cluster_household_rate
1 0.56 0.36 0.08 123383.0 45807 14587 0.3240
2 0.22 0.67 0.11 554401.0 239199 92163 0.3610
3 0.69 0.24 0.07 289254.0 107383 34901 0.3200
4 0.47 0.39 0.15 113047.0 45328 19781 0.4270
5 0.52 0.30 0.18 135062.5 51970 28780 0.5425
6 0.33 0.52 0.15 308705.0 127334 62984 0.4870

Let’s plot the differences from the population for each cluster.

recycling_cluster_profiles <- 
  recycling_stats_by_cluster |> 
  bind_cols(recycling_stats) |> 
  # Calculate the difference from global for each cluster
  mutate(
    type_comingled = (cluster_comingled - global_comingled) / global_comingled,
    type_two_stream = (cluster_two_stream - global_two_stream) / global_two_stream,
    type_multi_stream = (cluster_multi_stream - global_multi_stream) / global_multi_stream,
    population = (cluster_population - global_population) / global_population,
    household_collected = (cluster_household_collected - global_household_collected) / global_household_collected,
    household_recycled = (cluster_household_recycled - global_household_recycled) / global_household_recycled,
    household_rate = (cluster_household_rate - global_household_rate) / global_household_rate
  ) |> 
  select(cluster_id, type_comingled, type_two_stream, type_multi_stream, population,
         household_collected, household_recycled, household_rate) |> 
  pivot_longer(cols = !cluster_id, names_to = "feature", values_to = "percent_difference") |> 
  mutate(value_type = if_else(percent_difference >= 0, "positive", "negative"))

recycling_cluster_profiles |> 
  ggplot(aes(x = percent_difference, y = feature, fill = value_type)) +
  geom_col() +
  facet_wrap(~cluster_id, scales = "fixed")