library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
openaq_data <- read.csv('openaq.csv', sep = ';')
str(openaq_data)
## 'data.frame': 61177 obs. of 10 variables:
## $ Country.Code : chr "CN" "CN" "CN" "CN" ...
## $ City : chr "" "" "" "" ...
## $ Location : chr "市八十六中" "市农科院" "市发改委" "市委" ...
## $ Coordinates : chr "23.1047, 113.43319999999999" "21.9508, 108.6553" "29.8454, 114.3107" "30.457600000000003, 106.63030000000002" ...
## $ Pollutant : chr "O3" "SO2" "PM2.5" "O3" ...
## $ Source.Name : chr "ChinaAQIData" "ChinaAQIData" "ChinaAQIData" "ChinaAQIData" ...
## $ Unit : chr "µg/m³" "µg/m³" "µg/m³" "µg/m³" ...
## $ Value : num 36 7 26 91 19 113 600 300 98 14 ...
## $ Last.Updated : chr "2021-08-09T13:00:00+02:00" "2020-12-31T17:00:00+01:00" "2021-08-09T13:00:00+02:00" "2021-08-09T13:00:00+02:00" ...
## $ Country.Label: chr "China" "China" "China" "China" ...
I will first cluster the data based on countries, so I will group data by country
openaq_data$Country.Code <- factor(openaq_data$Country.Code)
str(openaq_data)
## 'data.frame': 61177 obs. of 10 variables:
## $ Country.Code : Factor w/ 137 levels "-99","AD","AE",..: 30 30 30 30 30 30 30 30 30 30 ...
## $ City : chr "" "" "" "" ...
## $ Location : chr "市八十六中" "市农科院" "市发改委" "市委" ...
## $ Coordinates : chr "23.1047, 113.43319999999999" "21.9508, 108.6553" "29.8454, 114.3107" "30.457600000000003, 106.63030000000002" ...
## $ Pollutant : chr "O3" "SO2" "PM2.5" "O3" ...
## $ Source.Name : chr "ChinaAQIData" "ChinaAQIData" "ChinaAQIData" "ChinaAQIData" ...
## $ Unit : chr "µg/m³" "µg/m³" "µg/m³" "µg/m³" ...
## $ Value : num 36 7 26 91 19 113 600 300 98 14 ...
## $ Last.Updated : chr "2021-08-09T13:00:00+02:00" "2020-12-31T17:00:00+01:00" "2021-08-09T13:00:00+02:00" "2021-08-09T13:00:00+02:00" ...
## $ Country.Label: chr "China" "China" "China" "China" ...
There is some strange unexpected factor ‘-99’ in Country.Code, so I am going to explore what this is
openaq_data[openaq_data$Country.Code == '-99',]
## Country.Code City Location
## 2306 -99 Ορμήδια - Βιομηχανικός Σταθμός Ormidia - Industrial Station
## 10362 -99 Ορμήδια - Βιομηχανικός Σταθμός Ormidia - Industrial Station
## 10890 -99 Ορμήδια - Βιομηχανικός Σταθμός Ormidia - Industrial Station
## 23473 -99 Ορμήδια - Βιομηχανικός Σταθμός Ormidia - Industrial Station
## 34321 -99 Ορμήδια - Βιομηχανικός Σταθμός Ormidia - Industrial Station
## 57862 -99 Ορμήδια - Βιομηχανικός Σταθμός Ormidia - Industrial Station
## 57863 -99 Ορμήδια - Βιομηχανικός Σταθμός Ormidia - Industrial Station
## Coordinates Pollutant
## 2306 34.9923, 33.779 PM10
## 10362 34.9923, 33.779 PM2.5
## 10890 34.9923, 33.779 SO2
## 23473 34.9923, 33.779 NOX
## 34321 34.9923, 33.779 O3
## 57862 34.9923, 33.779 NO
## 57863 34.9923, 33.779 NO2
## Source.Name Unit Value
## 2306 Republic of Cyprus Department of Labor Inspection µg/m³ 22.80000
## 10362 Republic of Cyprus Department of Labor Inspection µg/m³ 11.21000
## 10890 Republic of Cyprus Department of Labor Inspection µg/m³ 6.84391
## 23473 Republic of Cyprus Department of Labor Inspection µg/m³ 13.62056
## 34321 Republic of Cyprus Department of Labor Inspection µg/m³ 99.55051
## 57862 Republic of Cyprus Department of Labor Inspection µg/m³ 0.83549
## 57863 Republic of Cyprus Department of Labor Inspection µg/m³ 12.35798
## Last.Updated Country.Label
## 2306 2025-01-31T10:00:00+01:00
## 10362 2025-01-31T10:00:00+01:00
## 10890 2025-01-31T10:00:00+01:00
## 23473 2025-01-31T10:00:00+01:00
## 34321 2025-01-31T10:00:00+01:00
## 57862 2025-01-31T10:00:00+01:00
## 57863 2025-01-31T10:00:00+01:00
All of the Country.Code entries that have -99 have City Ορμήδια - Βιομηχανικός Σταθμός , which is a city in Cyprus. Let’s check if Cyprus is already in the data
head(openaq_data[openaq_data$Country.Code == 'CY',])
## Country.Code City
## 123 CY Αγία Μαρίνα Ξυλιάτου - Σταθμός Υποβάθρου
## 2440 CY Λεμεσός - Κυκλοφοριακός Σταθμός
## 2441 CY Καλαβασός - Βιομηχανικός Σταθμός
## 2442 CY Πάφος - Κυκλοφοριακός Σταθμός
## 4978 CY Παραλίμνι - Κυκλοφοριακός Σταθμός
## 4979 CY Πάφος - Κυκλοφοριακός Σταθμός
## Location
## 123 Ayia Marina Xyliatou - Background Station
## 2440 Limassol - Traffic Station
## 2441 Kalavasos - Industrial Station
## 2442 Paphos - Traffic Station
## 4978 Paralimni - Traffic Station
## 4979 Paphos - Traffic Station
## Coordinates Pollutant
## 123 35.0380556, 33.05777779999994 O3
## 2440 34.6861111, 33.03555559999995 SO2
## 2441 34.77152999999999, 33.29623 NO2
## 2442 34.7752778, 32.42194440000003 NO
## 4978 35.04569276191925, 33.977734884655774 NO
## 4979 34.7752778, 32.42194440000003 PM2.5
## Source.Name Unit Value
## 123 Republic of Cyprus Department of Labor Inspection µg/m³ 114.8657000
## 2440 Republic of Cyprus Department of Labor Inspection µg/m³ 0.6449241
## 2441 Republic of Cyprus Department of Labor Inspection µg/m³ 7.5563500
## 2442 Republic of Cyprus Department of Labor Inspection µg/m³ 1.9617910
## 4978 Republic of Cyprus Department of Labor Inspection µg/m³ 11.5399600
## 4979 Republic of Cyprus Department of Labor Inspection µg/m³ 14.7685100
## Last.Updated Country.Label
## 123 2025-01-31T10:00:00+01:00 Cyprus
## 2440 2025-01-31T10:00:00+01:00 Cyprus
## 2441 2025-01-31T10:00:00+01:00 Cyprus
## 2442 2025-01-31T10:00:00+01:00 Cyprus
## 4978 2025-01-31T10:00:00+01:00 Cyprus
## 4979 2025-01-31T10:00:00+01:00 Cyprus
Yes, so Cyprus is already in the data so I will just fix -99 to represent CY
openaq_data[openaq_data$Country.Code == '-99',]$Country.Code <- 'CY'
I want to reshape the data to have a Polutants as separate columns that will have the values in each row Let’s check what Pollutants there are in the data and if they all have a value and let’s handle this.
unique(openaq_data$Pollutant)
## [1] "O3" "SO2" "PM2.5" "NO2"
## [5] "CO" "PM10" "NO" "PM1"
## [9] "RELATIVEHUMIDITY" "TEMPERATURE" "NOX" "UM003"
## [13] "BC"
So overall we have the following Pollutants: BC CO NO NO2 NOX O3 PM1 PM10 PM2.5 RELATIVEHUMIDITY SO2 TEMPERATURE UM003 Let’s check what is the proportion of those reported in the data:
print(paste('BC:',count(openaq_data[openaq_data$Pollutant == 'BC',])))
## [1] "BC: 137"
print(paste('CO:',count(openaq_data[openaq_data$Pollutant == 'CO',])))
## [1] "CO: 5968"
print(paste('NO:',count(openaq_data[openaq_data$Pollutant == 'NO',])))
## [1] "NO: 4030"
print(paste('NO2:',count(openaq_data[openaq_data$Pollutant == 'NO2',])))
## [1] "NO2: 11129"
print(paste('NOX:',count(openaq_data[openaq_data$Pollutant == 'NOX',])))
## [1] "NOX: 2094"
print(paste('O3:',count(openaq_data[openaq_data$Pollutant == 'O3',])))
## [1] "O3: 9377"
print(paste('PM1:',count(openaq_data[openaq_data$Pollutant == 'PM1',])))
## [1] "PM1: 159"
print(paste('PM10:',count(openaq_data[openaq_data$Pollutant == 'PM10',])))
## [1] "PM10: 9264"
print(paste('PM2.5:',count(openaq_data[openaq_data$Pollutant == 'PM2.5',])))
## [1] "PM2.5: 10662"
print(paste('RELATIVEHUMIDITY:',count(openaq_data[openaq_data$Pollutant == 'RELATIVEHUMIDITY',])))
## [1] "RELATIVEHUMIDITY: 89"
print(paste('SO2:',count(openaq_data[openaq_data$Pollutant == 'SO2',])))
## [1] "SO2: 8061"
print(paste('TEMPERATURE:',count(openaq_data[openaq_data$Pollutant == 'TEMPERATURE',])))
## [1] "TEMPERATURE: 120"
print(paste('UM003:',count(openaq_data[openaq_data$Pollutant == 'UM003',])))
## [1] "UM003: 87"
I will drop those with too few observations, lets say under 1.000
openaq_data$Pollutant <- factor(openaq_data$Pollutant)
openaq_data <- openaq_data[openaq_data$Pollutant %in% names(which(table(openaq_data$Pollutant) >= 1000)), ]
duplicates <- openaq_data %>%
group_by(Country.Code, City, Location, Coordinates, Source.Name, Unit, Last.Updated, Country.Label, Pollutant) %>%
filter(n() > 1) %>%
arrange(Pollutant, Location)
duplicates
## # A tibble: 128 × 10
## # Groups: Country.Code, City, Location, Coordinates, Source.Name, Unit,
## # Last.Updated, Country.Label, Pollutant [61]
## Country.Code City Location Coordinates Pollutant Source.Name Unit Value
## <fct> <chr> <chr> <chr> <fct> <chr> <chr> <dbl>
## 1 ZA Sekhuk… Dilokong -24.615222… CO South Afri… ppm 0.789
## 2 ZA Sekhuk… Dilokong -24.615222… CO South Afri… ppm 0.856
## 3 ZA Sekhuk… Dilokong -24.615222… CO South Afri… ppm 1.04
## 4 ZA Sekhuk… Dilokong -24.615222… CO South Afri… ppm 0.824
## 5 ZA Sekhuk… Dilokong -24.615222… NO South Afri… ppm 0.0139
## 6 ZA Sekhuk… Dilokong -24.615222… NO South Afri… ppm 0.0152
## 7 ZA Sekhuk… Dilokong -24.615222… NO South Afri… ppm 0.0154
## 8 ZA Sekhuk… Dilokong -24.615222… NO South Afri… ppm 0.0137
## 9 ZA Sekhuk… Dilokong -24.615222… NO2 South Afri… ppm 0.00156
## 10 ZA Sekhuk… Dilokong -24.615222… NO2 South Afri… ppm 0.00139
## # ℹ 118 more rows
## # ℹ 2 more variables: Last.Updated <chr>, Country.Label <chr>
there are some duplicates since some of the air quality report were reported a few times accross differents times at the same location, so I will actually also take median of those cases and keep 1 row
As I want to cluster based on the Pollutant types, I need to reshape the data
openaq_data <- openaq_data %>%
pivot_wider(
id_cols = c(Country.Code, City, Location, Coordinates, Source.Name, Unit, Last.Updated, Country.Label),
names_from = Pollutant,
values_from = Value,
values_fn = list(Value = median) #taking median if any identical rows
)
head(openaq_data)
## # A tibble: 6 × 16
## Country.Code City Location Coordinates Source.Name Unit Last.Updated
## <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 CN "" 市八十六中 23.1047, 113.433… ChinaAQIDa… µg/m³ 2021-08-09T…
## 2 CN "" 市农科院 21.9508, 108.6553 ChinaAQIDa… µg/m³ 2020-12-31T…
## 3 CN "" 市发改委 29.8454, 114.3107 ChinaAQIDa… µg/m³ 2021-08-09T…
## 4 CN "" 市委 30.4576000000000… ChinaAQIDa… µg/m³ 2021-08-09T…
## 5 CN "" 市委党校 27.7314000000000… ChinaAQIDa… µg/m³ 2021-08-09T…
## 6 CN "" 市审计局 25.8070999999999… ChinaAQIDa… µg/m³ 2021-08-09T…
## # ℹ 9 more variables: Country.Label <chr>, O3 <dbl>, SO2 <dbl>, PM2.5 <dbl>,
## # NO2 <dbl>, CO <dbl>, PM10 <dbl>, NO <dbl>, NOX <dbl>
class(openaq_data$SO2)
## [1] "numeric"
Let us check for outliers, as this is very important and I want to do it before scaling or filling in missing data.
cols <- c("O3", "PM2.5", "NO2", "CO", "NO", "NOX", "PM10", "SO2")
boxplot(openaq_data[, cols], main = "Pollutants")
CO has a lot of big outliers, I need to handle them.
summary(openaq_data$CO)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -49051.4 0.4 336.6 5591.4 665.0 3198676.0 19513
Strange that it has min value of -49051 , I need to check all the Pollutants, since neither if them should have negative values
for (i in c("O3", "PM2.5", "NO2", "CO", "NO", "NOX", "PM10", "SO2")) {
n <- nrow(openaq_data[openaq_data[[i]] < 0 & !is.na(openaq_data[[i]]), ])
print(head(openaq_data[openaq_data[[i]] < 0 & !is.na(openaq_data[[i]]), ]))
print(paste(i, ": ", n, "negative values"))
}
## # A tibble: 6 × 16
## Country.Code City Location Coordinates Source.Name Unit Last.Updated
## <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 LT Karolini ki … "\"Viln… 54.6861114… eea µg/m³ 2025-01-31T…
## 2 LT Petra in sen… "\"Kaun… 54.8951280… eea µg/m³ 2025-01-31T…
## 3 LV Rezekne "Rezekn… 56.5110139… eea µg/m³ 2025-01-31T…
## 4 ZA Gert Sibande "Camden" -26.6226, … South Afri… ppm 2023-05-29T…
## 5 RO ALBA IULIA "AB-1" 46.0642749… eea µg/m³ 2024-09-08T…
## 6 RO RAMNICU VALC… "VL-2" 45.0388309… eea µg/m³ 2024-09-08T…
## # ℹ 9 more variables: Country.Label <chr>, O3 <dbl>, SO2 <dbl>, PM2.5 <dbl>,
## # NO2 <dbl>, CO <dbl>, PM10 <dbl>, NO <dbl>, NOX <dbl>
## [1] "O3 : 121 negative values"
## # A tibble: 6 × 16
## Country.Code City Location Coordinates Source.Name Unit Last.Updated
## <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 LT Petra in sen… "\"Kaun… 54.8951280… eea µg/m³ 2025-01-31T…
## 2 NO Oslo "Vahl s… 59.9169440… eea µg/m³ 2025-01-31T…
## 3 PL Biała "Biała,… 52.6025339… eea µg/m³ 2024-12-19T…
## 4 IT Via Amiternu… "AQ - A… 42.3641700… eea µg/m³ 2025-01-11T…
## 5 IT C.so Umberto… "MONTES… 42.5175000… eea µg/m³ 2025-01-11T…
## 6 US Poughkeepsie… "Newbur… 41.4994, -… AirNow µg/m³ 2023-05-31T…
## # ℹ 9 more variables: Country.Label <chr>, O3 <dbl>, SO2 <dbl>, PM2.5 <dbl>,
## # NO2 <dbl>, CO <dbl>, PM10 <dbl>, NO <dbl>, NOX <dbl>
## [1] "PM2.5 : 217 negative values"
## # A tibble: 6 × 16
## Country.Code City Location Coordinates Source.Name Unit Last.Updated
## <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 LT Karolini ki … "\"Viln… 54.6861114… eea µg/m³ 2025-01-31T…
## 2 LT Petra in sen… "\"Kaun… 54.8951280… eea µg/m³ 2025-01-31T…
## 3 NL Breda "Breda-… 51.6031, 4… Netherlands µg/m³ 2025-01-31T…
## 4 NL Heerlen "Heerle… 50.888, 5.… Netherlands µg/m³ 2025-01-31T…
## 5 NL Veldhoven "Veldho… 51.4074, 5… Netherlands µg/m³ 2025-01-31T…
## 6 HU Budapest "\"Buda… 47.4919440… eea µg/m³ 2025-01-31T…
## # ℹ 9 more variables: Country.Label <chr>, O3 <dbl>, SO2 <dbl>, PM2.5 <dbl>,
## # NO2 <dbl>, CO <dbl>, PM10 <dbl>, NO <dbl>, NOX <dbl>
## [1] "NO2 : 216 negative values"
## # A tibble: 6 × 16
## Country.Code City Location Coordinates Source.Name Unit Last.Updated
## <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 LT Petra in sen… "\"Kaun… 54.8951280… eea µg/m³ 2025-01-31T…
## 2 PT Rede de Qual… "Franci… 41.1644440… eea µg/m³ 2024-11-29T…
## 3 AT 5400 Hallein "Hallei… 47.6825900… eea µg/m³ 2024-12-21T…
## 4 TR Trabzon "Trabzo… 41.014341,… Turkiye µg/m³ 2022-02-18T…
## 5 PE Ilo "Ilo - … -17.634021… peru-oefa µg/m³ 2025-01-07T…
## 6 PE Tupac Amaru … "Pisco … -13.713381… peru-oefa µg/m³ 2025-01-08T…
## # ℹ 9 more variables: Country.Label <chr>, O3 <dbl>, SO2 <dbl>, PM2.5 <dbl>,
## # NO2 <dbl>, CO <dbl>, PM10 <dbl>, NO <dbl>, NOX <dbl>
## [1] "CO : 44 negative values"
## # A tibble: 6 × 16
## Country.Code City Location Coordinates Source.Name Unit Last.Updated
## <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 NL Wijk aan Zee Wijk aan… 52.4939920… eea µg/m³ 2024-03-25T…
## 2 NL Amsterdam Amsterda… 52.3813310… eea µg/m³ 2024-03-25T…
## 3 NL Breda Breda-Ba… 51.6031, 4… Netherlands µg/m³ 2025-01-31T…
## 4 NL Heerlen Heerlen-… 50.888, 5.… Netherlands µg/m³ 2025-01-31T…
## 5 NL Veldhoven Veldhove… 51.4074, 5… Netherlands µg/m³ 2025-01-31T…
## 6 US Bakersfield Edison 35.345607,… AirNow ppm 2023-05-31T…
## # ℹ 9 more variables: Country.Label <chr>, O3 <dbl>, SO2 <dbl>, PM2.5 <dbl>,
## # NO2 <dbl>, CO <dbl>, PM10 <dbl>, NO <dbl>, NOX <dbl>
## [1] "NO : 110 negative values"
## # A tibble: 5 × 16
## Country.Code City Location Coordinates Source.Name Unit Last.Updated
## <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 US Ogden-Clearf… Harrisv… 41.302799,… AirNow ppm 2023-05-31T…
## 2 ZA Gert Sibande Elandsf… -26.245481… South Afri… ppm 2023-05-29T…
## 3 ZA Gert Sibande Camden -26.6226, … South Afri… ppm 2023-05-29T…
## 4 ZA Nkangala Masakha… -25.97255,… South Afri… ppm 2023-05-29T…
## 5 ZA Gert Sibande Grootvl… -26.801033… South Afri… ppm 2023-05-07T…
## # ℹ 9 more variables: Country.Label <chr>, O3 <dbl>, SO2 <dbl>, PM2.5 <dbl>,
## # NO2 <dbl>, CO <dbl>, PM10 <dbl>, NO <dbl>, NOX <dbl>
## [1] "NOX : 5 negative values"
## # A tibble: 6 × 16
## Country.Code City Location Coordinates Source.Name Unit Last.Updated
## <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 LT Karolini ki … "\"Viln… 54.6861114… eea µg/m³ 2025-01-31T…
## 2 LT Petra in sen… "\"Kaun… 54.8951280… eea µg/m³ 2025-01-31T…
## 3 NO Oslo "Vahl s… 59.9169440… eea µg/m³ 2025-01-31T…
## 4 PL Biała "Biała,… 52.6025339… eea µg/m³ 2024-12-19T…
## 5 IT Via Amiternu… "AQ - A… 42.3641700… eea µg/m³ 2025-01-11T…
## 6 IT C.so Umberto… "MONTES… 42.5175000… eea µg/m³ 2025-01-11T…
## # ℹ 9 more variables: Country.Label <chr>, O3 <dbl>, SO2 <dbl>, PM2.5 <dbl>,
## # NO2 <dbl>, CO <dbl>, PM10 <dbl>, NO <dbl>, NOX <dbl>
## [1] "PM10 : 202 negative values"
## # A tibble: 6 × 16
## Country.Code City Location Coordinates Source.Name Unit Last.Updated
## <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 FR Nord "FR1002… 50.9959079… EEA France µg/m³ 2024-01-29T…
## 2 LT Karolini ki … "\"Viln… 54.6861114… eea µg/m³ 2025-01-31T…
## 3 LT Petra in sen… "\"Kaun… 54.8951280… eea µg/m³ 2025-01-31T…
## 4 HU Budapest "\"Buda… 47.4919440… eea µg/m³ 2025-01-31T…
## 5 US Houston-Suga… "Housto… 29.6239, -… AirNow ppm 2016-03-06T…
## 6 US ST. LOUIS "Rider … 38.75264, … AirNow ppm 2023-05-31T…
## # ℹ 9 more variables: Country.Label <chr>, O3 <dbl>, SO2 <dbl>, PM2.5 <dbl>,
## # NO2 <dbl>, CO <dbl>, PM10 <dbl>, NO <dbl>, NOX <dbl>
## [1] "SO2 : 200 negative values"
So around 1% of the data is negative, I think this is not significant. My guess is that this is either disfunctioning sensors or just a mistake in the report. It seems like those negative values come from completely different cities and hence sensors, so I am guessing this is not sensor malfunction but rather reporting issue. So I can do 1 of 3 things: remove those rows, set them to 0, fill in with median. I am in between dropping those values and setting them to 0, but since this is not a significant amount of data I will set those values to NA, so later they could be filled with median.
for (col in cols) {
openaq_data[[col]][openaq_data[[col]] < 0] <- NA
}
I will continue with outliers, will zoom in to see where the positive outliers lie
boxplot(openaq_data[, cols], main = "Pollutants", ylim = c(0, 1000))
Not too many positive outliers that lie above. However I realise that
the Unit for those (even of the same kind) pollutants are different.
for (col in cols){
print(paste(col, ":", nrow(openaq_data[openaq_data[[col]] > 1000,])))
}
## [1] "O3 : 16234"
## [1] "PM2.5 : 15110"
## [1] "NO2 : 14581"
## [1] "CO : 20125"
## [1] "NO : 21561"
## [1] "NOX : 23392"
## [1] "PM10 : 16458"
## [1] "SO2 : 17646"
Let’s check if units are the same, otherwise I will need to normalize the data:
unique(openaq_data$Unit)
## [1] "µg/m³" "ppm" "ppb"
nrow(openaq_data[openaq_data$Unit == 'ppb',])
## [1] 6
nrow(openaq_data[openaq_data$Unit == 'ppm',])
## [1] 6335
nrow(openaq_data[openaq_data$Unit == 'µg/m³',])
## [1] 19138
So µg/m³ has the most data. For the sake of starting clustering I will just assign median to those, otherwise if I have time I will get back to this and convert all of the values to µg/m³
for(p in cols) {
openaq_data[[p]][openaq_data$Unit != "µg/m³"] <- NA
}
openaq_data$Unit <- "µg/m³"
boxplot(openaq_data[, cols], main = "Pollutants", ylim = c(0, 1000))
There is a lot of missing data on some of the pollutants. I will fill them in with the median data to avoid outliers (with mean)
cols <- c("O3", "PM2.5", "NO2", "CO", "NO", "NOX", "PM10", "SO2")
for (col in cols) {
openaq_data[[col]][is.na(openaq_data[[col]])] <- median(openaq_data[[col]], na.rm = TRUE)
}
I need to scale data.
openaq_data[paste0(cols, "_scaled")] <- scale(openaq_data[cols])
There are a lot of columns that give us granularity on location, however for this particular clustering part I will not need such a deep granularity, I will only leave the country code and the neccesary features.
openaq_country <- openaq_data[,c('Country.Code', "O3_scaled", "PM2.5_scaled", "NO2_scaled", "CO_scaled", "NO_scaled", "NOX_scaled", "PM10_scaled", "SO2_scaled") ]
Now, I need to group those countries together, since we have so many rows for the same country. I will take a median to group them, since there are cases when the country is generally clean, but in the capital the air quality is terrible, so we still want to assign this country to clean one. I will be looking at a single cities (or rahter capitals) in another analysis.
openaq_country <- aggregate(. ~ Country.Code, data = openaq_country, FUN = median)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
set.seed(123)
hopkins_stat <- get_clust_tendency(openaq_country[,-1], n = nrow(openaq_country) -1, graph = TRUE)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
hopkins_stat
## $hopkins_stat
## [1] 0.9224853
##
## $plot
Number >0.75 represent a strong clustering tendency, so in my case the number is 0.92 which clearly indicates that I can proceed clustering the data.
Now I need to find the number of cluster suitable for my clustering analysis of countries. I will use Gap Statistic, Elbow method and Silhouette method
library(factoextra)
fviz_nbclust(openaq_country[,-1] , FUN = kmeans, method = "gap_stat")
From this plot I want to choose the number of cluters to be 4, because
there is a big jusmp from 3 to 4 cluters in the Gap statistic, and also
after 4 the curves still goes up but not as fast, so it will be a good
trade off between the quality and the computation speed to stop at 4.
Though math suggestion here is only 1 cluster - which does not make
sense for me because it is just all data in 1 cluster.
Let’s also confirm this with elbow method:
fviz_nbclust(openaq_country[,-1] , FUN = kmeans, method = "wss")
From WSS method the number of clusters it suggests me to pick is 5
And let’s also check that with Silhouette method
fviz_nbclust(openaq_country[, -1] , FUN = kmeans, method = "silhouette")
So 3 methods that suggest different number of clusters. I want to pick
5, becuase the drop in the elbow methods is quite significant and this
is also suppported by the silhouette , which even though has the 2 as
the suggestion and biggest climb, it also has 5 with the largest average
width. Also I would like to explore what 5 clusters could give me.
Because picking just 2 clusters might only suggest “clean” and
“pilluted” cities, and I want to explore more than that.
Now let’s use K-mean clustering to cluster our data.
rownames(openaq_country) <- openaq_country$Country.Code
kmean_country_k2 = eclust(openaq_country[,-1], FUNcluster = "kmeans", k = 2, graph = TRUE)
as.data.frame(kmean_country_k2$cluster)
## kmean_country_k2$cluster
## AD 2
## AE 1
## AF 2
## AG 2
## AJ 2
## AM 2
## AQ 2
## AR 2
## AT 2
## AU 2
## AZ 2
## BA 2
## BD 2
## BE 2
## BF 2
## BG 2
## BH 2
## BK 2
## BM 2
## BR 2
## BZ 2
## CA 2
## CD 2
## CE 2
## CF 2
## CH 2
## CI 2
## CL 2
## CN 1
## CO 2
## CR 2
## CS 2
## CW 2
## CY 1
## CZ 2
## DE 2
## DK 2
## DZ 2
## EC 2
## EE 2
## EG 2
## ES 2
## ET 2
## FI 2
## FR 2
## GA 2
## GB 2
## GH 2
## GI 2
## GN 2
## GR 2
## GT 2
## HK 1
## HR 2
## HU 2
## ID 2
## IE 2
## IL 2
## IN 2
## IQ 2
## IS 2
## IT 2
## IZ 2
## JO 2
## JP 2
## KE 2
## KG 2
## KR 2
## KU 2
## KV 2
## KW 2
## KZ 2
## LA 2
## LK 2
## LT 2
## LU 2
## LV 2
## MA 2
## MD 2
## ME 2
## MF 2
## MG 2
## MK 2
## ML 2
## MM 2
## MN 2
## MT 2
## MX 2
## MY 2
## MZ 2
## NG 2
## NI 2
## NL 2
## NO 2
## NP 2
## NZ 2
## OM 1
## PE 2
## PH 2
## PK 2
## PL 2
## PR 2
## PS 2
## PT 2
## PY 2
## QA 2
## RO 2
## RS 2
## RU 2
## RW 2
## SA 2
## SD 2
## SE 2
## SG 2
## SI 2
## SK 2
## SM 1
## SN 2
## SU 2
## TD 2
## TH 2
## TI 2
## TJ 2
## TM 2
## TR 2
## TT 2
## TW 2
## TX 2
## UC 2
## UG 2
## US 2
## UZ 2
## VM 2
## VN 2
## XK 2
## ZA 2
The 2 different clusters are very clear and interprettable. I can see that in cluster 2 in red we only have the most industrial and with biggest urban traffic and hence polluted countries like China, UAE, Hong Kong as well as the countries which geographically have a high dust level like UAE, Oman etc
The countries in the 2nd blue cluster represent the less affected by urbanism and geographical location countries.
I also want to have a look at 9 clusters, as I clearly have the 2nd biggest drop at 9 in WSS, and also the biggest increase at 9 in Gap Statistic.
rownames(openaq_country) <- openaq_country$Country.Code
kmean_country_k9 = eclust(openaq_country[,-1], FUNcluster = "kmeans", k = 9, graph = TRUE)
as.data.frame(kmean_country_k9$cluster)
## kmean_country_k9$cluster
## AD 9
## AE 7
## AF 6
## AG 9
## AJ 9
## AM 9
## AQ 9
## AR 9
## AT 9
## AU 9
## AZ 9
## BA 9
## BD 6
## BE 9
## BF 6
## BG 9
## BH 9
## BK 9
## BM 9
## BR 9
## BZ 9
## CA 9
## CD 6
## CE 9
## CF 9
## CH 8
## CI 9
## CL 9
## CN 7
## CO 9
## CR 9
## CS 9
## CW 9
## CY 5
## CZ 9
## DE 9
## DK 9
## DZ 9
## EC 9
## EE 1
## EG 6
## ES 9
## ET 9
## FI 9
## FR 9
## GA 9
## GB 9
## GH 9
## GI 9
## GN 9
## GR 9
## GT 9
## HK 3
## HR 9
## HU 9
## ID 9
## IE 9
## IL 9
## IN 2
## IQ 9
## IS 9
## IT 9
## IZ 9
## JO 9
## JP 9
## KE 9
## KG 6
## KR 9
## KU 9
## KV 9
## KW 9
## KZ 6
## LA 9
## LK 9
## LT 9
## LU 9
## LV 6
## MA 9
## MD 9
## ME 9
## MF 4
## MG 9
## MK 9
## ML 9
## MM 9
## MN 6
## MT 9
## MX 9
## MY 9
## MZ 9
## NG 9
## NI 9
## NL 9
## NO 9
## NP 9
## NZ 9
## OM 3
## PE 9
## PH 9
## PK 9
## PL 9
## PR 9
## PS 9
## PT 8
## PY 6
## QA 9
## RO 9
## RS 9
## RU 9
## RW 9
## SA 9
## SD 9
## SE 9
## SG 9
## SI 9
## SK 9
## SM 5
## SN 9
## SU 9
## TD 9
## TH 9
## TI 9
## TJ 9
## TM 9
## TR 9
## TT 9
## TW 9
## TX 9
## UC 9
## UG 9
## US 9
## UZ 9
## VM 9
## VN 9
## XK 8
## ZA 9
This one is difficult to interpret given many dimentions of Pollutants. Perhaps, the India cluster that stand separately at the bottom represent a pollution with a certain pollutant. Same goes with Estonia and Cyprus. The urban countries still fell into the same cluste just like with the k=2 clustering, which does classify them as highly polluted.
To better interpret this data I will check the mean of the pollutants in each cluster to identify what this cluster means.
openaq_country$kmean_cluster <- as.factor(kmean_country_k9$cluster)
cluster_profiles <- openaq_country %>%
group_by(kmean_cluster) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `kmean_cluster = 1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
plot_data <- cluster_profiles %>%
pivot_longer(cols = -kmean_cluster, names_to = "Pollutant", values_to = "Average_Value")
ggplot(plot_data, aes(x = Pollutant, y = Average_Value, fill = kmean_cluster)) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(~kmean_cluster) +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Pollutant Signatures by Cluster",
x = "Pollutant Type",
y = "Mean Concentration (Scaled/Standardized)")
Now much better for me to tell what those clusters mean. As I am not good at chemistry, I will use AI to ask for meanings of combinations like “high NO2” , “High O3 and High NO2”, “High O3 and Low NO2”
Results of GPT:
| Cluster | Key Chemistry Signature | Likely Environmental Source |
|---|---|---|
| Cluster 1 | Negative/Low across all pollutants | Clean Baseline: Pristine air or very low emissions (e.g., Estonia). |
| Cluster 2 | Extreme Low \(O_3\) (Ozone) | Urban Scavenging: High local \(NO\) reacting with and “eating” \(O_3\) at the surface. |
| Cluster 3 | High \(O_3\) & Moderate \(NO_2\) | Photochemical Smog: Mix of city traffic and strong sunlight (“Smog cooking”). |
| Cluster 4 | Extreme High \(NO\) & \(NO_2\) | Heavy Traffic/Combustion: Intense vehicle density or diesel-heavy transport hubs. |
| Cluster 5 | High \(O_3\) & Low \(NO_2\) | Sunny Coastal Smog: Regions with high solar radiation but lower local traffic (e.g., Cyprus). |
| Cluster 6 | Near Zero / Balanced | Global Average: Representative of standard urban levels without a specific dominant pollutant. |
| Cluster 7 | Extreme High \(O_3\) & \(PM_{10}\) | Industrial/Arid Region: High sunlight mixed with desert dust or heavy industry (e.g., China, UAE). |
| Cluster 8 | Low \(O_3\) & Moderate \(PM_{10}\) | Winter/Heating Profile: Particulates from coal or wood burning in weak sunlight. |
| Cluster 9 | Slightly Negative/Stable | Rural/Suburban: Lower than average pollution, generally stable air quality. |
Now I will explore the same clustering task but with hierarchical clustering
For K=2 I will use Ward.D2 to minimize variations inside clusters to keep countries closer by the poluution.
str(openaq_country)
## 'data.frame': 136 obs. of 10 variables:
## $ Country.Code : Factor w/ 137 levels "-99","AD","AE",..: 2 3 4 5 6 7 8 9 10 11 ...
## $ O3_scaled : num -0.0991 2.1526 -0.0991 -0.0991 -0.0991 ...
## $ PM2.5_scaled : num -0.0543 -0.0543 0.0798 -0.0528 -0.0337 ...
## $ NO2_scaled : num -0.12 -0.158 -0.12 -0.12 -0.12 ...
## $ CO_scaled : num -0.0286 -0.0286 -0.0286 -0.0286 -0.0286 ...
## $ NO_scaled : num -0.0855 -0.0855 -0.0855 -0.0855 -0.0855 ...
## $ NOX_scaled : num -0.0267 -0.0267 -0.0267 -0.0267 -0.0267 ...
## $ PM10_scaled : num -0.0452 0.2358 -0.0452 -0.0452 -0.0452 ...
## $ SO2_scaled : num -0.0362 0.3214 -0.0362 -0.0362 -0.0362 ...
## $ kmean_cluster: Factor w/ 9 levels "1","2","3","4",..: 9 7 6 9 9 9 9 9 9 9 ...
library(factoextra)
fviz_nbclust(openaq_country[,-c(1,10)] , FUN = hcut, method = "gap_stat")
Just like with k-means, there is a very steep jump when it comes to k=2
and k=9.
fviz_nbclust(openaq_country[,-1] , FUN = hcut, method = "wss")
fviz_nbclust(openaq_country[, -1] , FUN = kmeans, method = "silhouette")
All 3 methods are
# install.packages("NbClust")
library(NbClust)
duda_test <- NbClust(openaq_country[, -c(1,10)],
distance = "euclidean",
method = "ward.D2",
index = "duda",
min.nc = 2, max.nc = 2)
# See the result (Look for the "Best number of clusters")
duda_test$Best.nc
## Number_clusters Value_Index
## 2.0000 0.4822
All the methods above suggest k = 2, just like for the k-means. For the sake of the analysis I also want to pick 9 (to later compare it with k-means, and also because in gap and silhouette statistic there is a strong suggestion for this).
rownames(openaq_country) <- openaq_country$Country.Code #I add this line to assign the country code to later see which cluster got assigned
hc_clusters_k2 <- eclust(openaq_country[,-c(1,10)], FUNcluster = "hclust", k = 2, hc_method = "ward.D2", graph = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fviz_cluster(hc_clusters_k2,
geom = "text",
repel = TRUE,
show.clust.cent = FALSE,
main = "Hierarchical Clustering with 2 clusters")
The results are very similar to the K-mean clustering, the split appears
to be between the very industrial countries and less affected
countries.
Now let’s run this method for 9 clusters
rownames(openaq_country) <- openaq_country$Country.Code #I add this line to assign the country code to later see which cluster got assigned
hc_clusters_k9 <- eclust(openaq_country[,-c(1,10)], FUNcluster = "hclust", k = 9, hc_method = "ward.D2", graph = TRUE)
fviz_cluster(hc_clusters_k9,
geom = "text",
# repel = TRUE,
show.clust.cent = FALSE,
main = "Hierarchical Clustering with 9 clusters")
I will not do the deep-dive into the countries within each cluster,
however from the graph above we can clearly see that the results are
very similar to the results of k-mean clustering
For the dbscan I first need to find the optimal value for epsilon which will determine the radius between the individual points to be able to assign them to the same cluster.
library(dbscan)
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
kNNdistplot(openaq_country[,-c(1,10)], k = 2)
From this graph the line curves up at the point 0.15 , but I will
increase it to 0.20 to be my eps parameter to make the algorithm less
strict when it comes to clustering.
It is also important to choose the minimum points to form a cluster. in the context of this project it makes sense to choose 1 point only , as there could be only 1 country in the world that is exceptionally clean or dirty. However if I pick 1, every point will become a single cluster which is not what I am trying to achieve.
Following the DBSCAN Wikipedia article (https://en.wikipedia.org/wiki/DBSCAN) , the min I will choose is 3. However it is also mentioned that it depends on the number of dimensions, and as a rule of thumb we can use minPts = 2·dim So in my case: 2 x 8 dims = 16
dbscan_country <- dbscan(openaq_country[,-c(1,10)], eps = 0.20, minPts = 16)
dbscan_country$cluster
## [1] 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 1
## [38] 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
rownames(openaq_country) <- openaq_country$Country.Code
fviz_cluster(dbscan_country, data = openaq_country[,-c(1,10)],
geom = "text", repel = TRUE, main = "DBSCAN Clustering")
Interesting results, my interpretation of this is that most countries in
the world are very similar to each other when it comes to air quality so
they were assigned to the “average air quality cluster”. However there
are some countries with either much better air quality (like EE, XK, CH,
PT) or much worse air quality (like IN, CN, AE, OM) which is why they
are not part of the “average air quality cluster”.
With k=9 we have eps 0.4
library(dbscan)
kNNdistplot(openaq_country[,-c(1,10)], k = 9)
dbscan_country <- dbscan(openaq_country[,-c(1,10)], eps = 0.40, minPts = 16)
dbscan_country$cluster
## [1] 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1
## [38] 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
rownames(openaq_country) <- openaq_country$Country.Code
fviz_cluster(dbscan_country, data = openaq_country[,-c(1,10)],
geom = "text", repel = TRUE, main = "DBSCAN Clustering")
So with eps 0.40 the results are very similar to the eps = 0.20
Now that I have the results of the 3 main methods we covered during this course: K-means, DBSCAN, Hierarchical clustering - I can now compare the results of those , how much they agree with each other. So if all of them agree it means that I have a very good clustering results that can be reliable. It also provides me the insight on how much the certain result happened because of the use of a certain algorithm.
To do this I will use 2 approaches we went through during the course: ARI for comparing the clusters and Jaccard Similarity for checking robustness of each.
library(mclust)
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
library(fpc)
##
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
##
## dbscan
print(adjustedRandIndex(kmean_country_k2$cluster, hc_clusters_k2$cluster))
## [1] 1
This score is 1 which indicates that the clusters are strong and both algorithms agree with each other. And the results are the same no matter the algorithm used. And this is exactly what I observed above from the graph of the clustering assignments.
# install.packages("fossil")
library(fossil)
## Loading required package: sp
## Loading required package: maps
##
## Attaching package: 'maps'
## The following object is masked from 'package:mclust':
##
## map
## Loading required package: shapefiles
## Loading required package: foreign
##
## Attaching package: 'shapefiles'
## The following objects are masked from 'package:foreign':
##
## read.dbf, write.dbf
# print(adjustedRandIndex(kmean_country_k2$cluster, dbscan_country$cluster))
print(
data.frame(
Pair = c("KM vs HC", "KM vs DBSCAN", "HC vs DBSCAN"),
ARI = c(adjustedRandIndex(kmean_country_k2$cluster, hc_clusters_k2$cluster), adjustedRandIndex(kmean_country_k2$cluster, dbscan_country$cluster), adjustedRandIndex(hc_clusters_k2$cluster, dbscan_country$cluster)),
Jaccard = c(jaccard(kmean_country_k2$cluster, hc_clusters_k2$cluster), jaccard(kmean_country_k2$cluster, dbscan_country$cluster), jaccard(hc_clusters_k2$cluster, dbscan_country$cluster))
)
)
## Pair ARI Jaccard
## 1 KM vs HC 1.0000000 1.0000000
## 2 KM vs DBSCAN 0.7691994 0.9338235
## 3 HC vs DBSCAN 0.7691994 0.9338235
The results of the clusters are very close to each other both in ARI and Jaccard, which again proves the correct split of the countries for k=2 Though I would say that this version of Jaccard usage from fossil library is not the most reliable, I would create my own custom function, however here I can clearly see from another supporting metric ARI that the clusters are indeed similar.
print(
data.frame(
Pair = c("KM vs HC", "KM vs DBSCAN", "HC vs DBSCAN"),
ARI = c(adjustedRandIndex(kmean_country_k9$cluster, hc_clusters_k9$cluster), adjustedRandIndex(kmean_country_k9$cluster, dbscan_country$cluster), adjustedRandIndex(hc_clusters_k9$cluster, dbscan_country$cluster)),
Jaccard = c(jaccard(kmean_country_k9$cluster, hc_clusters_k9$cluster), jaccard(kmean_country_k9$cluster, dbscan_country$cluster), jaccard(hc_clusters_k9$cluster, dbscan_country$cluster))
)
)
## Pair ARI Jaccard
## 1 KM vs HC 0.6573443 1.0000000
## 2 KM vs DBSCAN 0.5113288 0.9338235
## 3 HC vs DBSCAN 0.8239581 0.9338235
The results are low for DBSCAN is as expected, since DBSCAN only had 1 cluster created in the end + the data points which were not in any of the clusters. Howeer I am surprised when it comes to the ARI index for KM vs HC, this index is a bit lower than I expected, but 0.66 is still a good result that shows that the clusters are somewhat similar.
Jaccard index from this part is completely not reliable so I would not use it in my next project, at least not from the fossil library.