Group 8 Assessment

Let’s start with reading the libraries and setting the working directory for the analysis

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(sf)

## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

library(ggpubr)
library(texreg)

## Version:  1.39.3
## Date:     2023-11-09
## Author:   Philip Leifeld (University of Essex)
## 
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
## 
## Attaching package: 'texreg'
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

library(leaflet)
library(RColorBrewer)

Now, let’s read the data.

expected <- read.csv(file = "Data/expected_counts.csv")
respiratory <- read.csv(file = "Data/respiratory_admissions.csv")
shape<-read_sf("SG_IntermediateZoneBdry_2001/")

# Change the name of the variables to match the datasets 
expected <- expected %>% 
  rename(IZ_CODE = code)

respiratory <- respiratory %>% 
  rename(IZ_CODE = IG)

# Merge datasets 
total <- left_join(expected, respiratory, by = "IZ_CODE")

Task 1

A) Create the combined shape and data file, (already done)

B) calculate the Standard Mortality Ratio (SMR) for each year separately,

total$SMR2009 <- total$Y2009 / total$E2009
total$SMR2010 <- total$Y2010 / total$E2010
head(total)

##     IZ_CODE     E2009     E2010 Y2009 Y2010   SMR2009   SMR2010
## 1 S02000260  99.51566 106.31486   111   107 1.1154023 1.0064445
## 2 S02000261  47.77799  50.86723    26    23 0.5441836 0.4521575
## 3 S02000262  95.01387 104.46578    46    53 0.4841399 0.5073431
## 4 S02000263  81.93079  90.20686    32    40 0.3905736 0.4434253
## 5 S02000264 128.91529 139.87310    52    60 0.4033657 0.4289603
## 6 S02000265  58.21802  63.97093    29    25 0.4981276 0.3908025

C) calculate summary statistics and map the risks for the two years in separate maps.

library(tibble)
library(dplyr)

result_table <- total %>%
  summarise(
    meanSMR2009 = mean(SMR2009),
    meanSMR2010 = mean(SMR2010),
    medianSMR2009 = median(SMR2009), 
    medianSMRE2010 = median(SMR2010),
    correlation = cor(SMR2009, SMR2010), 
    CV2009 = sd(SMR2009) / mean(SMR2009), 
    CV2010 = sd(SMR2010) / mean(SMR2010), 
    change = mean((SMR2010 - SMR2009) / SMR2009), 
    max09 = max(SMR2009), 
    max10 = max(SMR2010),
    min09 = min(SMR2009), 
    min10 = min(SMR2010)
  )

# Print the result table
print(as_tibble(result_table))

## # A tibble: 1 × 12
##   meanSMR2009 meanSMR2010 medianSMR2009 medianSMRE2010 correlation CV2009 CV2010
##         <dbl>       <dbl>         <dbl>          <dbl>       <dbl>  <dbl>  <dbl>
## 1       0.881       0.828         0.866          0.791       0.853  0.376  0.370
## # ℹ 5 more variables: change <dbl>, max09 <dbl>, max10 <dbl>, min09 <dbl>,
## #   min10 <dbl>

# Brief visualisation of the SMR densities and correlations
total_long <- pivot_longer(total,
                           cols = c(SMR2009, SMR2010),
                           names_to = "Year",
                           values_to = "Value")

# Create a density plot using facet_wrap
ggplot(total_long,
       aes(x = Value, fill = Year)) +
  geom_density(alpha = 0.5) +
  facet_wrap(~ Year, scales = "free")

# Similar in shapes
# highly correlated 
cor(total[,6], total[,7])

## [1] 0.8534824

# Merge spatial data
sf.data <- merge(shape, total, all.x = FALSE, by = "IZ_CODE")
head(sf.data, 3)

## Simple feature collection with 3 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 260667 ymin: 669235 xmax: 263842 ymax: 671065
## Projected CRS: OSGB36 / British National Grid
##     IZ_CODE       IZ_NAME STDAREA_HA Shape_Leng Shape_Area    E2009     E2010
## 1 S02000260    Auchinairn   112.2925   7751.798    1107841 99.51566 106.31486
## 2 S02000261 Woodhill East   111.5763   6464.223    1104265 47.77799  50.86723
## 3 S02000262 Woodhill West   107.2337   7316.999    1065622 95.01387 104.46578
##   Y2009 Y2010   SMR2009   SMR2010                       geometry
## 1   111   107 1.1154023 1.0064445 MULTIPOLYGON (((260718 6698...
## 2    26    23 0.5441836 0.4521575 MULTIPOLYGON (((262047 6699...
## 3    46    53 0.4841399 0.5073431 MULTIPOLYGON (((261158.7 67...

plot(sf.data$geometry)

D) Comment on the summaries and any spatial patterns of interest you see in the maps.

Conclusions from the summary table:

The SMR decreased between 2009 and 2010
The variability, measured in the coefficient of variation is similar between the years,
Besides the mean and medians, minimum and maximum values have decreased.

E) Check for spatial autocorrelation in each of the years.

# Calculate Spatial Correlation
library(spdep)

## Loading required package: spData

## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`

W.nb <- poly2nb(sf.data, row.names = rownames(sf.data))
W <- nb2mat(W.nb, style = "B")
#Look at the summary of the neighbourhood connections
summary(W.nb)

## Neighbour list object:
## Number of regions: 271 
## Number of nonzero links: 1424 
## Percentage nonzero weights: 1.938971 
## Average number of links: 5.254613 
## 2 disjoint connected subgraphs
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 10 11 14 15 20 
##  4 15 30 51 62 50 32 12  5  5  2  1  1  1 
## 4 least connected regions:
## 202 227 239 265 with 1 link
## 1 most connected region:
## 234 with 20 links

W.list <- nb2listw(W.nb, style = "B")

# Moran Test for Spatial Correlation
# Null Hypothesis in the test: Observations are spatially independent 
moran.mc(x = sf.data$SMR2009, listw = W.list, nsim = 10000)

## 
##  Monte-Carlo simulation of Moran I
## 
## data:  sf.data$SMR2009 
## weights: W.list  
## number of simulations + 1: 10001 
## 
## statistic = 0.39019, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater

# statistic = 0.39019
# p-value = 9.999e-05
moran.mc(x = sf.data$SMR2010, listw = W.list, nsim = 10000)

## 
##  Monte-Carlo simulation of Moran I
## 
## data:  sf.data$SMR2010 
## weights: W.list  
## number of simulations + 1: 10001 
## 
## statistic = 0.43235, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater

# statistic = 0.43235
# p-value = 9.999e-05

# Conclusion: We reject the null hypothesis on the spatial independence 
# Spatial analysis is justified 

# Visualize data 
sp.dat<-as_Spatial(sf.data)
library(broom)
sp.dat@data$id <- rownames(sp.dat@data)
temp1<-tidy(sp.dat)

## Warning: `tidy.SpatialPolygonsDataFrame()` was deprecated in broom 1.0.4.
## ℹ Please use functions from the sf package, namely `sf::st_as_sf()`, in favor
##   of sp tidiers.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Regions defined for each Polygons

sp.dat2 <- merge(temp1, sp.dat@data, by = "id")

# Visualisation for 2009 
visu2009 <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(SMR2009))) +
  geom_polygon() +
  coord_equal() +
  xlab("Easting (m)") +
  ylab("Northing (m)") +
  labs(fill = "SMR", title = "2009") +
  scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))

# Visualisation for 2010 
visu2010 <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(SMR2010))) +
  geom_polygon() +
  coord_equal() +
  xlab("Easting (m)") +
  ylab("Northing (m)") +
  labs(fill = "SMR", title = "2010") +
  scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))



combined_smr <- ggarrange(visu2009, visu2010,
                          ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom") 

annotate_figure(combined_smr, top = text_grob("SMR for respiratory hospitalisation", 
                                              color = "black", face = "bold", size = 16))

We observe that:

SMR takes larger value around the rivder, possibly the city centre
SMR Pattern for Spatial Data Observed:
Between 2009 and 2010, the SMR increased outside of the centre, for instance in the northern western part of the city

Task 2

A) Run a separate Poisson Leroux CAR model on each year for the respiratory counts using only an intercept term (no covariates) and expected count offset.

We use Poisson model because the event of the respiratory is rare.
Instead of ICAR model, Poisson Leroux Car model takes into account the measure of spatial dependence in the neighbourhood matrix

library(CARBayes)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: Rcpp

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "ndiMatrix" of class "replValueSp"; definition not updated

formula2009 <- Y2009 ~  offset(log(E2009))
formula2010 <- Y2010 ~  offset(log(E2010))

model1_09 <- glm(formula = formula2009, family = "poisson", data = sp.dat@data)
summary(model1_09)

## 
## Call:
## glm(formula = formula2009, family = "poisson", data = sp.dat@data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.134207   0.006874  -19.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2933.1  on 270  degrees of freedom
## Residual deviance: 2933.1  on 270  degrees of freedom
## AIC: 4588
## 
## Number of Fisher Scoring iterations: 4

# CAR MODEL 
model09 <- S.CARleroux(formula = formula2009, family = "poisson", data = sf.data, W = W,
                       burnin = 10000, n.sample = 100000, thin = 10, verbose=FALSE)
print(model09)

## 
## #################
## #### Model fitted
## #################
## Likelihood model - Poisson (log link function) 
## Random effects model - Leroux CAR
## Regression equation - Y2009 ~ offset(log(E2009))
## 
## 
## #################
## #### MCMC details
## #################
## Total number of post burnin and thinned MCMC samples generated - 9000
## Number of MCMC chains used - 1
## Length of the burnin period used for each chain - 10000
## Amount of thinning used - 10
## 
## ############
## #### Results
## ############
## Posterior quantities and DIC
## 
##                Mean    2.5%   97.5% n.effective Geweke.diag
## (Intercept) -0.1936 -0.2100 -0.1773      9000.0        -0.8
## tau2         0.3120  0.2339  0.4032      7119.5         1.8
## rho          0.6459  0.4152  0.8734      5945.6         1.8
## 
## DIC =  2160.271       p.d =  232.3898       LMPL =  -1166.66

summary(model09$samples)

##        Length  Class Mode   
## beta      9000 mcmc  numeric
## phi    2439000 mcmc  numeric
## rho       9000 mcmc  numeric
## tau2      9000 mcmc  numeric
## fitted 2439000 mcmc  numeric
## Y            1 mcmc  logical

B) Produce some summaries examining convergence of the chain and comment on them.

# Before swithcing to the inference, we assess the model convergence 
plot(model09$samples$rho)

Traceplot: we see no trend, assumed convergence achieved
Density: The distribution of the parameter approximately normal, with mean around to approximated value
rho: 0.6457 - moderate-strong spatial dependence

moran.mc(x = residuals(model09, type="pearson"), listw = W.list, nsim = 10000)

## 
##  Monte-Carlo simulation of Moran I
## 
## data:  residuals(model09, type = "pearson") 
## weights: W.list  
## number of simulations + 1: 10001 
## 
## statistic = -0.071755, observed rank = 270, p-value = 0.973
## alternative hypothesis: greater

The result of the Moran test for the simple CAR bayes model show that after fiting the model there is no left spatial correlation, assuming good fit

model10 <- S.CARleroux(formula=formula2010, family="poisson", data=sf.data, W=W,
                       burnin=10000, n.sample=100000, thin=10, verbose=FALSE)
print(model10)

## 
## #################
## #### Model fitted
## #################
## Likelihood model - Poisson (log link function) 
## Random effects model - Leroux CAR
## Regression equation - Y2010 ~ offset(log(E2010))
## 
## 
## #################
## #### MCMC details
## #################
## Total number of post burnin and thinned MCMC samples generated - 9000
## Number of MCMC chains used - 1
## Length of the burnin period used for each chain - 10000
## Amount of thinning used - 10
## 
## ############
## #### Results
## ############
## Posterior quantities and DIC
## 
##                Mean    2.5%   97.5% n.effective Geweke.diag
## (Intercept) -0.2539 -0.2703 -0.2371      8963.8         1.0
## tau2         0.2963  0.2295  0.3745      8091.6         0.2
## rho          0.7346  0.5115  0.9286      6626.0         0.1
## 
## DIC =  2159.947       p.d =  228.1328       LMPL =  -1152.61

summary(model10$samples)

##        Length  Class Mode   
## beta      9000 mcmc  numeric
## phi    2439000 mcmc  numeric
## rho       9000 mcmc  numeric
## tau2      9000 mcmc  numeric
## fitted 2439000 mcmc  numeric
## Y            1 mcmc  logical

plot(model10$samples$rho)

Traceplot: we see no trend, assumed convergence achieved
Density: slightly skewed, yet approximately normal
rho: 0.7317 - strong spatial dependence

moran.mc(x = residuals(model10, type="pearson"), listw = W.list, nsim = 10000)

## 
##  Monte-Carlo simulation of Moran I
## 
## data:  residuals(model10, type = "pearson") 
## weights: W.list  
## number of simulations + 1: 10001 
## 
## statistic = -0.036341, observed rank = 1916, p-value = 0.8084
## alternative hypothesis: greater

The result of the Moran test for the simple CAR bayes model show that after fiting the model there is no left spatial correlation, assuming good fit
As out CAR model did not involve covariates we do not interpret the parameters
However, we observe the strong spatial correlation

C) Plot the fitted values on two maps and comment on them.

# Plot the fitted values
sp.dat@data$fitted2009 <- model09$fitted.values 
sp.dat@data$fitted2010 <- model10$fitted.values 

sp.dat@data$id <- rownames(sp.dat@data)
temp1<-tidy(sp.dat)

## Regions defined for each Polygons

sp.dat2 <- merge(temp1, sp.dat@data, by = "id")

visu2009_fitted <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(fitted2009))) +
 geom_polygon() +
 coord_equal() +
 xlab("Easting (m)") +
 ylab("Northing (m)") +
 labs(fill = "Fitted number", title = "2009")+
 scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))

visu2009_real <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(Y2009))) +
  geom_polygon() +
  coord_equal() +
  xlab("Easting (m)") +
  ylab("Northing (m)") +
  labs(fill = "Fitted number", title = "2009")+
  scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))

visu2010_fitted <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(fitted2010))) +
  geom_polygon() +
  coord_equal() +
  xlab("Easting (m)") +
  ylab("Northing (m)") +
  labs(fill = "Fitted number", title = "2010")+
  scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))

visu2010_real <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(Y2010))) +
  geom_polygon() +
  coord_equal() +
  xlab("Easting (m)") +
  ylab("Northing (m)") +
  labs(fill = "Fitted number", title = "2010")+
  scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))

combined_fitted <- ggarrange(visu2009_fitted, visu2010_fitted,
                          ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom") 
annotate_figure(combined_fitted, top = text_grob("Predicted number of respiratory cases", 
                                              color = "black", face = "bold", size = 16))

The graph reveals similar patterns for the Intermediete Zones

In 2010, we can see that the respiratory cases have increased outside of the center
Based on the Glasgow map, this corresponds to Western Northern Part of Glasgow
At the same time it has slightly decreased in southern part of the city

# Comparing to the real data 
compare_2009 <- ggarrange(visu2009_fitted, visu2009_real,
                          ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom")
compare_2009

compare_2010 <- ggarrange(visu2010_fitted, visu2010_real,
                          ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom")
compare_2010

To provide valuable data analysis, we plot the PEP
- PEP - Posterior exceedance probability
- Probability that the risk exceeds the national average

m1 <- nrow(model09$samples$fitted)
m2 <- nrow(model10$samples$fitted)

visu2009_PEP <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(PEP09))) +
  geom_polygon() +
  coord_equal() +
  xlab("Easting (m)") +
  ylab("Northing (m)") +
  labs(fill = "PEP", title = "2009")+
  scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))

# Risk is calculated as: the ratio of the fitted.value and the expected value of the respirtory cases
# How to interpret it?
# The larger the risk, the larger the ratio of the modelled number of cases to the expected
#### Risk 2009 and 2010
sp.dat@data$risk2009 <- model09$fitted.values / sp.dat@data$E2009
sp.dat@data$risk2010 <- model10$fitted.values / sp.dat@data$E2010

risk09 <- model09$samples$fitted / matrix(rep(sp.dat@data$E2009,m1), nrow=m1, byrow=T)
risk10 <- model10$samples$fitted / matrix(rep(sp.dat@data$E2009,m2), nrow=m2, byrow=T)

summarise.exceedences<-function(samples,exceedences)
{
  exceed.vec<-apply((samples>exceedences),2,sum)/nrow(samples)
  return(exceed.vec)
}

sp.dat@data$PEP09 <- as.numeric(summarise.exceedences(risk09,1))  
sp.dat@data$PEP10 <- as.numeric(summarise.exceedences(risk10,1))  

sp.dat@data$id <- rownames(sp.dat@data)
temp1<-tidy(sp.dat)

## Regions defined for each Polygons

sp.dat2 <- merge(temp1, sp.dat@data, by = "id")

# Visualisation for 2009 
colours <- colorNumeric(palette = "YlOrRd", domain = sp.dat2$PEP2009)
library(RColorBrewer)

visu2009_PEP <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(PEP09))) +
  geom_polygon() +
  coord_equal() +
  xlab("Easting (m)") +
  ylab("Northing (m)") +
  labs(fill = "PEP", title = "2009")+
  scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))



# Visualisation for 2010 
visu2010_PEP <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(PEP10))) +
  geom_polygon() +
  coord_equal() +
  xlab("Easting (m)") +
  ylab("Northing (m)") +
  labs(fill = "PEP", title = "2010") +
  scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))

Task 3

A) Extract the mean fitted values (after burn-in) for the areas’ risks in each year.

risk_matrix <- cbind(sp.dat@data$risk2009, sp.dat@data$risk2010)
colnames(risk_matrix) <- c("risk2009", "risk2010")

B) Run a bivariate mixture model clustering method on the fitted values for both years (i.e. cluster both years’ fitted values at the same time, don’t run a separate clustering on each year).

# Fit a Gaussian Mixture Model
library(mclust)

## Package 'mclust' version 6.0.1
## Type 'citation("mclust")' for citing this R package in publications.

## 
## Attaching package: 'mclust'

## The following object is masked from 'package:purrr':
## 
##     map

gmm_model <- Mclust(risk_matrix, G = 1:9)  # G - number of clusters
gmm_model

## 'Mclust' model object: (VEE,2) 
## 
## Available components: 
##  [1] "call"           "data"           "modelName"      "n"             
##  [5] "d"              "G"              "BIC"            "loglik"        
##  [9] "df"             "bic"            "icl"            "hypvol"        
## [13] "parameters"     "z"              "classification" "uncertainty"

BIC

plot(gmm_model, what = "BIC")

classification

plot(gmm_model, what = "classification")

uncertainty

plot(gmm_model, what = "uncertainty")

density

plot(gmm_model, what = "density")

table(gmm_model$class)

## 
##   1   2 
## 180  91

C) Comment on your result in terms of number of clusters chosen, the component means.

Cluster 1 contains 181 observations and cluster 2 contains 90 observations.

1. BIC We focus on BIC variance parameterization: “EII”, “VII”, “EEI”, “VEI”, “EVI”, “VVI”. Based on them, 2 clusters seems to be optimal number. This conclusion corresponds with the algorithm’s output.

2. classification Based on classification plot, the number of clusters doesn’t seem obvious. Cluster with lower risk in both years (up to 0.8), is easier to distinct in comparison to cluster to, which includes a few outliers. Assuming we had only this plot, it would be hard to decide on number of clusters.

3. uncertainty Uncertainty is the higher the closer the observations are to each other.

4. density The low risk cluster has a denser distributed observations, which is supporting for the sake of robust cluster. In clustering, the objective is to minimize the sum of distances within clusters while maximizing the total distance between clusters. In this context, the presence of outliers assigned to the higher risk cluster could potentially impact the effectiveness of the clustering algorithm. Nevertheless, the overall pattern of observations, depicted as a “line shape”, remains relatively straightforward to interpret. The observations exhibit a discernible correlation across years, and outliers consistently demonstrate a proportionally higher risk for both years.

In conclusion, the most helpful in selecting the number of clusters is the BIC score. We end up with 2 clusters of lower mean risk area across 2009 and 2010 and higher risk, both for 2009 and 2010.

Component means

component_means <- gmm_model$parameters$mean
colnames(component_means) <- c("Cluster1", "Cluster2")
component_means

##           Cluster1  Cluster2
## risk2009 1.0160637 0.5822874
## risk2010 0.9462084 0.5648059

The ‘component_means’ dataframe represents the mean risk values for two clusters (Cluster1 and Cluster2) across two time points (risk in 2009 and risk in 2010) in the context of respiratory disease. Cluster1 tends to exhibit higher average risks compared to Cluster2, indicating potential spatial patterns or differences in disease prevalence.

Extract cluster membership and add it to our data.

sp.dat@data$Cluster <- gmm_model$classification

temp2 <- data.frame(id = sp.dat@data$id, Cluster = sp.dat@data$Cluster)
sp.dat2 <- merge(sp.dat2, temp2, by="id")

D) Map the cluster membership (i.e. one colour for each cluster)

clusters_vis <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = as.factor(Cluster))) +
  geom_polygon() +
  scale_fill_manual(values = c("#000033","#0099FF"),
                    name = "Risk Levels",  
                    labels = c("Higher Mean Risk Across Years", "Lower Mean Risk Across Years")) +
  coord_equal() +
  xlab("Easting (m)") +
  ylab("Northing (m)") +
  labs(fill = "Cluster", title = "Clusters for risk of respiratory disease across Glasgow for 2009 and 2010") +
  theme(legend.title = element_text(size = 8),
        plot.title = element_text(size = 13),
        legend.text = element_text(size = 7),
        legend.key.width = unit(0.3, "cm"),  
        legend.key.height = unit(0.3, "cm"))
clusters_vis

Task 4

Run another clustering algorithm on the data and use an appropriate measure

to compare the two clustering results.

library(ClusterR)

Since in this dataset we are not struggling with outliers, we can use k-means.

Optimal number of clusters according to Silhouette Score:

opt_n_of_clust <- Optimal_Clusters_KMeans(risk_matrix, max_clusters=10, 
                                   plot_clusters=TRUE, criterion="silhouette")

It seems that two clusters are optimal.

Optimal number of clusters according to elbow method:

opt_n_of_clust<-Optimal_Clusters_KMeans(risk_matrix, max_clusters=10, 
                                   plot_clusters = TRUE)

Here also two clusters performs well.

kmeans_result <- kmeans(risk_matrix, centers = 2, nstart = 50)

Extract the cluster membership and add it to our data:

sp.dat@data$Cluster_kmeans <- kmeans_result$cluster

temp3 <- data.frame(id = sp.dat@data$id, Cluster_kmeans = sp.dat@data$Cluster_kmeans)
sp.dat2 <- merge(sp.dat2, temp3, by="id")

clusters_kmeans_vis <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = as.factor(Cluster_kmeans))) +
  geom_polygon() +
  scale_fill_manual(values = c("#0099FF","#000033"),
                    name = "Risk Levels",  
                    labels = c("Higher Mean Risk Across Years", "Lower Mean Risk Across Years")) +
  coord_equal() +
  xlab("Easting (m)") +
  ylab("Northing (m)") +
  labs(fill = "Cluster kmeans", title = "Clustering method using kmeans") +
  theme(legend.title = element_text(size = 8),
        plot.title = element_text(size = 13),
        legend.text = element_text(size = 7),
        legend.key.width = unit(0.3, "cm"),  
        legend.key.height = unit(0.3, "cm"))
clusters_kmeans_vis

Note that we reversed colours in the plot, meaning the cluster number for keams is reversed in relation to the number from gmm_model. We will fix that:

diff_analysis <- as.data.frame(kmeans_result$cluster)
colnames(diff_analysis) <- c("kmeans_result")

diff_analysis$kmeans_result <- ifelse(
  diff_analysis$kmeans_result == 1, 2, 1
)

Differences analysis

diff_analysis[,2] <- gmm_model$classification
colnames(diff_analysis) <- c("kmeans_result","gmm_result")

diff_analysis$diff <- ifelse(
  diff_analysis$kmeans_result != diff_analysis$gmm_result, 
  ifelse(diff_analysis$kmeans_result - diff_analysis$gmm_result == -1, -1, -2),
  diff_analysis$kmeans_result
)

unique(diff_analysis$diff)

## [1]  1  2 -2

Assignment:

-2 means that gmm_model classified as cluster 1 and kmeans as cluster 2
-1 means that gmm_model classified as cluster 2 and kmeans as cluster 1

Abbreviations:
LR - low risk (2)
HR - high risk (1)

No such situation, where gmm_model classified the area as 2 (LR) and kmeans as 1 (HR). It means that gmm_model approach is more conservative. In case of risk, conservative side is usually the preferred one.

sp.dat@data$diff_cluster <- diff_analysis$diff

temp4 <- data.frame(id = sp.dat@data$id, diff_cluster = sp.dat@data$diff_cluster)
sp.dat2 <- merge(sp.dat2, temp4, by="id")

clusters_diff_vis <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = as.factor(diff_cluster))) +
  geom_polygon() +
  scale_fill_manual(values = c("#FF6966","#000033", "#0099FF"),
                    name = "Risk Levels",  
                    labels = c("LR for kmeans HR for gmm","HR cluster both", "LR cluser both")) +
  coord_equal() +
  xlab("Easting (m)") +
  ylab("Northing (m)") +
  labs(fill = "Clusters", title = "Comparison of two clustering methods") +
  theme(legend.title = element_text(size = 8),
        plot.title = element_text(size = 13),
        legend.text = element_text(size = 7),
        legend.key.width = unit(0.3, "cm"),  
        legend.key.height = unit(0.3, "cm"))
clusters_diff_vis

Spatial data science approach to model-based clustering and semi-supervised variable selection

Wojciech Szymczak and Zuzanna Kostecka