Spatial Clustering

load the necessary packages

#set the directory
setwd("/Users/zihualai/Desktop/DSBA2/2nd intensive/Group4 Assessment")
library(lattice)
library(sf)

## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

library(spData)

## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`

library(spdep)
library(MASS)
library(Rcpp)
library(CARBayes)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(coda)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tmap) # for mapping

## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')

library(sp)
library(cluster)
library(dplyr)
library(ggplot2)
library(tidyr)
library(mclust)

## Package 'mclust' version 6.0.1
## Type 'citation("mclust")' for citing this R package in publications.

library(stats)

I combine files and calculate the SMR, autocorrelation. Examine the expected counts and observed counts in the 2years with these 2csv files

# Read the CSV files
expected_counts <- read.csv("Data/expected_counts.csv")
observed_counts <- read.csv("Data/respiratory_admissions.csv")
df1 <- rename(observed_counts, IZ_CODE = IG)
df2 <- rename(expected_counts, IZ_CODE = code)
# Merge the CSV data first
dat<- merge(df1, df2, by = "IZ_CODE", all = TRUE)
# Calculate the SMR for each year
dat$SMR_2005 <- dat$Y2005 / dat$E2005
dat$SMR_2006 <- dat$Y2006 / dat$E2006
# Summary statistics of the SMR for 2005 and 2006
summary(dat$SMR_2005)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3111  0.6196  0.8366  0.8640  1.0631  1.9863

summary(dat$SMR_2006)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2645  0.5591  0.7800  0.8118  1.0045  1.6044

# construct a spatial object
# Read the shapefile
shape<- st_read("SG_IntermediateZoneBdry_2001")

## Reading layer `SG_IntermediateZone_Bdry_2001' from data source 
##   `/Users/zihualai/Desktop/DSBA2/2nd intensive/Group4 Assessment/SG_IntermediateZoneBdry_2001' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1235 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 54455.17 ymin: 530297 xmax: 469467.7 ymax: 1218558
## Projected CRS: OSGB36 / British National Grid

sf.data <- merge(shape, dat, all.x=FALSE, by.y="IZ_CODE")


#Transforming to sp object
sp.dat<-as_Spatial(sf.data)

#Getting neighbourhood matrix W
W.nb <- poly2nb(sf.data$geometry, row.names = rownames(sp.dat))
W <- nb2mat(W.nb, style = "B")
W.list <- nb2listw(W.nb, style = "B")

Run a separate Poisson Leroux CAR model on each year for the respiratory counts using only an intercept term (no covariates) and expected count offset.Produce some summaries examining convergence of the chain and comment on them. Plot the fitted values on two maps and comment on them.

#Setting the regression formula
formula_2005 <- Y2005 ~ offset(log(E2005)) + STDAREA_HA
formula_2006 <- Y2006 ~ offset(log(E2006)) + STDAREA_HA

#Fit using glm
model_2005 <- glm(formula=formula_2005, family="poisson", data=sp.dat@data)
model_2006 <- glm(formula=formula_2006, family="poisson", data=sp.dat@data)

summary(model_2005)

## 
## Call:
## glm(formula = formula_2005, family = "poisson", data = sp.dat@data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.320e-01  7.785e-03 -16.963  < 2e-16 ***
## STDAREA_HA  -5.985e-05  7.327e-06  -8.168 3.13e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2543.8  on 270  degrees of freedom
## Residual deviance: 2466.1  on 269  degrees of freedom
## AIC: 4089.5
## 
## Number of Fisher Scoring iterations: 4

summary(model_2006)

## 
## Call:
## glm(formula = formula_2006, family = "poisson", data = sp.dat@data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.861e-01  7.820e-03 -23.803   <2e-16 ***
## STDAREA_HA  -7.088e-05  7.609e-06  -9.316   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2531.4  on 270  degrees of freedom
## Residual deviance: 2427.6  on 269  degrees of freedom
## AIC: 4047.8
## 
## Number of Fisher Scoring iterations: 4

#Check the relative risks
exp(cbind(model_2005$coefficients, confint(model_2005)))

## Waiting for profiling to be done...

##                           2.5 %    97.5 %
## (Intercept) 0.8762987 0.8630023 0.8897429
## STDAREA_HA  0.9999402 0.9999255 0.9999543

exp(cbind(model_2006$coefficients, confint(model_2006)))

## Waiting for profiling to be done...

##                           2.5 %    97.5 %
## (Intercept) 0.8301529 0.8174995 0.8429485
## STDAREA_HA  0.9999291 0.9999139 0.9999438

# Residual diagnostics - normality and constant variance
par(mfrow=c(1,2))
qqnorm(residuals(model_2005, type="pearson"), main="", pch=19)
qqline(residuals(model_2005, type="pearson"), col="red")
plot(model_2005$fitted.values, residuals(model_2005, type="pearson"), pch=19, 
     xlab="Fitted values", ylab="Residuals")

par(mfrow=c(1,2))
qqnorm(residuals(model_2006, type="pearson"), main="", pch=19)
qqline(residuals(model_2006, type="pearson"), col="red")
plot(model_2006$fitted.values, residuals(model_2006, type="pearson"), pch=19, 
     xlab="Fitted values", ylab="Residuals")

# Residual diagnostics - correlation
moran.mc(x = residuals(model_2005, type="pearson"), listw = W.list, nsim = 10000)

## 
##  Monte-Carlo simulation of Moran I
## 
## data:  residuals(model_2005, type = "pearson") 
## weights: W.list  
## number of simulations + 1: 10001 
## 
## statistic = 0.41237, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater

moran.mc(x = residuals(model_2006, type="pearson"), listw = W.list, nsim = 10000)

## 
##  Monte-Carlo simulation of Moran I
## 
## data:  residuals(model_2006, type = "pearson") 
## weights: W.list  
## number of simulations + 1: 10001 
## 
## statistic = 0.39194, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater

# Fit a CARBayes model
model2_2005 <- S.CARleroux(formula = formula_2005, family = "poisson", data = sf.data, W = W, 
                          burnin = 10000, n.sample = 100000, thin = 10, verbose = FALSE)

model2_2006 <- S.CARleroux(formula = formula_2006, family = "poisson", data = sf.data, W = W, 
                          burnin = 10000, n.sample = 100000, thin = 10, verbose = FALSE)

#Look at summary objects
summary(model2_2005)

##                     Length Class      Mode     
## summary.results      28    -none-     numeric  
## samples               6    -none-     list     
## fitted.values       271    -none-     numeric  
## residuals             2    data.frame list     
## modelfit              6    -none-     numeric  
## accept                4    -none-     numeric  
## localised.structure   0    -none-     NULL     
## formula               3    formula    call     
## model                 2    -none-     character
## mcmc.info             5    -none-     numeric  
## X                   542    -none-     numeric

summary(model2_2005$samples)

##        Length  Class Mode   
## beta     18000 mcmc  numeric
## phi    2439000 mcmc  numeric
## rho       9000 mcmc  numeric
## tau2      9000 mcmc  numeric
## fitted 2439000 mcmc  numeric
## Y            1 mcmc  logical

summary(model2_2006)

##                     Length Class      Mode     
## summary.results      28    -none-     numeric  
## samples               6    -none-     list     
## fitted.values       271    -none-     numeric  
## residuals             2    data.frame list     
## modelfit              6    -none-     numeric  
## accept                4    -none-     numeric  
## localised.structure   0    -none-     NULL     
## formula               3    formula    call     
## model                 2    -none-     character
## mcmc.info             5    -none-     numeric  
## X                   542    -none-     numeric

summary(model2_2006$samples)

##        Length  Class Mode   
## beta     18000 mcmc  numeric
## phi    2439000 mcmc  numeric
## rho       9000 mcmc  numeric
## tau2      9000 mcmc  numeric
## fitted 2439000 mcmc  numeric
## Y            1 mcmc  logical

#Plot the MCMC samples for rho - check convergence
plot(model2_2005$samples$rho)

plot(model2_2005$samples$beta)

plot(model2_2005$samples$tau2)

plot(model2_2005$samples$phi[ ,c(1, 123, 214)])

plot(model2_2006$samples$rho)

plot(model2_2006$samples$beta)

plot(model2_2006$samples$tau2)

plot(model2_2006$samples$phi[ ,c(1, 123, 214)])

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot. ## the behavior of the chains for both sets appears consistent and stable.

Extract the mean fitted values (after burn-in) for the areas’ risks in each year.Run a bivariate mixture model clustering method on the fitted values for both years Comment on your result in terms of number of clusters chosen, the component means.

# Risk
sp.dat@data$risk <- model2_2005$fitted.values / sp.dat@data$E2005
sp.dat@data$risk <- model2_2006$fitted.values / sp.dat@data$E2006
# PEP
m_2005 <- nrow(model2_2005$samples$fitted)
m_2006 <- nrow(model2_2006$samples$fitted)

risk_2005 <- model2_2005$samples$fitted / matrix(rep(sp.dat@data$E2005,m_2005), nrow=m_2005, byrow=T)
risk_2006 <- model2_2006$samples$fitted / matrix(rep(sp.dat@data$E2006,m_2005), nrow=m_2006, byrow=T)

summarise.exceedences<-function(samples,exceedences)
{
  exceed.vec<-apply((samples>1),2,sum)/nrow(samples)
  return(exceed.vec)
}
sp.dat@data$PEP <- as.numeric(summarise.exceedences(risk_2005,1))
sp.dat@data$PEP <- as.numeric(summarise.exceedences(risk_2006,1))

mean_2005 <- mean(risk_2005)
mean_2006 <- mean(risk_2006)

# Bivariate Mixture Model
clustering_data <- data.frame(
  Fitted_2005 = model2_2005$fitted.values,
  Fitted_2006 = model2_2006$fitted.values
)

clustering_result <- Mclust(clustering_data)

summary(clustering_result)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 2 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -2298.605 271  9 -4647.629 -4733.939
## 
## Clustering table:
##   1   2 
##  80 191

plot(clustering_result)

sp.dat@data$Cluster <- clustering_result$classification

# Visualize it on the map
map <- tm_shape(sp.dat) +
  tm_fill(col = "Cluster", palette = "Set3", title = "Cluster") +
  tm_borders() +
  tm_layout(frame = FALSE,
            title = "Cluster Analysis of Respiratory Counts",
            title.position = c("center", "top"))

tmap_mode("view")

## tmap mode set to interactive viewing

print(map)

The distribution of clusters appears to be concentrated around the central part of the mapped area, with the darkest shades (which usually represent higher values or risk levels) in the middle.This could imply a higher incidence or risk in the urban or densely populated central areas.

IV change the algorithms to K_Means and the data is raw SMRs. Perform K-means clustering for year 2005

smr_data <- data.frame(SMR_2005 = dat$SMR_2005, SMR_2006 = dat$SMR_2006)
set.seed(123) 
k <- 3  
kmeans_result <- kmeans(smr_data, centers = k)
print(kmeans_result)

## K-means clustering with 3 clusters of sizes 52, 112, 107
## 
## Cluster means:
##    SMR_2005  SMR_2006
## 1 1.3493266 1.2754141
## 2 0.5724077 0.5311341
## 3 0.9332827 0.8802605
## 
## Clustering vector:
##   [1] 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 3 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 3 2 2 2 2 2 3 2 3 3 3 3 3 3 3 2 3 2 3 2 2 1 2 2 2 3 3 2 2 2 3 3 3 3 3
##  [75] 3 2 3 3 3 3 1 3 1 1 3 1 2 2 3 3 3 3 3 1 2 1 1 3 1 2 3 1 3 3 3 1 1 3 2 3 1
## [112] 1 1 2 3 1 1 3 3 1 1 3 3 1 2 1 1 2 2 1 3 3 1 1 3 2 1 1 3 1 2 1 2 3 2 2 2 1
## [149] 1 2 1 3 2 2 1 1 3 3 3 3 1 2 3 1 1 3 3 1 3 3 1 1 1 3 2 1 3 1 1 1 2 2 2 2 3
## [186] 3 2 3 2 3 3 3 1 2 2 2 2 2 3 3 2 2 2 3 3 3 3 2 2 2 3 2 1 3 2 3 3 2 3 1 2 1
## [223] 2 3 3 3 2 2 3 3 2 2 1 2 1 2 2 3 2 3 3 3 2 3 2 2 2 3 2 2 3 3 3 3 3 3 3 3 2
## [260] 2 3 3 2 2 3 3 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 4.269925 3.729706 3.750749
##  (between_SS / total_SS =  78.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

dat$Cluster <- kmeans_result$cluster
aggregate(smr_data, by = list(dat$Cluster), FUN = mean)

##   Group.1  SMR_2005  SMR_2006
## 1       1 1.3493266 1.2754141
## 2       2 0.5724077 0.5311341
## 3       3 0.9332827 0.8802605

plot(smr_data$SMR_2005, smr_data$SMR_2006, col = dat$Cluster, xlab = "SMR 2005", ylab = "SMR 2006")
points(kmeans_result$centers[,1], kmeans_result$centers[,2], col = 1:k, pch = 8, cex = 2)