#set the directory
setwd("/Users/zihualai/Desktop/DSBA2/2nd intensive/Group4 Assessment")
library(lattice)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(spData)
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(spdep)
library(MASS)
library(Rcpp)
library(CARBayes)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(coda)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tmap) # for mapping
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(sp)
library(cluster)
library(dplyr)
library(ggplot2)
library(tidyr)
library(mclust)
## Package 'mclust' version 6.0.1
## Type 'citation("mclust")' for citing this R package in publications.
library(stats)
# Read the CSV files
expected_counts <- read.csv("Data/expected_counts.csv")
observed_counts <- read.csv("Data/respiratory_admissions.csv")
df1 <- rename(observed_counts, IZ_CODE = IG)
df2 <- rename(expected_counts, IZ_CODE = code)
# Merge the CSV data first
dat<- merge(df1, df2, by = "IZ_CODE", all = TRUE)
# Calculate the SMR for each year
dat$SMR_2005 <- dat$Y2005 / dat$E2005
dat$SMR_2006 <- dat$Y2006 / dat$E2006
# Summary statistics of the SMR for 2005 and 2006
summary(dat$SMR_2005)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3111 0.6196 0.8366 0.8640 1.0631 1.9863
summary(dat$SMR_2006)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2645 0.5591 0.7800 0.8118 1.0045 1.6044
# construct a spatial object
# Read the shapefile
shape<- st_read("SG_IntermediateZoneBdry_2001")
## Reading layer `SG_IntermediateZone_Bdry_2001' from data source
## `/Users/zihualai/Desktop/DSBA2/2nd intensive/Group4 Assessment/SG_IntermediateZoneBdry_2001'
## using driver `ESRI Shapefile'
## Simple feature collection with 1235 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 54455.17 ymin: 530297 xmax: 469467.7 ymax: 1218558
## Projected CRS: OSGB36 / British National Grid
sf.data <- merge(shape, dat, all.x=FALSE, by.y="IZ_CODE")
#Transforming to sp object
sp.dat<-as_Spatial(sf.data)
#Getting neighbourhood matrix W
W.nb <- poly2nb(sf.data$geometry, row.names = rownames(sp.dat))
W <- nb2mat(W.nb, style = "B")
W.list <- nb2listw(W.nb, style = "B")
#Setting the regression formula
formula_2005 <- Y2005 ~ offset(log(E2005)) + STDAREA_HA
formula_2006 <- Y2006 ~ offset(log(E2006)) + STDAREA_HA
#Fit using glm
model_2005 <- glm(formula=formula_2005, family="poisson", data=sp.dat@data)
model_2006 <- glm(formula=formula_2006, family="poisson", data=sp.dat@data)
summary(model_2005)
##
## Call:
## glm(formula = formula_2005, family = "poisson", data = sp.dat@data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.320e-01 7.785e-03 -16.963 < 2e-16 ***
## STDAREA_HA -5.985e-05 7.327e-06 -8.168 3.13e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2543.8 on 270 degrees of freedom
## Residual deviance: 2466.1 on 269 degrees of freedom
## AIC: 4089.5
##
## Number of Fisher Scoring iterations: 4
summary(model_2006)
##
## Call:
## glm(formula = formula_2006, family = "poisson", data = sp.dat@data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.861e-01 7.820e-03 -23.803 <2e-16 ***
## STDAREA_HA -7.088e-05 7.609e-06 -9.316 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2531.4 on 270 degrees of freedom
## Residual deviance: 2427.6 on 269 degrees of freedom
## AIC: 4047.8
##
## Number of Fisher Scoring iterations: 4
#Check the relative risks
exp(cbind(model_2005$coefficients, confint(model_2005)))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.8762987 0.8630023 0.8897429
## STDAREA_HA 0.9999402 0.9999255 0.9999543
exp(cbind(model_2006$coefficients, confint(model_2006)))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.8301529 0.8174995 0.8429485
## STDAREA_HA 0.9999291 0.9999139 0.9999438
# Residual diagnostics - normality and constant variance
par(mfrow=c(1,2))
qqnorm(residuals(model_2005, type="pearson"), main="", pch=19)
qqline(residuals(model_2005, type="pearson"), col="red")
plot(model_2005$fitted.values, residuals(model_2005, type="pearson"), pch=19,
xlab="Fitted values", ylab="Residuals")
par(mfrow=c(1,2))
qqnorm(residuals(model_2006, type="pearson"), main="", pch=19)
qqline(residuals(model_2006, type="pearson"), col="red")
plot(model_2006$fitted.values, residuals(model_2006, type="pearson"), pch=19,
xlab="Fitted values", ylab="Residuals")
# Residual diagnostics - correlation
moran.mc(x = residuals(model_2005, type="pearson"), listw = W.list, nsim = 10000)
##
## Monte-Carlo simulation of Moran I
##
## data: residuals(model_2005, type = "pearson")
## weights: W.list
## number of simulations + 1: 10001
##
## statistic = 0.41237, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater
moran.mc(x = residuals(model_2006, type="pearson"), listw = W.list, nsim = 10000)
##
## Monte-Carlo simulation of Moran I
##
## data: residuals(model_2006, type = "pearson")
## weights: W.list
## number of simulations + 1: 10001
##
## statistic = 0.39194, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater
# Fit a CARBayes model
model2_2005 <- S.CARleroux(formula = formula_2005, family = "poisson", data = sf.data, W = W,
burnin = 10000, n.sample = 100000, thin = 10, verbose = FALSE)
model2_2006 <- S.CARleroux(formula = formula_2006, family = "poisson", data = sf.data, W = W,
burnin = 10000, n.sample = 100000, thin = 10, verbose = FALSE)
#Look at summary objects
summary(model2_2005)
## Length Class Mode
## summary.results 28 -none- numeric
## samples 6 -none- list
## fitted.values 271 -none- numeric
## residuals 2 data.frame list
## modelfit 6 -none- numeric
## accept 4 -none- numeric
## localised.structure 0 -none- NULL
## formula 3 formula call
## model 2 -none- character
## mcmc.info 5 -none- numeric
## X 542 -none- numeric
summary(model2_2005$samples)
## Length Class Mode
## beta 18000 mcmc numeric
## phi 2439000 mcmc numeric
## rho 9000 mcmc numeric
## tau2 9000 mcmc numeric
## fitted 2439000 mcmc numeric
## Y 1 mcmc logical
summary(model2_2006)
## Length Class Mode
## summary.results 28 -none- numeric
## samples 6 -none- list
## fitted.values 271 -none- numeric
## residuals 2 data.frame list
## modelfit 6 -none- numeric
## accept 4 -none- numeric
## localised.structure 0 -none- NULL
## formula 3 formula call
## model 2 -none- character
## mcmc.info 5 -none- numeric
## X 542 -none- numeric
summary(model2_2006$samples)
## Length Class Mode
## beta 18000 mcmc numeric
## phi 2439000 mcmc numeric
## rho 9000 mcmc numeric
## tau2 9000 mcmc numeric
## fitted 2439000 mcmc numeric
## Y 1 mcmc logical
#Plot the MCMC samples for rho - check convergence
plot(model2_2005$samples$rho)
plot(model2_2005$samples$beta)
plot(model2_2005$samples$tau2)
plot(model2_2005$samples$phi[ ,c(1, 123, 214)])
plot(model2_2006$samples$rho)
plot(model2_2006$samples$beta)
plot(model2_2006$samples$tau2)
plot(model2_2006$samples$phi[ ,c(1, 123, 214)])
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the plot. ##
the behavior of the chains for both sets appears consistent and
stable.
# Risk
sp.dat@data$risk <- model2_2005$fitted.values / sp.dat@data$E2005
sp.dat@data$risk <- model2_2006$fitted.values / sp.dat@data$E2006
# PEP
m_2005 <- nrow(model2_2005$samples$fitted)
m_2006 <- nrow(model2_2006$samples$fitted)
risk_2005 <- model2_2005$samples$fitted / matrix(rep(sp.dat@data$E2005,m_2005), nrow=m_2005, byrow=T)
risk_2006 <- model2_2006$samples$fitted / matrix(rep(sp.dat@data$E2006,m_2005), nrow=m_2006, byrow=T)
summarise.exceedences<-function(samples,exceedences)
{
exceed.vec<-apply((samples>1),2,sum)/nrow(samples)
return(exceed.vec)
}
sp.dat@data$PEP <- as.numeric(summarise.exceedences(risk_2005,1))
sp.dat@data$PEP <- as.numeric(summarise.exceedences(risk_2006,1))
mean_2005 <- mean(risk_2005)
mean_2006 <- mean(risk_2006)
# Bivariate Mixture Model
clustering_data <- data.frame(
Fitted_2005 = model2_2005$fitted.values,
Fitted_2006 = model2_2006$fitted.values
)
clustering_result <- Mclust(clustering_data)
summary(clustering_result)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 2 components:
##
## log-likelihood n df BIC ICL
## -2298.605 271 9 -4647.629 -4733.939
##
## Clustering table:
## 1 2
## 80 191
plot(clustering_result)
sp.dat@data$Cluster <- clustering_result$classification
# Visualize it on the map
map <- tm_shape(sp.dat) +
tm_fill(col = "Cluster", palette = "Set3", title = "Cluster") +
tm_borders() +
tm_layout(frame = FALSE,
title = "Cluster Analysis of Respiratory Counts",
title.position = c("center", "top"))
tmap_mode("view")
## tmap mode set to interactive viewing
print(map)
smr_data <- data.frame(SMR_2005 = dat$SMR_2005, SMR_2006 = dat$SMR_2006)
set.seed(123)
k <- 3
kmeans_result <- kmeans(smr_data, centers = k)
print(kmeans_result)
## K-means clustering with 3 clusters of sizes 52, 112, 107
##
## Cluster means:
## SMR_2005 SMR_2006
## 1 1.3493266 1.2754141
## 2 0.5724077 0.5311341
## 3 0.9332827 0.8802605
##
## Clustering vector:
## [1] 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 3 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2
## [38] 2 2 3 2 2 2 2 2 3 2 3 3 3 3 3 3 3 2 3 2 3 2 2 1 2 2 2 3 3 2 2 2 3 3 3 3 3
## [75] 3 2 3 3 3 3 1 3 1 1 3 1 2 2 3 3 3 3 3 1 2 1 1 3 1 2 3 1 3 3 3 1 1 3 2 3 1
## [112] 1 1 2 3 1 1 3 3 1 1 3 3 1 2 1 1 2 2 1 3 3 1 1 3 2 1 1 3 1 2 1 2 3 2 2 2 1
## [149] 1 2 1 3 2 2 1 1 3 3 3 3 1 2 3 1 1 3 3 1 3 3 1 1 1 3 2 1 3 1 1 1 2 2 2 2 3
## [186] 3 2 3 2 3 3 3 1 2 2 2 2 2 3 3 2 2 2 3 3 3 3 2 2 2 3 2 1 3 2 3 3 2 3 1 2 1
## [223] 2 3 3 3 2 2 3 3 2 2 1 2 1 2 2 3 2 3 3 3 2 3 2 2 2 3 2 2 3 3 3 3 3 3 3 3 2
## [260] 2 3 3 2 2 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 4.269925 3.729706 3.750749
## (between_SS / total_SS = 78.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
dat$Cluster <- kmeans_result$cluster
aggregate(smr_data, by = list(dat$Cluster), FUN = mean)
## Group.1 SMR_2005 SMR_2006
## 1 1 1.3493266 1.2754141
## 2 2 0.5724077 0.5311341
## 3 3 0.9332827 0.8802605
plot(smr_data$SMR_2005, smr_data$SMR_2006, col = dat$Cluster, xlab = "SMR 2005", ylab = "SMR 2006")
points(kmeans_result$centers[,1], kmeans_result$centers[,2], col = 1:k, pch = 8, cex = 2)