Let’s start with reading the libraries and setting the working directory for the analysis
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(ggpubr)
library(texreg)
## Version: 1.39.3
## Date: 2023-11-09
## Author: Philip Leifeld (University of Essex)
##
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
##
## Attaching package: 'texreg'
##
## The following object is masked from 'package:tidyr':
##
## extract
library(leaflet)
library(RColorBrewer)
Now, let’s read the data.
expected <- read.csv(file = "Data/expected_counts.csv")
respiratory <- read.csv(file = "Data/respiratory_admissions.csv")
shape<-read_sf("SG_IntermediateZoneBdry_2001/")
# Change the name of the variables to match the datasets
expected <- expected %>%
rename(IZ_CODE = code)
respiratory <- respiratory %>%
rename(IZ_CODE = IG)
# Merge datasets
total <- left_join(expected, respiratory, by = "IZ_CODE")
total$SMR2009 <- total$Y2009 / total$E2009
total$SMR2010 <- total$Y2010 / total$E2010
head(total)
## IZ_CODE E2009 E2010 Y2009 Y2010 SMR2009 SMR2010
## 1 S02000260 99.51566 106.31486 111 107 1.1154023 1.0064445
## 2 S02000261 47.77799 50.86723 26 23 0.5441836 0.4521575
## 3 S02000262 95.01387 104.46578 46 53 0.4841399 0.5073431
## 4 S02000263 81.93079 90.20686 32 40 0.3905736 0.4434253
## 5 S02000264 128.91529 139.87310 52 60 0.4033657 0.4289603
## 6 S02000265 58.21802 63.97093 29 25 0.4981276 0.3908025
library(tibble)
library(dplyr)
result_table <- total %>%
summarise(
meanSMR2009 = mean(SMR2009),
meanSMR2010 = mean(SMR2010),
medianSMR2009 = median(SMR2009),
medianSMRE2010 = median(SMR2010),
correlation = cor(SMR2009, SMR2010),
CV2009 = sd(SMR2009) / mean(SMR2009),
CV2010 = sd(SMR2010) / mean(SMR2010),
change = mean((SMR2010 - SMR2009) / SMR2009),
max09 = max(SMR2009),
max10 = max(SMR2010),
min09 = min(SMR2009),
min10 = min(SMR2010)
)
# Print the result table
print(as_tibble(result_table))
## # A tibble: 1 × 12
## meanSMR2009 meanSMR2010 medianSMR2009 medianSMRE2010 correlation CV2009 CV2010
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.881 0.828 0.866 0.791 0.853 0.376 0.370
## # ℹ 5 more variables: change <dbl>, max09 <dbl>, max10 <dbl>, min09 <dbl>,
## # min10 <dbl>
# Brief visualisation of the SMR densities and correlations
total_long <- pivot_longer(total,
cols = c(SMR2009, SMR2010),
names_to = "Year",
values_to = "Value")
# Create a density plot using facet_wrap
ggplot(total_long,
aes(x = Value, fill = Year)) +
geom_density(alpha = 0.5) +
facet_wrap(~ Year, scales = "free")
# Similar in shapes
# highly correlated
cor(total[,6], total[,7])
## [1] 0.8534824
# Merge spatial data
sf.data <- merge(shape, total, all.x = FALSE, by = "IZ_CODE")
head(sf.data, 3)
## Simple feature collection with 3 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 260667 ymin: 669235 xmax: 263842 ymax: 671065
## Projected CRS: OSGB36 / British National Grid
## IZ_CODE IZ_NAME STDAREA_HA Shape_Leng Shape_Area E2009 E2010
## 1 S02000260 Auchinairn 112.2925 7751.798 1107841 99.51566 106.31486
## 2 S02000261 Woodhill East 111.5763 6464.223 1104265 47.77799 50.86723
## 3 S02000262 Woodhill West 107.2337 7316.999 1065622 95.01387 104.46578
## Y2009 Y2010 SMR2009 SMR2010 geometry
## 1 111 107 1.1154023 1.0064445 MULTIPOLYGON (((260718 6698...
## 2 26 23 0.5441836 0.4521575 MULTIPOLYGON (((262047 6699...
## 3 46 53 0.4841399 0.5073431 MULTIPOLYGON (((261158.7 67...
plot(sf.data$geometry)
Conclusions from the summary table:
The SMR decreased between 2009 and 2010
The variability, measured in the coefficient of variation is similar between the years,
Besides the mean and medians, minimum and maximum values have decreased.
# Calculate Spatial Correlation
library(spdep)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
W.nb <- poly2nb(sf.data, row.names = rownames(sf.data))
W <- nb2mat(W.nb, style = "B")
#Look at the summary of the neighbourhood connections
summary(W.nb)
## Neighbour list object:
## Number of regions: 271
## Number of nonzero links: 1424
## Percentage nonzero weights: 1.938971
## Average number of links: 5.254613
## 2 disjoint connected subgraphs
## Link number distribution:
##
## 1 2 3 4 5 6 7 8 9 10 11 14 15 20
## 4 15 30 51 62 50 32 12 5 5 2 1 1 1
## 4 least connected regions:
## 202 227 239 265 with 1 link
## 1 most connected region:
## 234 with 20 links
W.list <- nb2listw(W.nb, style = "B")
# Moran Test for Spatial Correlation
# Null Hypothesis in the test: Observations are spatially independent
moran.mc(x = sf.data$SMR2009, listw = W.list, nsim = 10000)
##
## Monte-Carlo simulation of Moran I
##
## data: sf.data$SMR2009
## weights: W.list
## number of simulations + 1: 10001
##
## statistic = 0.39019, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater
# statistic = 0.39019
# p-value = 9.999e-05
moran.mc(x = sf.data$SMR2010, listw = W.list, nsim = 10000)
##
## Monte-Carlo simulation of Moran I
##
## data: sf.data$SMR2010
## weights: W.list
## number of simulations + 1: 10001
##
## statistic = 0.43235, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater
# statistic = 0.43235
# p-value = 9.999e-05
# Conclusion: We reject the null hypothesis on the spatial independence
# Spatial analysis is justified
# Visualize data
sp.dat<-as_Spatial(sf.data)
library(broom)
sp.dat@data$id <- rownames(sp.dat@data)
temp1<-tidy(sp.dat)
## Warning: `tidy.SpatialPolygonsDataFrame()` was deprecated in broom 1.0.4.
## ℹ Please use functions from the sf package, namely `sf::st_as_sf()`, in favor
## of sp tidiers.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Regions defined for each Polygons
sp.dat2 <- merge(temp1, sp.dat@data, by = "id")
# Visualisation for 2009
visu2009 <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(SMR2009))) +
geom_polygon() +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "SMR", title = "2009") +
scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))
# Visualisation for 2010
visu2010 <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(SMR2010))) +
geom_polygon() +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "SMR", title = "2010") +
scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))
combined_smr <- ggarrange(visu2009, visu2010,
ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom")
annotate_figure(combined_smr, top = text_grob("SMR for respiratory hospitalisation",
color = "black", face = "bold", size = 16))
We use Poisson model because the event of the respiratory is rare.
Instead of ICAR model, Poisson Leroux Car model takes into account the measure of spatial dependence in the neighbourhood matrix
library(CARBayes)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: Rcpp
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "ndiMatrix" of class "replValueSp"; definition not updated
formula2009 <- Y2009 ~ offset(log(E2009))
formula2010 <- Y2010 ~ offset(log(E2010))
model1_09 <- glm(formula = formula2009, family = "poisson", data = sp.dat@data)
summary(model1_09)
##
## Call:
## glm(formula = formula2009, family = "poisson", data = sp.dat@data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.134207 0.006874 -19.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2933.1 on 270 degrees of freedom
## Residual deviance: 2933.1 on 270 degrees of freedom
## AIC: 4588
##
## Number of Fisher Scoring iterations: 4
# CAR MODEL
model09 <- S.CARleroux(formula = formula2009, family = "poisson", data = sf.data, W = W,
burnin = 10000, n.sample = 100000, thin = 10, verbose=FALSE)
print(model09)
##
## #################
## #### Model fitted
## #################
## Likelihood model - Poisson (log link function)
## Random effects model - Leroux CAR
## Regression equation - Y2009 ~ offset(log(E2009))
##
##
## #################
## #### MCMC details
## #################
## Total number of post burnin and thinned MCMC samples generated - 9000
## Number of MCMC chains used - 1
## Length of the burnin period used for each chain - 10000
## Amount of thinning used - 10
##
## ############
## #### Results
## ############
## Posterior quantities and DIC
##
## Mean 2.5% 97.5% n.effective Geweke.diag
## (Intercept) -0.1936 -0.2100 -0.1773 9000.0 -0.8
## tau2 0.3120 0.2339 0.4032 7119.5 1.8
## rho 0.6459 0.4152 0.8734 5945.6 1.8
##
## DIC = 2160.271 p.d = 232.3898 LMPL = -1166.66
summary(model09$samples)
## Length Class Mode
## beta 9000 mcmc numeric
## phi 2439000 mcmc numeric
## rho 9000 mcmc numeric
## tau2 9000 mcmc numeric
## fitted 2439000 mcmc numeric
## Y 1 mcmc logical
# Before swithcing to the inference, we assess the model convergence
plot(model09$samples$rho)
Traceplot: we see no trend, assumed convergence achieved
Density: The distribution of the parameter approximately normal, with mean around to approximated value
rho: 0.6457 - moderate-strong spatial dependence
moran.mc(x = residuals(model09, type="pearson"), listw = W.list, nsim = 10000)
##
## Monte-Carlo simulation of Moran I
##
## data: residuals(model09, type = "pearson")
## weights: W.list
## number of simulations + 1: 10001
##
## statistic = -0.071755, observed rank = 270, p-value = 0.973
## alternative hypothesis: greater
model10 <- S.CARleroux(formula=formula2010, family="poisson", data=sf.data, W=W,
burnin=10000, n.sample=100000, thin=10, verbose=FALSE)
print(model10)
##
## #################
## #### Model fitted
## #################
## Likelihood model - Poisson (log link function)
## Random effects model - Leroux CAR
## Regression equation - Y2010 ~ offset(log(E2010))
##
##
## #################
## #### MCMC details
## #################
## Total number of post burnin and thinned MCMC samples generated - 9000
## Number of MCMC chains used - 1
## Length of the burnin period used for each chain - 10000
## Amount of thinning used - 10
##
## ############
## #### Results
## ############
## Posterior quantities and DIC
##
## Mean 2.5% 97.5% n.effective Geweke.diag
## (Intercept) -0.2539 -0.2703 -0.2371 8963.8 1.0
## tau2 0.2963 0.2295 0.3745 8091.6 0.2
## rho 0.7346 0.5115 0.9286 6626.0 0.1
##
## DIC = 2159.947 p.d = 228.1328 LMPL = -1152.61
summary(model10$samples)
## Length Class Mode
## beta 9000 mcmc numeric
## phi 2439000 mcmc numeric
## rho 9000 mcmc numeric
## tau2 9000 mcmc numeric
## fitted 2439000 mcmc numeric
## Y 1 mcmc logical
plot(model10$samples$rho)
Traceplot: we see no trend, assumed convergence achieved
Density: slightly skewed, yet approximately normal
rho: 0.7317 - strong spatial dependence
moran.mc(x = residuals(model10, type="pearson"), listw = W.list, nsim = 10000)
##
## Monte-Carlo simulation of Moran I
##
## data: residuals(model10, type = "pearson")
## weights: W.list
## number of simulations + 1: 10001
##
## statistic = -0.036341, observed rank = 1916, p-value = 0.8084
## alternative hypothesis: greater
The result of the Moran test for the simple CAR bayes model show that after fiting the model there is no left spatial correlation, assuming good fit
As out CAR model did not involve covariates we do not interpret the parameters
However, we observe the strong spatial correlation
# Plot the fitted values
sp.dat@data$fitted2009 <- model09$fitted.values
sp.dat@data$fitted2010 <- model10$fitted.values
sp.dat@data$id <- rownames(sp.dat@data)
temp1<-tidy(sp.dat)
## Regions defined for each Polygons
sp.dat2 <- merge(temp1, sp.dat@data, by = "id")
visu2009_fitted <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(fitted2009))) +
geom_polygon() +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "Fitted number", title = "2009")+
scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))
visu2009_real <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(Y2009))) +
geom_polygon() +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "Fitted number", title = "2009")+
scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))
visu2010_fitted <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(fitted2010))) +
geom_polygon() +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "Fitted number", title = "2010")+
scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))
visu2010_real <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(Y2010))) +
geom_polygon() +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "Fitted number", title = "2010")+
scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))
combined_fitted <- ggarrange(visu2009_fitted, visu2010_fitted,
ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom")
annotate_figure(combined_fitted, top = text_grob("Predicted number of respiratory cases",
color = "black", face = "bold", size = 16))
In 2010, we can see that the respiratory cases have increased outside of the center
Based on the Glasgow map, this corresponds to Western Northern Part of Glasgow
At the same time it has slightly decreased in southern part of the city
# Comparing to the real data
compare_2009 <- ggarrange(visu2009_fitted, visu2009_real,
ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom")
compare_2009
compare_2010 <- ggarrange(visu2010_fitted, visu2010_real,
ncol = 2, nrow = 1, common.legend = TRUE, legend = "bottom")
compare_2010
To provide valuable data analysis, we plot the PEP
PEP - Posterior exceedance probability
Probability that the risk exceeds the national average
m1 <- nrow(model09$samples$fitted)
m2 <- nrow(model10$samples$fitted)
visu2009_PEP <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(PEP09))) +
geom_polygon() +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "PEP", title = "2009")+
scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))
# Risk is calculated as: the ratio of the fitted.value and the expected value of the respirtory cases
# How to interpret it?
# The larger the risk, the larger the ratio of the modelled number of cases to the expected
#### Risk 2009 and 2010
sp.dat@data$risk2009 <- model09$fitted.values / sp.dat@data$E2009
sp.dat@data$risk2010 <- model10$fitted.values / sp.dat@data$E2010
risk09 <- model09$samples$fitted / matrix(rep(sp.dat@data$E2009,m1), nrow=m1, byrow=T)
risk10 <- model10$samples$fitted / matrix(rep(sp.dat@data$E2009,m2), nrow=m2, byrow=T)
summarise.exceedences<-function(samples,exceedences)
{
exceed.vec<-apply((samples>exceedences),2,sum)/nrow(samples)
return(exceed.vec)
}
sp.dat@data$PEP09 <- as.numeric(summarise.exceedences(risk09,1))
sp.dat@data$PEP10 <- as.numeric(summarise.exceedences(risk10,1))
sp.dat@data$id <- rownames(sp.dat@data)
temp1<-tidy(sp.dat)
## Regions defined for each Polygons
sp.dat2 <- merge(temp1, sp.dat@data, by = "id")
# Visualisation for 2009
colours <- colorNumeric(palette = "YlOrRd", domain = sp.dat2$PEP2009)
library(RColorBrewer)
visu2009_PEP <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(PEP09))) +
geom_polygon() +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "PEP", title = "2009")+
scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))
# Visualisation for 2010
visu2010_PEP <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = c(PEP10))) +
geom_polygon() +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "PEP", title = "2010") +
scale_fill_gradientn(colors=brewer.pal(n=9, name="YlOrRd"))
risk_matrix <- cbind(sp.dat@data$risk2009, sp.dat@data$risk2010)
colnames(risk_matrix) <- c("risk2009", "risk2010")
# Fit a Gaussian Mixture Model
library(mclust)
## Package 'mclust' version 6.0.1
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
##
## map
gmm_model <- Mclust(risk_matrix, G = 1:9) # G - number of clusters
gmm_model
## 'Mclust' model object: (VEE,2)
##
## Available components:
## [1] "call" "data" "modelName" "n"
## [5] "d" "G" "BIC" "loglik"
## [9] "df" "bic" "icl" "hypvol"
## [13] "parameters" "z" "classification" "uncertainty"
BIC
plot(gmm_model, what = "BIC")
classification
plot(gmm_model, what = "classification")
uncertainty
plot(gmm_model, what = "uncertainty")
density
plot(gmm_model, what = "density")
table(gmm_model$class)
##
## 1 2
## 180 91
Cluster 1 contains 181 observations and cluster 2 contains 90 observations.
1. BIC We focus on BIC variance parameterization: “EII”, “VII”, “EEI”, “VEI”, “EVI”, “VVI”. Based on them, 2 clusters seems to be optimal number. This conclusion corresponds with the algorithm’s output.
2. classification Based on classification plot, the number of clusters doesn’t seem obvious. Cluster with lower risk in both years (up to 0.8), is easier to distinct in comparison to cluster to, which includes a few outliers. Assuming we had only this plot, it would be hard to decide on number of clusters.
3. uncertainty Uncertainty is the higher the closer the observations are to each other.
4. density The low risk cluster has a denser distributed observations, which is supporting for the sake of robust cluster. In clustering, the objective is to minimize the sum of distances within clusters while maximizing the total distance between clusters. In this context, the presence of outliers assigned to the higher risk cluster could potentially impact the effectiveness of the clustering algorithm. Nevertheless, the overall pattern of observations, depicted as a “line shape”, remains relatively straightforward to interpret. The observations exhibit a discernible correlation across years, and outliers consistently demonstrate a proportionally higher risk for both years.
In conclusion, the most helpful in selecting the number of clusters is the BIC score. We end up with 2 clusters of lower mean risk area across 2009 and 2010 and higher risk, both for 2009 and 2010.
Component means
component_means <- gmm_model$parameters$mean
colnames(component_means) <- c("Cluster1", "Cluster2")
component_means
## Cluster1 Cluster2
## risk2009 1.0160637 0.5822874
## risk2010 0.9462084 0.5648059
The ‘component_means’ dataframe represents the mean risk values for two clusters (Cluster1 and Cluster2) across two time points (risk in 2009 and risk in 2010) in the context of respiratory disease. Cluster1 tends to exhibit higher average risks compared to Cluster2, indicating potential spatial patterns or differences in disease prevalence.
Extract cluster membership and add it to our data.
sp.dat@data$Cluster <- gmm_model$classification
temp2 <- data.frame(id = sp.dat@data$id, Cluster = sp.dat@data$Cluster)
sp.dat2 <- merge(sp.dat2, temp2, by="id")
clusters_vis <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = as.factor(Cluster))) +
geom_polygon() +
scale_fill_manual(values = c("#000033","#0099FF"),
name = "Risk Levels",
labels = c("Higher Mean Risk Across Years", "Lower Mean Risk Across Years")) +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "Cluster", title = "Clusters for risk of respiratory disease across Glasgow for 2009 and 2010") +
theme(legend.title = element_text(size = 8),
plot.title = element_text(size = 13),
legend.text = element_text(size = 7),
legend.key.width = unit(0.3, "cm"),
legend.key.height = unit(0.3, "cm"))
clusters_vis
library(ClusterR)
Since in this dataset we are not struggling with outliers, we can use k-means.
Optimal number of clusters according to Silhouette Score:
opt_n_of_clust <- Optimal_Clusters_KMeans(risk_matrix, max_clusters=10,
plot_clusters=TRUE, criterion="silhouette")
It seems that two clusters are optimal.
Optimal number of clusters according to elbow method:
opt_n_of_clust<-Optimal_Clusters_KMeans(risk_matrix, max_clusters=10,
plot_clusters = TRUE)
Here also two clusters performs well.
kmeans_result <- kmeans(risk_matrix, centers = 2, nstart = 50)
Extract the cluster membership and add it to our data:
sp.dat@data$Cluster_kmeans <- kmeans_result$cluster
temp3 <- data.frame(id = sp.dat@data$id, Cluster_kmeans = sp.dat@data$Cluster_kmeans)
sp.dat2 <- merge(sp.dat2, temp3, by="id")
clusters_kmeans_vis <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = as.factor(Cluster_kmeans))) +
geom_polygon() +
scale_fill_manual(values = c("#0099FF","#000033"),
name = "Risk Levels",
labels = c("Higher Mean Risk Across Years", "Lower Mean Risk Across Years")) +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "Cluster kmeans", title = "Clustering method using kmeans") +
theme(legend.title = element_text(size = 8),
plot.title = element_text(size = 13),
legend.text = element_text(size = 7),
legend.key.width = unit(0.3, "cm"),
legend.key.height = unit(0.3, "cm"))
clusters_kmeans_vis
Note that we reversed colours in the plot, meaning the cluster number for keams is reversed in relation to the number from gmm_model. We will fix that:
diff_analysis <- as.data.frame(kmeans_result$cluster)
colnames(diff_analysis) <- c("kmeans_result")
diff_analysis$kmeans_result <- ifelse(
diff_analysis$kmeans_result == 1, 2, 1
)
Differences analysis
diff_analysis[,2] <- gmm_model$classification
colnames(diff_analysis) <- c("kmeans_result","gmm_result")
diff_analysis$diff <- ifelse(
diff_analysis$kmeans_result != diff_analysis$gmm_result,
ifelse(diff_analysis$kmeans_result - diff_analysis$gmm_result == -1, -1, -2),
diff_analysis$kmeans_result
)
unique(diff_analysis$diff)
## [1] 1 2 -2
Assignment:
Abbreviations:
LR - low risk (2)
HR - high risk (1)
No such situation, where gmm_model classified the area as 2 (LR) and kmeans as 1 (HR). It means that gmm_model approach is more conservative. In case of risk, conservative side is usually the preferred one.
sp.dat@data$diff_cluster <- diff_analysis$diff
temp4 <- data.frame(id = sp.dat@data$id, diff_cluster = sp.dat@data$diff_cluster)
sp.dat2 <- merge(sp.dat2, temp4, by="id")
clusters_diff_vis <- ggplot(data = sp.dat2, aes(x=long, y=lat, goup=group, fill = as.factor(diff_cluster))) +
geom_polygon() +
scale_fill_manual(values = c("#FF6966","#000033", "#0099FF"),
name = "Risk Levels",
labels = c("LR for kmeans HR for gmm","HR cluster both", "LR cluser both")) +
coord_equal() +
xlab("Easting (m)") +
ylab("Northing (m)") +
labs(fill = "Clusters", title = "Comparison of two clustering methods") +
theme(legend.title = element_text(size = 8),
plot.title = element_text(size = 13),
legend.text = element_text(size = 7),
legend.key.width = unit(0.3, "cm"),
legend.key.height = unit(0.3, "cm"))
clusters_diff_vis