Burcin Yazgi Walsh, Chris Brunsdon, Martin Charlton
NCG, Maynooth University
Ireland clusters are obtained using the 2016 Census Data at Small Areas Level by following the instructions prepared by Chris Brunsdon, Jan Rigby and Martin Charlton for 2011 Ireland Census Data.
For the previous study done for 2011 Ireland Census Data you can check this link (https://rpubs.com/chrisbrunsdon/14998).
The original census data can be found through the link below (http://www.cso.ie/en/census/census2016reports/census2016smallareapopulationstatistics/).
CreateVariables <- function(SA2016, na.rm=TRUE) {
attach(SA2016)
Age0_4 <- 100 * ( T1_1AGE0T + T1_1AGE1T + T1_1AGE2T + T1_1AGE3T + T1_1AGE4T ) / T1_1AGETT
Age5_14 <- 100 * ( T1_1AGE5T + T1_1AGE6T + T1_1AGE7T + T1_1AGE8T + T1_1AGE9T +
T1_1AGE10T + T1_1AGE11T + T1_1AGE12T + T1_1AGE13T + T1_1AGE14T) / T1_1AGETT
Age25_44 <- 100 * ( T1_1AGE25_29T + T1_1AGE30_34T + T1_1AGE35_39T + T1_1AGE40_44T ) / T1_1AGETT
Age45_64 <- 100 * ( T1_1AGE45_49T + T1_1AGE50_54T + T1_1AGE55_59T + T1_1AGE60_64T ) / T1_1AGETT
Age65over <- 100 * ( T1_1AGE65_69T + T1_1AGE70_74T + T1_1AGE75_79T + T1_1AGE80_84T + T1_1AGEGE_85T ) / T1_1AGETT
EU_National <- 100 * (T2_1UKN + T2_1PLN + T2_1LTN + T2_1EUN) / T2_1TN
ROW_National <- 100 * (T2_1RWN) / T2_1TN
Born_outside_Ireland <- 100 * (T2_1TBP - T2_1IEBP) / T2_1TBP
Separated <- 100 * (T1_2SEPT + T1_2DIVT) / T1_2T
SinglePerson <- 100 * (T5_2_1PP - T4_5RP ) / T5_2_TP
Pensioner <- 100 * T4_5RP / T4_5TP
LoneParent <- 100 * (T4_3FOPFCT + T4_3FOPMCT) / T4_5TF
DINK <- 100 * T4_5PFF / T4_5TF
NonDependentKids <- 100 * T4_4AGE_GE20F / T4_4TF
RentPublic <- 100 * T6_3_RLAH / T6_3_TH
RentPrivate <- 100 * T6_3_RPLH / T6_3_TH
Flats <- 100 * T6_1_FA_H / T6_1_TH
NoCenHeat <- 100 * T6_5_NCH / T6_5_T
RoomsHH <- (T6_4_1RH + 2*T6_4_2RH + 3*T6_4_3RH + 4*T6_4_4RH + 5*T6_4_5RH + 6*T6_4_6RH + 7*T6_4_7RH + 8*T6_4_GE8RH) / T6_4_TH
PeopleRoom <- T1_1AGETT / (T6_4_1RH + 2*T6_4_2RH + 3*T6_4_3RH + 4*T6_4_4RH + 5*T6_4_5RH + 6*T6_4_6RH + 7*T6_4_7RH + 8*T6_4_GE8RH)
SepticTank <- 100 * T6_7_IST / T6_7_T
HEQual <- 100 * ((T10_4_ODNDT + T10_4_HDPQT + T10_4_PDT + T10_4_DT) / T10_4_TT)
Employed <- 100 * T8_1_WT / T8_1_TT
TwoCars <- 100 * (T15_1_2C + T15_1_3C + T15_1_GE4C) / (T15_1_NC + T15_1_1C + T15_1_2C + T15_1_3C + T15_1_GE4C + T15_1_NSC)
JTWPublic <- 100 * (T11_1_BUW + T11_1_TDLW) / T11_1_TW
HomeWork <- 100 * T9_2_PH / T9_2_PT
LLTI <- 100 * (T12_3_BT + T12_3_VBT) / T12_3_TT
UnpaidCare <- 100 * T12_2_T / T1_1AGETT
Students <- 100 * T8_1_ST / T8_1_TT
Unemployed <- 100 * T8_1_ULGUPJT / T8_1_TT
EconInactFam <- 100 * T8_1_LAHFT / T8_1_TT
Agric <- 100 * T14_1_AFFT / T14_1_TT
Construction <- 100 * T14_1_BCT / T14_1_TT
Manufacturing <- 100 * T14_1_MIT / T14_1_TT
Commerce <- 100 * T14_1_CTT / T14_1_TT
Transport <- 100 * T14_1_TCT / T14_1_TT
Public <- 100 * T14_1_PAT / T14_1_TT
Professional <- 100 * T14_1_PST / T14_1_TT
#MISC
Broadband <- 100 * T15_3_B / (T15_3_B + T15_3_OTH) # Internet connected HH with Broadband
Internet <- 100 * (T15_3_B + T15_3_OTH) / T15_3_T # Households with Internet
Place <- data.frame(substring(GEOGID, 8), stringsAsFactors=FALSE)
detach(SA2016)
### Bringing it all together
colnames(Place)[1] <- 'GEOGID'
Demographic <- data.frame(Age0_4, Age5_14, Age25_44, Age45_64, Age65over, EU_National, ROW_National, Born_outside_Ireland)
HouseholdComposition <- data.frame(Separated, SinglePerson, Pensioner, LoneParent, DINK, NonDependentKids)
Housing <- data.frame(RentPublic, RentPrivate, Flats, NoCenHeat, RoomsHH, PeopleRoom, SepticTank)
SocioEconomic <- data.frame(HEQual, Employed, TwoCars, JTWPublic, HomeWork, LLTI, UnpaidCare)
Employment <- data.frame(Students, Unemployed, EconInactFam, Agric, Construction, Manufacturing, Commerce, Transport, Public, Professional)
Misc <- data.frame(Internet, Broadband)
DerivedData <- data.frame(Place, Demographic, HouseholdComposition, Housing, SocioEconomic, Employment, Misc)
if (na.rm) DerivedData[which(is.na(DerivedData),arr.ind=T)] <- 0
DerivedData
}
After organising the data, we will be reading the data and computing the derived varaibles as explained above. At the end of these steps, we will have a working data set.
SAData <- read.csv(file="SAPS2016_SA2017.csv", header = TRUE, sep=",", stringsAsFactors=FALSE)
SAVars <- CreateVariables(SAData)
Cluster Analysis
We will be using the k-means cluster analysis ( 1 ) and as a first step we will be running the principal component analysis. After learning the proportion of the each component’s variance explanation, we will be investigating the cumulative effect.
PCA <- princomp(SAVars[, -1], cor = T, scores = T)
PCA$sdev^2/sum(PCA$sdev^2)
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## 0.2484693804 0.1421954394 0.0891606551 0.0758493378 0.0400537948
## Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## 0.0332321155 0.0308810401 0.0267886445 0.0245470766 0.0236552256
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
## 0.0215870004 0.0199187591 0.0191543176 0.0181670433 0.0170903608
## Comp.16 Comp.17 Comp.18 Comp.19 Comp.20
## 0.0164860334 0.0148776432 0.0137297064 0.0127655558 0.0114793411
## Comp.21 Comp.22 Comp.23 Comp.24 Comp.25
## 0.0105019385 0.0104688446 0.0091711753 0.0088711172 0.0079172581
## Comp.26 Comp.27 Comp.28 Comp.29 Comp.30
## 0.0070287236 0.0064817850 0.0063391058 0.0054675320 0.0048938867
## Comp.31 Comp.32 Comp.33 Comp.34 Comp.35
## 0.0042540572 0.0039313291 0.0028685743 0.0025762038 0.0024743384
## Comp.36 Comp.37 Comp.38 Comp.39 Comp.40
## 0.0023213324 0.0016691996 0.0014561298 0.0009091458 0.0003098518
cumsum(PCA$sdev^2/sum(PCA$sdev^2))
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## 0.2484694 0.3906648 0.4798255 0.5556748 0.5957286 0.6289607 0.6598418
## Comp.8 Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14
## 0.6866304 0.7111775 0.7348327 0.7564197 0.7763385 0.7954928 0.8136598
## Comp.15 Comp.16 Comp.17 Comp.18 Comp.19 Comp.20 Comp.21
## 0.8307502 0.8472362 0.8621139 0.8758436 0.8886091 0.9000885 0.9105904
## Comp.22 Comp.23 Comp.24 Comp.25 Comp.26 Comp.27 Comp.28
## 0.9210593 0.9302304 0.9391015 0.9470188 0.9540475 0.9605293 0.9668684
## Comp.29 Comp.30 Comp.31 Comp.32 Comp.33 Comp.34 Comp.35
## 0.9723360 0.9772298 0.9814839 0.9854152 0.9882838 0.9908600 0.9933343
## Comp.36 Comp.37 Comp.38 Comp.39 Comp.40
## 0.9956557 0.9973249 0.9987810 0.9996901 1.0000000
As a result of the cumulative amount of the variance explained by the components, it is noticable that the first 14 components can explain the 81.3% of the variance in total. We will be using this number for k-means cluster analysis. Note that the number of principal component considered should be updated based on your own analysis.
smallest.clus <- wss <- rep(0, 100)
for (i in 1:100) {
clus <- kmeans(PCA$scores[, 1:14], i, iter.max = 100, nstart = 20)
wss[i] <- clus$tot.withinss
smallest.clus[i] <- min(clus$size)
}
plot(1:100, wss[1:100], type = "h", main = "Cluster Scree Plot", xlab = "Number of Clusters",
ylab = "Within Cluster Sum of Squares")
“A further issue is the size of the clusters. One problem frequently occuring with ( k )-means cluster analysis is that very unusual cases tend to break of into small clusters - largely because they aim to minimise the within-cluster sum of squares, and small groups of similar but outlying observations contribute highly to this quantity unless they are assigned their own cluster. This sometimes means that increasing the number of clusters tends to create these very small ‘outlier’ clusters. A helpful diagnostic is to plot the smallest cluster size against the total number of cluster” (Brunsdon et al., 2011).
plot(1:100, smallest.clus[1:100], type = "h", main = "Smallest Cluster Plot",
xlab = "Number of Clusters", ylab = "Smallest Cluster Size")
When the number of the clusters is more than 8, it looks like very small clusters are being more common. Based on this info, using the cluster number 8, we can recalculate the classification. Note that the number of the cluster should be updated based on your own analysis.
SAclus <- kmeans(PCA$scores[, 1:14], 8, iter.max = 100, nstart = 20)
SAclusters <- SAclus$cluster
Diagnostics
“Since the above computation applied the analysis to principal components it is helpful to characterise the clusters in terms of the original variables used in the PCA. To do this, firstly the ( z )-scores of each variable for the cluster centroids are computed. If the initial variable is ( x ), then the ( z )-score is given by
\[ z = \frac{x - \bar{x}}{s} \]
where ( {x} ) is the arithmetic mean of the ( x ) values in the data set, and ( s ) is the sample standard deviation defined by
\[ s = + \sqrt{\frac{\sum{(x - \bar{x})^2}}{n - 1}} \]
this standardisation allows values of each variable to be compared on a consistent scale - the ( z )-score in each case has a mean of zero and a standard deviation of one. Here, the ( x ) values are actually cluster-wise mean values, whereas ( {x} ) and ( s ) are computed for values for all Small Areas. This is useful for identifying which clusters have relatively high or low values of particular variables. The following code creates a set of ( z ) scores for each cluster" (Brunsdon et al., 2011).
# We need this for the 'ddply' function
library(plyr)
# Compute a data frame (one row per cluster) containing the means of each variable in that cluster
mean_by_cluster <- ddply(SAVars, .(SAclusters), numcolwise(mean))[, -1]
# Compute the column-wise means for *all* observations
mean_by_col <- apply(SAVars[, -1], 2, mean)
# Compute the column-wise *sample* sd's for *all* observations
sd_by_col <- apply(SAVars[, -1], 2, sd)
# Create the z-scores via the 'scale' function
z_scores <- scale(mean_by_cluster, center = mean_by_col, scale = sd_by_col)
These values can be visualised to explore more and heatmap function can be one of the options.
library(RColorBrewer)
heatmap(t(z_scores),
scale = 'none',
col=brewer.pal(6,'BrBG'),
breaks=c(-1e10,-2,-1,0,1,2,+1e10),
xlab='Cluster Number',
add.expr=abline(h=(0:40)+0.5,v=(0:6)+0.5,col='white'))
We can also check the distribution of the size of each cluster group.
barplot(sort(table(SAclusters)), las = 3)
Using PAM Instead of ( k )-Means
As Brunsdon et al. ( 2011 ) mentioned the other alternative clustering technique is Kaufman and Rousseeuw’s Partitioning Around Medoids (PAM) algorithm ( 2 ). Note that the numbers can be different related to your own analysis results. After this stage, we will be repeating the previous steps; running z-score analysis, presenting the values by a heatmap and producing a barplot.
library(cluster)
PAMclus <- pam(x = PCA$scores[, 1:14], k = 8, pamonce = 2)
PAMclusters <- PAMclus$clustering
mean_by_cluster2 <- ddply(SAVars, .(PAMclusters), numcolwise(mean))[, -1]
z_scores2 <- scale(mean_by_cluster2, center = mean_by_col, scale = sd_by_col)
heatmap(t(z_scores2),
scale = 'none',
col=brewer.pal(7,'BrBG'),
breaks=c(-1e10,-2,-1,-0.5,0.5,1,2,+1e10),
xlab='Cluster Number',
add.expr=abline(h=(0:40)+0.5,v=(0:6)+0.5,col='white'))
barplot(sort(table(PAMclusters)), las = 3)
Mapping Diagnostics
There are many possible ways to map your clusters. Some of the alternatives will be covered in this section.
#tmap option
library(tmap)
load("m")
tm_shape(m)+
tm_fill("PAMclusters", title="clusters", legend.show=FALSE)
#if you want to add facets as well
tm_shape(m)+
tm_fill("PAMclusters", title="clusters", legend.show=FALSE)+
tm_facets("PAMclusters")
Another alternative display style is Leaflet-Shiny integration. This is a useful tool when you want some interactivity for your map and data. Here are some images from the output.
You can see the live application through this link
Leaflet-Shiny example
You might also want to try flexdashboard option for your map output. This alternative also have interactive display tools. For the live application you can visit the link
Flexdashboard example
References
( 1 ): Hartigan, J. A. and Wong, M. A., 1979. A K-means clustering algorithm. Applied Statistics 28, 100-108.
( 2 ): Kaufman, L. and Rousseeuw, P. J., 1990. Finding Groups in Data: An Introduction to Cluster Analysis John Wiley and Sons, New York.