rpub link: https://rpubs.com/sakiyasuda/687678

Objective

Installing Packages

packages = c('rgdal', 'spdep', 'tmap', 'sf', 'ggpubr', 'cluster', 'factoextra', 'NbClust', 'heatmaply', 'corrplot', 'psych', 'tidyverse','geobr','mstknnclust' )
for (p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p,character.only = T)
}

Data Wrangling

Import Data into R enviroment

brazil = read.csv("data/aspatial/BRAZIL_CITIES.csv",  sep = ";", encoding = "UTF-8") 
data_dict = read.csv("data/aspatial/Data_Dictionary.csv",  sep = ";", encoding = "UTF-8") 

Filter and Extract data

I have extracted Brazil state called “SP” and city called “Santa Bárbara D’Oeste”

brazil_filtered <- brazil[brazil$STATE == 'SP',]
brazil_filtered$CITY[brazil_filtered$CITY == "Santa Bárbara D'Oeste"] <- "Santa Bárbara D'oeste"

Extract 2016 municipality level

Filter municipality by state “SP”

muni_fil <- muni[muni$abbrev_state  == 'SP',]

Sao Paulo Metropolitian level

Join Macro Metro and Brazil Filtered by city name

macro_metro_join <- left_join(metro_fill, brazil_filtered, 
                              by = c("name_muni" = "CITY"))

Extract Municipalities from Braganca Paulista

I have retrieve the city names from below link https://en.wikipedia.org/wiki/List_of_municipalities_in_S%C3%A3o_Paulo

bp_names <- c("Atibaia",
              "Bom Jesus dos Perdões",
              "Bragança Paulista",
              "Itatiba",
              "Jarinu",
              "Joanópolis",
              "Morungaba",
              "Nazaré Paulista",
              "Piracaia",
              "Tuiuti",
              "Vargem"
              )

Filter out the Sao Paulo Municipality data

muni_fill_sao <- muni_fil %>% filter(tolower(name_muni) %in% tolower(bp_names))

Merge Metropolitian and Municipalities

area <- bind_rows(metro_fill, muni_fill_sao)

remove if there is any duplicated value

area <- area[!duplicated(area$name_muni),]
plot(area$geom)

## Join Geospatial and Aspatial

area_join <- area %>% inner_join(brazil_filtered, by = c("abbrev_state" = "STATE", "name_muni" = "CITY"))

Data Filter and Selection

choose the column that is used for the analysis later

data_fil <- c("name_muni",
              "IBGE_RES_POP", 
              "AREA",
              "GVA_AGROPEC",
              "GVA_INDUSTRY",
              "GVA_SERVICES",
              "GVA_PUBLIC",
              "COMP_A",
              "COMP_B",
              "COMP_C",
              "COMP_D",
              "COMP_E",
              "COMP_F",
              "COMP_G",
              "COMP_H",
              "COMP_I",
              "COMP_J",
              "COMP_K",
              "COMP_L",
              "COMP_M",
              "COMP_N",
              "COMP_O",
              "COMP_P",
              "COMP_Q",
              "COMP_R",
              "COMP_S",
              "COMP_T",
              "COMP_U",
              "geom"
              )

data_fil <- area_join[,data_fil]

According to the data_dic, we can understand each Comp data type. Hence, for better understanding we can rename the column

data_fil_new <- c("NAME_MUNI",
              "POPULATION", 
              "AREA",
              "GVA_AGROPEC",
              "GVA_INDUSTRY",
              "GVA_SERVICES",
              "GVA_PUBLIC",
              "AGRO",
              "EXTRACTIVE",
              "TRANSFORMATION",
              "GAS_ELECT",
              "WATER_SEWAGE",
              "CONSTRUCTION",
              "VEHICLE_REPAIR",
              "TRANSPORT_MAIL",
              "ACCOM_FOOD", 
              "INFOCOMM",
              "FINANCE",
              "REALESTATE",
              "SCIENCE_TECHNICAL",
              "ADMIN",
              "DEFENSE_SS",
              "EDUCATION",
              "HEALTH_SOCIAL_SERVICE",
              "ART_SPORT_RECRE",
              "OTHER_SERVICES",
              "DOMESTIC_SERVICES",
              "INTERNATIONAL_INSTI",
              "geom"
              )

colnames(data_fil) <- data_fil_new

Aggregate the data

we can combine companes based on similar cateogies

data_agg <- data_fil %>% 
  mutate(`AGROPEC` = `AGRO` + `EXTRACTIVE`) %>%
  mutate(`Industry` = `GAS_ELECT` + `CONSTRUCTION` + `TRANSFORMATION`) %>%
  mutate(`Services` =  + `WATER_SEWAGE` + `VEHICLE_REPAIR` + `TRANSPORT_MAIL` + `INFOCOMM` + `FINANCE` + `REALESTATE` + `SCIENCE_TECHNICAL` + `ADMIN`+`ACCOM_FOOD` +`OTHER_SERVICES` + `DOMESTIC_SERVICES` + `INTERNATIONAL_INSTI` + `DEFENSE_SS`+ `ART_SPORT_RECRE`) %>%
  mutate(`Public` =  `EDUCATION` + `HEALTH_SOCIAL_SERVICE` ) %>%
  select(NAME_MUNI, POPULATION, AREA, GVA_AGROPEC, GVA_INDUSTRY, GVA_SERVICES, GVA_PUBLIC, AGROPEC, Industry, Services, Public, geom)

EDA

GVA_AGROPEC_gg <- ggplot(data=data_agg, 
             aes(x= `GVA_AGROPEC`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

GVA_INDUSTRY_gg <- ggplot(data=data_agg, 
             aes(x= `GVA_INDUSTRY`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

GVA_SERVICES_gg <- ggplot(data=data_agg, 
             aes(x= `GVA_SERVICES`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

GVA_PUBLIC_gg <- ggplot(data=data_agg, 
             aes(x= `GVA_PUBLIC`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

AGROPEC_gg <- ggplot(data=data_agg, 
             aes(x= `AGROPEC`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

Industry_gg <- ggplot(data=data_agg, 
             aes(x= `Industry`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

Services_gg <- ggplot(data=data_agg, 
             aes(x= `Services`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

Public_gg <- ggplot(data=data_agg, 
             aes(x= `Public`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")


ggarrange(GVA_AGROPEC_gg, GVA_INDUSTRY_gg, GVA_SERVICES_gg, GVA_PUBLIC_gg, AGROPEC_gg, Industry_gg, Services_gg,Public_gg,
          ncol = 2,
          nrow = 4)

## Chlorpleth Mapping by visualizing we can understand on how different types of industry are concentrated in Brazil

Based on the choropleth maps, GVA_AGROPEC is concentrated in east. GVA_INDUSTRY & GVA_SERVICES & GVA_PUBLIC are mostly in south area of the state. lastly, for AGROPEC, we can see that it is evenly distributed.

tm_shape(data_agg)+
  tm_fill("GVA_AGROPEC",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("GVA_INDUSTRY",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("GVA_SERVICES",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("GVA_PUBLIC",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("AGROPEC",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("Industry",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("Services",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("Public",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

## Correlation Analaysis

According to the correlation analysis, GVA_SERVICES anr GVA_PUBLIC are highly corrleate to industry, services and public

corr_analy <- data_agg %>% select(c(`GVA_AGROPEC`, `GVA_INDUSTRY`, `GVA_SERVICES`, `GVA_PUBLIC`, `AGROPEC`, `Industry`, `Services`,`Public`)) %>%st_set_geometry(NULL)
corr_analy.cor = cor(corr_analy)

corrplot.mixed(corr_analy.cor,
               lower = "ellipse", 
               upper = "number",
               tl.pos = "lt",
               diag = "l",
               tl.col = "black",width=10, height=4,number.cex = .55)

#Hierarchial Clustering Filter the clustering variables

cluster <- data_agg %>%
  st_set_geometry(NULL) %>%
  select("NAME_MUNI","GVA_AGROPEC", "GVA_INDUSTRY", "GVA_SERVICES","GVA_PUBLIC", "AGROPEC", "Industry", "Services","Public")

rename the row by the “NAME_MUNI”

row.names(cluster) <- cluster$"NAME_MUNI"

we can drop the “NAME_MUNI”

cluster <- select(cluster, c(2:9))

Standarise dataset

cluster.std <- normalize(cluster)

calcualte euclidean distance

proxmat <- dist(cluster.std, method = "euclidean")

computing hierarachial clustering

hclust_ward <- hclust(proxmat, method = 'ward.D')

we can plot the decision tree

plot(hclust_ward, cex = 0.4)

## Selecting the optimal clustering algorithm

m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

ac <- function(x) {
  agnes(cluster, method = x)$ac
}

map_dbl(m, ac)
##   average    single  complete      ward 
## 0.9925490 0.9925826 0.9924811 0.9929544

Determining Optimal Clusters

There are three methods are can find optimal clustering number I will be using gap statistic to find the optimal number for cluster

set.seed(12345)
gap_stat <- clusGap(cluster.std, FUN = hcut, nstart = 25, K.max = 10, B = 50)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = cluster.std, FUNcluster = hcut, K.max = 10, B = 50,     nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 1
##           logW   E.logW      gap     SE.sim
##  [1,] 2.142592 3.887897 1.745305 0.02394589
##  [2,] 1.989448 3.591081 1.601632 0.02303804
##  [3,] 1.709898 3.465527 1.755630 0.02448668
##  [4,] 1.566754 3.376340 1.809586 0.02148310
##  [5,] 1.464510 3.298227 1.833717 0.01955427
##  [6,] 1.298997 3.228299 1.929303 0.01881859
##  [7,] 1.250977 3.163612 1.912634 0.01959523
##  [8,] 1.145949 3.104285 1.958336 0.01875626
##  [9,] 1.071225 3.051907 1.980682 0.01908589
## [10,] 1.033361 3.004548 1.971188 0.01850501

Visualise th eplot by uising :“fviz_gap_stat”

fviz_gap_stat(gap_stat)

Looking at the gap statistic graph, the 8-cluster gives the larger gap statistic. Furthermore, it appears to be flattening and stable therefore 8 clusters should be the best cluster to pick.l.

Interpret The Dendrograms

plot(hclust_ward, cex = 0.6)
rect.hclust(hclust_ward, k = 8, border = 2:5)

From the cluster dendrogram, we can see the 8 clusters of municipality grouped together. ## Visually deiven hierachical clustering analysis by using heatmaply, we can plot the interactive cluster heatmap

Transforming the data frame into a matrix

cluster_mat <- data.matrix(cluster)
heatmaply(normalize(cluster_mat),
          Colv=NA,
          dist_method = "euclidean",
          hclust_method = "ward.D",
          seriate = "OLO",
          colors = Blues,
          k_row = 6,
          margins = c(NA,200,60,NA),
          fontsize_row = 4,
          fontsize_col = 5,
          main="Geographic Segmentation of  Sao Paulo State",
          xlab = "ICT Indicators",
          ylab = "Townships of Shan State"
          )

## Mapping the clusters formed

groups <- as.factor(cutree(hclust_ward, k=8))
data_agg_cluster <- cbind(data_agg, as.matrix(groups)) %>%
  rename(`CLUSTER`=`as.matrix.groups.`)

Use qtm() from tmap to plot the choropleth map of how clusters are formed

qtm(data_agg_cluster, "CLUSTER")

# Spatially Constrained Clustering - SKATER approach ## Conver spatialpolygons dataframe

data_sp <- as_Spatial(data_agg)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO"): Discarded datum Unknown based on GRS80 ellipsoid in CRS definition,
##  but +towgs84= values preserved

Compute Neighbour List

data.nb <- poly2nb(data_sp,)
summary(data.nb)
## Neighbour list object:
## Number of regions: 172 
## Number of nonzero links: 872 
## Percentage nonzero weights: 2.947539 
## Average number of links: 5.069767 
## 1 region with no links:
## 100
## Link number distribution:
## 
##  0  1  2  3  4  5  6  7  8  9 10 22 
##  1  4 14 21 35 31 29 17  9  5  5  1 
## 4 least connected regions:
## 8 9 29 36 with 1 link
## 1 most connected region:
## 161 with 22 links

We can understand the neighbours list by ploting data.nb.

plot(data_sp, border=grey(.5))
plot(data.nb, coordinates(data_sp), col="blue", add=TRUE)

From the plot, we can see the most of neighbors is concentrate in south region.

Computing minimum spanning tree

Artificially add in neighbour for the island municipality

cnt <- card(data.nb)
island_ind <- match(0, cnt)
print(island_ind)
## [1] 100
nearest <- knearneigh(coordinates(data_sp))$nn
data.nb[[island_ind]] <- nearest[island_ind]

# We need to add this island to the neighbours of its connected municipality as well
data.nb[[nearest[island_ind]]] <- c(as.integer(100), as.integer(data.nb[[nearest[island_ind]]]))

calculating edges costs

lcosts <- nbcosts(data.nb, cluster.std)
data.w <- nb2listw(data.nb, lcosts, style="B")
summary(data.w)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 172 
## Number of nonzero links: 874 
## Percentage nonzero weights: 2.9543 
## Average number of links: 5.081395 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 10 22 
##  5 14 21 35 30 30 17  9  5  5  1 
## 5 least connected regions:
## 8 9 29 36 100 with 1 link
## 1 most connected region:
## 161 with 22 links
## 
## Weights style: B 
## Weights constants summary:
##     n    nn       S0       S1       S2
## B 172 29584 267.8447 688.0612 14258.25

Computing minimum spanning tree

data.mst <- mstree(data.w)
plot(data_sp, border=gray(.5))
plot.mst(data.mst, coordinates(data_sp), 
     col="blue", cex.lab=0.7, cex.circles=0.005, add=TRUE)

## Computing spatially constrained clusters using SKATER method

clust8 <- skater(data.mst[,1:2], cluster.std, method = "euclidean", 8)
ccs8 <- clust8$groups
ccs8
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 9 1 1 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 8 1 1 3 1 1 1 3
##  [75] 3 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
plot(data_sp, border=gray(.5))
plot(clust8, coordinates(data_sp), cex.lab=.3,
     groups.colors=c("blue","green3","red", "brown", "pink","yellow","gray","cyan"), cex.circles=0.005, add=TRUE)
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Visualise Clusters using Chloropleth Map

groups_mat <- as.matrix(clust8$groups)
data_sp_spatialclust <- cbind(data_agg_cluster, as.factor(groups_mat)) %>%
  rename(`SP_CLUSTER`=`as.factor.groups_mat.`)
qtm(data_sp_spatialclust, "SP_CLUSTER")

hclust.map <- qtm(data_agg_cluster,
                  "CLUSTER", legend.outside = T) + 
  tm_borders(alpha = 0.5) 

spclust.map <- qtm(data_sp_spatialclust,
                   "SP_CLUSTER", legend.outside = T) + 
  tm_borders(alpha = 0.5) 

tmap_arrange(hclust.map, spclust.map,
             asp=NA, nrow=2)

From the two choropleth graphs shown above, he hierarchical method generates a graph that is very messy, the municipality of the clusters are close to each other. it is more separated apart.

However, SKATER method graph shows clear cluster and the municipalities are clustered together beside each other.