rpub link: https://rpubs.com/sakiyasuda/687678

Objective

Installing Packages

packages = c('rgdal', 'spdep', 'tmap', 'sf', 'ggpubr', 'cluster', 'factoextra', 'NbClust', 'heatmaply', 'corrplot', 'psych', 'tidyverse','geobr','mstknnclust' )
for (p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p,character.only = T)
}

Data Wrangling

Import Data into R enviroment

brazil = read.csv("data/aspatial/BRAZIL_CITIES.csv",  sep = ";", encoding = "UTF-8") 
data_dict = read.csv("data/aspatial/Data_Dictionary.csv",  sep = ";", encoding = "UTF-8") 

Filter and Extract data

I have extracted Brazil state called “SP”

brazil_filtered <- brazil[brazil$STATE == 'SP',]
brazil_filtered$CITY[brazil_filtered$CITY == "Santa Bárbara D'Oeste"] <- "Santa Bárbara D'oeste"

Extract 2016 municipality level

Filter by state “SP”

muni_fil <- muni[muni$abbrev_state  == 'SP',]

Sao Paulo Metropolitian level

Join Macro Metro and Brazil Filtered by city name

macro_metro_join <- left_join(metro_fill, brazil_filtered, 
                              by = c("name_muni" = "CITY"))

Extract Municipalities from Braganca Paulista

I have retrieve the city names from below link https://en.wikipedia.org/wiki/List_of_municipalities_in_S%C3%A3o_Paulo

bp_names <- c("Atibaia",
              "Bom Jesus dos Perdões",
              "Bragança Paulista",
              "Itatiba",
              "Jarinu",
              "Joanópolis",
              "Morungaba",
              "Nazaré Paulista",
              "Piracaia",
              "Tuiuti",
              "Vargem"
              )

Filter out the Sao Paulo Municipality data

muni_fill_sao <- muni_fil %>% filter(tolower(name_muni) %in% tolower(bp_names))

Merge Metropolitian and Municipalities

area <- bind_rows(metro_fill, muni_fill_sao)

remove if there is any euplicated value

area <- area[!duplicated(area$name_muni),]
plot(area$geom)

## Join Geospatial and Aspatial

area_join <- area %>% inner_join(brazil_filtered, by = c("abbrev_state" = "STATE", "name_muni" = "CITY"))

Data Filter and Selection

choose the column that is used for the analaysis later

data_fil <- c("name_muni",
              "IBGE_RES_POP", 
              "AREA",
              "GVA_AGROPEC",
              "GVA_INDUSTRY",
              "GVA_SERVICES",
              "GVA_PUBLIC",
              "COMP_A",
              "COMP_B",
              "COMP_C",
              "COMP_D",
              "COMP_E",
              "COMP_F",
              "COMP_G",
              "COMP_H",
              "COMP_I",
              "COMP_J",
              "COMP_K",
              "COMP_L",
              "COMP_M",
              "COMP_N",
              "COMP_O",
              "COMP_P",
              "COMP_Q",
              "COMP_R",
              "COMP_S",
              "COMP_T",
              "COMP_U",
              "geom"
              )

data_fil <- area_join[,data_fil]

according to the data_dic, we can understand each Comp data type. Hence, for better understanding we can rename the column

data_fil_new <- c("NAME_MUNI",
              "POPULATION", 
              "AREA",
              "GVA_AGROPEC",
              "GVA_INDUSTRY",
              "GVA_SERVICES",
              "GVA_PUBLIC",
              "AGRO",
              "EXTRACTIVE",
              "TRANSFORMATION",
              "GAS_ELECT",
              "WATER_SEWAGE",
              "CONSTRUCTION",
              "VEHICLE_REPAIR",
              "TRANSPORT_MAIL",
              "ACCOM_FOOD", 
              "INFOCOMM",
              "FINANCE",
              "REALESTATE",
              "SCIENCE_TECHNICAL",
              "ADMIN",
              "DEFENSE_SS",
              "EDUCATION",
              "HEALTH_SOCIAL_SERVICE",
              "ART_SPORT_RECRE",
              "OTHER_SERVICES",
              "DOMESTIC_SERVICES",
              "INTERNATIONAL_INSTI",
              "geom"
              )

colnames(data_fil) <- data_fil_new

Aggregate the data

we can combine companes based on similar cateogies

data_agg <- data_fil %>% 
  mutate(`AGROPEC` = `AGRO` + `EXTRACTIVE`) %>%
  mutate(`Industry` = `GAS_ELECT` + `CONSTRUCTION` + `TRANSFORMATION`) %>%
  mutate(`Services` =  + `WATER_SEWAGE` + `VEHICLE_REPAIR` + `TRANSPORT_MAIL` + `INFOCOMM` + `FINANCE` + `REALESTATE` + `SCIENCE_TECHNICAL` + `ADMIN`+`ACCOM_FOOD` +`OTHER_SERVICES` + `DOMESTIC_SERVICES` + `INTERNATIONAL_INSTI` + `DEFENSE_SS`+ `ART_SPORT_RECRE`) %>%
  mutate(`Public` =  `EDUCATION` + `HEALTH_SOCIAL_SERVICE` ) %>%
  select(NAME_MUNI, POPULATION, AREA, GVA_AGROPEC, GVA_INDUSTRY, GVA_SERVICES, GVA_PUBLIC, AGROPEC, Industry, Services, Public, geom)

EDA

GVA_AGROPEC_gg <- ggplot(data=data_agg, 
             aes(x= `GVA_AGROPEC`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

GVA_INDUSTRY_gg <- ggplot(data=data_agg, 
             aes(x= `GVA_INDUSTRY`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

GVA_SERVICES_gg <- ggplot(data=data_agg, 
             aes(x= `GVA_SERVICES`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

GVA_PUBLIC_gg <- ggplot(data=data_agg, 
             aes(x= `GVA_PUBLIC`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

AGROPEC_gg <- ggplot(data=data_agg, 
             aes(x= `AGROPEC`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

Industry_gg <- ggplot(data=data_agg, 
             aes(x= `Industry`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

Services_gg <- ggplot(data=data_agg, 
             aes(x= `Services`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

Public_gg <- ggplot(data=data_agg, 
             aes(x= `Public`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")


ggarrange(GVA_AGROPEC_gg, GVA_INDUSTRY_gg, GVA_SERVICES_gg, GVA_PUBLIC_gg, AGROPEC_gg, Industry_gg, Services_gg,Public_gg,
          ncol = 2,
          nrow = 4)

## Chlorpleth Mapping by visualizing we can understand on how different types of industry are concentrated in Brazil

Based on the choropleth maps, GVA_AGROPEC is concentrated in east. GVA_INDUSTRY & GVA_SERVICES & GVA_PUBLIC are mostly in south area of the state.

tm_shape(data_agg)+
  tm_fill("GVA_AGROPEC",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("GVA_INDUSTRY",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("GVA_SERVICES",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("GVA_PUBLIC",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("AGROPEC",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("Industry",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("Services",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

tm_shape(data_agg)+
  tm_fill("Public",
          n = 5,
          style = "fisher") +
  tm_layout(scale=0.6)

## Correlation Analaysis

According to the correlation analysis, GVA_SERVICES anr GVA_PUBLIC are highly corrleate to industry, services and public

corr_analy <- data_agg %>% select(c(`GVA_AGROPEC`, `GVA_INDUSTRY`, `GVA_SERVICES`, `GVA_PUBLIC`, `AGROPEC`, `Industry`, `Services`,`Public`)) %>%st_set_geometry(NULL)
corr_analy.cor = cor(corr_analy)

corrplot.mixed(corr_analy.cor,
               lower = "ellipse", 
               upper = "number",
               tl.pos = "lt",
               diag = "l",
               tl.col = "black",width=10, height=4,number.cex = .55)

#Hierarchial Clustering Filter the clustering variables

cluster <- data_agg %>%
  st_set_geometry(NULL) %>%
  select("NAME_MUNI","GVA_AGROPEC", "GVA_INDUSTRY", "GVA_SERVICES","GVA_PUBLIC", "AGROPEC", "Industry", "Services","Public")

rename the row by the “name_muni”

row.names(cluster) <- cluster$"NAME_MUNI"

we can drop the “name_muni”

cluster <- select(cluster, c(2:9))

Standarise dataset

cluster.std <- normalize(cluster)

calcualte euclidean distance

proxmat <- dist(cluster.std, method = "euclidean")

computing hierarachial clustering

hclust_ward <- hclust(proxmat, method = 'ward.D')

we can plot the decision tree

plot(hclust_ward, cex = 0.4)

## Selecting the optimal clustering algorithm

m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

ac <- function(x) {
  agnes(cluster, method = x)$ac
}

map_dbl(m, ac)
##   average    single  complete      ward 
## 0.9925490 0.9925826 0.9924811 0.9929544

Determining Optimal Clusters

There are three methods are can find optimal clustering number I will be using gap statistic to find the optimal number for cluster

set.seed(12345)
gap_stat <- clusGap(cluster.std, FUN = hcut, nstart = 25, K.max = 10, B = 50)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = cluster.std, FUNcluster = hcut, K.max = 10, B = 50,     nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 1
##           logW   E.logW      gap     SE.sim
##  [1,] 2.142592 3.887897 1.745305 0.02394589
##  [2,] 1.989448 3.591081 1.601632 0.02303804
##  [3,] 1.709898 3.465527 1.755630 0.02448668
##  [4,] 1.566754 3.376340 1.809586 0.02148310
##  [5,] 1.464510 3.298227 1.833717 0.01955427
##  [6,] 1.298997 3.228299 1.929303 0.01881859
##  [7,] 1.250977 3.163612 1.912634 0.01959523
##  [8,] 1.145949 3.104285 1.958336 0.01875626
##  [9,] 1.071225 3.051907 1.980682 0.01908589
## [10,] 1.033361 3.004548 1.971188 0.01850501

Visualise th eplot by uising :“fviz_gap_stat”

fviz_gap_stat(gap_stat)

From the plot, I can infer that 8 is the best number.The gradient from cluster i-1 whichs is 9-1 equal to 8 ## Interpret The Dendrograms

plot(hclust_ward, cex = 0.6)
rect.hclust(hclust_ward, k = 8, border = 2:5)

## Visually deiven hierachical clustering analysis by using heatmaply, we can plot the interactive cluster heatmap

Transforming the data frame into a matrix

cluster_mat <- data.matrix(cluster)
heatmaply(normalize(cluster_mat),
          Colv=NA,
          dist_method = "euclidean",
          hclust_method = "ward.D",
          seriate = "OLO",
          colors = Blues,
          k_row = 6,
          margins = c(NA,200,60,NA),
          fontsize_row = 4,
          fontsize_col = 5,
          main="Geographic Segmentation of  Sao Paulo State",
          xlab = "ICT Indicators",
          ylab = "Townships of Shan State"
          )

## Mapping the clusters formed

groups <- as.factor(cutree(hclust_ward, k=8))
data_agg_cluster <- cbind(data_agg, as.matrix(groups)) %>%
  rename(`CLUSTER`=`as.matrix.groups.`)

Use qtm() from tmap to plot the choropleth map of how clusters are formed

qtm(data_agg_cluster, "CLUSTER")

# Spatially Constrained Clustering - SKATER approach ## Conver spatialpolygons dataframe

data_sp <- as_Spatial(data_agg)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO"): Discarded datum Unknown based on GRS80 ellipsoid in CRS definition,
##  but +towgs84= values preserved

Compute Neighbour List

data.nb <- poly2nb(data_sp,)
summary(data.nb)
## Neighbour list object:
## Number of regions: 172 
## Number of nonzero links: 872 
## Percentage nonzero weights: 2.947539 
## Average number of links: 5.069767 
## 1 region with no links:
## 100
## Link number distribution:
## 
##  0  1  2  3  4  5  6  7  8  9 10 22 
##  1  4 14 21 35 31 29 17  9  5  5  1 
## 4 least connected regions:
## 8 9 29 36 with 1 link
## 1 most connected region:
## 161 with 22 links

We can understand the neighbours list by ploting data.nb.

plot(data_sp, border=grey(.5))
plot(data.nb, coordinates(data_sp), col="blue", add=TRUE)

From the plot, we can see the most of neighbors is concentrate in south region.

Computing minimum spanning tree

Artificially add in neighbour for the island municipality

cnt <- card(data.nb)
island_ind <- match(0, cnt)
print(island_ind)
## [1] 100
nearest <- knearneigh(coordinates(data_sp))$nn
data.nb[[island_ind]] <- nearest[island_ind]

# We need to add this island to the neighbours of its connected municipality as well
data.nb[[nearest[island_ind]]] <- c(as.integer(100), as.integer(data.nb[[nearest[island_ind]]]))

calculating edges costs

lcosts <- nbcosts(data.nb, cluster.std)
data.w <- nb2listw(data.nb, lcosts, style="B")
summary(data.w)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 172 
## Number of nonzero links: 874 
## Percentage nonzero weights: 2.9543 
## Average number of links: 5.081395 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 10 22 
##  5 14 21 35 30 30 17  9  5  5  1 
## 5 least connected regions:
## 8 9 29 36 100 with 1 link
## 1 most connected region:
## 161 with 22 links
## 
## Weights style: B 
## Weights constants summary:
##     n    nn       S0       S1       S2
## B 172 29584 267.8447 688.0612 14258.25

Computing minimum spanning tree

data.mst <- mstree(data.w)
plot(data_sp, border=gray(.5))
plot.mst(data.mst, coordinates(data_sp), 
     col="blue", cex.lab=0.7, cex.circles=0.005, add=TRUE)

## Computing spatially constrained clusters using SKATER method

clust8 <- skater(data.mst[,1:2], cluster.std, method = "euclidean", 8)
ccs8 <- clust8$groups
ccs8
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 9 1 1 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 8 1 1 3 1 1 1 3
##  [75] 3 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
plot(data_sp, border=gray(.5))
plot(clust8, coordinates(data_sp), cex.lab=.3,
     groups.colors=c("blue","green3","red", "brown", "pink","yellow","gray","cyan"), cex.circles=0.005, add=TRUE)
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Visualise Clusters using Chloropleth Map

groups_mat <- as.matrix(clust8$groups)
data_sp_spatialclust <- cbind(data_agg_cluster, as.factor(groups_mat)) %>%
  rename(`SP_CLUSTER`=`as.factor.groups_mat.`)
qtm(data_sp_spatialclust, "SP_CLUSTER")

hclust.map <- qtm(data_agg_cluster,
                  "CLUSTER", legend.outside = T) + 
  tm_borders(alpha = 0.5) 

spclust.map <- qtm(data_sp_spatialclust,
                   "SP_CLUSTER", legend.outside = T) + 
  tm_borders(alpha = 0.5) 

tmap_arrange(hclust.map, spclust.map,
             asp=NA, nrow=2)