rpub link: https://rpubs.com/sakiyasuda/687678
packages = c('rgdal', 'spdep', 'tmap', 'sf', 'ggpubr', 'cluster', 'factoextra', 'NbClust', 'heatmaply', 'corrplot', 'psych', 'tidyverse','geobr','mstknnclust' )
for (p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
brazil = read.csv("data/aspatial/BRAZIL_CITIES.csv", sep = ";", encoding = "UTF-8")
data_dict = read.csv("data/aspatial/Data_Dictionary.csv", sep = ";", encoding = "UTF-8")
I have extracted Brazil state called “SP”
brazil_filtered <- brazil[brazil$STATE == 'SP',]
brazil_filtered$CITY[brazil_filtered$CITY == "Santa Bárbara D'Oeste"] <- "Santa Bárbara D'oeste"
Filter by state “SP”
muni_fil <- muni[muni$abbrev_state == 'SP',]
macro_metro_join <- left_join(metro_fill, brazil_filtered,
by = c("name_muni" = "CITY"))
I have retrieve the city names from below link https://en.wikipedia.org/wiki/List_of_municipalities_in_S%C3%A3o_Paulo
bp_names <- c("Atibaia",
"Bom Jesus dos Perdões",
"Bragança Paulista",
"Itatiba",
"Jarinu",
"Joanópolis",
"Morungaba",
"Nazaré Paulista",
"Piracaia",
"Tuiuti",
"Vargem"
)
Filter out the Sao Paulo Municipality data
muni_fill_sao <- muni_fil %>% filter(tolower(name_muni) %in% tolower(bp_names))
area <- bind_rows(metro_fill, muni_fill_sao)
remove if there is any euplicated value
area <- area[!duplicated(area$name_muni),]
plot(area$geom)
## Join Geospatial and Aspatial
area_join <- area %>% inner_join(brazil_filtered, by = c("abbrev_state" = "STATE", "name_muni" = "CITY"))
choose the column that is used for the analaysis later
data_fil <- c("name_muni",
"IBGE_RES_POP",
"AREA",
"GVA_AGROPEC",
"GVA_INDUSTRY",
"GVA_SERVICES",
"GVA_PUBLIC",
"COMP_A",
"COMP_B",
"COMP_C",
"COMP_D",
"COMP_E",
"COMP_F",
"COMP_G",
"COMP_H",
"COMP_I",
"COMP_J",
"COMP_K",
"COMP_L",
"COMP_M",
"COMP_N",
"COMP_O",
"COMP_P",
"COMP_Q",
"COMP_R",
"COMP_S",
"COMP_T",
"COMP_U",
"geom"
)
data_fil <- area_join[,data_fil]
according to the data_dic, we can understand each Comp data type. Hence, for better understanding we can rename the column
data_fil_new <- c("NAME_MUNI",
"POPULATION",
"AREA",
"GVA_AGROPEC",
"GVA_INDUSTRY",
"GVA_SERVICES",
"GVA_PUBLIC",
"AGRO",
"EXTRACTIVE",
"TRANSFORMATION",
"GAS_ELECT",
"WATER_SEWAGE",
"CONSTRUCTION",
"VEHICLE_REPAIR",
"TRANSPORT_MAIL",
"ACCOM_FOOD",
"INFOCOMM",
"FINANCE",
"REALESTATE",
"SCIENCE_TECHNICAL",
"ADMIN",
"DEFENSE_SS",
"EDUCATION",
"HEALTH_SOCIAL_SERVICE",
"ART_SPORT_RECRE",
"OTHER_SERVICES",
"DOMESTIC_SERVICES",
"INTERNATIONAL_INSTI",
"geom"
)
colnames(data_fil) <- data_fil_new
we can combine companes based on similar cateogies
data_agg <- data_fil %>%
mutate(`AGROPEC` = `AGRO` + `EXTRACTIVE`) %>%
mutate(`Industry` = `GAS_ELECT` + `CONSTRUCTION` + `TRANSFORMATION`) %>%
mutate(`Services` = + `WATER_SEWAGE` + `VEHICLE_REPAIR` + `TRANSPORT_MAIL` + `INFOCOMM` + `FINANCE` + `REALESTATE` + `SCIENCE_TECHNICAL` + `ADMIN`+`ACCOM_FOOD` +`OTHER_SERVICES` + `DOMESTIC_SERVICES` + `INTERNATIONAL_INSTI` + `DEFENSE_SS`+ `ART_SPORT_RECRE`) %>%
mutate(`Public` = `EDUCATION` + `HEALTH_SOCIAL_SERVICE` ) %>%
select(NAME_MUNI, POPULATION, AREA, GVA_AGROPEC, GVA_INDUSTRY, GVA_SERVICES, GVA_PUBLIC, AGROPEC, Industry, Services, Public, geom)
GVA_AGROPEC_gg <- ggplot(data=data_agg,
aes(x= `GVA_AGROPEC`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
GVA_INDUSTRY_gg <- ggplot(data=data_agg,
aes(x= `GVA_INDUSTRY`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
GVA_SERVICES_gg <- ggplot(data=data_agg,
aes(x= `GVA_SERVICES`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
GVA_PUBLIC_gg <- ggplot(data=data_agg,
aes(x= `GVA_PUBLIC`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
AGROPEC_gg <- ggplot(data=data_agg,
aes(x= `AGROPEC`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
Industry_gg <- ggplot(data=data_agg,
aes(x= `Industry`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
Services_gg <- ggplot(data=data_agg,
aes(x= `Services`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
Public_gg <- ggplot(data=data_agg,
aes(x= `Public`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
ggarrange(GVA_AGROPEC_gg, GVA_INDUSTRY_gg, GVA_SERVICES_gg, GVA_PUBLIC_gg, AGROPEC_gg, Industry_gg, Services_gg,Public_gg,
ncol = 2,
nrow = 4)
## Chlorpleth Mapping by visualizing we can understand on how different types of industry are concentrated in Brazil
Based on the choropleth maps, GVA_AGROPEC is concentrated in east. GVA_INDUSTRY & GVA_SERVICES & GVA_PUBLIC are mostly in south area of the state.
tm_shape(data_agg)+
tm_fill("GVA_AGROPEC",
n = 5,
style = "fisher") +
tm_layout(scale=0.6)
tm_shape(data_agg)+
tm_fill("GVA_INDUSTRY",
n = 5,
style = "fisher") +
tm_layout(scale=0.6)
tm_shape(data_agg)+
tm_fill("GVA_SERVICES",
n = 5,
style = "fisher") +
tm_layout(scale=0.6)
tm_shape(data_agg)+
tm_fill("GVA_PUBLIC",
n = 5,
style = "fisher") +
tm_layout(scale=0.6)
tm_shape(data_agg)+
tm_fill("AGROPEC",
n = 5,
style = "fisher") +
tm_layout(scale=0.6)
tm_shape(data_agg)+
tm_fill("Industry",
n = 5,
style = "fisher") +
tm_layout(scale=0.6)
tm_shape(data_agg)+
tm_fill("Services",
n = 5,
style = "fisher") +
tm_layout(scale=0.6)
tm_shape(data_agg)+
tm_fill("Public",
n = 5,
style = "fisher") +
tm_layout(scale=0.6)
## Correlation Analaysis
According to the correlation analysis, GVA_SERVICES anr GVA_PUBLIC are highly corrleate to industry, services and public
corr_analy <- data_agg %>% select(c(`GVA_AGROPEC`, `GVA_INDUSTRY`, `GVA_SERVICES`, `GVA_PUBLIC`, `AGROPEC`, `Industry`, `Services`,`Public`)) %>%st_set_geometry(NULL)
corr_analy.cor = cor(corr_analy)
corrplot.mixed(corr_analy.cor,
lower = "ellipse",
upper = "number",
tl.pos = "lt",
diag = "l",
tl.col = "black",width=10, height=4,number.cex = .55)
#Hierarchial Clustering Filter the clustering variables
cluster <- data_agg %>%
st_set_geometry(NULL) %>%
select("NAME_MUNI","GVA_AGROPEC", "GVA_INDUSTRY", "GVA_SERVICES","GVA_PUBLIC", "AGROPEC", "Industry", "Services","Public")
rename the row by the “name_muni”
row.names(cluster) <- cluster$"NAME_MUNI"
we can drop the “name_muni”
cluster <- select(cluster, c(2:9))
cluster.std <- normalize(cluster)
proxmat <- dist(cluster.std, method = "euclidean")
hclust_ward <- hclust(proxmat, method = 'ward.D')
we can plot the decision tree
plot(hclust_ward, cex = 0.4)
## Selecting the optimal clustering algorithm
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
ac <- function(x) {
agnes(cluster, method = x)$ac
}
map_dbl(m, ac)
## average single complete ward
## 0.9925490 0.9925826 0.9924811 0.9929544
There are three methods are can find optimal clustering number I will be using gap statistic to find the optimal number for cluster
set.seed(12345)
gap_stat <- clusGap(cluster.std, FUN = hcut, nstart = 25, K.max = 10, B = 50)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = cluster.std, FUNcluster = hcut, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 1
## logW E.logW gap SE.sim
## [1,] 2.142592 3.887897 1.745305 0.02394589
## [2,] 1.989448 3.591081 1.601632 0.02303804
## [3,] 1.709898 3.465527 1.755630 0.02448668
## [4,] 1.566754 3.376340 1.809586 0.02148310
## [5,] 1.464510 3.298227 1.833717 0.01955427
## [6,] 1.298997 3.228299 1.929303 0.01881859
## [7,] 1.250977 3.163612 1.912634 0.01959523
## [8,] 1.145949 3.104285 1.958336 0.01875626
## [9,] 1.071225 3.051907 1.980682 0.01908589
## [10,] 1.033361 3.004548 1.971188 0.01850501
Visualise th eplot by uising :“fviz_gap_stat”
fviz_gap_stat(gap_stat)
From the plot, I can infer that 8 is the best number.The gradient from cluster i-1 whichs is 9-1 equal to 8 ## Interpret The Dendrograms
plot(hclust_ward, cex = 0.6)
rect.hclust(hclust_ward, k = 8, border = 2:5)
## Visually deiven hierachical clustering analysis by using heatmaply, we can plot the interactive cluster heatmap
cluster_mat <- data.matrix(cluster)
heatmaply(normalize(cluster_mat),
Colv=NA,
dist_method = "euclidean",
hclust_method = "ward.D",
seriate = "OLO",
colors = Blues,
k_row = 6,
margins = c(NA,200,60,NA),
fontsize_row = 4,
fontsize_col = 5,
main="Geographic Segmentation of Sao Paulo State",
xlab = "ICT Indicators",
ylab = "Townships of Shan State"
)
## Mapping the clusters formed
groups <- as.factor(cutree(hclust_ward, k=8))
data_agg_cluster <- cbind(data_agg, as.matrix(groups)) %>%
rename(`CLUSTER`=`as.matrix.groups.`)
Use qtm() from tmap to plot the choropleth map of how clusters are formed
qtm(data_agg_cluster, "CLUSTER")
# Spatially Constrained Clustering - SKATER approach ## Conver spatialpolygons dataframe
data_sp <- as_Spatial(data_agg)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO"): Discarded datum Unknown based on GRS80 ellipsoid in CRS definition,
## but +towgs84= values preserved
data.nb <- poly2nb(data_sp,)
summary(data.nb)
## Neighbour list object:
## Number of regions: 172
## Number of nonzero links: 872
## Percentage nonzero weights: 2.947539
## Average number of links: 5.069767
## 1 region with no links:
## 100
## Link number distribution:
##
## 0 1 2 3 4 5 6 7 8 9 10 22
## 1 4 14 21 35 31 29 17 9 5 5 1
## 4 least connected regions:
## 8 9 29 36 with 1 link
## 1 most connected region:
## 161 with 22 links
We can understand the neighbours list by ploting data.nb.
plot(data_sp, border=grey(.5))
plot(data.nb, coordinates(data_sp), col="blue", add=TRUE)
From the plot, we can see the most of neighbors is concentrate in south region.
cnt <- card(data.nb)
island_ind <- match(0, cnt)
print(island_ind)
## [1] 100
nearest <- knearneigh(coordinates(data_sp))$nn
data.nb[[island_ind]] <- nearest[island_ind]
# We need to add this island to the neighbours of its connected municipality as well
data.nb[[nearest[island_ind]]] <- c(as.integer(100), as.integer(data.nb[[nearest[island_ind]]]))
lcosts <- nbcosts(data.nb, cluster.std)
data.w <- nb2listw(data.nb, lcosts, style="B")
summary(data.w)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 172
## Number of nonzero links: 874
## Percentage nonzero weights: 2.9543
## Average number of links: 5.081395
## Link number distribution:
##
## 1 2 3 4 5 6 7 8 9 10 22
## 5 14 21 35 30 30 17 9 5 5 1
## 5 least connected regions:
## 8 9 29 36 100 with 1 link
## 1 most connected region:
## 161 with 22 links
##
## Weights style: B
## Weights constants summary:
## n nn S0 S1 S2
## B 172 29584 267.8447 688.0612 14258.25
data.mst <- mstree(data.w)
plot(data_sp, border=gray(.5))
plot.mst(data.mst, coordinates(data_sp),
col="blue", cex.lab=0.7, cex.circles=0.005, add=TRUE)
## Computing spatially constrained clusters using SKATER method
clust8 <- skater(data.mst[,1:2], cluster.std, method = "euclidean", 8)
ccs8 <- clust8$groups
ccs8
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 9 1 1 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 8 1 1 3 1 1 1 3
## [75] 3 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
plot(data_sp, border=gray(.5))
plot(clust8, coordinates(data_sp), cex.lab=.3,
groups.colors=c("blue","green3","red", "brown", "pink","yellow","gray","cyan"), cex.circles=0.005, add=TRUE)
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter
## Visualise Clusters using Chloropleth Map
groups_mat <- as.matrix(clust8$groups)
data_sp_spatialclust <- cbind(data_agg_cluster, as.factor(groups_mat)) %>%
rename(`SP_CLUSTER`=`as.factor.groups_mat.`)
qtm(data_sp_spatialclust, "SP_CLUSTER")
hclust.map <- qtm(data_agg_cluster,
"CLUSTER", legend.outside = T) +
tm_borders(alpha = 0.5)
spclust.map <- qtm(data_sp_spatialclust,
"SP_CLUSTER", legend.outside = T) +
tm_borders(alpha = 0.5)
tmap_arrange(hclust.map, spclust.map,
asp=NA, nrow=2)