Load Packages

packages = c('rgdal', 'spdep', 'ClustGeo',  'tmap', 'sf', 'ggpubr', 'cluster', 'heatmaply', 'corrplot', 'psych', 'tidyverse','units')
for (p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
    }
  library(p,character.only = T)
}
## Loading required package: rgdal
## Loading required package: sp
## rgdal: version: 1.4-8, (SVN revision 845)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
##  Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/gdal
##  GDAL binary built with GEOS: FALSE 
##  Loaded PROJ.4 runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
##  Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/proj
##  Linking to sp version: 1.3-2
## Loading required package: spdep
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
## Loading required package: ClustGeo
## Loading required package: tmap
## Loading required package: ggpubr
## Loading required package: ggplot2
## Loading required package: cluster
## Loading required package: heatmaply
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: viridis
## Loading required package: viridisLite
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
## 
## ======================
## Welcome to heatmaply version 1.1.0
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## Or contact: <tal.galili@gmail.com>
## ======================
## Loading required package: corrplot
## corrplot 0.84 loaded
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## Loading required package: tidyverse
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ✓ purrr   0.3.3
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x psych::%+%()    masks ggplot2::%+%()
## x psych::alpha()  masks ggplot2::alpha()
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag()    masks stats::lag()
## Loading required package: units
## udunits system database from /Library/Frameworks/R.framework/Versions/3.6/Resources/library/units/share/udunits

Data Import and Prepatation

business_sf <- st_read(dsn = "data/geospatial", layer = "Business")
## Reading layer `Business' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 6550 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6147 ymin: 1.24605 xmax: 104.0044 ymax: 1.4698
## CRS:            4326
financial_sf <- st_read(dsn = "data/geospatial", layer = "Financial")
## Reading layer `Financial' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3320 features and 29 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6256 ymin: 1.24392 xmax: 103.9998 ymax: 1.46247
## CRS:            4326
govt_embassy_sf <- st_read(dsn = "data/geospatial", layer = "Govt_Embassy")
## Reading layer `Govt_Embassy' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 443 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6282 ymin: 1.24911 xmax: 103.9884 ymax: 1.45765
## CRS:            4326
private_residential_sf <- st_read(dsn = "data/geospatial", layer = "Private residential")
## Reading layer `Private residential' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3604 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6295 ymin: 1.23943 xmax: 103.9749 ymax: 1.45379
## CRS:            4326
shoppings_sf <- st_read(dsn = "data/geospatial", layer = "Shopping")
## Reading layer `Shopping' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 511 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.679 ymin: 1.24779 xmax: 103.9644 ymax: 1.4535
## CRS:            4326
subzone <- st_read(dsn = "data/geospatial", layer = "MP14_SUBZONE_WEB_PL")
## Reading layer `MP14_SUBZONE_WEB_PL' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 323 features and 15 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
costaloutline <- st_read(dsn = "data/geospatial", layer = "CostalOutline")
## Reading layer `CostalOutline' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 60 features and 4 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 2663.926 ymin: 16357.98 xmax: 56047.79 ymax: 50244.03
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
subzone <- st_transform(subzone, crs=4326)

Find area of each subzone

subzone$area <- set_units(st_area(subzone), km^2)

Load aspatial data

pop_sz <- read_csv ("data/aspatial/planning-area-subzone-age-group-sex-and-type-of-dwelling-june-2011-2019.csv")
## Parsed with column specification:
## cols(
##   planning_area = col_character(),
##   subzone = col_character(),
##   age_group = col_character(),
##   sex = col_character(),
##   type_of_dwelling = col_character(),
##   resident_count = col_double(),
##   year = col_double()
## )

Filter for 2019

# to upper for joining
pop_sz <- pop_sz %>% 
                mutate(subzone = toupper(subzone)) %>% 
                filter(year == 2019)
summary(pop_sz)
##  planning_area        subzone           age_group             sex           
##  Length:98192       Length:98192       Length:98192       Length:98192      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  type_of_dwelling   resident_count         year     
##  Length:98192       Min.   :   0.00   Min.   :2019  
##  Class :character   1st Qu.:   0.00   1st Qu.:2019  
##  Mode  :character   Median :   0.00   Median :2019  
##                     Mean   :  41.08   Mean   :2019  
##                     3rd Qu.:  20.00   3rd Qu.:2019  
##                     Max.   :2440.00   Max.   :2019

Filtering for segments of population

Total Population

total_pop_sz <- pop_sz %>% 
                      group_by(subzone) %>%
                      summarise(total_pop = sum(resident_count))

Filter Economy Active Population

econ_active_pop <- pop_sz %>% filter(age_group == "25_to_29" | 
                                     age_group == "30_to_34"| 
                                     age_group == "35_to_39" |
                                      age_group == "40_to_44" |
                                      age_group == "45_to_49" |
                                      age_group == "50_to_54" |
                                      age_group == "50_to_54" |
                                      age_group == "55_to_60" |
                                      age_group == "60_to_64"
                                     )
econ_active_pop <- econ_active_pop %>% group_by(subzone) %>%
                      summarise(econ_active_pop = sum(resident_count))

Young Population

young_pop <- pop_sz %>% filter(age_group == "0_to_4" | 
                                     age_group == "5_to_9"| 
                                     age_group == "10_to_14" |
                                      age_group == "15_to_19" |
                                      age_group == "20_to_24") %>% 
                      group_by(subzone) %>%
                      summarise(young_pop = sum(resident_count))

Aged Group

aged_pop <- pop_sz %>% filter(age_group == "65_to_69" | 
                                     age_group == "70_to_74"| 
                                     age_group == "75_to_79" |
                                      age_group == "80_to_84" |
                                      age_group == "85_to_89" |
                                     age_group == "90_and_over") %>% 
                      group_by(subzone) %>%
                      summarise(aged_pop = sum(resident_count))

HDB 1-2Rm

hdb1_2_pop <- pop_sz %>% filter(type_of_dwelling == "HDB 1- and 2-Room Flats") %>% 
                      group_by(subzone) %>%
                      summarise(hdb1_2_pop = sum(resident_count))

HDB 3-4Rm

hdb3_4_pop <- pop_sz %>% filter(type_of_dwelling == "HDB 3-Room Flats" |type_of_dwelling == "HDB 4-Room Flats") %>% 
                      group_by(subzone) %>%
                      summarise(hdb3_4_pop = sum(resident_count))

HDB 5RM-EC

hdb5_ec_pop <- pop_sz %>% filter(type_of_dwelling == "HDB 5-Room and Executive Flats") %>% 
                      group_by(subzone) %>%
                      summarise(hdb5_ec_pop = sum(resident_count))

Condo

condo_pop <- pop_sz %>% filter(type_of_dwelling == "Condominiums and Other Apartments") %>% 
                      group_by(subzone) %>%
                      summarise(condo_pop = sum(resident_count))

Landed

landed_pop <- pop_sz %>% filter(type_of_dwelling == "Landed Properties") %>% 
                      group_by(subzone) %>%
                      summarise(landed_pop = sum(resident_count))

Population Density

pop_density <- subzone %>%select('SUBZONE_N', 'area')
pop_density <- left_join(pop_density, total_pop_sz, by = c("SUBZONE_N" = "subzone"))
## Warning: Column `SUBZONE_N`/`subzone` joining factor and character vector,
## coercing into character vector
pop_density <- mutate(pop_density, `pop_density` = `total_pop`/`area`)
sz_pop_main <- left_join(pop_density, econ_active_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, young_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, aged_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, hdb1_2_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, hdb3_4_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, hdb5_ec_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, condo_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, landed_pop, by = c("SUBZONE_N" = "subzone"))

Create Main table of all population data

Using measurements directly will cause biases as total population in each subzone is differnt. In general, the townships with relatively higher total number of population will also have higher number of consititutent population (eg young, aged and various population dewellers)

pop_derived <- sz_pop_main %>% 
  mutate(`econActivePR` = `econ_active_pop`/`total_pop`) %>% 
  mutate(`youngPR` = `young_pop`/`total_pop`) %>% 
  mutate(`agedPR` = `aged_pop`/`total_pop`) %>% 
  mutate(`hdb12PR` = `hdb1_2_pop`/`total_pop`) %>% 
  mutate(`hdb34PR` = `hdb3_4_pop`/`total_pop`) %>% 
  mutate(`hdb5ecPR` = `hdb5_ec_pop`/`total_pop`) %>% 
  mutate(`condoPR` = `condo_pop`/`total_pop`) %>% 
  mutate(`landedPR` = `landed_pop`/`total_pop`)

View Geospaital Data

tm_shape(subzone)+
  tm_polygons() +
tm_shape(business_sf)+
  tm_dots(col="orange")+
tm_shape(financial_sf)+
  tm_dots(col="yellow")+
tm_shape(govt_embassy_sf)+
  tm_dots(col="green")+
tm_shape(private_residential_sf)+
  tm_dots(col="blue")+
tm_shape(shoppings_sf)+
  tm_dots(col="red")
## Warning: The shape subzone is invalid. See sf::st_is_valid

## Wrangle Geospatial Data and join with main table (pop_derived) Business

summary(business_sf)
##      POI_ID             SEQ_NUM         FAC_TYPE   
##  Min.   :3.644e+07   Min.   :1.000   Min.   :5000  
##  1st Qu.:9.967e+08   1st Qu.:1.000   1st Qu.:5000  
##  Median :1.097e+09   Median :1.000   Median :5000  
##  Mean   :9.933e+08   Mean   :1.021   Mean   :5084  
##  3rd Qu.:1.108e+09   3rd Qu.:1.000   3rd Qu.:5000  
##  Max.   :1.204e+09   Max.   :3.000   Max.   :9991  
##                                                    
##                        POI_NAME                  ST_NAME    
##  CAMBRIDGE INDUSTRIAL TRUST:   8   TAGORE LN         :  83  
##  DHL                       :   6   JOO KOON CIR      :  80  
##  NATIONAL OILWELL VARCO    :   6   GUL CIR           :  62  
##  ST MICROELECTRONICS       :   6   KAKI BUKIT PL     :  53  
##  CWT                       :   5   KAKI BUKIT IND TER:  52  
##  HALLIBURTON               :   5   (Other)           :5941  
##  (Other)                   :6514   NA's              : 279  
##           geometry   
##  POINT        :6550  
##  epsg:4326    :   0  
##  +proj=long...:   0  
##                      
##                      
##                      
## 

Differentiate business and industrial points using FAC_TYPE col in business_sf. Business: FAC_TYPE: 5000 Industrial: FAC_TYPE: 9001

table(business_sf$FAC_TYPE)
## 
## 5000 9991 
## 6440  110
business_sel_sf <- business_sf %>% filter(FAC_TYPE == 5000)
industry_sel_sef <- business_sf %>% filter(FAC_TYPE == 9991)
business_sz <- st_join(business_sel_sf, subzone) %>% 
                select(SUBZONE_N) %>%
                arrange(SUBZONE_N) %>%
                add_count(SUBZONE_N, name="business") %>%
                distinct(SUBZONE_N, .keep_all = TRUE) %>%
                select(SUBZONE_N, business)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar

Industrial

industrial_sz <- st_join(industry_sel_sef, subzone) %>% 
                select(SUBZONE_N) %>%
                arrange(SUBZONE_N) %>%
                add_count(SUBZONE_N, name="industrial") %>%
                distinct(SUBZONE_N, .keep_all = TRUE) %>%
                select(SUBZONE_N, industrial)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar

Govt Including Embassy

summary(govt_embassy_sf)
##      POI_ID             SEQ_NUM         FAC_TYPE   
##  Min.   :3.644e+07   Min.   :1.000   Min.   :9525  
##  1st Qu.:1.010e+09   1st Qu.:1.000   1st Qu.:9525  
##  Median :1.058e+09   Median :1.000   Median :9525  
##  Mean   :1.006e+09   Mean   :1.111   Mean   :9651  
##  3rd Qu.:1.113e+09   3rd Qu.:1.000   3rd Qu.:9993  
##  Max.   :1.203e+09   Max.   :2.000   Max.   :9993  
##                                                    
##                             POI_NAME         ST_NAME             geometry  
##  ANG MO KIO TOWN COUNCIL        :  5   MAXWELL RD: 16   POINT        :443  
##  SEMBAWANG-NEE SOON TOWN COUNCIL:  3   THOMSON RD: 12   epsg:4326    :  0  
##  ALJUNIED HOUGANG TOWN COUNCIL  :  2   ORCHARD RD: 11   +proj=long...:  0  
##  ALJUNIED TOWN COUNCIL          :  2   COLLEGE RD: 10                      
##  BISHAN-TOA PAYOH TOWN COUNCIL  :  2   SCOTTS RD :  8                      
##  CENTRAL PROVIDENT FUND BOARD   :  2   (Other)   :358                      
##  (Other)                        :427   NA's      : 28
govtemb_sz <- st_join(govt_embassy_sf, subzone) %>% 
                select(SUBZONE_N) %>%
                arrange(SUBZONE_N) %>%
                add_count(SUBZONE_N, name="govtemb") %>%
                distinct(SUBZONE_N, .keep_all = TRUE) %>%
                select(SUBZONE_N, govtemb)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar

Shopping

summary(shoppings_sf)
##      POI_ID             SEQ_NUM         FAC_TYPE   
##  Min.   :3.644e+07   Min.   :1.000   Min.   :6512  
##  1st Qu.:9.656e+08   1st Qu.:1.000   1st Qu.:6512  
##  Median :1.070e+09   Median :1.000   Median :6512  
##  Mean   :8.934e+08   Mean   :1.108   Mean   :6512  
##  3rd Qu.:1.104e+09   3rd Qu.:1.000   3rd Qu.:6512  
##  Max.   :1.204e+09   Max.   :3.000   Max.   :6512  
##                                                    
##                              POI_NAME             ST_NAME   
##  BUKIT BATOK WEST SHOPPING CENTRE:  2   ORCHARD RD    : 37  
##  CHANGE ALLEY                    :  2   BUKIT TIMAH RD:  7  
##  FARMART CENTRE                  :  2   SCOTTS RD     :  7  
##  FORTUNE CENTRE                  :  2   BEACH RD      :  6  
##  HARBOUR FRONT CENTRE ENTRANCE   :  2   BENCOOLEN ST  :  5  
##  NEW WORLD CENTRE                :  2   (Other)       :347  
##  (Other)                         :499   NA's          :102  
##           geometry  
##  POINT        :511  
##  epsg:4326    :  0  
##  +proj=long...:  0  
##                     
##                     
##                     
## 
shoppings_sz <- st_join(shoppings_sf, subzone) %>% 
                select(SUBZONE_N) %>%
                arrange(SUBZONE_N) %>%
                add_count(SUBZONE_N, name="shooping") %>%
                distinct(SUBZONE_N, .keep_all = TRUE) %>%
                select(SUBZONE_N, shooping)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar

Financial

summary(financial_sf)
##     LINK_ID              POI_ID             SEQ_NUM         FAC_TYPE   
##  Min.   :1.161e+08   Min.   :3.644e+07   Min.   :1.000   Min.   :3578  
##  1st Qu.:8.594e+08   1st Qu.:1.097e+09   1st Qu.:1.000   1st Qu.:3578  
##  Median :9.140e+08   Median :1.113e+09   Median :1.000   Median :3578  
##  Mean   :9.092e+08   Mean   :1.088e+09   Mean   :1.008   Mean   :4397  
##  3rd Qu.:1.046e+09   3rd Qu.:1.132e+09   3rd Qu.:1.000   3rd Qu.:6000  
##  Max.   :1.224e+09   Max.   :1.204e+09   Max.   :2.000   Max.   :6000  
##                                                                        
##      POI_NAME   POI_LANGCD POI_NMTYPE   POI_ST_NUM   ST_NUM_FUL  ST_NFUL_LC 
##  OCBC    :788   ENG:3320   B:3293     1      : 212   29A :   1   ENG :   5  
##  UOB     :577              J:  27     10     :  76   333A:   1   NA's:3315  
##  POSB    :564                         2      :  53   77B :   1              
##  DBS     :282                         11     :  50   7A  :   1              
##  CITIBANK:153                         304    :  50   8A  :   1              
##  MAYBANK : 51                         (Other):2004   NA's:3315              
##  (Other) :905                         NA's   : 875                          
##               ST_NAME     ST_LANGCD   POI_ST_SD ACC_TYPE          PH_NUMBER   
##  ORCHARD RD       : 156   ENG :2926   L:1652    NA's:3320   63396666   :  52  
##  BEACH RD         :  44   NA's: 394   N:  24                63272265   :  16  
##  NORTH BRIDGE RD  :  39               R:1644                18002222121:  15  
##  COLLYER QUAY     :  38                                     18004383333:  13  
##  NEW UPP CHANGI RD:  35                                     18001111111:  11  
##  (Other)          :2614                                     (Other)    : 539  
##  NA's             : 394                                     NA's       :2674  
##     CHAIN_ID     NAT_IMPORT PRIVATE  IN_VICIN   NUM_PARENT    
##  Min.   :    0   N:3320     N:3320   N:3320   Min.   :0.0000  
##  1st Qu.: 2526                                1st Qu.:0.0000  
##  Median : 6918                                Median :0.0000  
##  Mean   : 5121                                Mean   :0.3807  
##  3rd Qu.: 6920                                3rd Qu.:1.0000  
##  Max.   :24982                                Max.   :2.0000  
##                                                               
##    NUM_CHILD           PERCFRREF       VANCITY_ID
##  Min.   :0.0000000   Min.   : 1.00   Min.   :0   
##  1st Qu.:0.0000000   1st Qu.:30.00   1st Qu.:0   
##  Median :0.0000000   Median :50.00   Median :0   
##  Mean   :0.0003012   Mean   :46.87   Mean   :0   
##  3rd Qu.:0.0000000   3rd Qu.:60.00   3rd Qu.:0   
##  Max.   :1.0000000   Max.   :99.00   Max.   :0   
##                      NA's   :1339                
##                                                                ACT_ADDR   
##  1 KIM SENG PROMENADE                              SINGAPORE 237994:   7  
##  3 TEMASEK BOULEVARD                               SINGAPORE 038983:   7  
##  530 LORONG 6 TOA PAYOH                            SINGAPORE 310530:   7  
##  2 JURONG EAST ST 21                               SINGAPORE 609601:   6  
##  3D RIVER VALLEY ROAD                              SINGAPORE 179023:   6  
##  (Other)                                                           : 243  
##  NA's                                                              :3044  
##  ACT_LANGCD               ACT_ST_NAM     ACT_ST_NUM       ACT_ADMIN   
##  ENG : 276   DUNEARN ROAD      :   7   1      :  20   INGAPORE :   1  
##  NA's:3044   KIM SENG PROMENADE:   7   3      :  11   SINGAPORE: 275  
##              LORONG 6 TOA PAYOH:   7   2      :  10   NA's     :3044  
##              PAYA LEBAR ROAD   :   7   50     :   8                   
##              TEMASEK BOULEVARD :   7   530    :   7                   
##              (Other)           : 241   (Other): 220                   
##              NA's              :3044   NA's   :3044                   
##    ACT_POSTAL            geometry   
##  038983 :   7   POINT        :3320  
##  237994 :   7   epsg:4326    :   0  
##  310530 :   7   +proj=long...:   0  
##  609601 :   7                       
##  179023 :   6                       
##  (Other): 242                       
##  NA's   :3044
financial_sz <- st_join(financial_sf, subzone) %>% 
                select(SUBZONE_N) %>%
                arrange(SUBZONE_N) %>%
                add_count(SUBZONE_N, name="financial") %>%
                distinct(SUBZONE_N, .keep_all = TRUE) %>%
                select(SUBZONE_N, financial)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar

Upmarket residential Area

summary(private_residential_sf)
##      POI_ID             SEQ_NUM         FAC_TYPE   
##  Min.   :3.644e+07   Min.   :1.000   Min.   :9590  
##  1st Qu.:9.968e+08   1st Qu.:1.000   1st Qu.:9590  
##  Median :1.070e+09   Median :1.000   Median :9590  
##  Mean   :1.052e+09   Mean   :1.007   Mean   :9590  
##  3rd Qu.:1.105e+09   3rd Qu.:1.000   3rd Qu.:9590  
##  Max.   :1.204e+09   Max.   :2.000   Max.   :9590  
##                                                    
##                     POI_NAME                 ST_NAME              geometry   
##  BLISSFUL VIEW          :   3   PASIR PANJANG RD :  45   POINT        :3604  
##  CLEMENTI PARK          :   3   UPP EAST COAST RD:  29   epsg:4326    :   0  
##  COMPASSVALE VIEW       :   3   LOR K TELOK KURAU:  26   +proj=long...:   0  
##  KING'S MANSION         :   3   BUKIT TIMAH RD   :  24                       
##  MIDPOINT PROPERTIES    :   3   EAST COAST RD    :  23                       
##  NEE SOON CENTRAL ESTATE:   3   (Other)          :3412                       
##  (Other)                :3586   NA's             :  45
private_residential_sz <- st_join(private_residential_sf, subzone) %>% 
                select(SUBZONE_N) %>%
                arrange(SUBZONE_N) %>%
                add_count(SUBZONE_N, name="privateres") %>%
                distinct(SUBZONE_N, .keep_all = TRUE) %>%
                select(SUBZONE_N, privateres)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar

Join geospatial data with main table

pop_derived <- left_join(pop_derived, business_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector
pop_derived <- left_join(pop_derived, financial_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector
pop_derived <- left_join(pop_derived, govtemb_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector
pop_derived <- left_join(pop_derived, industrial_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector
pop_derived <- left_join(pop_derived, private_residential_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector
pop_derived <- left_join(pop_derived, shoppings_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector

Mutate to get PR for urban functions

pop_derived <- pop_derived %>% 
  mutate(`businessPR` = `business`/`total_pop`) %>% 
  mutate(`financialPR` = `financial`/`total_pop`) %>% 
  mutate(`privateresPR` = `privateres`/`total_pop`) %>% 
  mutate(`govtembPR` = `govtemb`/`total_pop`) %>% 
  mutate(`industrialPR` = `industrial`/`total_pop`) %>% 
  mutate(`shoppingsPR` = `shooping`/`total_pop`)

Spatial EDA

showMap <- function(df, segment,chartTitle) {
  tm_shape(df)+ 
    tm_polygons(col=segment,legend.hist = TRUE, palette="seq", title=chartTitle)+
    tm_layout(legend.outside = TRUE, legend.hist.width = 10, aes.palette = list(seq = "-PiYG"))
}

Population segments at subzone level

population_map <- showMap(pop_derived, "total_pop", "Total Popuation")
econ_active_pop <- showMap(pop_derived, "econ_active_pop", "Economy Active Population")
young_map <- showMap(pop_derived, "young_pop", "Young Population")
aged_pop <- showMap(pop_derived, "aged_pop", "Aged Population")
hdb1_2_pop <- showMap(pop_derived, "hdb1_2_pop", "HDB 1- 2Rm Dwellers")
hdb3_4_pop <- showMap(pop_derived, "hdb3_4_pop", "HDB 3- 4Rm Dwellers")
hdb5_ec_pop <- showMap(pop_derived, "hdb5_ec_pop", "HDB 5Rm EC Dwellers")
condo_pop <- showMap(pop_derived, "condo_pop", "Condominium and Apt Dwellers")
landed_pop <- showMap(pop_derived, "landed_pop", "Landed Property Dwellers")

Plot 1/2 - Population Segment

tmap_arrange(population_map, econ_active_pop, young_map, aged_pop, hdb1_2_pop, hdb3_4_pop, ncol = 2)
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.50, 0.46, 0.46. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Warning: The shape df is invalid. See sf::st_is_valid

## Warning: The shape df is invalid. See sf::st_is_valid

## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.55, 0.51, 0.51. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.60, 0.60, 0.60, 0.60. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.51, 0.51, 0.51. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.

Plot 2/2 - Population Segment

tmap_arrange(hdb5_ec_pop, condo_pop, landed_pop, ncol = 2)
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.51, 0.51, 0.51, 0.51. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.55, 0.51, 0.51. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.55, 0.51, 0.51. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.

Urban Functions at subzone level

Population segments at subzone level

business_map <- showMap(pop_derived, "business", "No Of Business Functions")
financial_map <- showMap(pop_derived, "financial", "No Of Financial Functions")
govtemb_map <- showMap(pop_derived, "govtemb", "No Of Govt/Emb Functions")
industrial_map <- showMap(pop_derived, "industrial", "No Of industrial Functions")
privateres_map <- showMap(pop_derived, "privateres", "No Of Upmarket Residential Area")
shopping_map <- showMap(pop_derived, "shooping", "No Of Shopping Functions")

Plot 2/2 - Population Urban Functions

tmap_arrange(business_map, financial_map, govtemb_map, industrial_map, privateres_map, shopping_map, ncol = 2)
## Warning: The shape df is invalid. See sf::st_is_valid

## Warning: The shape df is invalid. See sf::st_is_valid

## Warning: The shape df is invalid. See sf::st_is_valid

## Warning: The shape df is invalid. See sf::st_is_valid

## Warning: The shape df is invalid. See sf::st_is_valid

## Warning: The shape df is invalid. See sf::st_is_valid

EDA Via Graphs

pop_seg <- pop_derived %>%
  st_set_geometry(NULL) 

## Urban functions
business_hist <- ggplot(data=pop_seg, aes(x=business)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "Business")
financial_hist <- ggplot(data=pop_seg, aes(x=financial)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "Financial")
govtemb_hist <- ggplot(data=pop_seg, aes(x=govtemb)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "Govt Functions")
industrial_hist <- ggplot(data=pop_seg, aes(x=industrial)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "Industrial")
privateres_hist <- ggplot(data=pop_seg, aes(x=privateres)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "Upmarket Residential Areas")
shopping_hist <- ggplot(data=pop_seg, aes(x=shooping)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "Shopping")

## Urban functions PR
businessPR_hist <- ggplot(data=pop_seg, aes(x=businessPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR Business")
financialPR_hist <- ggplot(data=pop_seg, aes(x=financialPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR Financial")
govtembPR_hist <- ggplot(data=pop_seg, aes(x=govtembPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR Govt Functions")
industrialPR_hist <- ggplot(data=pop_seg, aes(x=industrialPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR Industrial")
privateresPR_hist <- ggplot(data=pop_seg, aes(x=privateresPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR Upmarket Residential Areas")
shoppingPR_hist <- ggplot(data=pop_seg, aes(x=shoppingsPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR Shopping")

## Population PR
econActivePR_hist <- ggplot(data=pop_seg, aes(x=econActivePR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR Active Population")
youngPR_hist <- ggplot(data=pop_seg, aes(x=youngPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR Young Population")
agedPR_hist <- ggplot(data=pop_seg, aes(x=agedPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR Aged Population")
hdb12PR_hist <- ggplot(data=pop_seg, aes(x=hdb12PR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR HDB 1-2 Rm Dwellers")
hdb34PR_hist <- ggplot(data=pop_seg, aes(x=hdb34PR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR HDB 3-4 Rm Dwellers")
hdb5ecPR_hist <- ggplot(data=pop_seg, aes(x=hdb5ecPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR HDB 5Rm/EC Dwellers")
condoPR_hist <- ggplot(data=pop_seg, aes(x=condoPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR Condo")
landedPR_hist <- ggplot(data=pop_seg, aes(x=landedPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
          scale_x_continuous(name = "PR Landed")

Population Segments

ggarrange(econActivePR_hist,youngPR_hist,agedPR_hist,hdb12PR_hist,hdb34PR_hist,hdb5ecPR_hist, condoPR_hist,landedPR_hist,
          nrow = 2,ncol=4)
## Warning: Removed 89 rows containing non-finite values (stat_bin).

## Warning: Removed 89 rows containing non-finite values (stat_bin).

## Warning: Removed 89 rows containing non-finite values (stat_bin).

## Warning: Removed 89 rows containing non-finite values (stat_bin).

## Warning: Removed 89 rows containing non-finite values (stat_bin).

## Warning: Removed 89 rows containing non-finite values (stat_bin).

## Warning: Removed 89 rows containing non-finite values (stat_bin).

## Warning: Removed 89 rows containing non-finite values (stat_bin).

Urban functions

ggarrange(business_hist,financial_hist,govtemb_hist,industrial_hist,privateres_hist,shopping_hist,
          nrow = 2,ncol=3)
## Warning: Removed 107 rows containing non-finite values (stat_bin).
## Warning: Removed 73 rows containing non-finite values (stat_bin).
## Warning: Removed 190 rows containing non-finite values (stat_bin).
## Warning: Removed 274 rows containing non-finite values (stat_bin).
## Warning: Removed 84 rows containing non-finite values (stat_bin).
## Warning: Removed 176 rows containing non-finite values (stat_bin).

## Urban functions PR

ggarrange(businessPR_hist,financialPR_hist,govtembPR_hist,industrialPR_hist,privateresPR_hist,shoppingPR_hist,nrow = 2,ncol=3)
## Warning: Removed 168 rows containing non-finite values (stat_bin).
## Warning: Removed 112 rows containing non-finite values (stat_bin).
## Warning: Removed 213 rows containing non-finite values (stat_bin).
## Warning: Removed 297 rows containing non-finite values (stat_bin).
## Warning: Removed 98 rows containing non-finite values (stat_bin).
## Warning: Removed 189 rows containing non-finite values (stat_bin).

Correlation Analysis

# replace NaN to 0
pop_derived_corr <- pop_derived
pop_derived_corr <- st_set_geometry(pop_derived_corr, NULL)
pop_derived_corr <- pop_derived_corr %>% mutate_if(is.numeric, ~replace_na(., 0))
row.names(pop_derived_corr) <- pop_derived_corr$"SUBZONE_N"
pop_derived_corr <- pop_derived_corr %>% select(13:20,21:26)
head(pop_derived_corr,10)
##                 econActivePR   youngPR    agedPR   hdb12PR   hdb34PR   hdb5ecPR
## MARINA SOUTH       0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.00000000
## PEARL'S HILL       0.4223228 0.1659125 0.3182504 0.6817496 0.2111614 0.00000000
## BOAT QUAY          0.5714286 0.0000000 0.2857143 0.0000000 0.0000000 0.00000000
## HENDERSON HILL     0.4790732 0.1958146 0.2436472 0.2937220 0.5971599 0.09491779
## REDHILL            0.5135135 0.2646785 0.1519105 0.1835974 0.3280522 0.30195713
## ALEXANDRA HILL     0.4644412 0.2111756 0.2402032 0.2837446 0.4586357 0.22641509
## BUKIT HO SWEE      0.4844384 0.1928281 0.2428958 0.2503383 0.5825440 0.12855210
## CLARKE QUAY        0.6666667 0.0000000 0.1666667 0.0000000 0.0000000 0.00000000
## PASIR PANJANG 1    0.5530474 0.2528217 0.1264108 0.0000000 0.0000000 0.00000000
## QUEENSWAY          0.6206897 0.1034483 0.1724138 0.0000000 0.0000000 0.00000000
##                    condoPR  landedPR business financial govtemb industrial
## MARINA SOUTH    0.00000000 0.0000000        0         3       0          0
## PEARL'S HILL    0.06334842 0.0000000        6        25       1          0
## BOAT QUAY       0.71428571 0.0000000       40         2       2          0
## HENDERSON HILL  0.01420030 0.0000000        0         4       0          0
## REDHILL         0.17986952 0.0000000        2        12       0          0
## ALEXANDRA HILL  0.00000000 0.0000000       39        15       7          1
## BUKIT HO SWEE   0.03653586 0.0000000        6         6       4          0
## CLARKE QUAY     0.16666667 0.0000000       12        19       4          0
## PASIR PANJANG 1 0.67042889 0.3227991       16         4       0          0
## QUEENSWAY       1.00000000 0.0000000        0         2       0          0
##                 privateres shooping
## MARINA SOUTH             0        0
## PEARL'S HILL             6        3
## BOAT QUAY                1        1
## HENDERSON HILL           5        0
## REDHILL                  6        1
## ALEXANDRA HILL           4        1
## BUKIT HO SWEE           11        1
## CLARKE QUAY              6       10
## PASIR PANJANG 1         56        0
## QUEENSWAY                1        1
# pop_derived_corr[is.na(pop_derived_corr)] <- 0
# pop_derived_corr <- st_set_geometry(pop_derived_corr, NULL)
# pop_derived_corr <- pop_derived_corr %>%  select_if(is.numeric) %>% select(matches('PR|SUBZONE_N', ignore.case = FALSE))

Plot Corr

pop_derived_corr.cor = cor(pop_derived_corr)
corrplot.mixed(pop_derived_corr.cor,
         lower = "ellipse",
                tl.col = "black",
               upper = "number",
               tl.pos = "lt",
               diag = "l",
                number.cex = 0.5)

Remove young PR

pop_derived_corr <- pop_derived_corr %>% select(!econActivePR)
pop_derived_corr.cor = cor(pop_derived_corr)
corrplot.mixed(pop_derived_corr.cor,
         lower = "ellipse",
                tl.col = "black",
               upper = "number",
               tl.pos = "lt",
               diag = "l",
                number.cex = 0.5)

Hierarchy Cluster Analysis

Extrating clustering variables

The code chunk below will be used to extract the clustering variables from the simple feature object into data.frame.

cluster_vars <- pop_derived %>%
   mutate_if(is.numeric, ~replace_na(., 0)) %>% 
  st_set_geometry(NULL) 
# Set row names as Subzone name
row.names(cluster_vars) <- cluster_vars$"SUBZONE_N"
# delete subzone col
cluster_vars <- select(cluster_vars, !1)
head(cluster_vars,10)
##                             area total_pop         pop_density econ_active_pop
## MARINA SOUTH    1.6303787 [km^2]         0     0.0000 [1/km^2]               0
## PEARL'S HILL    0.5598162 [km^2]      6630 11843.1720 [1/km^2]            2800
## BOAT QUAY       0.1608075 [km^2]        70   435.3031 [1/km^2]              40
## HENDERSON HILL  0.5954289 [km^2]     13380 22471.1982 [1/km^2]            6410
## REDHILL         0.3874294 [km^2]     10730 27695.3679 [1/km^2]            5510
## ALEXANDRA HILL  1.0303786 [km^2]     13780 13373.7249 [1/km^2]            6400
## BUKIT HO SWEE   0.5517320 [km^2]     14780 26788.3660 [1/km^2]            7160
## CLARKE QUAY     0.2901846 [km^2]        60   206.7649 [1/km^2]              40
## PASIR PANJANG 1 1.0847914 [km^2]      4430  4083.7346 [1/km^2]            2450
## QUEENSWAY       0.6316441 [km^2]       290   459.1193 [1/km^2]             180
##                 young_pop aged_pop hdb1_2_pop hdb3_4_pop hdb5_ec_pop condo_pop
## MARINA SOUTH            0        0          0          0           0         0
## PEARL'S HILL         1100     2110       4520       1400           0       420
## BOAT QUAY               0       20          0          0           0        50
## HENDERSON HILL       2620     3260       3930       7990        1270       190
## REDHILL              2840     1630       1970       3520        3240      1930
## ALEXANDRA HILL       2910     3310       3910       6320        3120         0
## BUKIT HO SWEE        2850     3590       3700       8610        1900       540
## CLARKE QUAY             0       10          0          0           0        10
## PASIR PANJANG 1      1120      560          0          0           0      2970
## QUEENSWAY              30       50          0          0           0       290
##                 landed_pop econActivePR   youngPR    agedPR   hdb12PR   hdb34PR
## MARINA SOUTH             0    0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## PEARL'S HILL             0    0.4223228 0.1659125 0.3182504 0.6817496 0.2111614
## BOAT QUAY                0    0.5714286 0.0000000 0.2857143 0.0000000 0.0000000
## HENDERSON HILL           0    0.4790732 0.1958146 0.2436472 0.2937220 0.5971599
## REDHILL                  0    0.5135135 0.2646785 0.1519105 0.1835974 0.3280522
## ALEXANDRA HILL           0    0.4644412 0.2111756 0.2402032 0.2837446 0.4586357
## BUKIT HO SWEE            0    0.4844384 0.1928281 0.2428958 0.2503383 0.5825440
## CLARKE QUAY              0    0.6666667 0.0000000 0.1666667 0.0000000 0.0000000
## PASIR PANJANG 1       1430    0.5530474 0.2528217 0.1264108 0.0000000 0.0000000
## QUEENSWAY                0    0.6206897 0.1034483 0.1724138 0.0000000 0.0000000
##                   hdb5ecPR    condoPR  landedPR business financial govtemb
## MARINA SOUTH    0.00000000 0.00000000 0.0000000        0         3       0
## PEARL'S HILL    0.00000000 0.06334842 0.0000000        6        25       1
## BOAT QUAY       0.00000000 0.71428571 0.0000000       40         2       2
## HENDERSON HILL  0.09491779 0.01420030 0.0000000        0         4       0
## REDHILL         0.30195713 0.17986952 0.0000000        2        12       0
## ALEXANDRA HILL  0.22641509 0.00000000 0.0000000       39        15       7
## BUKIT HO SWEE   0.12855210 0.03653586 0.0000000        6         6       4
## CLARKE QUAY     0.00000000 0.16666667 0.0000000       12        19       4
## PASIR PANJANG 1 0.00000000 0.67042889 0.3227991       16         4       0
## QUEENSWAY       0.00000000 1.00000000 0.0000000        0         2       0
##                 industrial privateres shooping   businessPR  financialPR
## MARINA SOUTH             0          0        0 0.0000000000          Inf
## PEARL'S HILL             0          6        3 0.0009049774 0.0037707391
## BOAT QUAY                0          1        1 0.5714285714 0.0285714286
## HENDERSON HILL           0          5        0 0.0000000000 0.0002989537
## REDHILL                  0          6        1 0.0001863933 0.0011183597
## ALEXANDRA HILL           1          4        1 0.0028301887 0.0010885341
## BUKIT HO SWEE            0         11        1 0.0004059540 0.0004059540
## CLARKE QUAY              0          6       10 0.2000000000 0.3166666667
## PASIR PANJANG 1          0         56        0 0.0036117381 0.0009029345
## QUEENSWAY                0          1        1 0.0000000000 0.0068965517
##                 privateresPR    govtembPR industrialPR  shoppingsPR
## MARINA SOUTH    0.0000000000 0.0000000000 0.000000e+00 0.000000e+00
## PEARL'S HILL    0.0009049774 0.0001508296 0.000000e+00 4.524887e-04
## BOAT QUAY       0.0142857143 0.0285714286 0.000000e+00 1.428571e-02
## HENDERSON HILL  0.0003736921 0.0000000000 0.000000e+00 0.000000e+00
## REDHILL         0.0005591799 0.0000000000 0.000000e+00 9.319664e-05
## ALEXANDRA HILL  0.0002902758 0.0005079826 7.256894e-05 7.256894e-05
## BUKIT HO SWEE   0.0007442490 0.0002706360 0.000000e+00 6.765900e-05
## CLARKE QUAY     0.1000000000 0.0666666667 0.000000e+00 1.666667e-01
## PASIR PANJANG 1 0.0126410835 0.0000000000 0.000000e+00 0.000000e+00
## QUEENSWAY       0.0034482759 0.0000000000 0.000000e+00 3.448276e-03

Data Standardisation

In general, multiple variables will be used in cluster analysis. It is not unusual their values range are different. In order to avoid the cluster analysis result is baised to clustering variables with large values, it is useful to standardise the input variables before performing cluster analysis.

Min-Max standardisation

In the code chunk below, normalize() of heatmaply package is used to stadardisation the clustering variables by using Min-Max method. The summary() is then used to display the summary statistics of the standardised clustering variables.

pop_derived.std <- normalize(cluster_vars)
pop_derived.std[is.na(pop_derived.std)] <- 0
pop_derived_df <- as.data.frame(pop_derived.std)

EDA on Clustering Variables

names(pop_derived_df)
##  [1] "area"            "total_pop"       "pop_density"     "econ_active_pop"
##  [5] "young_pop"       "aged_pop"        "hdb1_2_pop"      "hdb3_4_pop"     
##  [9] "hdb5_ec_pop"     "condo_pop"       "landed_pop"      "econActivePR"   
## [13] "youngPR"         "agedPR"          "hdb12PR"         "hdb34PR"        
## [17] "hdb5ecPR"        "condoPR"         "landedPR"        "business"       
## [21] "financial"       "govtemb"         "industrial"      "privateres"     
## [25] "shooping"        "businessPR"      "financialPR"     "privateresPR"   
## [29] "govtembPR"       "industrialPR"    "shoppingsPR"
business_hist <- ggplot(data=pop_derived_df, aes(x=business)) +
  geom_histogram(bins=20, color="black", fill="light blue")
industrial_hist <- ggplot(data=pop_derived_df, aes(x=industrial)) +
  geom_histogram(bins=20, color="black", fill="light blue")
financial_hist <- ggplot(data=pop_derived_df, aes(x=financial)) +
  geom_histogram(bins=20, color="black", fill="light blue")
govtemb_hist <- ggplot(data=pop_derived_df, aes(x=govtemb)) +
  geom_histogram(bins=20, color="black", fill="light blue")
privateres_hist <- ggplot(data=pop_derived_df, aes(x=privateres)) +
  geom_histogram(bins=20, color="black", fill="light blue")
shopping_hist <- ggplot(data=pop_derived_df, aes(x=shooping)) +
  geom_histogram(bins=20, color="black", fill="light blue")

# PR
youngPR_hist <- ggplot(data=pop_derived_df, aes(x=youngPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")
agedPR_hist <- ggplot(data=pop_derived_df, aes(x=agedPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")
hdb12PR_hist <- ggplot(data=pop_derived_df, aes(x=hdb12PR)) +
  geom_histogram(bins=20, color="black", fill="light blue")
hdb34PR_hist <- ggplot(data=pop_derived_df, aes(x=hdb34PR)) +
  geom_histogram(bins=20, color="black", fill="light blue")
hdb5ecPR_hist <- ggplot(data=pop_derived_df, aes(x=hdb5ecPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")
condoPR_hist <- ggplot(data=pop_derived_df, aes(x=condoPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")
landedPR_hist <- ggplot(data=pop_derived_df, aes(x=landedPR)) +
  geom_histogram(bins=20, color="black", fill="light blue")
ggarrange(business_hist,industrial_hist,financial_hist,govtemb_hist,privateres_hist,shopping_hist,
          ncol = 3,nrow = 3)

ggarrange(youngPR_hist,agedPR_hist,hdb12PR_hist,hdb34PR_hist,hdb5ecPR_hist,condoPR_hist,landedPR_hist, ncol = 3,nrow = 3)

K-means

wssplot <- function(data, nc=15, seed=999){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

wssplot(pop_derived_df, nc=10)

There is a distinct drop above 4 clusters. After three clusters, this decrease drops off, suggesting that a 3-cluster solution may be a good fit to the data.

k <- 4

Computing proximity matrix

In R, many packages provide functions to calculate distance matrix. We will compute the proximity matrix by using dist() of R.

dist() supports six distance proximity calculations, they are: euclidean, maximum, manhattan, canberra, binary and minkowski. The default is euclidean proximity matrix.

The code chunk below is used to compute the proximity matrix using euclidean method.

proxmat <- dist(pop_derived_df, method = 'euclidean')
hclust_ward <- hclust(proxmat, method = 'ward.D')
plot(hclust_ward, cex = 0.6)

Selecting the optimal clustering algorithm

m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

ac <- function(x) {
  agnes(pop_derived_df, method = x)$ac
}

map_dbl(m, ac)
##   average    single  complete      ward 
## 0.8735622 0.7759751 0.9110540 0.9771934

Interpreting the dendrograms

In the dendrogram displayed above, each leaf corresponds to one observation. As we move up the tree, observations that are similar to each other are combined into branches, which are themselves fused at a higher height.

The height of the fusion, provided on the vertical axis, indicates the (dis)similarity between two observations. The higher the height of the fusion, the less similar the observations are. Note that, conclusions about the proximity of two observations can be drawn only based on the height where branches containing those two observations first are fused. We cannot use the proximity of two observations along the horizontal axis as a criteria of their similarity.

It’s also possible to draw the dendrogram with a border around the selected clusters by using rect.hclust() of R stats. The argument border is used to specify the border colors for the rectangles.

plot(hclust_ward, cex = 0.6)
rect.hclust(hclust_ward, k = k, border = 2:5)

Visually-driven hierarchical clustering analysis

In this section, we will perform visually-driven hiearchical clustering analysis by using heatmaply package.

Transforming the data frame into a matrix

pop_derived_df_mat <- data.matrix(pop_derived_df)
heatmaply(pop_derived_df_mat,
          Colv=NA,
          dist_method = "euclidean",
          hclust_method = "ward.D",
          seriate = "OLO",
          colors = Blues,
          k_row = k,
          margins = c(NA,200,60,NA),
          fontsize_row = 4,
          fontsize_col = 5,
          main="Geographic Segmentation of Subzones by Urban Functions and Population",
          xlab = "Urban Functions and Population",
          ylab = "Subzones"
          )

Mapping the clusters formed

With closed examination of the dendragram above, we have decided to retain four clusters.

cutree() of R Base will be used in the code chunk below to derive a 4-cluster model.

pop_derived_c <- pop_derived
pop_derived_c[is.na(pop_derived_c)] <- 0
groups <- as.factor(cutree(hclust_ward, k=k))
pop_derived_c_cluster <- cbind(pop_derived_c, as.matrix(groups)) %>% 
    rename(`CLUSTER`=`as.matrix.groups.`)

Plot

hierarchical_map <- qtm(pop_derived_c_cluster, "CLUSTER")
## Warning: The shape pop_derived_c_cluster is invalid. See sf::st_is_valid
hierarchical_map

Spatially Constrained Clustering - SKATER approach

pop_derived_sp <- as_Spatial(pop_derived_c)
class(pop_derived_sp)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"

Computing Neighbour List

pop_derived.nb <- poly2nb(pop_derived_sp)
summary(pop_derived.nb)
## Neighbour list object:
## Number of regions: 323 
## Number of nonzero links: 1938 
## Percentage nonzero weights: 1.857585 
## Average number of links: 6 
## 5 regions with no links:
## 17 18 19 295 302
## Link number distribution:
## 
##  0  1  2  3  4  5  6  7  8  9 10 11 12 14 17 
##  5  2  6 10 26 77 85 52 34 17  3  3  1  1  1 
## 2 least connected regions:
## 16 234 with 1 link
## 1 most connected region:
## 313 with 17 links

We note that there are 5 regions with no links. Remove them

plot(pop_derived_sp, border=grey(.5))
plot(pop_derived.nb, coordinates(pop_derived_sp), col="blue", add=TRUE)

Visualised 5 regions with no links.

`%notin%` <- Negate(`%in%`)
# pop_derived_sp_filtered <- pop_derived_sp %>% filter(data$SUBZONE_N %notin% c("NORTH-EASTERN ISLANDS", "PULAU SELETAR", "SEMAKAU", "SOUTHERN GROUP", "SUDONG"))
# pop_derived_filterd_sp <- as_Spatial(pop_derived_c_filtered)

pop_derived_filtered_sf <- pop_derived_c %>% filter(SUBZONE_N %notin% c("NORTH-EASTERN ISLANDS", "PULAU SELETAR", "SEMAKAU", "SOUTHERN GROUP", "SUDONG"))
pop_derived_filtered_sp <- as_Spatial(pop_derived_filtered_sf)
# plot(pop_derived_filtered_sp)


nrow(pop_derived_sp)
## [1] 323
nrow(pop_derived_filtered_sp)
## [1] 318
class(pop_derived_sp)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
class(pop_derived_filtered_sp)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"

Recalc Neighbour list

pop_derived_filtered.nb <- poly2nb(pop_derived_filtered_sp)
summary(pop_derived_filtered.nb)
## Neighbour list object:
## Number of regions: 318 
## Number of nonzero links: 1938 
## Percentage nonzero weights: 1.916459 
## Average number of links: 6.09434 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 14 17 
##  2  6 10 26 77 85 52 34 17  3  3  1  1  1 
## 2 least connected regions:
## 16 231 with 1 link
## 1 most connected region:
## 308 with 17 links
plot(pop_derived_filtered_sp, border=grey(.5))
plot(pop_derived_filtered.nb, coordinates(pop_derived_filtered_sp), col="blue", add=TRUE)

lcosts <- nbcosts(pop_derived_filtered.nb, pop_derived_df)
pop_derived_filtered.w <- nb2listw(pop_derived_filtered.nb, lcosts, style="B")
summary(pop_derived_filtered.w)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 318 
## Number of nonzero links: 1938 
## Percentage nonzero weights: 1.916459 
## Average number of links: 6.09434 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 14 17 
##  2  6 10 26 77 85 52 34 17  3  3  1  1  1 
## 2 least connected regions:
## 16 231 with 1 link
## 1 most connected region:
## 308 with 17 links
## 
## Weights style: B 
## Weights constants summary:
##     n     nn       S0       S1       S2
## B 318 101124 2246.255 6013.154 71179.68
pop_derived_filtered.mst <- mstree(pop_derived_filtered.w)
class(pop_derived_filtered.mst)
## [1] "mst"    "matrix"
dim(pop_derived_filtered.mst)
## [1] 317   3
head(pop_derived_filtered.mst)
##      [,1] [,2]       [,3]
## [1,]   56   96 1.06070658
## [2,]   96  123 0.81471313
## [3,]  123  128 0.06917768
## [4,]  123   79 0.15059524
## [5,]  128  162 0.18875858
## [6,]  162  124 0.14914557
plot(pop_derived_filtered_sp, border=gray(.5))
plot.mst(pop_derived_filtered.mst, coordinates(pop_derived_filtered_sp), 
      col="blue", cex.lab=0.7, cex.circles=0.005, add=TRUE)

Computing spatially constrained clusters using SKATER method

clust4 <- skater(pop_derived_filtered.mst[,1:2], pop_derived_df, 4)
str(clust4)
## List of 8
##  $ groups      : num [1:318] 4 1 1 1 1 1 1 1 1 4 ...
##  $ edges.groups:List of 5
##   ..$ :List of 3
##   .. ..$ node: num [1:229] 33 69 92 161 147 271 308 192 8 25 ...
##   .. ..$ edge: num [1:228, 1:3] 147 271 144 249 206 25 131 159 29 171 ...
##   .. ..$ ssw : num 194
##   ..$ :List of 3
##   .. ..$ node: num [1:31] 304 298 313 221 292 303 265 203 280 301 ...
##   .. ..$ edge: num [1:30, 1:3] 301 279 292 287 289 302 280 317 315 318 ...
##   .. ..$ ssw : num 16.3
##   ..$ :List of 3
##   .. ..$ node: num [1:36] 283 273 236 276 240 250 284 239 248 267 ...
##   .. ..$ edge: num [1:35, 1:3] 236 222 250 248 276 239 240 284 239 300 ...
##   .. ..$ ssw : num 25.3
##   ..$ :List of 3
##   .. ..$ node: num [1:16] 19 18 17 21 20 1 22 11 30 26 ...
##   .. ..$ edge: num [1:15, 1:3] 11 30 1 21 11 22 17 26 22 18 ...
##   .. ..$ ssw : num 9.35
##   ..$ :List of 3
##   .. ..$ node: num [1:6] 311 309 277 274 314 258
##   .. ..$ edge: num [1:5, 1:3] 311 311 274 309 277 258 309 314 277 274 ...
##   .. ..$ ssw : num 3.79
##  $ not.prune   : NULL
##  $ candidates  : int [1:5] 1 2 3 4 5
##  $ ssto        : num 277
##  $ ssw         : num [1:5] 277 267 259 254 249
##  $ crit        : num [1:2] 1 Inf
##  $ vec.crit    : num [1:318] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "class")= chr "skater"
ccs4 <- clust4$groups
ccs4
##   [1] 4 1 1 1 1 1 1 1 1 4 4 1 4 1 1 1 4 4 4 4 4 4 1 1 1 4 1 1 1 4 1 4 1 1 1 1 1
##  [38] 4 1 1 4 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3 2 1 3 1 1 1 1 1 1 1 1 3 3 1 1 1 1 1 2 3
## [223] 3 3 1 1 1 1 1 1 1 1 1 2 1 3 3 3 3 3 1 1 1 1 3 3 3 3 1 3 1 2 1 1 1 2 1 5 3
## [260] 3 2 3 3 3 2 1 3 1 1 1 1 3 3 5 3 3 5 1 2 2 3 1 3 3 2 1 2 2 2 3 2 2 1 2 2 2
## [297] 2 2 2 3 2 2 2 2 3 1 3 1 5 1 5 2 2 5 2 2 2 2
table(ccs4)
## ccs4
##   1   2   3   4   5 
## 229  31  36  16   6
plot(pop_derived_filtered_sp, border=gray(.5))
plot(clust4, coordinates(pop_derived_filtered_sp), cex.lab=.7,
     groups.colors=c("red","green","blue", "brown", "pink"), cex.circles=0.005, add=TRUE)
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

Visualising clusters

groups_mat <- as.matrix(clust4$groups)
pop_derived_filtered_sp_Scluster <- cbind(pop_derived_filtered_sf, as.factor(groups_mat)) %>% rename(`SP_CLUSTER`=`as.factor.groups_mat.`)
spatial_map <- qtm(pop_derived_filtered_sp_Scluster, "SP_CLUSTER")
## Warning: The shape pop_derived_filtered_sp_Scluster is invalid. See
## sf::st_is_valid
spatial_map