Import packages

packages = c('rgdal', 'spdep', 'ClustGeo',  'tmap', 'sf', 'ggpubr', 'cluster', 'heatmaply', 'corrplot', 'psych', 'tidyverse', 'ggplot2')
for (p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
    }
  library(p,character.only = T)
}

Import data

1. Planning Area Subzone

pasz <- read_csv ("data/aspatial/planning-area-subzone-age-group-sex-and-type-of-dwelling-june-2011-2019.csv")
## Parsed with column specification:
## cols(
##   planning_area = col_character(),
##   subzone = col_character(),
##   age_group = col_character(),
##   sex = col_character(),
##   type_of_dwelling = col_character(),
##   resident_count = col_double(),
##   year = col_double()
## )
any(is.na(pasz))
## [1] FALSE

2. Planning Subzone boundary

mpsz <- st_read(dsn = "data/geospatial", layer = "MP14_SUBZONE_WEB_PL") %>%
  select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `geometry`)
## Reading layer `MP14_SUBZONE_WEB_PL' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 323 features and 15 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## projected CRS:  SVY21
mpsz_sf <- st_as_sf(mpsz, 3414)
  
mpsz_sf3414 <- st_transform(mpsz_sf, 3414)
 
st_crs(mpsz_sf3414)
## Coordinate Reference System:
##   User input: EPSG:3414 
##   wkt:
## PROJCRS["SVY21 / Singapore TM",
##     BASEGEOGCRS["SVY21",
##         DATUM["SVY21",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4757]],
##     CONVERSION["Singapore Transverse Mercator",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",1.36666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",103.833333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",1,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",28001.642,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",38744.572,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["northing (N)",north,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["easting (E)",east,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["Singapore"],
##         BBOX[1.13,103.59,1.47,104.07]],
##     ID["EPSG",3414]]

3. Business

business <- st_read(dsn = "data/geospatial", layer = "Business") %>%
  filter(FAC_TYPE == 5000) %>%
  select(`POI_ID`, `geometry`) %>%
  rename(business_id = POI_ID)
## Reading layer `Business' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 6550 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6147 ymin: 1.24605 xmax: 104.0044 ymax: 1.4698
## geographic CRS: WGS 84
business_sf <- st_as_sf(business, 4326)
business_sf3414 <- st_transform(business_sf, 3414)
st_crs(business_sf3414)
## Coordinate Reference System:
##   User input: EPSG:3414 
##   wkt:
## PROJCRS["SVY21 / Singapore TM",
##     BASEGEOGCRS["SVY21",
##         DATUM["SVY21",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4757]],
##     CONVERSION["Singapore Transverse Mercator",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",1.36666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",103.833333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",1,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",28001.642,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",38744.572,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["northing (N)",north,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["easting (E)",east,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["Singapore"],
##         BBOX[1.13,103.59,1.47,104.07]],
##     ID["EPSG",3414]]
mpsz_b <- st_join(mpsz_sf3414, business_sf3414) %>%
  group_by(SUBZONE_N, REGION_N, SHAPE_Area) %>%
  summarise(business_count = sum(!is.na(business_id))) %>%
  select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `geometry`)

4. Industry

industry <- st_read(dsn = "data/geospatial", layer = "Business") %>%
  filter(FAC_TYPE == 9991) %>%
  select(`POI_ID`, `geometry`) %>%
  rename(industry_id = POI_ID)
## Reading layer `Business' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 6550 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6147 ymin: 1.24605 xmax: 104.0044 ymax: 1.4698
## geographic CRS: WGS 84
industry_sf <- st_as_sf(industry, 4326)
industry_sf3414 <- st_transform(industry_sf, 3414)
st_crs(industry_sf3414)
## Coordinate Reference System:
##   User input: EPSG:3414 
##   wkt:
## PROJCRS["SVY21 / Singapore TM",
##     BASEGEOGCRS["SVY21",
##         DATUM["SVY21",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4757]],
##     CONVERSION["Singapore Transverse Mercator",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",1.36666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",103.833333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",1,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",28001.642,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",38744.572,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["northing (N)",north,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["easting (E)",east,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["Singapore"],
##         BBOX[1.13,103.59,1.47,104.07]],
##     ID["EPSG",3414]]
mpsz_bi <- st_join(mpsz_b, industry_sf3414) %>%
  group_by(SUBZONE_N, REGION_N, SHAPE_Area, business_count) %>%
  summarise(industry_count = sum(!is.na(industry_id))) %>%
  select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `industry_count`, `geometry`)

5. Financial

financial <- st_read(dsn = "data/geospatial", layer = "Financial") %>%
  select(`POI_ID`, `geometry`) %>%
  rename(financial_id = POI_ID)
## Reading layer `Financial' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3320 features and 29 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6256 ymin: 1.24392 xmax: 103.9998 ymax: 1.46247
## geographic CRS: WGS 84
financial_sf <- st_as_sf(financial, 4326)
financial_sf3414 <- st_transform(financial_sf, 3414)
st_crs(financial_sf3414)
## Coordinate Reference System:
##   User input: EPSG:3414 
##   wkt:
## PROJCRS["SVY21 / Singapore TM",
##     BASEGEOGCRS["SVY21",
##         DATUM["SVY21",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4757]],
##     CONVERSION["Singapore Transverse Mercator",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",1.36666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",103.833333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",1,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",28001.642,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",38744.572,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["northing (N)",north,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["easting (E)",east,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["Singapore"],
##         BBOX[1.13,103.59,1.47,104.07]],
##     ID["EPSG",3414]]
mpsz_bif <- st_join(mpsz_bi, financial_sf3414) %>%
  group_by(SUBZONE_N, REGION_N, SHAPE_Area, business_count, industry_count) %>%
  summarise(financial_count = sum(!is.na(financial_id))) %>%
  select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `industry_count`, `financial_count`, `geometry`)

6. Government including embassy

govt <- st_read(dsn = "data/geospatial", layer = "Govt_Embassy") %>%
  select(`POI_ID`, `geometry`) %>%
  rename(govt_id = POI_ID)
## Reading layer `Govt_Embassy' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 443 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6282 ymin: 1.24911 xmax: 103.9884 ymax: 1.45765
## geographic CRS: WGS 84
govt_sf <- st_as_sf(govt, 4326)
govt_sf3414 <- st_transform(govt_sf, 3414)
st_crs(govt_sf3414)
## Coordinate Reference System:
##   User input: EPSG:3414 
##   wkt:
## PROJCRS["SVY21 / Singapore TM",
##     BASEGEOGCRS["SVY21",
##         DATUM["SVY21",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4757]],
##     CONVERSION["Singapore Transverse Mercator",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",1.36666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",103.833333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",1,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",28001.642,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",38744.572,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["northing (N)",north,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["easting (E)",east,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["Singapore"],
##         BBOX[1.13,103.59,1.47,104.07]],
##     ID["EPSG",3414]]
mpsz_bifg <- st_join(mpsz_bif, govt_sf3414) %>%
  group_by(SUBZONE_N, REGION_N, SHAPE_Area, business_count, industry_count, financial_count) %>%
  summarise(govt_count = sum(!is.na(govt_id))) %>%
  select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `industry_count`, `financial_count`, `govt_count`, `geometry`)

7. Private Residential

pr <- st_read(dsn = "data/geospatial", layer = "Private residential") %>%
  select(`POI_ID`, `geometry`) %>%
  rename(pr_id = POI_ID)
## Reading layer `Private residential' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3604 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6295 ymin: 1.23943 xmax: 103.9749 ymax: 1.45379
## geographic CRS: WGS 84
pr_sf <- st_as_sf(pr, 4326)
pr_sf3414 <- st_transform(pr_sf, 3414)
st_crs(pr_sf3414)
## Coordinate Reference System:
##   User input: EPSG:3414 
##   wkt:
## PROJCRS["SVY21 / Singapore TM",
##     BASEGEOGCRS["SVY21",
##         DATUM["SVY21",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4757]],
##     CONVERSION["Singapore Transverse Mercator",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",1.36666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",103.833333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",1,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",28001.642,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",38744.572,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["northing (N)",north,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["easting (E)",east,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["Singapore"],
##         BBOX[1.13,103.59,1.47,104.07]],
##     ID["EPSG",3414]]
mpsz_bifgp <- st_join(mpsz_bifg, pr_sf3414) %>%
  group_by(SUBZONE_N, REGION_N, SHAPE_Area, business_count, industry_count, financial_count, govt_count) %>%
  summarise(pr_count = sum(!is.na(pr_id))) %>%
  select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `industry_count`, `financial_count`, `govt_count`, `pr_count`, `geometry`)

8. Shopping

shopping <- st_read(dsn = "data/geospatial", layer = "Shopping") %>%
  select(`POI_ID`, `geometry`) %>%
  rename(shopping_id = POI_ID)
## Reading layer `Shopping' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 511 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.679 ymin: 1.24779 xmax: 103.9644 ymax: 1.4535
## geographic CRS: WGS 84
shopping_sf <- st_as_sf(shopping, 4326)
shopping_sf3414 <- st_transform(shopping_sf, 3414)
st_crs(shopping_sf3414)
## Coordinate Reference System:
##   User input: EPSG:3414 
##   wkt:
## PROJCRS["SVY21 / Singapore TM",
##     BASEGEOGCRS["SVY21",
##         DATUM["SVY21",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4757]],
##     CONVERSION["Singapore Transverse Mercator",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",1.36666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",103.833333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",1,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",28001.642,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",38744.572,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["northing (N)",north,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["easting (E)",east,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["Singapore"],
##         BBOX[1.13,103.59,1.47,104.07]],
##     ID["EPSG",3414]]
mpsz_bifgps <- st_join(mpsz_bifgp, shopping_sf3414) %>%
  group_by(SUBZONE_N, REGION_N, SHAPE_Area, business_count, industry_count, financial_count, govt_count, pr_count) %>%
  summarise(shopping_count = sum(!is.na(shopping_id))) %>%
  select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `industry_count`, `financial_count`, `govt_count`, `pr_count`, `shopping_count`, `geometry`)

Preparing the data - Extracting indicators

1. Age group

# Economy active population (i.e. age 25-64)
# Young group (i.e. age 0-24)
# Aged group (i.e 65 and above)

pasz_age_group <- pasz %>%
  spread(age_group, resident_count) %>%
  group_by(subzone) %>%
  mutate(young = `0_to_4`+`5_to_9`+`10_to_14`+ `15_to_19`+`20_to_24`) %>%
  mutate(economy_active = `25_to_29`+`30_to_34`+`35_to_39`+`40_to_44`+`45_to_49`+`50_to_54`+`55_to_59`+`60_to_64`) %>%
  mutate(aged = `65_to_69`+`70_to_74`+`75_to_79`+ `80_to_84`+`85_to_89`+`90_and_over`) %>%
  summarise(young = sum(young), economy_active = sum(economy_active), aged = sum(aged)) %>%
  select(`subzone`, `young`, `economy_active`, `aged`)

2. Dwelling type

# HDB1-2RM dwellers
# HDB3-4RM dwellers
# HDB5RM-Ec dweller
# Condominium and apartment dwellers
# Landed property dwellers 

pasz_dwelling <- pasz %>%
  spread(type_of_dwelling, resident_count) %>%
  group_by(subzone) %>%
  mutate(HDB12rm = `HDB 1- and 2-Room Flats`) %>%
  mutate(HDB34rm = `HDB 3-Room Flats`+`HDB 4-Room Flats`) %>%
  mutate(HDB5rmec = `HDB 5-Room and Executive Flats`) %>%
  mutate(condo = `Condominiums and Other Apartments`) %>%
  mutate(landed = `Landed Properties`) %>%
  summarise(HDB12rm = sum(HDB12rm), HDB34rm = sum(HDB34rm), HDB5rmec = sum(HDB5rmec), condo = sum(condo), landed = sum(landed)) %>%
  select(`subzone`, `HDB12rm`, `HDB34rm`, `HDB5rmec`, `condo`, `landed`)

3. Total residents

total_residents <- pasz %>%
  group_by(subzone) %>%
  summarise(resident_count = sum(resident_count))

Joining df by subzone

residents_sz <- left_join(pasz_age_group, pasz_dwelling, by=c("subzone" = "subzone"))
total_residents_sz <- left_join(residents_sz, total_residents, by=c("subzone" = "subzone")) %>%
  mutate_at(.vars = vars(subzone), .funs = funs(toupper))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
residents_mpsz <- left_join(mpsz_bifgps, total_residents_sz, by=c("SUBZONE_N" = "subzone")) %>%
  mutate(pop_density = `resident_count`/`SHAPE_Area`)
summary(residents_mpsz)
##   SUBZONE_N                      REGION_N     SHAPE_Area       business_count  
##  Length:323         CENTRAL REGION   :134   Min.   :   39438   Min.   :  0.00  
##  Class :character   EAST REGION      : 30   1st Qu.:  628261   1st Qu.:  0.00  
##  Mode  :character   NORTH-EAST REGION: 48   Median : 1229894   Median :  2.00  
##                     NORTH REGION     : 41   Mean   : 2420882   Mean   : 19.94  
##                     WEST REGION      : 70   3rd Qu.: 2106483   3rd Qu.: 14.00  
##                                             Max.   :69748299   Max.   :308.00  
##  industry_count   financial_count    govt_count        pr_count     
##  Min.   :0.0000   Min.   :  0.00   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.:  1.00   1st Qu.: 0.000   1st Qu.:  0.00  
##  Median :0.0000   Median :  5.00   Median : 0.000   Median :  4.00  
##  Mean   :0.3406   Mean   : 10.28   Mean   : 1.372   Mean   : 11.16  
##  3rd Qu.:0.0000   3rd Qu.: 13.00   3rd Qu.: 1.000   3rd Qu.: 11.00  
##  Max.   :8.0000   Max.   :134.00   Max.   :19.000   Max.   :217.00  
##  shopping_count            geometry       young        economy_active  
##  Min.   : 0.000   MULTIPOLYGON :323   Min.   :     0   Min.   :     0  
##  1st Qu.: 0.000   epsg:3414    :  0   1st Qu.:     0   1st Qu.:     0  
##  Median : 0.000   +proj=tmer...:  0   Median : 10740   Median : 22420  
##  Mean   : 1.582                       Mean   : 30969   Mean   : 65096  
##  3rd Qu.: 1.000                       3rd Qu.: 40065   3rd Qu.: 91505  
##  Max.   :31.000                       Max.   :360610   Max.   :741760  
##       aged           HDB12rm         HDB34rm          HDB5rmec     
##  Min.   :     0   Min.   :    0   Min.   :     0   Min.   :     0  
##  1st Qu.:     0   1st Qu.:    0   1st Qu.:     0   1st Qu.:     0  
##  Median :  4440   Median :    0   Median :     0   Median :     0  
##  Mean   : 12916   Mean   : 4323   Mean   : 53600   Mean   : 29951  
##  3rd Qu.: 20880   3rd Qu.: 3385   3rd Qu.: 86575   3rd Qu.: 31230  
##  Max.   :129850   Max.   :48330   Max.   :709850   Max.   :448060  
##      condo            landed       resident_count     pop_density     
##  Min.   :     0   Min.   :     0   Min.   :      0   Min.   :0.00000  
##  1st Qu.:     0   1st Qu.:     0   1st Qu.:      0   1st Qu.:0.00000  
##  Median :  1510   Median :     0   Median :  36430   Median :0.04142  
##  Mean   : 13123   Mean   :  6989   Mean   : 108981   Mean   :0.09494  
##  3rd Qu.: 18440   3rd Qu.:  3745   3rd Qu.: 150475   3rd Qu.:0.17612  
##  Max.   :144470   Max.   :172520   Max.   :1232220   Max.   :0.43540

Derive new variables using dplyr

Residents data -> The unit of measurement of the values are number of residents. Using these values directly will be bias by the underlying total number of residents. In general, the subzones with relatively higher total number of residents will also have a higher number of young residents.

Geospatial data -> The unit of measurement of the values are number of businesses. Using these values directly will be bias by the area of the subzone. In general, the subzone with relatively bigger area will also have higher number of business.

residents_mpsz_derived <- residents_mpsz %>%
  mutate(`young_pr` = `young`/`resident_count`*1000) %>%
  mutate(`economy_active_pr` = `economy_active`/`resident_count`*1000) %>%
  mutate(`aged_pr` = `aged`/`resident_count`*1000) %>%
  
  mutate(`HDB12rm_pr` = `HDB12rm`/`resident_count`*1000) %>%
  mutate(`HDB34rm_pr` = `HDB34rm`/`resident_count`*1000) %>%
  mutate(`HDB5rmec_pr` = `HDB5rmec`/`resident_count`*1000) %>%
  mutate(`condo_pr` = `condo`/`resident_count`*1000) %>%
  mutate(`landed_pr` = `landed`/`resident_count`*1000) %>%
  
  mutate(`business_pr` = `business_count`/`SHAPE_Area`*100000) %>%
  mutate(`industry_pr` = `industry_count`/`SHAPE_Area`*100000) %>%
  mutate(`financial_pr` = `financial_count`/`SHAPE_Area`*100000) %>%
  mutate(`govt_pr` = `govt_count`/`SHAPE_Area`*100000) %>%
  mutate(`pr_pr` = `pr_count`/`SHAPE_Area`*100000) %>%
  mutate(`shopping_pr` = `shopping_count`/`SHAPE_Area`*100000) %>%
  
  select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `resident_count`, `young`, `economy_active`, `aged`, `HDB12rm`, `HDB34rm`, `HDB5rmec`, `condo`, `landed`, `business_count`, `industry_count`, `financial_count`, `govt_count`, `pr_count`, `shopping_count`, `young_pr`, `economy_active_pr`, `aged_pr`, `HDB12rm_pr`, `HDB34rm_pr`, `HDB5rmec_pr`, `condo_pr`, `landed_pr`, `pop_density`, `business_pr`, `industry_pr`, `financial_pr`, `govt_pr`, `pr_pr`, `shopping_pr`)

Replacing NA values with 0

residents_mpsz_derived$young_pr[is.na(residents_mpsz_derived$young_pr)] <- 0
residents_mpsz_derived$economy_active_pr[is.na(residents_mpsz_derived$economy_active_pr)] <- 0
residents_mpsz_derived$aged_pr[is.na(residents_mpsz_derived$aged_pr)] <- 0
residents_mpsz_derived$HDB12rm_pr[is.na(residents_mpsz_derived$HDB12rm_pr)] <- 0
residents_mpsz_derived$HDB34rm_pr[is.na(residents_mpsz_derived$HDB34rm_pr)] <- 0
residents_mpsz_derived$HDB5rmec_pr[is.na(residents_mpsz_derived$HDB5rmec_pr)] <- 0
residents_mpsz_derived$condo_pr[is.na(residents_mpsz_derived$condo_pr)] <- 0
residents_mpsz_derived$landed_pr[is.na(residents_mpsz_derived$landed_pr)] <- 0
summary(residents_mpsz_derived)
##   SUBZONE_N                      REGION_N     SHAPE_Area      
##  Length:323         CENTRAL REGION   :134   Min.   :   39438  
##  Class :character   EAST REGION      : 30   1st Qu.:  628261  
##  Mode  :character   NORTH-EAST REGION: 48   Median : 1229894  
##                     NORTH REGION     : 41   Mean   : 2420882  
##                     WEST REGION      : 70   3rd Qu.: 2106483  
##                                             Max.   :69748299  
##  resident_count        young        economy_active        aged       
##  Min.   :      0   Min.   :     0   Min.   :     0   Min.   :     0  
##  1st Qu.:      0   1st Qu.:     0   1st Qu.:     0   1st Qu.:     0  
##  Median :  36430   Median : 10740   Median : 22420   Median :  4440  
##  Mean   : 108981   Mean   : 30969   Mean   : 65096   Mean   : 12916  
##  3rd Qu.: 150475   3rd Qu.: 40065   3rd Qu.: 91505   3rd Qu.: 20880  
##  Max.   :1232220   Max.   :360610   Max.   :741760   Max.   :129850  
##     HDB12rm         HDB34rm          HDB5rmec          condo       
##  Min.   :    0   Min.   :     0   Min.   :     0   Min.   :     0  
##  1st Qu.:    0   1st Qu.:     0   1st Qu.:     0   1st Qu.:     0  
##  Median :    0   Median :     0   Median :     0   Median :  1510  
##  Mean   : 4323   Mean   : 53600   Mean   : 29951   Mean   : 13123  
##  3rd Qu.: 3385   3rd Qu.: 86575   3rd Qu.: 31230   3rd Qu.: 18440  
##  Max.   :48330   Max.   :709850   Max.   :448060   Max.   :144470  
##      landed       business_count   industry_count   financial_count 
##  Min.   :     0   Min.   :  0.00   Min.   :0.0000   Min.   :  0.00  
##  1st Qu.:     0   1st Qu.:  0.00   1st Qu.:0.0000   1st Qu.:  1.00  
##  Median :     0   Median :  2.00   Median :0.0000   Median :  5.00  
##  Mean   :  6989   Mean   : 19.94   Mean   :0.3406   Mean   : 10.28  
##  3rd Qu.:  3745   3rd Qu.: 14.00   3rd Qu.:0.0000   3rd Qu.: 13.00  
##  Max.   :172520   Max.   :308.00   Max.   :8.0000   Max.   :134.00  
##    govt_count        pr_count      shopping_count      young_pr    
##  Min.   : 0.000   Min.   :  0.00   Min.   : 0.000   Min.   :  0.0  
##  1st Qu.: 0.000   1st Qu.:  0.00   1st Qu.: 0.000   1st Qu.:  0.0  
##  Median : 0.000   Median :  4.00   Median : 0.000   Median :246.8  
##  Mean   : 1.372   Mean   : 11.16   Mean   : 1.582   Mean   :193.1  
##  3rd Qu.: 1.000   3rd Qu.: 11.00   3rd Qu.: 1.000   3rd Qu.:290.7  
##  Max.   :19.000   Max.   :217.00   Max.   :31.000   Max.   :573.3  
##  economy_active_pr    aged_pr         HDB12rm_pr       HDB34rm_pr   
##  Min.   :  0.0     Min.   :  0.00   Min.   :  0.00   Min.   :  0.0  
##  1st Qu.:  0.0     1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.0  
##  Median :588.9     Median : 98.47   Median :  0.00   Median :  0.0  
##  Mean   :441.7     Mean   : 95.76   Mean   : 27.17   Mean   :254.9  
##  3rd Qu.:609.4     3rd Qu.:150.03   3rd Qu.: 15.50   3rd Qu.:515.8  
##  Max.   :934.4     Max.   :915.25   Max.   :650.32   Max.   :961.7  
##   HDB5rmec_pr       condo_pr         landed_pr        pop_density     
##  Min.   :  0.0   Min.   :   0.00   Min.   :   0.00   Min.   :0.00000  
##  1st Qu.:  0.0   1st Qu.:   0.00   1st Qu.:   0.00   1st Qu.:0.00000  
##  Median :  0.0   Median :  27.25   Median :   0.00   Median :0.04142  
##  Mean   :119.6   Mean   : 199.54   Mean   :  92.42   Mean   :0.09494  
##  3rd Qu.:201.5   3rd Qu.: 247.14   3rd Qu.:  48.31   3rd Qu.:0.17612  
##  Max.   :857.0   Max.   :1000.00   Max.   :1000.00   Max.   :0.43540  
##   business_pr      industry_pr       financial_pr         govt_pr       
##  Min.   : 0.000   Min.   :0.00000   Min.   : 0.00000   Min.   : 0.0000  
##  1st Qu.: 0.000   1st Qu.:0.00000   1st Qu.: 0.04325   1st Qu.: 0.0000  
##  Median : 0.193   Median :0.00000   Median : 0.47461   Median : 0.0000  
##  Mean   : 1.726   Mean   :0.02278   Mean   : 1.88625   Mean   : 0.3364  
##  3rd Qu.: 1.469   3rd Qu.:0.00000   3rd Qu.: 1.25705   3rd Qu.: 0.1009  
##  Max.   :27.892   Max.   :1.44323   Max.   :66.74878   Max.   :25.1316  
##      pr_pr          shopping_pr               geometry  
##  Min.   : 0.0000   Min.   : 0.0000   MULTIPOLYGON :323  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   epsg:3414    :  0  
##  Median : 0.4777   Median : 0.0000   +proj=tmer...:  0  
##  Mean   : 1.1721   Mean   : 0.3397                      
##  3rd Qu.: 1.0928   3rd Qu.: 0.1166                      
##  Max.   :22.6813   Max.   :12.0904

Exploratory Data Analysis (EDA)

1. EDA using statistical graphics

young <- ggplot(data=residents_mpsz_derived, 
             aes(x= `young_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

economy_active <- ggplot(data=residents_mpsz_derived, 
             aes(x= `economy_active_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

aged <- ggplot(data=residents_mpsz_derived, 
             aes(x= `aged_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

HDB12rm <- ggplot(data=residents_mpsz_derived, 
             aes(x= `HDB12rm_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

HDB34rm <- ggplot(data=residents_mpsz_derived, 
             aes(x= `HDB34rm_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

HDB5rmec <- ggplot(data=residents_mpsz_derived, 
             aes(x= `HDB5rmec_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

condo <- ggplot(data=residents_mpsz_derived, 
             aes(x= `condo_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

landed <- ggplot(data=residents_mpsz_derived, 
             aes(x= `landed_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

pop_d <- ggplot(data=residents_mpsz_derived, 
             aes(x= `pop_density`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")
# multiple histograms are plottted to reveal the distribution of the selected variables in the residents_mpsz_derived data.frame.
ggarrange(young, economy_active, aged, HDB12rm, HDB34rm, HDB5rmec, condo, landed, pop_d, ncol = 3, nrow = 3)

From the histograms above, we observed that the overall distribution of data values are very right skewed.

b <- ggplot(data=residents_mpsz_derived, 
             aes(x= `business_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

i <- ggplot(data=residents_mpsz_derived, 
             aes(x= `industry_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

f <- ggplot(data=residents_mpsz_derived, 
             aes(x= `financial_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

g <- ggplot(data=residents_mpsz_derived, 
             aes(x= `govt_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

p <- ggplot(data=residents_mpsz_derived, 
             aes(x= `pr_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

s <- ggplot(data=residents_mpsz_derived, 
             aes(x= `shopping_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")
# multiple histograms are plottted to reveal the distribution of the selected variables in the residents_mpsz_derived data.frame.
ggarrange(b, i, f, g, p, s, 
          ncol = 3, 
          nrow = 2)

From the histograms above, we observed that the overall distribution of data values are very right skewed.

young.box <- ggplot(data=residents_mpsz_derived, aes(x=`young_pr`)) +
  geom_boxplot(color="black", fill="light blue")

ea.box <- ggplot(data=residents_mpsz_derived, aes(x=`economy_active_pr`)) +
  geom_boxplot(color="black", fill="light blue")

a.box <- ggplot(data=residents_mpsz_derived, aes(x=`aged_pr`)) +
  geom_boxplot(color="black", fill="light blue")

hdb12.box <- ggplot(data=residents_mpsz_derived, aes(x=`HDB12rm_pr`)) +
  geom_boxplot(color="black", fill="light blue")

hdb34.box <- ggplot(data=residents_mpsz_derived, aes(x=`HDB34rm_pr`)) +
  geom_boxplot(color="black", fill="light blue")

hdb5.box <- ggplot(data=residents_mpsz_derived, aes(x=`HDB5rmec_pr`)) +
  geom_boxplot(color="black", fill="light blue")

condo.box <- ggplot(data=residents_mpsz_derived, aes(x=`condo_pr`)) +
  geom_boxplot(color="black", fill="light blue")

landed.box <- ggplot(data=residents_mpsz_derived, aes(x=`landed_pr`)) +
  geom_boxplot(color="black", fill="light blue")

popd.box <- ggplot(data=residents_mpsz_derived, aes(x=`pop_density`)) +
  geom_boxplot(color="black", fill="light blue")

ggarrange(young.box, ea.box, a.box, hdb12.box, hdb34.box, hdb5.box, condo.box, landed.box, popd.box, ncol = 3, nrow = 3)

From the boxplots above, we observed that some variables - aged_pr, HDB12rm_pr and HDB5rmec_pr have outliers and this may cause biasness. We may use standardisation to reduce the biasness.

b.box <- ggplot(data=residents_mpsz_derived, aes(x=`business_pr`)) +
  geom_boxplot(color="black", fill="light blue")

i.box <- ggplot(data=residents_mpsz_derived, aes(x=`industry_pr`)) +
  geom_boxplot(color="black", fill="light blue")

g.box <- ggplot(data=residents_mpsz_derived, aes(x=`financial_pr`)) +
  geom_boxplot(color="black", fill="light blue")

f.box <- ggplot(data=residents_mpsz_derived, aes(x=`govt_pr`)) +
  geom_boxplot(color="black", fill="light blue")

pr.box <- ggplot(data=residents_mpsz_derived, aes(x=`pr_pr`)) +
  geom_boxplot(color="black", fill="light blue")

s.box <- ggplot(data=residents_mpsz_derived, aes(x=`shopping_pr`)) +
  geom_boxplot(color="black", fill="light blue")

ggarrange(b.box, i.box, g.box, f.box, pr.box, s.box, ncol = 3, nrow = 2)

From the boxplots above, we observed that some variables - industry_pr, financial+pr, govt_pr and pr_pr have outliers and this may cause biasness. We may use standardisation to reduce the biasness.

2. EDA using choropleth map

total_residents.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "resident_count",
          n = 5,
          style = "jenks", 
          title = "Total Residents") + 
  tm_borders(alpha = 0.5) 

young.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "young",
          n = 5,
          style = "jenks",
          title = "Number of Young Residents ") + 
  tm_borders(alpha = 0.5) 

ec.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "economy_active",
          n = 5,
          style = "jenks",
          title = "Number of Economy Active Residents ") + 
  tm_borders(alpha = 0.5) 

aged.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "aged",
          n = 5,
          style = "jenks",
          title = "Number of Aged Residents ") + 
  tm_borders(alpha = 0.5) 

tmap_arrange(total_residents.map, young.map, ec.map, aged.map)
## Some legend labels were too wide. These labels have been resized to 0.65. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.

The choropleth maps above clearly show that areas with relatively larger number of residents are also showing relatively higher number of young, economy active, and aged residents.

total_residents.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "resident_count",
          n = 5,
          style = "jenks", 
          title = "Total Residents") + 
  tm_borders(alpha = 0.5) 

HDB12rm.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "HDB12rm",
          n = 5,
          style = "jenks",
          title = "Number of Residents living in HDB12rm") + 
  tm_borders(alpha = 0.5) 

HDB34rm.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "HDB34rm",
          n = 5,
          style = "jenks",
          title = "Number of Residents living in HDB34rm") + 
  tm_borders(alpha = 0.5) 

HDB5rmec.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "HDB5rmec",
          n = 5,
          style = "jenks",
          title = "Number of Residents living in HDB5rmec") + 
  tm_borders(alpha = 0.5) 

condo.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "condo",
          n = 5,
          style = "jenks",
          title = "Number of Residents living in condo") + 
  tm_borders(alpha = 0.5) 

landed.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "landed",
          n = 5,
          style = "jenks",
          title = "Number of Residents living in landed") + 
  tm_borders(alpha = 0.5) 

tmap_arrange(total_residents.map, HDB12rm.map, HDB34rm.map, HDB5rmec.map, condo.map, landed.map)

The choropleth maps above clearly show that areas with relatively larger number of residents are also showing relatively higher number of residents living in HDB12rm, HDB34rm, HDB5rmec, condo and landed.

area.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "SHAPE_Area",
          n = 5,
          style = "jenks",
          title = "Area") + 
  tm_borders(alpha = 0.5) 

business.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "business_count",
          n = 5,
          style = "jenks",
          title = "Number of Business") + 
  tm_borders(alpha = 0.5) 

industry.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "industry_count",
          n = 5,
          style = "jenks",
          title = "Number of Industry") + 
  tm_borders(alpha = 0.5) 

financial.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "financial_count",
          n = 5,
          style = "jenks",
          title = "Number of Financial") + 
  tm_borders(alpha = 0.5) 

govt.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "govt_count",
          n = 5,
          style = "jenks",
          title = "Number of Govt") + 
  tm_borders(alpha = 0.5) 

pr.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "pr_count",
          n = 5,
          style = "jenks",
          title = "Number of Private Residents") + 
  tm_borders(alpha = 0.5) 

shopping.map <- tm_shape(st_make_valid(residents_mpsz_derived)) + 
  tm_fill(col = "shopping_count",
          n = 5,
          style = "jenks",
          title = "Number of Shopping") + 
  tm_borders(alpha = 0.5) 

tmap_arrange(area.map, business.map, industry.map, financial.map, govt.map, pr.map, shopping.map)
## Legend labels were too wide. The labels have been resized to 0.39, 0.34, 0.32, 0.31, 0.31. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.

The choropleth maps above clearly show that areas with relatively larger area does not affect the number of business, industry, financial, government, private residential and shopping.

3. Correlation Analysis

cluster_vars.cor = cor(as.data.frame(residents_mpsz_derived)[,19:27])
corrplot.mixed(cluster_vars.cor,
         lower = "ellipse", 
               upper = "number",
               tl.pos = "lt",
               diag = "l",
               tl.col = "black",
               tl.cex = 1)

The correlation plot above shows that young_pr and economy_active_pr are highly correlated. This suggest that only one of them should be used in the cluster analysis instead of both.

Hierarchy Cluster Analysis

1. Extrating clustering variables

# economy_active_pr is not included in our cluster analysis

r_mpsz <- residents_mpsz_derived %>%
  st_set_geometry(NULL) %>%
  ungroup() %>%
  as.data.frame() %>%
  select("SUBZONE_N", "young_pr", "aged_pr", "HDB12rm_pr", "HDB34rm_pr", "HDB5rmec_pr", "condo_pr", "landed_pr", "pop_density")

head(r_mpsz,10)
##                 SUBZONE_N young_pr   aged_pr HDB12rm_pr HDB34rm_pr HDB5rmec_pr
## 1               ADMIRALTY 321.1362  79.73150  77.521284   433.5298    488.5396
## 2            AIRPORT ROAD   0.0000   0.00000   0.000000     0.0000      0.0000
## 3          ALEXANDRA HILL 240.7700 191.06450 293.851458   455.9690    218.3594
## 4         ALEXANDRA NORTH 283.2031  60.54688   0.000000     0.0000      0.0000
## 5                ALJUNIED 235.1806 156.36724  57.616036   489.9239    165.5590
## 6              ANAK BUKIT 295.6962 137.13884   2.808989   103.2805    150.9831
## 7              ANCHORVALE 327.0400  66.39566  24.059018   368.3529    582.4450
## 8  ANG MO KIO TOWN CENTRE 283.3671 128.06662   0.000000   246.6802    408.9579
## 9                   ANSON   0.0000   0.00000   0.000000     0.0000      0.0000
## 10              BALESTIER 241.4252 165.72813  65.083103   460.0184    154.9094
##      condo_pr landed_pr pop_density
## 1     0.00000   0.00000  0.09682568
## 2     0.00000   0.00000  0.00000000
## 3     0.00000   0.00000  0.13511536
## 4  1000.00000   0.00000  0.03486475
## 5   242.36253  14.04223  0.12609451
## 6   383.87841 346.96027  0.07201392
## 7    25.14303   0.00000  0.22153165
## 8   343.46163   0.00000  0.14020991
## 9     0.00000   0.00000  0.00000000
## 10  275.38309  17.43968  0.15208484
row.names(r_mpsz) <- r_mpsz$"SUBZONE_N"
head(r_mpsz,10)
##                                     SUBZONE_N young_pr   aged_pr HDB12rm_pr
## ADMIRALTY                           ADMIRALTY 321.1362  79.73150  77.521284
## AIRPORT ROAD                     AIRPORT ROAD   0.0000   0.00000   0.000000
## ALEXANDRA HILL                 ALEXANDRA HILL 240.7700 191.06450 293.851458
## ALEXANDRA NORTH               ALEXANDRA NORTH 283.2031  60.54688   0.000000
## ALJUNIED                             ALJUNIED 235.1806 156.36724  57.616036
## ANAK BUKIT                         ANAK BUKIT 295.6962 137.13884   2.808989
## ANCHORVALE                         ANCHORVALE 327.0400  66.39566  24.059018
## ANG MO KIO TOWN CENTRE ANG MO KIO TOWN CENTRE 283.3671 128.06662   0.000000
## ANSON                                   ANSON   0.0000   0.00000   0.000000
## BALESTIER                           BALESTIER 241.4252 165.72813  65.083103
##                        HDB34rm_pr HDB5rmec_pr   condo_pr landed_pr pop_density
## ADMIRALTY                433.5298    488.5396    0.00000   0.00000  0.09682568
## AIRPORT ROAD               0.0000      0.0000    0.00000   0.00000  0.00000000
## ALEXANDRA HILL           455.9690    218.3594    0.00000   0.00000  0.13511536
## ALEXANDRA NORTH            0.0000      0.0000 1000.00000   0.00000  0.03486475
## ALJUNIED                 489.9239    165.5590  242.36253  14.04223  0.12609451
## ANAK BUKIT               103.2805    150.9831  383.87841 346.96027  0.07201392
## ANCHORVALE               368.3529    582.4450   25.14303   0.00000  0.22153165
## ANG MO KIO TOWN CENTRE   246.6802    408.9579  343.46163   0.00000  0.14020991
## ANSON                      0.0000      0.0000    0.00000   0.00000  0.00000000
## BALESTIER                460.0184    154.9094  275.38309  17.43968  0.15208484
r_mpsz <- select(r_mpsz, c(2:9))
head(r_mpsz, 10)
##                        young_pr   aged_pr HDB12rm_pr HDB34rm_pr HDB5rmec_pr
## ADMIRALTY              321.1362  79.73150  77.521284   433.5298    488.5396
## AIRPORT ROAD             0.0000   0.00000   0.000000     0.0000      0.0000
## ALEXANDRA HILL         240.7700 191.06450 293.851458   455.9690    218.3594
## ALEXANDRA NORTH        283.2031  60.54688   0.000000     0.0000      0.0000
## ALJUNIED               235.1806 156.36724  57.616036   489.9239    165.5590
## ANAK BUKIT             295.6962 137.13884   2.808989   103.2805    150.9831
## ANCHORVALE             327.0400  66.39566  24.059018   368.3529    582.4450
## ANG MO KIO TOWN CENTRE 283.3671 128.06662   0.000000   246.6802    408.9579
## ANSON                    0.0000   0.00000   0.000000     0.0000      0.0000
## BALESTIER              241.4252 165.72813  65.083103   460.0184    154.9094
##                          condo_pr landed_pr pop_density
## ADMIRALTY                 0.00000   0.00000  0.09682568
## AIRPORT ROAD              0.00000   0.00000  0.00000000
## ALEXANDRA HILL            0.00000   0.00000  0.13511536
## ALEXANDRA NORTH        1000.00000   0.00000  0.03486475
## ALJUNIED                242.36253  14.04223  0.12609451
## ANAK BUKIT              383.87841 346.96027  0.07201392
## ANCHORVALE               25.14303   0.00000  0.22153165
## ANG MO KIO TOWN CENTRE  343.46163   0.00000  0.14020991
## ANSON                     0.00000   0.00000  0.00000000
## BALESTIER               275.38309  17.43968  0.15208484

2. Data Standardisation

Min-Max standardisation

r_mpsz.std <- normalize(r_mpsz)
r_mpsz.std_df <- as.data.frame(r_mpsz.std)
summary(r_mpsz.std_df)
##     young_pr         aged_pr         HDB12rm_pr        HDB34rm_pr    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.4304   Median :0.1076   Median :0.00000   Median :0.0000  
##  Mean   :0.3369   Mean   :0.1046   Mean   :0.04178   Mean   :0.2650  
##  3rd Qu.:0.5070   3rd Qu.:0.1639   3rd Qu.:0.02383   3rd Qu.:0.5364  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##   HDB5rmec_pr        condo_pr         landed_pr        pop_density     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.02725   Median :0.00000   Median :0.09513  
##  Mean   :0.1395   Mean   :0.19954   Mean   :0.09242   Mean   :0.21806  
##  3rd Qu.:0.2351   3rd Qu.:0.24714   3rd Qu.:0.04831   3rd Qu.:0.40450  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000

Visualising the standardised clustering variables

y <- ggplot(data=residents_mpsz_derived, 
             aes(x= `young_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

r_mpsz_y_df <- as.data.frame(r_mpsz.std_df)
y1 <- ggplot(data=r_mpsz_y_df, 
       aes(x=`young_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Min-Max Standardisation")

a <- ggplot(data=residents_mpsz_derived, 
             aes(x= `aged_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

r_mpsz_a_df <- as.data.frame(r_mpsz.std_df)
a1 <- ggplot(data=r_mpsz_a_df, 
       aes(x=`aged_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Min-Max Standardisation")

p <- ggplot(data=residents_mpsz_derived, 
             aes(x= `pop_density`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

r_mpsz_p_df <- as.data.frame(r_mpsz.std_df)
p1 <- ggplot(data=r_mpsz_p_df, 
       aes(x=`pop_density`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Min-Max Standardisation")

ggarrange(y, y1, a, a1, p, p1, ncol = 2, nrow = 3)

hdb12rm <- ggplot(data=residents_mpsz_derived, 
             aes(x= `HDB12rm_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

r_mpsz_hdb12rm_df <- as.data.frame(r_mpsz.std_df)
hdb12rm1 <- ggplot(data=r_mpsz_hdb12rm_df, 
       aes(x=`HDB12rm_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Min-Max Standardisation")

hdb34rm <- ggplot(data=residents_mpsz_derived, 
             aes(x= `HDB34rm_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

r_mpsz_hdb34rm_df <- as.data.frame(r_mpsz.std_df)
hdb34rm1 <- ggplot(data=r_mpsz_hdb34rm_df, 
       aes(x=`HDB34rm_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Min-Max Standardisation")

hdb5rm <- ggplot(data=residents_mpsz_derived, 
             aes(x= `HDB5rmec_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

r_mpsz_hdb5rm_df <- as.data.frame(r_mpsz.std_df)
hdb5rm1 <- ggplot(data=r_mpsz_hdb5rm_df, 
       aes(x=`HDB5rmec_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Min-Max Standardisation")

ggarrange(hdb12rm, hdb12rm1, hdb34rm, hdb34rm1, hdb5rm, hdb5rm1, ncol = 2, nrow = 3)

c <- ggplot(data=residents_mpsz_derived, 
             aes(x= `condo_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

r_mpsz_condo_df <- as.data.frame(r_mpsz.std_df)
c1 <- ggplot(data=r_mpsz_condo_df, 
       aes(x=`condo_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Min-Max Standardisation")

l <- ggplot(data=residents_mpsz_derived, 
             aes(x= `landed_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

r_mpsz_landed_df <- as.data.frame(r_mpsz.std_df)
l1 <- ggplot(data=r_mpsz_landed_df, 
       aes(x=`landed_pr`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Min-Max Standardisation")

ggarrange(c, c1, l, l1, ncol = 2, nrow = 2)

Based on the histograms above, we noticed that overall range of the clustering variables is not very large. Therefore, we do not need to apply standardisation as doing so will change the distribution.

3. Computing proximity matrix

proxmat <- dist(r_mpsz, method = 'euclidean')

4. Computing hierarchical clustering

hclust_ward <- hclust(proxmat, method = 'ward.D')
plot(hclust_ward, cex = 0.6)

5. Selecting the optimal clustering algorithm

m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

ac <- function(x) {
  agnes(r_mpsz, method = x)$ac
}

map_dbl(m, ac)
##   average    single  complete      ward 
## 0.9431697 0.8945523 0.9567745 0.9927800

Interpreting the dendrograms

plot(hclust_ward, cex = 0.6)
rect.hclust(hclust_ward, k = 5, border = 2:5)

6. Visually-driven hierarchical clustering analysis

Transforming the data frame into a matrix

r_mpsz_mat <- data.matrix(r_mpsz)

Plotting interactive cluster heatmap using heatmaply()

heatmaply(normalize(r_mpsz_mat),
          Colv=NA,
          dist_method = "euclidean",
          hclust_method = "ward.D",
          seriate = "OLO",
          colors = Blues,
          k_row = 5,
          margins = c(NA,200,60,NA),
          fontsize_row = 4,
          fontsize_col = 5,
          main="Geographic Segmentation of Subzones by Residents indicators",
          xlab = "Residents Indicators",
          ylab = "Subzones"
          )

7. Mapping the clusters formed

groups <- as.factor(cutree(hclust_ward, k=5))
residents_mpsz_clusters <- cbind(residents_mpsz_derived, as.matrix(groups)) %>%
  rename(`CLUSTER`=`as.matrix.groups.`)
qtm(residents_mpsz_clusters, "CLUSTER")
## Warning: The shape residents_mpsz_clusters is invalid. See sf::st_is_valid

Spatially Constrained Clustering - SKATER approach

1. Converting into SpatialPolygonsDataFrame

residents_mpsz_filtered <- residents_mpsz_derived %>%
  filter(resident_count == 0 & business_count == 0 & industry_count == 0 & govt_count == 0 & financial_count == 0 & pr_count == 0 & shopping_count == 0) %>%
  ungroup()

residents_mpsz_filtered_df <- as.data.frame(residents_mpsz_filtered)
residents_filtered <- anti_join(residents_mpsz_derived, residents_mpsz_filtered_df, by='SUBZONE_N')
residents_sp <- as_Spatial(residents_filtered)

2. Computing Neighbour List

residents.nb <- poly2nb(residents_sp)
summary(residents.nb)
## Neighbour list object:
## Number of regions: 300 
## Number of nonzero links: 1758 
## Percentage nonzero weights: 1.953333 
## Average number of links: 5.86 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 15 
##  1  5 13 34 79 74 47 31 12  1  1  1  1 
## 1 least connected region:
## 106 with 1 link
## 1 most connected region:
## 39 with 15 links
plot(residents_sp, border=grey(.5))
plot(residents.nb, coordinates(residents_sp), col="blue", add=TRUE)

3. Computing minimum spanning tree

# Calculating edge costs
lcosts <- nbcosts(residents.nb, r_mpsz)
residents.w <- nb2listw(residents.nb, lcosts, style="B")
## Warning in nb2listw(residents.nb, lcosts, style = "B"): zero sum general weights
summary(residents.w)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 300 
## Number of nonzero links: 1758 
## Percentage nonzero weights: 1.953333 
## Average number of links: 5.86 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 15 
##  1  5 13 34 79 74 47 31 12  1  1  1  1 
## 1 least connected region:
## 106 with 1 link
## 1 most connected region:
## 39 with 15 links
## 
## Weights style: B 
## Weights constants summary:
##     n    nn      S0         S1          S2
## B 300 90000 1213552 2072096477 22630050243
residents_mst <- mstree(residents.w)
class(residents_mst)
## [1] "mst"    "matrix"
dim(residents_mst)
## [1] 299   3
plot(residents_sp, border=gray(.5))
plot.mst(residents_mst, coordinates(residents_sp), 
     col="blue", cex.lab=0.7, cex.circles=0.005, add=TRUE)

4. Computing spatially constrained clusters using SKATER method

clust5 <- skater(residents_mst[,1:2], r_mpsz, 4)
ccs5 <- clust5$groups
ccs5
##   [1] 1 1 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1 1 4 4 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 2 3 3 1 1 1 1 1 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 4 1 1 1 5 1 4 1
## [112] 1 1 1 1 1 4 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 5 4 1 4 1 1 1 1 4 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 5 2 1 1 1 1 1 1 1 1 4 4 1
## [186] 1 1 1 5 1 1 5 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 4 1 1 1 1 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 3 1 1 1 1 5 2 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1
table(ccs5)
## ccs5
##   1   2   3   4   5 
## 266   6   7  13   8
plot(residents_sp, border=gray(.5))
plot(clust5, coordinates(residents_sp), cex.lab=.7,
     groups.colors=c("red","green","blue", "brown", "pink"), 
     cex.circles=0.005, add = TRUE)

5. Visualising the clusters in choropleth map

groups_mat <- as.matrix(clust5$groups)
residents_clusters_filtered <- anti_join(residents_mpsz_clusters, residents_mpsz_filtered_df, by='SUBZONE_N')
residents_sf_spatialcluster <- cbind(residents_clusters_filtered, as.factor(groups_mat)) %>%
    rename(`SP_CLUSTER`=`as.factor.groups_mat.`)

qtm(residents_sf_spatialcluster, "SP_CLUSTER")
## Warning: The shape residents_sf_spatialcluster is invalid. See sf::st_is_valid

Based on the choropleth map above, we can observe that:

1. Cluster 2 is located at the centre of the map

2. Cluster 5 is located at the west of the map

3. Cluster 3 and 4 is located at the North-East/East of the map

5.1 Plotting business with clusters

tm_shape(st_make_valid(residents_sf_spatialcluster)) +
  tm_fill("SP_CLUSTER") +
  tm_borders(lwd = 0.1,  alpha = 1) +
  tm_layout(title = "Business") +
  tm_bubbles(col = "business_count",
             size = "business_count",
             border.col = "black",
             border.lwd = 1,
             alpha = 0.7)

ggplot(residents_sf_spatialcluster, aes(x=SP_CLUSTER, y=business_count)) + 
  geom_bar(stat = "identity")

5.2 Plotting industry with clusters

tm_shape(st_make_valid(residents_sf_spatialcluster)) +
  tm_fill("SP_CLUSTER") +
  tm_borders(lwd = 0.1,  alpha = 1) +
  tm_layout(title = "Industry") +
  tm_bubbles(col = "industry_count",
             size = "industry_count",
             border.col = "black",
             border.lwd = 1,
             alpha = 0.7)

ggplot(residents_sf_spatialcluster, aes(x=SP_CLUSTER, y=industry_count)) + 
  geom_bar(stat = "identity")

5.3 Plotting financial with clusters

tm_shape(st_make_valid(residents_sf_spatialcluster)) +
  tm_fill("SP_CLUSTER") +
  tm_borders(lwd = 0.1,  alpha = 1) +
  tm_layout(title = "Financial") +
  tm_bubbles(col = "financial_count",
             size = "financial_count",
             border.col = "black",
             border.lwd = 1,
             alpha = 0.7)

ggplot(residents_sf_spatialcluster, aes(x=SP_CLUSTER, y=financial_count)) + 
  geom_bar(stat = "identity")

5.4 Plotting government with clusters

tm_shape(st_make_valid(residents_sf_spatialcluster)) +
  tm_fill("SP_CLUSTER") +
  tm_borders(lwd = 0.1,  alpha = 1) +
  tm_layout(title = "Government with Embassy") +
  tm_bubbles(col = "govt_count",
             size = "govt_count",
             border.col = "black",
             border.lwd = 1,
             alpha = 0.7)

ggplot(residents_sf_spatialcluster, aes(x=SP_CLUSTER, y=govt_count)) + 
  geom_bar(stat = "identity")

5.5 Plotting Private Residents with clusters

tm_shape(st_make_valid(residents_sf_spatialcluster)) +
  tm_fill("SP_CLUSTER") +
  tm_borders(lwd = 0.1,  alpha = 1) +
  tm_layout(title = "Private Residents") +
  tm_bubbles(col = "pr_count",
             size = "pr_count",
             border.col = "black",
             border.lwd = 1,
             alpha = 0.7)

ggplot(residents_sf_spatialcluster, aes(x=SP_CLUSTER, y=pr_count)) + 
  geom_bar(stat = "identity")

5.6 Plotting Shopping with clusters

tm_shape(st_make_valid(residents_sf_spatialcluster)) +
  tm_fill("SP_CLUSTER") +
  tm_borders(lwd = 0.1,  alpha = 1) +
  tm_layout(title = "Shopping") +
  tm_bubbles(col = "shopping_count",
             size = "shopping_count",
             border.col = "black",
             border.lwd = 1,
             alpha = 0.7)

ggplot(residents_sf_spatialcluster, aes(x=SP_CLUSTER, y=shopping_count)) + 
  geom_bar(stat = "identity")

Based on choropleth maps and bar charts above, we observed that majority of the number of business, industry, financial, government with embassy, private resident and shopping is in cluster 1. This might be due to the fact that the area of cluster 1 is significantly larger than other clusters. Looking at the numbers in other clusters:

1. Cluster 4 and 5 has the most number of business buildings

2. Cluster 4 has the most number of industrial buildings

3. Cluster 3 has the most number of financial buildings

4. Cluster 4 has the most number of government with embassy buildings

5. Cluster 3 has the most number of private residential buildings

6. Cluster 3 has the most number of shopping Malls