1 Overview

In this take-home exercise, you are tasked to segment Singapore at the planning subzone level into homogeneous socioeconomic areas by combining geodemographic data extracted from Singapore Department of Statistics and urban functions extracted from the geospatial data provided.

2 Installing Packages

packages = c('factoextra','rgdal', 'spdep', 'ClustGeo',  'tmap', 'sf', 'ggpubr', 'cluster', 'heatmaply', 'corrplot', 'psych', 'tidyverse')
for (p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
    }
  library(p,character.only = T)
}

3 Data Wrangling

3.1 Importing Data into R environment

business_sf <- st_read(dsn = "data/geospatial", layer = "business")
## Reading layer `business' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 6550 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6147 ymin: 1.24605 xmax: 104.0044 ymax: 1.4698
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
financial_sf <- st_read(dsn = "data/geospatial", layer = "Financial")
## Reading layer `Financial' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3320 features and 29 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6256 ymin: 1.24392 xmax: 103.9998 ymax: 1.46247
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
govtembassy_sf <- st_read(dsn = "data/geospatial", layer = "Govt_Embassy")
## Reading layer `Govt_Embassy' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 443 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6282 ymin: 1.24911 xmax: 103.9884 ymax: 1.45765
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
private_sf <- st_read(dsn = "data/geospatial", layer = "Private residential")
## Reading layer `Private residential' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3604 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.6295 ymin: 1.23943 xmax: 103.9749 ymax: 1.45379
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
shopping_sf <- st_read(dsn = "data/geospatial", layer = "Shopping")
## Reading layer `Shopping' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 511 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 103.679 ymin: 1.24779 xmax: 103.9644 ymax: 1.4535
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
mpsz <- st_read(dsn = "data/geospatial/master-plan-2014-subzone-boundary-web-shp", layer = "MP14_SUBZONE_WEB_PL")
## Reading layer `MP14_SUBZONE_WEB_PL' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial\master-plan-2014-subzone-boundary-web-shp' using driver `ESRI Shapefile'
## Simple feature collection with 323 features and 15 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## epsg (SRID):    NA
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
resident <- read_csv ("data/aspatial/Singapore Residents by Planning AreaSubzone Age Group Sex and Type of Dwelling June 20112019/respopagesextod2011to2019.csv")

3.2 Check CRS of geospatial data

st_crs(business_sf)
## Coordinate Reference System:
##   EPSG: 4326 
##   proj4string: "+proj=longlat +datum=WGS84 +no_defs"
st_crs(financial_sf)
## Coordinate Reference System:
##   EPSG: 4326 
##   proj4string: "+proj=longlat +datum=WGS84 +no_defs"
st_crs(govtembassy_sf)
## Coordinate Reference System:
##   EPSG: 4326 
##   proj4string: "+proj=longlat +datum=WGS84 +no_defs"
st_crs(private_sf)
## Coordinate Reference System:
##   EPSG: 4326 
##   proj4string: "+proj=longlat +datum=WGS84 +no_defs"
st_crs(shopping_sf)
## Coordinate Reference System:
##   EPSG: 4326 
##   proj4string: "+proj=longlat +datum=WGS84 +no_defs"
st_crs(mpsz)
## Coordinate Reference System:
##   No EPSG code
##   proj4string: "+proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs"

3.3 Set and Transform crs

mpsz3414 <- subset(st_set_crs(mpsz,3414),select=c(SUBZONE_N))
mpsz_area <- subset(st_set_crs(mpsz,3414),select=c(SUBZONE_N,SHAPE_Area)) %>% st_set_geometry(NULL)
business3414 <- st_transform(business_sf,3414)
financial3414 <- st_transform(financial_sf,3414)
govtembassy3414 <- st_transform(govtembassy_sf,3414)
private3414 <- st_transform(private_sf,3414)
shopping3414 <- st_transform(shopping_sf,3414)

3.4 Filter and Extract data in aspatial data

Data to extract in year 2019:  Economy active population (i.e. age 25-64)  Young group (i.e. age 0-24)  Aged group (i.e 65 and above)  Population density  HDB1-2RM dwellers  HDB3-4RM dwellers  HDB5RM-Ec dweller  Condominium and apartment dwellers  Landed property dwellers

3.4.1 Filter time to 2019

residents_2019 <- resident %>%
  filter(Time == 2019)

3.4.2 Reclassify data

Reclassify data according to the data needed to be extracted

3.4.2.1 Reclassify Age Group

Print Current Age group

unique(residents_2019$AG)
##  [1] "0_to_4"      "5_to_9"      "10_to_14"    "15_to_19"    "20_to_24"   
##  [6] "25_to_29"    "30_to_34"    "35_to_39"    "40_to_44"    "45_to_49"   
## [11] "50_to_54"    "55_to_59"    "60_to_64"    "65_to_69"    "70_to_74"   
## [16] "75_to_79"    "80_to_84"    "85_to_89"    "90_and_over"

Transpose age column and group the age groups into Young, ECONOMY Active and Aged.

residents_2019_AG <- residents_2019%>%
  spread(AG, Pop) %>%
  mutate(YOUNG = `0_to_4`+`5_to_9`+`10_to_14`+`15_to_19`+`20_to_24`) %>%
  mutate(`ECONOMY ACTIVE` = `25_to_29`+`30_to_34`+`35_to_39`+`40_to_44`+`45_to_49`+`50_to_54`+`55_to_59`+`60_to_64`)%>%
  mutate(`AGED`= `65_to_69`+`70_to_74`+`75_to_79`+`80_to_84`+`85_to_89`+`90_and_over`) %>%
  mutate(`Pop`= `0_to_4`+`5_to_9`+`10_to_14`+`15_to_19`+`20_to_24`+ `25_to_29`+`30_to_34`+`35_to_39`+`40_to_44`+`45_to_49`+`50_to_54`+`55_to_59`+`60_to_64`+ `65_to_69`+`70_to_74`+`75_to_79`+`80_to_84`+`85_to_89`+`90_and_over`) %>%
  mutate_at(.vars = vars(PA, SZ), .funs = funs(toupper)) %>%
  select(`SZ`, `YOUNG`, `ECONOMY ACTIVE`, `AGED`,`Pop`)
residents_2019_AG
## # A tibble: 5,168 x 5
##    SZ                     YOUNG `ECONOMY ACTIVE`  AGED   Pop
##    <chr>                  <dbl>            <dbl> <dbl> <dbl>
##  1 ANG MO KIO TOWN CENTRE   340              600    80  1020
##  2 ANG MO KIO TOWN CENTRE     0                0     0     0
##  3 ANG MO KIO TOWN CENTRE    50              160    50   260
##  4 ANG MO KIO TOWN CENTRE    80              200    80   360
##  5 ANG MO KIO TOWN CENTRE   220              530   220   970
##  6 ANG MO KIO TOWN CENTRE     0                0     0     0
##  7 ANG MO KIO TOWN CENTRE     0                0     0     0
##  8 ANG MO KIO TOWN CENTRE     0                0     0     0
##  9 ANG MO KIO TOWN CENTRE   340              510    60   910
## 10 ANG MO KIO TOWN CENTRE     0                0     0     0
## # ... with 5,158 more rows

Sum up data by Subzone

residents_2019_AG_SZ <- residents_2019_AG%>%
  group_by(SZ) %>%
  summarise(YOUNG = sum(YOUNG),`ECONOMY ACTIVE` = sum(`ECONOMY ACTIVE`), AGED = sum(AGED),Pop=sum(Pop))
residents_2019_AG_SZ
## # A tibble: 323 x 5
##    SZ                     YOUNG `ECONOMY ACTIVE`  AGED   Pop
##    <chr>                  <dbl>            <dbl> <dbl> <dbl>
##  1 ADMIRALTY               4370             8380  1360 14110
##  2 AIRPORT ROAD               0                0     0     0
##  3 ALEXANDRA HILL          2910             7560  3310 13780
##  4 ALEXANDRA NORTH          580             1420   120  2120
##  5 ALJUNIED                8350            24330  7510 40190
##  6 ANAK BUKIT              5910            12490  3850 22250
##  7 ANCHORVALE             15250            27740  3620 46610
##  8 ANG MO KIO TOWN CENTRE  1360             2790   740  4890
##  9 ANSON                      0                0     0     0
## 10 BALESTIER               7350            19320  6090 32760
## # ... with 313 more rows

Calculate Population density. We join mpsz_area to residents_2019_AG_SZ and obtain Pop/SHAPE_Area

residents_2019_AG_PD <- full_join(residents_2019_AG_SZ,mpsz_area, by = c("SZ"="SUBZONE_N"))
nrow(residents_2019_AG_PD)
## [1] 323
residents_2019_AG_PD$Pop_Density<-residents_2019_AG_PD$Pop/residents_2019_AG_PD$SHAPE_Area
head(residents_2019_AG_PD)
## # A tibble: 6 x 7
##   SZ              YOUNG `ECONOMY ACTIVE`  AGED   Pop SHAPE_Area Pop_Density
##   <chr>           <dbl>            <dbl> <dbl> <dbl>      <dbl>       <dbl>
## 1 ADMIRALTY        4370             8380  1360 14110   1261649.     0.0112 
## 2 AIRPORT ROAD        0                0     0     0    494504.     0      
## 3 ALEXANDRA HILL   2910             7560  3310 13780   1030379.     0.0134 
## 4 ALEXANDRA NORTH   580             1420   120  2120    293706.     0.00722
## 5 ALJUNIED         8350            24330  7510 40190   2959368.     0.0136 
## 6 ANAK BUKIT       5910            12490  3850 22250   2768354.     0.00804

3.4.2.2 Reclassify TOD

Reclassify TOD and omit those that are not needed

Print Current TOD

unique(residents_2019$TOD)
## [1] "HDB 1- and 2-Room Flats"                
## [2] "HDB 3-Room Flats"                       
## [3] "HDB 4-Room Flats"                       
## [4] "HDB 5-Room and Executive Flats"         
## [5] "HUDC Flats (excluding those privatised)"
## [6] "Landed Properties"                      
## [7] "Condominiums and Other Apartments"      
## [8] "Others"

We would like to reclassify the above into the following categories: 1. HDB1-2RM dwellers 2. HDB3-4RM dwellers 3. HDB5RM-Ec dweller 4. Condominium and apartment dwellers 5. Landed property dwellers

Create new columns to reclassify TOD by subzone and Obtain sum population density by Subzone

residents_2019_TOD <- residents_2019%>%
  spread(TOD,Pop)%>%
  mutate("HDB1-2RM dwellers" = `HDB 1- and 2-Room Flats`)%>%
  mutate("HDB3-4RM dwellers" = `HDB 3-Room Flats`+`HDB 4-Room Flats`)%>%
  mutate("HDB5RM-Ec dweller" = `HDB 5-Room and Executive Flats`)%>%
  mutate("Condominium and apartment dwellers" = `Condominiums and Other Apartments`)%>%
  mutate("Landed property dwellers" = `Landed Properties`) %>%
  mutate_at(.vars = vars(PA, SZ), .funs = funs(toupper)) %>%
 select(`SZ`, `HDB1-2RM dwellers`, `HDB3-4RM dwellers`, `HDB5RM-Ec dweller`, `Condominium and apartment dwellers`,`Landed property dwellers`)
head(residents_2019_TOD)
## # A tibble: 6 x 6
##   SZ    `HDB1-2RM dwell~ `HDB3-4RM dwell~ `HDB5RM-Ec dwel~ `Condominium an~
##   <chr>            <dbl>            <dbl>            <dbl>            <dbl>
## 1 ANG ~                0               20               20               40
## 2 ANG ~                0               20               20               50
## 3 ANG ~                0               40               60               80
## 4 ANG ~                0               30               50               70
## 5 ANG ~                0               20               50               80
## 6 ANG ~                0               30               40               90
## # ... with 1 more variable: `Landed property dwellers` <dbl>

Sum up data by Subzone

residents_2019_TOD_SZ <- residents_2019_TOD%>%
  group_by(SZ) %>%
  summarise(`HDB1-2RM dwellers` = sum(`HDB1-2RM dwellers`),`HDB3-4RM dwellers` = sum(`HDB3-4RM dwellers`), `HDB5RM-Ec dweller` = sum(`HDB5RM-Ec dweller`),`Condominium and apartment dwellers`=sum(`Condominium and apartment dwellers`),`Landed property dwellers`=sum(`Landed property dwellers`))
head(residents_2019_TOD_SZ)
## # A tibble: 6 x 6
##   SZ    `HDB1-2RM dwell~ `HDB3-4RM dwell~ `HDB5RM-Ec dwel~ `Condominium an~
##   <chr>            <dbl>            <dbl>            <dbl>            <dbl>
## 1 ADMI~             1140             6750             6220                0
## 2 AIRP~                0                0                0                0
## 3 ALEX~             3910             6320             3120                0
## 4 ALEX~                0                0                0             2120
## 5 ALJU~             2120            18020             6400            11930
## 6 ANAK~              160             2090             3170             9020
## # ... with 1 more variable: `Landed property dwellers` <dbl>

3.4.3 Join back the new age group and TOD to MPSZ by Subzone

Before joining, lets do a count for each dataset

nrow(mpsz3414)
## [1] 323
nrow(residents_2019_AG_PD)
## [1] 323
nrow(residents_2019_TOD_SZ)
## [1] 323

Next we will do a full join of residents_2019_AG_PD and residents_2019_TOD_SZ to MPSZ

mpsz3414_AG<- full_join(mpsz3414, residents_2019_AG_PD, by = c("SUBZONE_N" = "SZ"))

We do a count of rows to ensure the joining of both data

nrow(mpsz3414_AG)
## [1] 323
mpsz3414_AG_TOD<- full_join(mpsz3414_AG, residents_2019_TOD_SZ, by = c("SUBZONE_N" = "SZ"))

We do a count of rows again to ensure the joining of both data

nrow(mpsz3414_AG_TOD)
## [1] 323

3.5 Manipulate Spatial Data

3.5.1 Split Business and Industry

First we need to spilt business and industry

In order to do so, we will have to view the data first.

view(business3414)

FAC_TYPE consists of the 2 types - Business and Industry. Hence, lets look at the unqiue values to obtain the 2 codes.

unique(business3414$FAC_TYPE)
## [1] 5000 9991

By referencing to business3414, we can derive that 5000 represents Business and 9991 represents Industry. Next, we will split the 2. We will first extract Business.

business3414b <- business3414%>%
  filter(FAC_TYPE==5000)
business3414b
## Simple feature collection with 6440 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 3669.148 ymin: 25408.41 xmax: 47034.83 ymax: 50148.54
## epsg (SRID):    3414
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
## First 10 features:
##        POI_ID SEQ_NUM FAC_TYPE                        POI_NAME
## 1  1101180209       1     5000                       JOHN CHEN
## 2  1101180210       1     5000    TROPICAL INDUSTRIAL BUILDING
## 3  1101180211       1     5000 LIAN CHEONG INDUSTRIAL BUILDING
## 4  1101180212       1     5000  MALAYSIA GARMENT MANUFACTURERS
## 5  1101180213       1     5000                         UNIGOLD
## 6  1192316144       1     5000             NUS UNIVERSITY HALL
## 7  1144317654       1     5000           SUITES AT BUKIT TIMAH
## 8  1103507488       1     5000                      TIONG HUAT
## 9  1001052867       1     5000  LEE CHOON GUAN TIMBER MERCHANT
## 10 1001052868       1     5000           WEIGHT BRIDGE SERVICE
##                ST_NAME                  geometry
## 1            LITTLE RD POINT (33818.36 35620.16)
## 2            LITTLE RD  POINT (33770.51 35610.2)
## 3            LITTLE RD POINT (33779.41 35612.41)
## 4                 <NA> POINT (33802.78 35598.04)
## 5            LITTLE RD POINT (33835.06 35623.47)
## 6  LOWER KENT RIDGE RD POINT (21813.48 31063.37)
## 7  JALAN JURONG KECHIL POINT (21375.11 35831.37)
## 8   KALLANG PUDDING RD  POINT (33088.33 34439.2)
## 9           PENJURU RD POINT (17103.73 33407.71)
## 10          PENJURU RD   POINT (17178.3 33503.9)

Next, we will extract Industry

business3414I <- business3414%>%
  filter(FAC_TYPE==9991)
business3414I
## Simple feature collection with 110 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 4674.563 ymin: 26960.98 xmax: 46655.31 ymax: 49784.76
## epsg (SRID):    3414
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
## First 10 features:
##        POI_ID SEQ_NUM FAC_TYPE
## 1  1110491789       1     9991
## 2  1099992474       1     9991
## 3  1099992477       1     9991
## 4  1099992477       2     9991
## 5  1100464367       1     9991
## 6  1100464367       2     9991
## 7  1100464368       1     9991
## 8  1100464369       1     9991
## 9  1100464369       2     9991
## 10 1108286608       1     9991
##                                                        POI_NAME
## 1                                  KAKI BUKIT INDUSTRIAL ESTATE
## 2                             TERRACE FACTORIES TUAS SOUTH ST 5
## 3               TERRACE FACTORIES TUAS SOUTH ST 5 EAST ENTRANCE
## 4                             JTC TERRACE FACTORIES TUAS S ST 5
## 5  CHANGI BUSINESS PARK CHANGI BUSINESS PARK CENTRAL 1 ENTRANCE
## 6                                CHANGI BUSINESS PARK CENTRAL 1
## 7                                          CHANGI BUSINESS PARK
## 8      CHANGI BUSINESS PARK CHANGI BUSINESS PARK VISTA ENTRANCE
## 9                           CHANGI BUSINESS PARK VISTA ENTRANCE
## 10                                 ANG MO KIO INDUSTRIAL PARK 1
##                           ST_NAME                  geometry
## 1                KAKI BUKIT AVE 1 POINT (35885.03 34987.72)
## 2                 TUAS SOUTH ST 5 POINT (4726.864 30627.49)
## 3                 TUAS SOUTH ST 5 POINT (4825.915 30636.33)
## 4                 TUAS SOUTH ST 5 POINT (4825.915 30636.33)
## 5  CHANGI BUSINESS PARK CENTRAL 1 POINT (42460.02 35003.47)
## 6  CHANGI BUSINESS PARK CENTRAL 1 POINT (42460.02 35003.47)
## 7  CHANGI BUSINESS PARK CENTRAL 2 POINT (42729.34 35182.61)
## 8        CHANGI BUSINESS PARK VIS   POINT (43055.4 35408.2)
## 9        CHANGI BUSINESS PARK VIS   POINT (43055.4 35408.2)
## 10          ANG MO KIO IND PARK 1  POINT (30518.61 37985.3)

3.5.2 Check NA and Unique

We will need to check for NA as we will need to count the number of each urban functions. Hence we need to ensure there is no NA for the column that we are using and ensure that is unique. First we check for NA.

financial3414[rowSums(is.na(financial3414))!=0,]
## Simple feature collection with 3320 features and 29 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 4881.527 ymin: 25171.88 xmax: 46526.16 ymax: 49338.02
## epsg (SRID):    3414
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
## First 10 features:
##       LINK_ID     POI_ID SEQ_NUM FAC_TYPE            POI_NAME POI_LANGCD
## 1  1170624361 1132324230       1     3578                 UOB        ENG
## 2  1112103842 1132315471       1     3578                POSB        ENG
## 3  1112103842 1132315472       1     3578                 UOB        ENG
## 4  1112103842 1132315473       1     3578                OCBC        ENG
## 5   864687596 1100784924       1     3578                OCBC        ENG
## 6   902073032 1132324170       1     6000             MAYBANK        ENG
## 7   778516217 1141424387       1     6000 ADPOST MONEYCHANGER        ENG
## 8   880495939 1096910285       1     3578                 UOB        ENG
## 9   866996334 1096910292       1     3578                OCBC        ENG
## 10  880495939 1096910286       1     3578            CITIBANK        ENG
##    POI_NMTYPE POI_ST_NUM ST_NUM_FUL ST_NFUL_LC           ST_NAME ST_LANGCD
## 1           B        201       <NA>       <NA>      YISHUN AVE 2       ENG
## 2           B        375       <NA>       <NA>  COMMONWEALTH AVE       ENG
## 3           B        375       <NA>       <NA>  COMMONWEALTH AVE       ENG
## 4           B        375       <NA>       <NA>  COMMONWEALTH AVE       ENG
## 5           B       <NA>       <NA>       <NA> JURONG WEST ST 51       ENG
## 6           B        707       <NA>       <NA>     EAST COAST RD       ENG
## 7           B        163       <NA>       <NA>        TANGLIN RD       ENG
## 8           B       <NA>       <NA>       <NA>              <NA>      <NA>
## 9           B         11       <NA>       <NA>         ARTS LINK       ENG
## 10          B       <NA>       <NA>       <NA>              <NA>      <NA>
##    POI_ST_SD ACC_TYPE   PH_NUMBER CHAIN_ID NAT_IMPORT PRIVATE IN_VICIN
## 1          L     <NA>        <NA>     6919          N       N        N
## 2          R     <NA>        <NA>     6918          N       N        N
## 3          R     <NA>        <NA>     6919          N       N        N
## 4          R     <NA>        <NA>     6920          N       N        N
## 5          R     <NA>        <NA>     6920          N       N        N
## 6          L     <NA> 18006292266     3657          N       N        N
## 7          R     <NA>    67330779        0          N       N        N
## 8          R     <NA>        <NA>     6919          N       N        N
## 9          R     <NA>        <NA>     6920          N       N        N
## 10         R     <NA>        <NA>     1165          N       N        N
##    NUM_PARENT NUM_CHILD PERCFRREF VANCITY_ID
## 1           0         0        NA          0
## 2           0         0        NA          0
## 3           0         0        NA          0
## 4           0         0        NA          0
## 5           0         0        60          0
## 6           0         0        NA          0
## 7           1         0        50          0
## 8           0         0        20          0
## 9           0         0        NA          0
## 10          0         0        20          0
##                                                              ACT_ADDR
## 1                                                                <NA>
## 2                                                                <NA>
## 3                                                                <NA>
## 4                                                                <NA>
## 5  501 JURONG WEST STREET 51                         SINGAPORE 640501
## 6                                                                <NA>
## 7                                                                <NA>
## 8                                                                <NA>
## 9                                                                <NA>
## 10                                                               <NA>
##    ACT_LANGCD            ACT_ST_NAM ACT_ST_NUM ACT_ADMIN ACT_POSTAL
## 1        <NA>                  <NA>       <NA>      <NA>       <NA>
## 2        <NA>                  <NA>       <NA>      <NA>       <NA>
## 3        <NA>                  <NA>       <NA>      <NA>       <NA>
## 4        <NA>                  <NA>       <NA>      <NA>       <NA>
## 5         ENG JURONG WEST STREET 51        501 SINGAPORE     640501
## 6        <NA>                  <NA>       <NA>      <NA>       <NA>
## 7        <NA>                  <NA>       <NA>      <NA>       <NA>
## 8        <NA>                  <NA>       <NA>      <NA>       <NA>
## 9        <NA>                  <NA>       <NA>      <NA>       <NA>
## 10       <NA>                  <NA>       <NA>      <NA>       <NA>
##                     geometry
## 1  POINT (27966.77 44304.65)
## 2  POINT (24163.96 31606.25)
## 3  POINT (24163.96 31606.25)
## 4  POINT (24163.96 31606.25)
## 5  POINT (15270.94 36919.65)
## 6  POINT (37917.26 32698.88)
## 7  POINT (26981.85 31956.75)
## 8  POINT (21205.83 30939.54)
## 9  POINT (21159.08 30673.06)
## 10 POINT (21205.83 30939.54)

As seen above, POI_ID does not have any NA. this could likely be the unique primary key. Next, lets check for if it consists of only unique values. We do a count for each POI_ID.

F_POI_N <- financial3414 %>% group_by(POI_ID, LINK_ID) %>% summarize(count=n())
F_POI_N
## Simple feature collection with 3293 features and 3 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 4881.527 ymin: 25171.88 xmax: 46526.16 ymax: 49338.02
## epsg (SRID):    3414
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
## # A tibble: 3,293 x 4
## # Groups:   POI_ID [3,293]
##      POI_ID    LINK_ID count            geometry
##  *    <dbl>      <dbl> <int>         <POINT [m]>
##  1 36438791 1195622058     1  (17518.91 35275.3)
##  2 36439016 1206451148     1 (20678.49 37946.69)
##  3 36439060  862669111     1 (34824.27 41553.63)
##  4 36439061  862669111     1 (34822.04 41538.15)
##  5 36439318  939808842     1 (33879.48 39781.09)
##  6 36439341 1199010724     1 (32936.92 37359.48)
##  7 36439599  116173695     1 (23708.83 34892.54)
##  8 36439668  914931160     1 (38133.16 32755.28)
##  9 36439689  901144425     1 (25411.55 34020.09)
## 10 36439715 1199568246     1 (31094.04 31778.74)
## # ... with 3,283 more rows
F_POI_N_2 <- F_POI_N %>% filter(count>=2)  %>% st_set_geometry(NULL) 
F_POI_N_2
## # A tibble: 27 x 3
## # Groups:   POI_ID [27]
##        POI_ID    LINK_ID count
##  *      <dbl>      <dbl> <int>
##  1 1040480984 1046226297     2
##  2 1058317297  762719732     2
##  3 1058317303  762719732     2
##  4 1058317333  966347222     2
##  5 1083881074 1128584674     2
##  6 1113017405  845452563     2
##  7 1115796366  778595512     2
##  8 1115796975 1188277710     2
##  9 1115797008  116157295     2
## 10 1115868988  862559104     2
## # ... with 17 more rows

Next, we join back the data to see why the data is duplicated.

F_POI_N_2_all <- left_join(F_POI_N_2,financial3414, c("POI_ID"="POI_ID"))
F_POI_N_2_all
## # A tibble: 54 x 32
## # Groups:   POI_ID [27]
##    POI_ID LINK_ID.x count LINK_ID.y SEQ_NUM FAC_TYPE POI_NAME POI_LANGCD
##     <dbl>     <dbl> <int>     <dbl>   <int>    <int> <fct>    <fct>     
##  1 1.04e9    1.05e9     2    1.05e9       1     6000 CREDIT ~ ENG       
##  2 1.04e9    1.05e9     2    1.05e9       2     6000 CREDIT ~ ENG       
##  3 1.06e9    7.63e8     2    7.63e8       1     6000 BANQUE ~ ENG       
##  4 1.06e9    7.63e8     2    7.63e8       2     6000 BCEE     ENG       
##  5 1.06e9    7.63e8     2    7.63e8       1     6000 DZ BANK~ ENG       
##  6 1.06e9    7.63e8     2    7.63e8       2     6000 DZ BANK  ENG       
##  7 1.06e9    9.66e8     2    9.66e8       1     6000 RAIFFEI~ ENG       
##  8 1.06e9    9.66e8     2    9.66e8       2     6000 RAIFFEI~ ENG       
##  9 1.08e9    1.13e9     2    1.13e9       1     6000 AMEERTE~ ENG       
## 10 1.08e9    1.13e9     2    1.13e9       2     6000 AMEERTE~ ENG       
## # ... with 44 more rows, and 24 more variables: POI_NMTYPE <fct>,
## #   POI_ST_NUM <fct>, ST_NUM_FUL <fct>, ST_NFUL_LC <fct>, ST_NAME <fct>,
## #   ST_LANGCD <fct>, POI_ST_SD <fct>, ACC_TYPE <fct>, PH_NUMBER <fct>,
## #   CHAIN_ID <dbl>, NAT_IMPORT <fct>, PRIVATE <fct>, IN_VICIN <fct>,
## #   NUM_PARENT <int>, NUM_CHILD <int>, PERCFRREF <int>, VANCITY_ID <dbl>,
## #   ACT_ADDR <fct>, ACT_LANGCD <fct>, ACT_ST_NAM <fct>, ACT_ST_NUM <fct>,
## #   ACT_ADMIN <fct>, ACT_POSTAL <fct>, geometry <POINT [m]>

This can be seen that it is due to the SEQ_NUM abd POI_NMTYPE whereby there is SEQ_NUM = 1 and 2 and POI_NMTYPE = B and J for each of the duplicates. The other different field is the POI_NAME which differes slightly. Hence, for the purpose of our spatial data, let’s check if those rows with counts = 1 have any SEQ_NUM = 2.

F_POI_N_1 <- F_POI_N %>% filter(count==1)  %>% st_set_geometry(NULL) 
F_POI_N_1_all <- left_join(F_POI_N_1,financial3414, c("POI_ID"="POI_ID"))
unique(F_POI_N_1_all$SEQ_NUM)
## [1] 1

As seen above, there are no SEQ = 2. Hence, for our spatial data, we will filter all to sequence 1 only to avoid any duplicates.

3.5.3 Filter SEQ_NUM=1 to omit duplicates

business3414b_filter <- business3414b%>%filter(SEQ_NUM==1)

business3414I_filter <- business3414I%>%filter(SEQ_NUM==1)

financial3414_filter <- financial3414%>%filter(SEQ_NUM==1)

govtembassy3414_filter <- govtembassy3414%>%filter(SEQ_NUM==1)

private3414_filter <- private3414%>%filter(SEQ_NUM==1)

shopping3414_filter <- shopping3414%>%filter(SEQ_NUM==1)

3.5.4 Join all spatial data to MPSZ

We will join all the spatial data to mpsz to otbtain the subzone that each point belongs to and count by subzone

3.5.4.1 Join Financial to mpsz

Before joining, lets do a row count.

nrow(financial3414_filter)
## [1] 3293

We join financial3414 to mpsz3414 to obtain the subzone of each point.

mpsz_financial <- st_join(mpsz3414, financial3414_filter, join = st_contains)

We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any financial centers.

nrow(mpsz_financial)
## [1] 3366

Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.

mpsz_financial_sz <- mpsz_financial %>%
  group_by(SUBZONE_N)%>%
  summarise(Financial = sum(!is.na(POI_ID)))
mpsz_financial_sz_df <- mpsz_financial_sz %>% st_set_geometry(NULL)

head(mpsz_financial_sz_df)
## # A tibble: 6 x 2
##   SUBZONE_N       Financial
##   <fct>               <int>
## 1 ADMIRALTY               2
## 2 AIRPORT ROAD            0
## 3 ALEXANDRA HILL         14
## 4 ALEXANDRA NORTH         0
## 5 ALJUNIED               42
## 6 ANAK BUKIT             31

Next, we join the mpsz_financial_sz to mpsz3414_AG_TOD by Subzone

mpsz3414_AG_TOD_F<- full_join(mpsz3414_AG_TOD, mpsz_financial_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))

We then count the number of rows to ensure it adds up to 323.

nrow(mpsz3414_AG_TOD_F)
## [1] 323

3.5.4.2 Join GovtEmbassy to mpsz

Before joining, lets do a row count.

nrow(govtembassy3414_filter)
## [1] 394

We join govtembassy3414 to mpsz3414 to obtain the subzone of each point.

mpsz_govtembassy <- st_join(mpsz3414, govtembassy3414_filter, join = st_contains)

We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any govt embassy.

nrow(mpsz_govtembassy)
## [1] 584

Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.

mpsz_govtembassy_sz <- mpsz_govtembassy %>%
  group_by(SUBZONE_N)%>%
  summarise(GovtEmbassy = sum(!is.na(POI_ID)))

mpsz_govtembassy_sz_df <- mpsz_govtembassy_sz %>% st_set_geometry(NULL)

Next, we join the mpsz_govtembassy_sz_df to mpsz3414_AG_TOD_F

mpsz3414_AG_TOD_F_G<- full_join(mpsz3414_AG_TOD_F, mpsz_govtembassy_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))

We then count the number of rows to ensure it adds up to 323.

nrow(mpsz3414_AG_TOD_F_G)
## [1] 323

3.5.4.3 Join Private to mpsz

Before joining, lets do a row count.

nrow(private3414_filter)
## [1] 3580

We join private3414 to mpsz3414 to obtain the subzone of each point.

mpsz_private <- st_join(mpsz3414, private3414_filter, join = st_contains)

We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any Private.

nrow(mpsz_private)
## [1] 3664

Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.

mpsz_private_sz <- mpsz_private %>%
  group_by(SUBZONE_N)%>%
  summarise(Private = sum(!is.na(POI_ID)))
mpsz_private_sz_df <- mpsz_private_sz %>% st_set_geometry(NULL)

Next, we join the mpsz_private3414_sz_df to mpsz3414_AG_TOD_F_G

mpsz3414_AG_TOD_F_G_P <- full_join(mpsz3414_AG_TOD_F_G, mpsz_private_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))

We then count the number of rows to ensure it adds up to 323.

nrow(mpsz3414_AG_TOD_F_G_P)
## [1] 323

3.5.4.4 Join Shopping to mpsz

Before joining, lets do a row count.

nrow(shopping3414_filter)
## [1] 458

We join shopping3414 to mpsz3414 to obtain the subzone of each point.

mpsz_shopping <- st_join(mpsz3414, shopping3414_filter, join = st_contains)

We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any Shopping Centres

nrow(mpsz_shopping)
## [1] 634

Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.

mpsz_Shopping_sz <- mpsz_shopping %>%
  group_by(SUBZONE_N)%>%
  summarise(Shopping = sum(!is.na(POI_ID)))

mpsz_Shopping_sz_df <- mpsz_Shopping_sz %>% st_set_geometry(NULL)

Next, we join the mpsz_Shopping_sz_df to mpsz3414_AG_TOD_F_G_P

mpsz3414_AG_TOD_F_G_P_S <- full_join(mpsz3414_AG_TOD_F_G_P, mpsz_Shopping_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))

We then count the number of rows to ensure it adds up to 323.

nrow(mpsz3414_AG_TOD_F_G_P_S)
## [1] 323

3.5.4.5 Join Business to mpsz

Before joining, lets do a row count.

nrow(business3414b_filter)
## [1] 6320

We join business3414b to mpsz3414 to obtain the subzone of each point.

mpsz_business<- st_join(mpsz3414, business3414b_filter, join = st_contains)

We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any Business Centres

nrow(mpsz_business)
## [1] 6427

Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.

mpsz_business_sz <- mpsz_business %>%
  group_by(SUBZONE_N)%>%
  summarise(Business = sum(!is.na(POI_ID)))

mpsz_business_sz_df <- mpsz_business_sz %>% st_set_geometry(NULL)

Next, we join the mpsz_business_sz_df to mpsz3414_AG_TOD_F_G_P

mpsz3414_AG_TOD_F_G_P_S_B <- full_join(mpsz3414_AG_TOD_F_G_P_S, mpsz_business_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))

We then count the number of rows to ensure it adds up to 323.

nrow(mpsz3414_AG_TOD_F_G_P_S_B)
## [1] 323

3.4.5.5 Join Industy to mpsz

Before joining, lets do a row count.

nrow(business3414I_filter)
## [1] 95

We join business3414I to mpsz3414 to obtain the subzone of each point.

mpsz_industry<- st_join(mpsz3414, business3414I_filter, join = st_contains)

We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any Industry Centres

nrow(mpsz_industry)
## [1] 369

Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.

mpsz_industry_sz <- mpsz_industry %>%
  group_by(SUBZONE_N)%>%
  summarise(Industry = sum(!is.na(POI_ID)))

mpsz_industry_sz_df <- mpsz_industry_sz %>% st_set_geometry(NULL)

Next, we join the mpsz_industry_sz_df to mpsz3414_AG_TOD_F_G_P

mpsz3414_all <- full_join(mpsz3414_AG_TOD_F_G_P_S_B, mpsz_industry_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))

We then count the number of rows to ensure it adds up to 323.

nrow(mpsz3414_all)
## [1] 323
head(mpsz3414_all)
## Simple feature collection with 6 features and 18 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 24468.89 ymin: 28369.47 xmax: 32362.39 ymax: 30542.74
## epsg (SRID):    3414
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
##        SUBZONE_N YOUNG ECONOMY ACTIVE AGED   Pop SHAPE_Area  Pop_Density
## 1   MARINA SOUTH     0              0    0     0  1630379.3 0.0000000000
## 2   PEARL'S HILL  1100           3420 2110  6630   559816.2 0.0118431719
## 3      BOAT QUAY     0             50   20    70   160807.5 0.0004353031
## 4 HENDERSON HILL  2620           7500 3260 13380   595428.9 0.0224711973
## 5        REDHILL  2840           6260 1630 10730   387429.4 0.0276953656
## 6 ALEXANDRA HILL  2910           7560 3310 13780  1030378.8 0.0133737225
##   HDB1-2RM dwellers HDB3-4RM dwellers HDB5RM-Ec dweller
## 1                 0                 0                 0
## 2              4520              1400                 0
## 3                 0                 0                 0
## 4              3930              7990              1270
## 5              1970              3520              3240
## 6              3910              6320              3120
##   Condominium and apartment dwellers Landed property dwellers Financial
## 1                                  0                        0         3
## 2                                420                        0        24
## 3                                 50                        0         2
## 4                                190                        0         4
## 5                               1930                        0        12
## 6                                  0                        0        14
##   GovtEmbassy Private Shopping Business Industry
## 1           0       0        0        0        0
## 2           1       6        3        6        0
## 3           1       1        1       39        0
## 4           0       5        0        0        0
## 5           0       6        1        1        0
## 6           5       4        1       39        1
##                         geometry
## 1 MULTIPOLYGON (((31495.56 30...
## 2 MULTIPOLYGON (((29092.28 30...
## 3 MULTIPOLYGON (((29932.33 29...
## 4 MULTIPOLYGON (((27131.28 30...
## 5 MULTIPOLYGON (((26451.03 30...
## 6 MULTIPOLYGON (((25899.7 297...

3.4.6 Check for NA

After joining all the data, lets check if there is any NA data.

mpsz3414_all[rowSums(is.na(mpsz3414_all))!=0,]
## Simple feature collection with 0 features and 18 fields
## bbox:           xmin: NA ymin: NA xmax: NA ymax: NA
## epsg (SRID):    3414
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
##  [1] SUBZONE_N                          YOUNG                             
##  [3] ECONOMY ACTIVE                     AGED                              
##  [5] Pop                                SHAPE_Area                        
##  [7] Pop_Density                        HDB1-2RM dwellers                 
##  [9] HDB3-4RM dwellers                  HDB5RM-Ec dweller                 
## [11] Condominium and apartment dwellers Landed property dwellers          
## [13] Financial                          GovtEmbassy                       
## [15] Private                            Shopping                          
## [17] Business                           Industry                          
## [19] geometry                          
## <0 rows> (or 0-length row.names)

4 Exploratory Data Analysis (EDA)

4.1 EDA using statistical graphics

4.1.1 Check Data Range

We first use summary() to derive the summary statistics of mpsz3414_all data.frame.

summary(mpsz3414_all)
##   SUBZONE_N             YOUNG       ECONOMY ACTIVE       AGED      
##  Length:323         Min.   :    0   Min.   :    0   Min.   :    0  
##  Class :character   1st Qu.:    0   1st Qu.:    0   1st Qu.:    0  
##  Mode  :character   Median : 1170   Median : 2840   Median :  730  
##                     Mean   : 3296   Mean   : 7386   Mean   : 1806  
##                     3rd Qu.: 4365   3rd Qu.:10285   3rd Qu.: 3000  
##                     Max.   :34260   Max.   :79780   Max.   :18860  
##       Pop           SHAPE_Area        Pop_Density      HDB1-2RM dwellers
##  Min.   :     0   Min.   :   39438   Min.   :0.00000   Min.   :   0     
##  1st Qu.:     0   1st Qu.:  628261   1st Qu.:0.00000   1st Qu.:   0     
##  Median :  4890   Median : 1229894   Median :0.00591   Median :   0     
##  Mean   : 12487   Mean   : 2420882   Mean   :0.01074   Mean   : 542     
##  3rd Qu.: 17065   3rd Qu.: 2106483   3rd Qu.:0.01993   3rd Qu.: 605     
##  Max.   :132900   Max.   :69748299   Max.   :0.04606   Max.   :4700     
##  HDB3-4RM dwellers HDB5RM-Ec dweller Condominium and apartment dwellers
##  Min.   :    0     Min.   :    0     Min.   :    0                     
##  1st Qu.:    0     1st Qu.:    0     1st Qu.:    0                     
##  Median :    0     Median :    0     Median :  230                     
##  Mean   : 5953     Mean   : 3297     Mean   : 1827                     
##  3rd Qu.: 9705     3rd Qu.: 3660     3rd Qu.: 2835                     
##  Max.   :75000     Max.   :47960     Max.   :16770                     
##  Landed property dwellers   Financial      GovtEmbassy       Private      
##  Min.   :    0.0          Min.   :  0.0   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:    0.0          1st Qu.:  1.0   1st Qu.: 0.00   1st Qu.:  0.00  
##  Median :    0.0          Median :  5.0   Median : 0.00   Median :  4.00  
##  Mean   :  773.9          Mean   : 10.2   Mean   : 1.22   Mean   : 11.08  
##  3rd Qu.:  400.0          3rd Qu.: 13.0   3rd Qu.: 1.00   3rd Qu.: 11.00  
##  Max.   :18820.0          Max.   :132.0   Max.   :17.00   Max.   :215.00  
##     Shopping         Business         Industry               geometry  
##  Min.   : 0.000   Min.   :  0.00   Min.   :0.0000   MULTIPOLYGON :323  
##  1st Qu.: 0.000   1st Qu.:  0.00   1st Qu.:0.0000   epsg:3414    :  0  
##  Median : 0.000   Median :  2.00   Median :0.0000   +proj=tmer...:  0  
##  Mean   : 1.418   Mean   : 19.57   Mean   :0.2941                      
##  3rd Qu.: 1.000   3rd Qu.: 13.50   3rd Qu.:0.0000                      
##  Max.   :27.000   Max.   :303.00   Max.   :5.0000

4.1.2 Check distribution of data

We can plot the distribution of the variables by using appropriate Exploratory Data Analysis (EDA) as shown in the code chunk below. Histogram is useful to identify the overall distribution of the data values (i.e. left skew, right skew or normal distribution)

a <- ggplot(data=mpsz3414_all, aes(x=`YOUNG`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

b <- ggplot(data=mpsz3414_all, aes(x=`ECONOMY ACTIVE`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

c <- ggplot(data=mpsz3414_all, aes(x=`AGED`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

d <- ggplot(data=mpsz3414_all, aes(x=`Pop`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

e <- ggplot(data=mpsz3414_all, aes(x=`HDB1-2RM dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

f <- ggplot(data=mpsz3414_all, aes(x=`HDB3-4RM dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

g <- ggplot(data=mpsz3414_all, aes(x=`HDB5RM-Ec dweller`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

h <- ggplot(data=mpsz3414_all, aes(x=`Condominium and apartment dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

i <- ggplot(data=mpsz3414_all, aes(x=`Landed property dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")


ggarrange(a,b,c,d,e,f,g,h,i,
          ncol = 3,
          nrow = 3)

j <- ggplot(data=mpsz3414_all, aes(x=`Financial`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

k <- ggplot(data=mpsz3414_all, aes(x=`GovtEmbassy`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

l <- ggplot(data=mpsz3414_all, aes(x=`Private`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

m <- ggplot(data=mpsz3414_all, aes(x=`Shopping`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

n <- ggplot(data=mpsz3414_all, aes(x=`Business`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

o <- ggplot(data=mpsz3414_all, aes(x=`Industry`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

ggarrange(j,k,l,m,n,o,
          ncol = 3,
          nrow = 2)

As all the NA values is due to the fact that the total population is 0, the new dervied variabless yield a NA value. Hence, we will have to remove this NA values with 0 as it is irelevant for our analysis.

mpsz3414_all_o <- mpsz3414_all%>%filter(Pop>0)
summary(mpsz3414_all_o)
##   SUBZONE_N             YOUNG       ECONOMY ACTIVE       AGED        
##  Length:234         Min.   :    0   Min.   :   10   Min.   :    0.0  
##  Class :character   1st Qu.:  735   1st Qu.: 1760   1st Qu.:  472.5  
##  Mode  :character   Median : 2730   Median : 6505   Median : 1905.0  
##                     Mean   : 4549   Mean   :10195   Mean   : 2492.7  
##                     3rd Qu.: 6512   3rd Qu.:15582   3rd Qu.: 3547.5  
##                     Max.   :34260   Max.   :79780   Max.   :18860.0  
##       Pop           SHAPE_Area        Pop_Density       HDB1-2RM dwellers
##  Min.   :    50   Min.   :   49626   Min.   :0.000004   Min.   :   0.0   
##  1st Qu.:  2992   1st Qu.:  552455   1st Qu.:0.004062   1st Qu.:   0.0   
##  Median : 11085   Median : 1059527   Median :0.011992   Median :   0.0   
##  Mean   : 17237   Mean   : 1733644   Mean   :0.014820   Mean   : 748.1   
##  3rd Qu.: 26378   3rd Qu.: 1794212   3rd Qu.:0.024407   3rd Qu.:1290.0   
##  Max.   :132900   Max.   :69748299   Max.   :0.046058   Max.   :4700.0   
##  HDB3-4RM dwellers HDB5RM-Ec dweller Condominium and apartment dwellers
##  Min.   :    0     Min.   :    0     Min.   :    0.0                   
##  1st Qu.:    0     1st Qu.:    0     1st Qu.:   52.5                   
##  Median : 5285     Median : 1850     Median : 1285.0                   
##  Mean   : 8217     Mean   : 4552     Mean   : 2521.8                   
##  3rd Qu.:13482     3rd Qu.: 5690     3rd Qu.: 4027.5                   
##  Max.   :75000     Max.   :47960     Max.   :16770.0                   
##  Landed property dwellers   Financial       GovtEmbassy    
##  Min.   :    0.0          Min.   :  0.00   Min.   : 0.000  
##  1st Qu.:    0.0          1st Qu.:  3.00   1st Qu.: 0.000  
##  Median :    0.0          Median :  7.50   Median : 0.000  
##  Mean   : 1068.3          Mean   : 12.38   Mean   : 1.269  
##  3rd Qu.:  887.5          3rd Qu.: 16.00   3rd Qu.: 1.750  
##  Max.   :18820.0          Max.   :132.00   Max.   :17.000  
##     Private          Shopping         Business         Industry     
##  Min.   :  0.00   Min.   : 0.000   Min.   :  0.00   Min.   :0.0000  
##  1st Qu.:  3.00   1st Qu.: 0.000   1st Qu.:  0.00   1st Qu.:0.0000  
##  Median :  7.00   Median : 1.000   Median :  2.00   Median :0.0000  
##  Mean   : 15.16   Mean   : 1.718   Mean   : 10.06   Mean   :0.2137  
##  3rd Qu.: 15.00   3rd Qu.: 2.000   3rd Qu.:  7.00   3rd Qu.:0.0000  
##  Max.   :215.00   Max.   :27.000   Max.   :224.00   Max.   :5.0000  
##           geometry  
##  MULTIPOLYGON :234  
##  epsg:3414    :  0  
##  +proj=tmer...:  0  
##                     
##                     
## 

4.1.3 Check distribution of the derived variables

We can plot the distribution of the variables by using appropriate Exploratory Data Analysis (EDA) as shown in the code chunk below. Histogram is useful to identify the overall distribution of the data values (i.e. left skew, right skew or normal distribution)

youngo <- ggplot(data=mpsz3414_all_o, aes(x=`YOUNG`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

econo <- ggplot(data=mpsz3414_all_o, aes(x=`ECONOMY ACTIVE`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

agedo <- ggplot(data=mpsz3414_all_o, aes(x=`AGED`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

popo <- ggplot(data=mpsz3414_all_o, aes(x=`Pop`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

hdb1o <- ggplot(data=mpsz3414_all_o, aes(x=`HDB1-2RM dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

hdb3o <- ggplot(data=mpsz3414_all_o, aes(x=`HDB3-4RM dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

hdb5o <- ggplot(data=mpsz3414_all_o, aes(x=`HDB5RM-Ec dweller`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

condoo <- ggplot(data=mpsz3414_all_o, aes(x=`Condominium and apartment dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

landedo <- ggplot(data=mpsz3414_all_o, aes(x=`Landed property dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")


ggarrange(youngo,econo,agedo,popo,hdb1o, hdb3o,hdb5o,condoo,landedo,i,
          ncol = 3,
          nrow = 4)

4.1.4 Split Socio-economic and Urban functions

For analysis purpose, we split the socio economic factors and the urban functions

socio <- mpsz3414_all_o %>% select(`SUBZONE_N`,`YOUNG`,`ECONOMY ACTIVE`,`AGED`,`HDB1-2RM dwellers`,`HDB3-4RM dwellers`,`HDB5RM-Ec dweller`,`Condominium and apartment dwellers`,`Landed property dwellers`,`Pop_Density`)
urban <- mpsz3414_all_o %>% select(`Financial`,`GovtEmbassy`,`Private`,`Shopping`,`Business`,`Industry`)

4.2 Correlation Analysis

To perform correlation analyss, we will need to remove the geometry.

socio_df <- socio%>% st_set_geometry(NULL) 
cluster_vars.cor = cor(socio_df[2:10])
corrplot.mixed(cluster_vars.cor,
         lower = "ellipse", 
               upper = "number",
               tl.pos = "lt",
               diag = "l",
               tl.col = "black",width=10, height=4,number.cex = .55)

As seen from abovE: 1. Young has a high correlation with economy active of 0.99 2. Young and HDB 3-4 have a high correlation of 0.91 3. Young and HDB5RM/EC have a high correlation of 0.9242. Economy active and HDB3-4RM have a high correlation of 0.94 4. Economy active and HDB5RM-EC have a high correlation of 0.89 5. Aged has a high correlation of 0.91 with economy active of 0.91 6. Aged and HDB 3-4 RM have a high correlation of 0.91

Based on the high corrrelations, we can see that Economy active people who stays with their children (YOUNG) usually stay in HDB3-4 room or HDB 5 room/Executive condo. Economy active people who stay with their parents (AGED) usuallt stay in HDB 3-4 room. There are also cases where the YOUNG stay with AGED which could be the case that children stay with both their parents(ECONOMY ACTIVE) and grandparents (AGED), o either or. Based on the correlations, ECONOMY ACTIVE has the highest correlation with YOUNG, then ECONOMY ACTIVE and AGED,then AGED and YOUNG. Hence, we will first drop ECONOMY ACTIVE.

After dropping economy active, we are left with 3 high correlations, YOUNG and AGED, YOUNG and HDB3-4 RM, AGED and HDB3-4 RM. As discussed earlier, all 3 age groups have a high correlation with HDB3-4 rooms as families with the different age group tend to stay in HDB 3-4 rooms. Hence, we will remove HDB3-4 RM so that we are able to seperate the age groups.

Between AGED and YOUNG, we decide to drop YOUNG as YOUNG.Apart from the explanations apart, we also drop the following variales due to the it having the highest correlation. again in this case,YOUNG has the highest correlation with other variables as compared to AGED.

new_socio <-  subset(socio_df, select = -c(`ECONOMY ACTIVE`,`HDB3-4RM dwellers`,`YOUNG`)) 
cluster_vars.cor = cor(new_socio[2:7])
corrplot.mixed(cluster_vars.cor,
         lower = "ellipse", 
               upper = "number",
               tl.pos = "lt",
               diag = "l",
               tl.col = "black",width=10, height=4,number.cex = .55)

5 Hierarchy Cluster Analysis

In this section, you will learn how to perform hierarchical cluster analysis. The analysis consists of four major steps:

5.1 Extrating analysis variables

cluster_vars <- new_socio[1:7]
head(cluster_vars,10)
##          SUBZONE_N AGED HDB1-2RM dwellers HDB5RM-Ec dweller
## 1     PEARL'S HILL 2110              4520                 0
## 2        BOAT QUAY   20                 0                 0
## 3   HENDERSON HILL 3260              3930              1270
## 4          REDHILL 1630              1970              3240
## 5   ALEXANDRA HILL 3310              3910              3120
## 6    BUKIT HO SWEE 3590              3700              1900
## 7      CLARKE QUAY   10                 0                 0
## 8  PASIR PANJANG 1  560                 0                 0
## 9        QUEENSWAY   50                 0                 0
## 10 ALEXANDRA NORTH  120                 0                 0
##    Condominium and apartment dwellers Landed property dwellers
## 1                                 420                        0
## 2                                  50                        0
## 3                                 190                        0
## 4                                1930                        0
## 5                                   0                        0
## 6                                 540                        0
## 7                                  10                        0
## 8                                2970                     1430
## 9                                 290                        0
## 10                               2120                        0
##     Pop_Density
## 1  0.0118431719
## 2  0.0004353031
## 3  0.0224711973
## 4  0.0276953656
## 5  0.0133737225
## 6  0.0267883659
## 7  0.0002067649
## 8  0.0040837310
## 9  0.0004591192
## 10 0.0072180927

Next, we need to change the rows by township name instead of row number by using the code chunk below

row.names(cluster_vars) <- cluster_vars$"SUBZONE_N"
head(cluster_vars,10)
##                       SUBZONE_N AGED HDB1-2RM dwellers HDB5RM-Ec dweller
## PEARL'S HILL       PEARL'S HILL 2110              4520                 0
## BOAT QUAY             BOAT QUAY   20                 0                 0
## HENDERSON HILL   HENDERSON HILL 3260              3930              1270
## REDHILL                 REDHILL 1630              1970              3240
## ALEXANDRA HILL   ALEXANDRA HILL 3310              3910              3120
## BUKIT HO SWEE     BUKIT HO SWEE 3590              3700              1900
## CLARKE QUAY         CLARKE QUAY   10                 0                 0
## PASIR PANJANG 1 PASIR PANJANG 1  560                 0                 0
## QUEENSWAY             QUEENSWAY   50                 0                 0
## ALEXANDRA NORTH ALEXANDRA NORTH  120                 0                 0
##                 Condominium and apartment dwellers
## PEARL'S HILL                                   420
## BOAT QUAY                                       50
## HENDERSON HILL                                 190
## REDHILL                                       1930
## ALEXANDRA HILL                                   0
## BUKIT HO SWEE                                  540
## CLARKE QUAY                                     10
## PASIR PANJANG 1                               2970
## QUEENSWAY                                      290
## ALEXANDRA NORTH                               2120
##                 Landed property dwellers  Pop_Density
## PEARL'S HILL                           0 0.0118431719
## BOAT QUAY                              0 0.0004353031
## HENDERSON HILL                         0 0.0224711973
## REDHILL                                0 0.0276953656
## ALEXANDRA HILL                         0 0.0133737225
## BUKIT HO SWEE                          0 0.0267883659
## CLARKE QUAY                            0 0.0002067649
## PASIR PANJANG 1                     1430 0.0040837310
## QUEENSWAY                              0 0.0004591192
## ALEXANDRA NORTH                        0 0.0072180927

Notice that the row number has been replaced into the Subzone name.

Now, we will delete the Subzone field by using the code chunk below.

subzone_var <- select(cluster_vars, c(2:7))
head(subzone_var, 10)
##                 AGED HDB1-2RM dwellers HDB5RM-Ec dweller
## PEARL'S HILL    2110              4520                 0
## BOAT QUAY         20                 0                 0
## HENDERSON HILL  3260              3930              1270
## REDHILL         1630              1970              3240
## ALEXANDRA HILL  3310              3910              3120
## BUKIT HO SWEE   3590              3700              1900
## CLARKE QUAY       10                 0                 0
## PASIR PANJANG 1  560                 0                 0
## QUEENSWAY         50                 0                 0
## ALEXANDRA NORTH  120                 0                 0
##                 Condominium and apartment dwellers
## PEARL'S HILL                                   420
## BOAT QUAY                                       50
## HENDERSON HILL                                 190
## REDHILL                                       1930
## ALEXANDRA HILL                                   0
## BUKIT HO SWEE                                  540
## CLARKE QUAY                                     10
## PASIR PANJANG 1                               2970
## QUEENSWAY                                      290
## ALEXANDRA NORTH                               2120
##                 Landed property dwellers  Pop_Density
## PEARL'S HILL                           0 0.0118431719
## BOAT QUAY                              0 0.0004353031
## HENDERSON HILL                         0 0.0224711973
## REDHILL                                0 0.0276953656
## ALEXANDRA HILL                         0 0.0133737225
## BUKIT HO SWEE                          0 0.0267883659
## CLARKE QUAY                            0 0.0002067649
## PASIR PANJANG 1                     1430 0.0040837310
## QUEENSWAY                              0 0.0004591192
## ALEXANDRA NORTH                        0 0.0072180927

5.2 Data Standardisation

As the data range is very big, we perform standardization.

subzone_var.std <- normalize(subzone_var)
summary(subzone_var.std)
##       AGED         HDB1-2RM dwellers HDB5RM-Ec dweller
##  Min.   :0.00000   Min.   :0.0000    Min.   :0.00000  
##  1st Qu.:0.02505   1st Qu.:0.0000    1st Qu.:0.00000  
##  Median :0.10101   Median :0.0000    Median :0.03857  
##  Mean   :0.13217   Mean   :0.1592    Mean   :0.09490  
##  3rd Qu.:0.18810   3rd Qu.:0.2745    3rd Qu.:0.11864  
##  Max.   :1.00000   Max.   :1.0000    Max.   :1.00000  
##  Condominium and apartment dwellers Landed property dwellers
##  Min.   :0.000000                   Min.   :0.00000         
##  1st Qu.:0.003131                   1st Qu.:0.00000         
##  Median :0.076625                   Median :0.00000         
##  Mean   :0.150378                   Mean   :0.05676         
##  3rd Qu.:0.240161                   3rd Qu.:0.04716         
##  Max.   :1.000000                   Max.   :1.00000         
##   Pop_Density    
##  Min.   :0.0000  
##  1st Qu.:0.0881  
##  Median :0.2603  
##  Mean   :0.3217  
##  3rd Qu.:0.5299  
##  Max.   :1.0000
popd <- ggplot(data=subzone_var.std, aes(x=`Pop_Density`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

hdb1d <- ggplot(data=subzone_var.std, aes(x=`HDB1-2RM dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

AGEDd <- ggplot(data=subzone_var.std, aes(x=`AGED`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

hdb5d <- ggplot(data=subzone_var.std, aes(x=`HDB5RM-Ec dweller`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

condod <- ggplot(data=subzone_var.std, aes(x=`Condominium and apartment dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

landedd <- ggplot(data=subzone_var.std, aes(x=`Landed property dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")


ggarrange(popd,hdb1d,AGEDd,hdb5d,condod,landedd,
          ncol = 3,
          nrow = 2)

As seen from above, even after standardizing the data, it is right skewed. Hence, we need to perform Transformation.

subzone_var.log <- log(subzone_var.std)
d <- ggplot(data=subzone_var.log, aes(x=`Pop_Density`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

e <- ggplot(data=subzone_var.log, aes(x=`HDB1-2RM dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

f <- ggplot(data=subzone_var.log, aes(x=`AGED`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

g <- ggplot(data=subzone_var.log, aes(x=`HDB5RM-Ec dweller`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

h <- ggplot(data=subzone_var.log, aes(x=`Condominium and apartment dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

i <- ggplot(data=subzone_var.log, aes(x=`Landed property dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")


ggarrange(d,e,f,g,h,i,
          ncol = 3,
          nrow = 2)

After performing transformation, there may be NA/Infinite values due to 0 values being transformed. Hence, we need to check for any and replace these data with 0.

subzone_var.log[rowSums(is.na(subzone_var.log))!=0,]
## [1] AGED                               HDB1-2RM dwellers                 
## [3] HDB5RM-Ec dweller                  Condominium and apartment dwellers
## [5] Landed property dwellers           Pop_Density                       
## <0 rows> (or 0-length row.names)
indx <- apply(subzone_var.log, 2, function(x) any(is.na(x) | is.infinite(x)))
indx
##                               AGED                  HDB1-2RM dwellers 
##                               TRUE                               TRUE 
##                  HDB5RM-Ec dweller Condominium and apartment dwellers 
##                               TRUE                               TRUE 
##           Landed property dwellers                        Pop_Density 
##                               TRUE                               TRUE
subzone_var.log[is.na(subzone_var.log)] = 0
subzone_var.replaced <- subzone_var.log %>% mutate_if(is.numeric, function(x) ifelse(is.infinite(x), 0, x))

5.3 Computing proximity matrix

In R, many packages provide functions to calculate distance matrix. We will compute the proximity matrix by using dist() of R.

dist() supports six distance proximity calculations, they are: euclidean, maximum, manhattan, canberra, binary and minkowski. The default is euclidean proximity matrix.

The code chunk below is used to compute the proximity matrix using euclidean method.

proxmat <- dist(subzone_var.replaced, method = 'euclidean')

5.4 Selecting the optimal clustering algorithm

One of the challenge in performing hierarchical clustering is to identify stronger clustering structures. The issue can be solved by using use agnes() function of cluster package. It functions like hclus(), however, with the agnes() function you can also get the agglomerative coefficient, which measures the amount of clustering structure found (values closer to 1 suggest strong clustering structure).

The code chunk below will be used to compute the agglomerative coefficients of all hierarchical clustering algorithms.

m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

ac <- function(x) {
  agnes(subzone_var.replaced, method = x)$ac
}

map_dbl(m, ac)
##   average    single  complete      ward 
## 0.8515852 0.7304796 0.9113168 0.9699733

With reference to the output above, we can see that Ward’s method provides the strongest clustering structure among the four methods assessed. Hence, in the subsequent analysis, only Ward’s method will be used.

5.5 Computing hierarchical clustering

The code chunk below performs hierarchical cluster analysis using ward.D method. The hierarchical clustering output is stored in an object of class hclust which describes the tree produced by the clustering process.

hclust_ward <- hclust(proxmat, method = 'ward.D')

We can then plot the tree by using plot() of R Graphics as shown in the code chunk below.

plot(hclust_ward, cex = 0.6)

There are various techniques to use to identify the optimal number of clusters such as Silhouette Coefficient, or from the dendrogram, a horizontal line that can traverse the maximum distance vertically without intersecting a cluster is drawn and the number of vertical lines in the dendrogram cut by this line represents the optimal number of clusters. In our case, we will use that method and the elbow method. The elbow method looks at the percentage of variance explained as a function of the number of clusters: One should choose a number of clusters so that adding another cluster doesn’t give much better modeling of the data.

Next, we will use the elbow method to determine how many clusters we should use.

fviz_nbclust(subzone_var.replaced, FUN = hcut, method = "wss")

As seen above, 2 has an elbow. However,we decided to try with more clusters as there were more clusters found while doing analysis with 2 clusters. Hence, we tried 3-6 clusters as well. Of those, 5 clusters was the most suitable.

5.5.1 Interpreting the dendrograms

plot(hclust_ward, cex = 0.6)
rect.hclust(hclust_ward, k =5, border = 2:5)

5.6 Visually-driven hierarchical clustering analysis

In this section, we will learn how to perform visually-driven hiearchical clustering analysis by using heatmaply package.

With heatmaply, we are able to build both highly interactive cluster heatmap or static cluster heatmap.

5.6.1 Transforming the data frame into a matrix

The code chunk below will be used to transform subzone_var.replaced data frame into a data matrix.

subzone_var_mat <- data.matrix(subzone_var.replaced)

5.6.2 Plotting interactive cluster heatmap using heatmaply()

In the code chunk below, the heatmaply() is used to build an interactive cluster heatmap.

heatmaply(subzone_var_mat,
          Colv=NA,
          dist_method = "euclidean",
          hclust_method = "ward.D",
          seriate = "OLO",
          colors = Blues,
          k_row = 5,
          margins = c(NA,200,60,NA),
          fontsize_row = 4,
          fontsize_col = 5,
          main="Geographic Segmentation of Subzones by Socio economic Indicators",
          xlab = "Socio economic Indicators",
          ylab = "Subzones of Singapore"
          )

5.7 Mapping the clusters formed

With closed examination of the dendragram above, we have decided to retain five clusters.

cutree() of R Base will be used in the code chunk below to derive a 3-cluster model.

groups <- as.factor(cutree(hclust_ward, k=5))

The output is called groups. It is a list object.

In order to visualise the clusters, the groups object need to be appended onto socio simple feature object.

The code chunk below form the join in three steps: 1. the groups list object will be converted into a matrix; 2. cbnd() is used to append groups matrix onto socio to produce an output simple feature object called sg_cluster; and 3. rename of dplyr package is used to rename as.matrix.groups field as CLUSTER.

sg_cluster <- cbind(socio, as.matrix(groups))  %>%
  rename(`CLUSTER`=`as.matrix.groups.`)

Next, qtm() of tmap package is used to plot the choropleth map showing the cluster formed.

qtm(sg_cluster, "CLUSTER")

The choropleth map above reveals the clusters are very fragmented. The is one of the major limitation when non-spatial clustering algorithm such as hierarchical cluster analysis method is used.

6 Spatially Constrained Clustering - SKATER approach

In this section, you swill learn how to derive spatially constrained cluster by using SKATER method.

6.1 Converting into SpatialPolygonsDataFrame

First, we need to convert socio into SpatialPolygonDataFrame. This is because SKATER function only support sp objects such as SpatialPolygonDataFrame.

The code chunk below uses as_Spatial() of sf package to convert socio into a SpatialPolygonDataFrame called sg_sp.

sg_sp <- as_Spatial(socio)

6.2 Computing Neighbour List

6.3 Modelling Queens method

Next, poly2nd() of spdep package will be used to compute the neighbours list from polygon list.

sg.nb <- poly2nb(sg_sp)
summary(sg.nb)
## Neighbour list object:
## Number of regions: 234 
## Number of nonzero links: 1198 
## Percentage nonzero weights: 2.187888 
## Average number of links: 5.119658 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 
##  4 10 25 36 69 41 33 11  5 
## 4 least connected regions:
## 13 204 222 233 with 1 link
## 5 most connected regions:
## 59 66 71 121 131 with 9 links

We can plot the neighbours list on sg_sp by using the code chunk below. Since we now can plot the community area boundaries as well, we plot this graph on top of the map. The first plot command gives the boundaries. This is followed by the plot of the neighbor list object, with coordinates applied to the original SpatialPolygonDataFrame to extract the centroids of the polygons. These are used as the nodes for the graph representation. We also set the color to blue and specify add=TRUE to plot the network on top of the boundaries.

lcosts <- nbcosts(sg.nb, subzone_var)
sg.w <- nb2listw(sg.nb, lcosts, style="B")
summary(sg.w)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 234 
## Number of nonzero links: 1198 
## Percentage nonzero weights: 2.187888 
## Average number of links: 5.119658 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 
##  4 10 25 36 69 41 33 11  5 
## 4 least connected regions:
## 13 204 222 233 with 1 link
## 5 most connected regions:
## 59 66 71 121 131 with 9 links
## 
## Weights style: B 
## Weights constants summary:
##     n    nn      S0           S1           S2
## B 234 54756 9043408 262731206400 2.560782e+12

6.3.1 Computing minimum spanning tree

The minimum spanning tree is computed by mean of the mstree() of spdep package as shown in the code chunk below.

sg.mst <- mstree(sg.w)

After computing the MST, we can check its class and dimension by using the code chunk below.

class(sg.mst)
## [1] "mst"    "matrix"
dim(sg.mst)
## [1] 233   3

Note that the dimension is 233 and not 234. This is because the minimum spanning tree consists on n-1 edges (links) in order to traverse all the nodes.

We can display the content of sg.mst by using head() as shown in the code chunk below.

head(sg.mst)
##      [,1] [,2]     [,3]
## [1,]  179  133 3905.791
## [2,]  133  126 3353.595
## [3,]  126  107 2941.003
## [4,]  107  114 2884.614
## [5,]  107  127 3610.028
## [6,]  126  140 4078.848

The plot method for the MST include a way to show the observation numbers of the nodes in addition to the edge. As before, we plot this together with the township boundaries. We can see how the initial neighbour list is simplified to just one edge connecting each of the nodes, while passing through all the nodes.

plot(sg_sp, border=gray(.5))
plot.mst(sg.mst, coordinates(sg_sp), 
     col="blue", cex.lab=0.7, cex.circles=0.005, add=TRUE)

6.4 Computing spatially constrained clusters using SKATER method

The code chunk below compute the spatially constrained cluster using skater() of spdep package.

clust5 <- skater(sg.mst[,1:2], subzone_var, 4)

The skater() takes three mandatory arguments: - the first two columns of the MST matrix (i.e. not the cost), - the data matrix (to update the costs as units are being grouped), and - the number of cuts. Note: It is set to one less than the number of clusters. So, the value specified is not the number of clusters, but the number of cuts in the graph, one less than the number of custers.

The result of the skater() is an object of class skater. We can examine its contents by using the code chunk below.

str(clust5)
## List of 8
##  $ groups      : num [1:234] 4 4 4 4 4 4 4 4 4 4 ...
##  $ edges.groups:List of 5
##   ..$ :List of 3
##   .. ..$ node: num [1:33] 107 126 195 179 133 124 168 164 141 153 ...
##   .. ..$ edge: num [1:32, 1:3] 124 141 160 153 195 164 136 197 168 124 ...
##   .. ..$ ssw : num 219752
##   ..$ :List of 3
##   .. ..$ node: num [1:4] 206 205 200 198
##   .. ..$ edge: num [1:3, 1:3] 206 200 205 205 198 ...
##   .. ..$ ssw : num 14097
##   ..$ :List of 3
##   .. ..$ node: num [1:86] 194 221 196 220 122 223 211 173 81 159 ...
##   .. ..$ edge: num [1:85, 1:3] 221 220 122 223 211 81 232 122 159 219 ...
##   .. ..$ ssw : num 678039
##   ..$ :List of 3
##   .. ..$ node: num [1:109] 144 89 86 83 43 42 17 40 46 9 ...
##   .. ..$ edge: num [1:108, 1:3] 51 117 72 80 59 95 156 91 36 39 ...
##   .. ..$ ssw : num 336728
##   ..$ :List of 3
##   .. ..$ node: num [1:2] 188 138
##   .. ..$ edge: num [1, 1:3] 188 138 21956
##   .. ..$ ssw : num 21956
##  $ not.prune   : NULL
##  $ candidates  : int [1:5] 1 2 3 4 5
##  $ ssto        : num 1623978
##  $ ssw         : num [1:5] 1623978 1514554 1428910 1329531 1270571
##  $ crit        : num [1:2] 1 Inf
##  $ vec.crit    : num [1:234] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "class")= chr "skater"
ccs5 <- clust5$groups
table(ccs5)
## ccs5
##   1   2   3   4   5 
##  33   4  86 109   2

6.5 Visualising the clusters in choropleth map

The code chunk below is used to plot the newly derived clusters by using SKATER method.

groups_mat <- as.matrix(clust5$groups)
sg_sf_spatialcluster <- cbind(sg_cluster, as.factor(groups_mat)) %>%
  rename(`SP_CLUSTER`=`as.factor.groups_mat.`)

The clusters are then plotted in tmap.

tmap_mode("view")
  tm_shape(sg_sf_spatialcluster)+
  tm_borders(alpha = 0.5) +
  tm_fill(col = "SP_CLUSTER", alpha = 0.9, id = "SUBZONE_N")

We can view the subzones in each cluster easily in the data table below

SP_CLUSTER
JURONG GATEWAY 1
TAMAN JURONG 1
LAKESIDE 1
YUHUA EAST 1
BUKIT BATOK SOUTH 1
JURONG WEST CENTRAL 1
TOH GUAN 1
TEBAN GARDENS 1
BOON LAY PLACE 1
BUKIT BATOK EAST 1
BUKIT BATOK WEST 1
BUKIT BATOK CENTRAL 1
HONG KAH 1
BRICKWORKS 1
GUILIN 1
WENYA 1
YUHUA WEST 1
YUNNAN 1
JELEBU 1
KEAT HONG 1
BANGKIT 1
PENG SIANG 1
TECK WHYE 1
CHOA CHU KANG CENTRAL 1
FAJAR 1
SENJA 1
SAUJANA 1
HONG KAH NORTH 1
GOMBAK 1
CHOA CHU KANG NORTH 1
YEW TEE 1
WESTERN WATER CATCHMENT 1
LIM CHU KANG 1
RIVERVALE 2
PUNGGOL FIELD 2
SENGKANG TOWN CENTRE 2
ANCHORVALE 2
TANJONG RHU 3
MOUNTBATTEN 3
SIGLAP 3
BAYSHORE 3
MARINE PARADE 3
BEDOK SOUTH 3
KEMBANGAN 3
KATONG 3
KAMPONG UBI 3
KAKI BUKIT 3
BEDOK RESERVOIR 3
JOO SENG 3
ALJUNIED 3
FRANKEL 3
GEYLANG EAST 3
UPPER PAYA LEBAR 3
SERANGOON CENTRAL 3
BISHAN EAST 3
TAMPINES WEST 3
MARYMOUNT 3
TAI SENG 3
LORONG CHUAN 3
MACPHERSON 3
BEDOK NORTH 3
HOUGANG EAST 3
KANGKAR 3
SEMBAWANG HILLS 3
HOUGANG WEST 3
XILIN 3
SIMEI 3
PASIR RIS WEST 3
YIO CHU KANG WEST 3
TRAFALGAR 3
CHANGI WEST 3
SELETAR HILLS 3
COMPASSVALE 3
YIO CHU KANG EAST 3
LOYANG WEST 3
TAGORE 3
LORONG AH SOO 3
FLORA DRIVE 3
UPPER THOMSON 3
TOWNSVILLE 3
KOVAN 3
CHONG BOON 3
SHANGRI-LA 3
SERANGOON GARDEN 3
HOUGANG CENTRAL 3
LOYANG EAST 3
TAMPINES NORTH 3
CHENG SAN 3
ANG MO KIO TOWN CENTRE 3
KEBUN BAHRU 3
SERANGOON NORTH 3
PASIR RIS CENTRAL 3
PASIR RIS PARK 3
FERNVALE 3
MATILDA 3
WATERWAY EAST 3
CHANGI POINT 3
YISHUN SOUTH 3
LOWER SELETAR 3
NORTHSHORE 3
MANDAI ESTATE 3
YISHUN CENTRAL 3
TURF CLUB 3
WOODLANDS SOUTH 3
WOODGROVE 3
YISHUN EAST 3
YISHUN WEST 3
WOODLANDS EAST 3
SEMBAWANG CENTRAL 3
SEMBAWANG EAST 3
ADMIRALTY 3
SEMBAWANG NORTH 3
NORTHLAND 3
MIDVIEW 3
WOODLANDS WEST 3
SEMBAWANG SPRINGS 3
SPRINGLEAF 3
PUNGGOL TOWN CENTRE 3
NEE SOON 3
SELETAR 3
KHATIB 3
NORTH COAST 3
SEMBAWANG STRAITS 3
PEARL’S HILL 4
BOAT QUAY 4
HENDERSON HILL 4
REDHILL 4
ALEXANDRA HILL 4
BUKIT HO SWEE 4
CLARKE QUAY 4
PASIR PANJANG 1 4
QUEENSWAY 4
ALEXANDRA NORTH 4
INSTITUTION HILL 4
ROBERTSON QUAY 4
SENTOSA 4
MARITIME SQUARE 4
TELOK BLANGAH WAY 4
CECIL 4
KAMPONG TIONG BAHRU 4
TELOK BLANGAH DRIVE 4
PASIR PANJANG 2 4
CENTRAL SUBZONE 4
DEPOT ROAD 4
PEOPLE’S PARK 4
BUKIT MERAH 4
CHINATOWN 4
CHINA SQUARE 4
TIONG BAHRU 4
TIONG BAHRU STATION 4
OXLEY 4
MEI CHIN 4
NATIONAL UNIVERSITY OF S’PORE 4
ONE TREE HILL 4
SOMERSET 4
BENCOOLEN 4
LEONIE HILL 4
PORT 4
DHOBY GHAUT 4
BUGIS 4
VICTORIA 4
PATERSON 4
TELOK BLANGAH RISE 4
TANJONG PAGAR 4
EVERTON PARK 4
TANGLIN HALT 4
MACKENZIE 4
SUNGEI ROAD 4
ONE NORTH 4
COMMONWEALTH 4
DOVER 4
RIDOUT 4
CAIRNHILL 4
CLEMENTI WEST 4
MONK’S HILL 4
CLEMENTI WOODS 4
ORANGE GROVE 4
KAMPONG BUGIS 4
BOULEVARD 4
LITTLE INDIA 4
FORT CANNING 4
FARRER COURT 4
NASSIM 4
WEST COAST 4
CHATSWORTH 4
KAMPONG GLAM 4
SELEGIE 4
MOUNT EMILY 4
CRAWFORD 4
MARGARET DRIVE 4
TANGLIN 4
GEYLANG BAHRU 4
FABER 4
MALCOLM 4
BENDEMEER 4
BALESTIER 4
CORONATION ROAD 4
HOLLAND DRIVE 4
FARRER PARK 4
NEWTON CIRCUS 4
GHIM MOH 4
LAVENDER 4
GOODWOOD PARK 4
SINGAPORE POLYTECHNIC 4
CLEMENTI CENTRAL 4
KAMPONG JAVA 4
BOON KENG 4
ULU PANDAN 4
HOLLAND ROAD 4
SENNETT 4
POTONG PASIR 4
PEI CHUN 4
BOON TECK 4
WOODLEIGH 4
TOA PAYOH WEST 4
MOUNT PLEASANT 4
HILLCREST 4
LORONG 8 TOA PAYOH 4
BRADDELL 4
TYERSALL 4
MOULMEIN 4
CLEMENTI NORTH 4
LEEDON PARK 4
NATURE RESERVE 4
DUNEARN 4
SUNSET WAY 4
KIM KEAT 4
TOA PAYOH CENTRAL 4
ANAK BUKIT 4
SWISS CLUB 4
HILLVIEW 4
DAIRY FARM 4
TAMPINES EAST 5
PASIR RIS DRIVE 5

6.6 Analyse statistics by socio economic indicator

Before we analyse, we join back the data to the urban function data. We do a count of rows after joining to ensure that the number of rows is 234.

socio_urban <- st_join(sg_sf_spatialcluster,urban,join = st_contains)
nrow(socio_urban)
## [1] 234
head(socio_urban)
## Simple feature collection with 6 features and 18 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 24468.89 ymin: 29357.01 xmax: 29947.32 ymax: 30567.91
## epsg (SRID):    3414
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
##        SUBZONE_N YOUNG ECONOMY.ACTIVE AGED HDB1.2RM.dwellers
## 1   PEARL'S HILL  1100           3420 2110              4520
## 2      BOAT QUAY     0             50   20                 0
## 3 HENDERSON HILL  2620           7500 3260              3930
## 4        REDHILL  2840           6260 1630              1970
## 5 ALEXANDRA HILL  2910           7560 3310              3910
## 6  BUKIT HO SWEE  2850           8340 3590              3700
##   HDB3.4RM.dwellers HDB5RM.Ec.dweller Condominium.and.apartment.dwellers
## 1              1400                 0                                420
## 2                 0                 0                                 50
## 3              7990              1270                                190
## 4              3520              3240                               1930
## 5              6320              3120                                  0
## 6              8610              1900                                540
##   Landed.property.dwellers  Pop_Density CLUSTER SP_CLUSTER Financial
## 1                        0 0.0118431719       1          4        24
## 2                        0 0.0004353031       2          4         2
## 3                        0 0.0224711973       3          4         4
## 4                        0 0.0276953656       3          4        12
## 5                        0 0.0133737225       3          4        14
## 6                        0 0.0267883659       3          4         6
##   GovtEmbassy Private Shopping Business Industry
## 1           1       6        3        6        0
## 2           1       1        1       39        0
## 3           0       5        0        0        0
## 4           0       6        1        1        0
## 5           5       4        1       39        1
## 6           3      11        1        6        0
##                         geometry
## 1 MULTIPOLYGON (((29092.28 30...
## 2 MULTIPOLYGON (((29932.33 29...
## 3 MULTIPOLYGON (((27131.28 30...
## 4 MULTIPOLYGON (((26451.03 30...
## 5 MULTIPOLYGON (((25899.7 297...
## 6 MULTIPOLYGON (((27746.95 30...

6.6.1 Pop_Density group

ggplot(data=socio_urban, aes(x=`Pop_Density`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

Cluster 2 subzones have a relatively high population density of between 0.03 to 0.04. Cluster 1 ranges widely from 0.0 to 0.05. cluster 3 ranges from 0.0 to 0.04. Cluster 4 forms the biggest cluster which most of them have a lower density ranging from 0 to 0.03. Cluster 3 has a wide range from 0 to 0.03. Cluster 5 has a moderate to slightly higher population density of between 0.03 to 0.035.

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="Pop_Density",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "quantile",size="Pop_Density")

By looking at the map, we can see that cluster 1 generally have high population density with the exception of water catchment and Lim Chu Kang. All subzones in cluster 4 have a high population density as well. In cluster 4, there are a few that have high population density, with those around one another. One of the highly populated clusters in cluster 4 includes Tiong Bahru Station, Tiong Bahru, Telok Blangah Way, Redhill, Henderson Hill, Kampong Tiong Bahru, Telok Blangah Rise and Bukit Ho Swee. Another is Boon Teck, Toa Payoh Central, Balestier, Braddell, Lorong 8 Toa Payoh, Pei Chun, Kim Keat,Potong Pasir,Beedemeer and Balestier. Cluster 3 has a wide range of population density as seen in the barchart earlier and in the map as well. Areas neaer cluster 2 and also up north in Singapore have a high population density. Cluster 5 have a high population density.

6.6.2 YOUNG

ggplot(data=socio_urban, aes(x=`YOUNG`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="YOUNG",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="YOUNG")

As we can see from the map, Cluster 1 has a moderate number of YOUNG across the subzones. Cluster 2 subzones have a high number of YOUNG group. Cluster 3 has a moderate number of YOUNG across the subzones. Cluster 5 have a high number of YOUNG group. Cluster 4 generally has low number of YOUNG group.

6.6.3 AGED

ggplot(data=socio_urban, aes(x=`AGED`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="AGED",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="AGED")

Cluster 1 less to moderate number of AGED people. Cluster 2 has a moderate, lesser proportion as compared to the YOUNG. Cluser 3 varies with a low to moderate number of AGED people. Cluster 3 has a moderate to high AGED Group and cluster 4 has a low number of aged in general. Cluster 5 have moderate to high number of AGED people.

6.6.4 Economy Active

ggplot(data=socio_urban, aes(x=`ECONOMY.ACTIVE`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="ECONOMY.ACTIVE",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="ECONOMY.ACTIVE",size.lim=c(1))

There are more ECONOMY ACTIVE people as compared to the other 2 groups. Cluster 1 has a moderate to high number of economy active people. Cluster 2 has a higher number of ECONOMY ACTIVE people. Cluster 3 varies, with mostly low to moderate. Cluster 4 has the lowest number of economy active people as compared to the other clutsers. This is due to the low population density in cluster 4.Cluster 5 have moderate to high number of economy active people.

6.6.5 HDB1.2RM.dwellers

ggplot(data=socio_urban, aes(x=`HDB1.2RM.dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="HDB1.2RM.dwellers",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="HDB1.2RM.dwellers",size.lim=c(1))

As compared to the other housing types, HDB1-2 rooms in Singapore. Cluster 1 and Cluster 3 have a wide range, cluster 2 has a low umber of HDB1-2 rooms, cluster 4 are on extreme ends, with mostly having a low number with the exception of one group - Pearl’s Hill, Aleandra Hill, Boat Quay and Redhill. CLuster 4 has a wide range of opposite ends as well.

6.6.6 HDB3.4RM.dwellers

ggplot(data=socio_urban, aes(x=`HDB3.4RM.dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="HDB3.4RM.dwellers",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="HDB3.4RM.dwellers",size.lim=c(1))

CLuster 1 and 2 have a moderate to high number of HDB3-4RM . Cluster 3 varies and cluster 4 has a low number of HDB3-4RM. Cluster 5 have a moderate to high number of HDB3-4 RM.

6.6.7 HDB5RM.Ec.dweller

ggplot(data=socio_urban, aes(x=`HDB5RM.Ec.dweller`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="HDB5RM.Ec.dweller",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="HDB5RM.Ec.dweller",size.lim=c(1))

Cluster 1 have moderate to high number of HDB5 room/EC. Cluster 2 have a high number of HDB5 room/EC. Cluster 3 generally has lower with the exception of Woodlands East. Cluster 4 have the lowest number of HDB5room/EC overall. Cluster 5 have a high number of HDB5RM/EC dwellers.

6.6.8 Condominium.and.apartment.dwellers

ggplot(data=socio_urban, aes(x=`Condominium.and.apartment.dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="Condominium.and.apartment.dwellers",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Condominium.and.apartment.dwellers",size.lim=c(1))

Cluster 1 and 2 have low to moderate number of condominium/apartments, cluster 3 have a generally high number of condominium/apartments in the East. Cluster 4 have a low to moderate number of condominium/apartments. Cluster 5 has a relatively lower number of condominium/apartments.

6.6.9 Landed.property.dwellers

ggplot(data=socio_urban, aes(x=`Landed.property.dwellers`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="Landed.property.dwellers",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Landed.property.dwellers",size.lim=c(1))
## Warning: The shape socio_urban is invalid. See sf::st_is_valid
## Legend for symbol sizes not available in view mode.

Cluster 1 and 2 have relatively low numbers in Landed property, cluster 3 subzones have the highest number as compare to the others. cluster 4 in general have a low to moderae number of landed property. Cluster 5 has no landed property dwellers.

6.8 Analyse by cluster

Filter by cluster

cluster1 <- socio_urban %>%
  filter(SP_CLUSTER==1)

cluster2 <- socio_urban %>%
  filter(SP_CLUSTER==2)

cluster3 <- socio_urban %>%
  filter(SP_CLUSTER==3)

cluster4 <- socio_urban %>%
  filter(SP_CLUSTER==4)

cluster5 <- socio_urban %>%
  filter(SP_CLUSTER==5)
summary(cluster1)
##   SUBZONE_N             YOUNG       ECONOMY.ACTIVE       AGED     
##  Length:33          Min.   :   20   Min.   :   40   Min.   :  10  
##  Class :character   1st Qu.: 3640   1st Qu.: 8960   1st Qu.:1790  
##  Mode  :character   Median : 6100   Median :13680   Median :3030  
##                     Mean   : 6726   Mean   :14576   Mean   :2947  
##                     3rd Qu.: 8610   3rd Qu.:19120   3rd Qu.:3710  
##                     Max.   :19400   Max.   :41080   Max.   :8740  
##  HDB1.2RM.dwellers HDB3.4RM.dwellers HDB5RM.Ec.dweller
##  Min.   :   0.0    Min.   :    0     Min.   :    0    
##  1st Qu.:   0.0    1st Qu.: 6870     1st Qu.: 3990    
##  Median :  30.0    Median :11350     Median : 6300    
##  Mean   : 680.6    Mean   :12550     Mean   : 8601    
##  3rd Qu.:1330.0    3rd Qu.:16220     3rd Qu.:11720    
##  Max.   :3700.0    Max.   :35950     Max.   :38700    
##  Condominium.and.apartment.dwellers Landed.property.dwellers
##  Min.   :   0                       Min.   :   0            
##  1st Qu.:   0                       1st Qu.:   0            
##  Median :1200                       Median :   0            
##  Mean   :1987                       Mean   : 337            
##  3rd Qu.:3100                       3rd Qu.:   0            
##  Max.   :8670                       Max.   :3590            
##   Pop_Density       CLUSTER SP_CLUSTER   Financial      GovtEmbassy    
##  Min.   :0.000004   1: 0    1:33       Min.   : 0.00   Min.   :0.0000  
##  1st Qu.:0.012054   2: 3    2: 0       1st Qu.: 5.00   1st Qu.:0.0000  
##  Median :0.028496   3:22    3: 0       Median : 6.00   Median :0.0000  
##  Mean   :0.023712   4: 1    4: 0       Mean   :10.48   Mean   :0.6364  
##  3rd Qu.:0.032726   5: 7    5: 0       3rd Qu.:14.00   3rd Qu.:1.0000  
##  Max.   :0.046058                      Max.   :55.00   Max.   :6.0000  
##     Private          Shopping    Business         Industry     
##  Min.   : 0.000   Min.   :0   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.: 2.000   1st Qu.:0   1st Qu.: 0.000   1st Qu.:0.0000  
##  Median : 3.000   Median :1   Median : 0.000   Median :0.0000  
##  Mean   : 3.879   Mean   :1   Mean   : 5.182   Mean   :0.0303  
##  3rd Qu.: 5.000   3rd Qu.:2   3rd Qu.: 3.000   3rd Qu.:0.0000  
##  Max.   :14.000   Max.   :5   Max.   :45.000   Max.   :1.0000  
##           geometry 
##  MULTIPOLYGON :33  
##  epsg:3414    : 0  
##  +proj=tmer...: 0  
##                    
##                    
## 
summary(cluster2)
##   SUBZONE_N             YOUNG       ECONOMY.ACTIVE       AGED     
##  Length:4           Min.   :15250   Min.   :27740   Min.   :3620  
##  Class :character   1st Qu.:15768   1st Qu.:28235   1st Qu.:4265  
##  Mode  :character   Median :16870   Median :32185   Median :5180  
##                     Mean   :16838   Mean   :32383   Mean   :5102  
##                     3rd Qu.:17940   3rd Qu.:36333   3rd Qu.:6018  
##                     Max.   :18360   Max.   :37420   Max.   :6430  
##  HDB1.2RM.dwellers HDB3.4RM.dwellers HDB5RM.Ec.dweller
##  Min.   : 450      Min.   :13810     Min.   :22650    
##  1st Qu.: 855      1st Qu.:16413     1st Qu.:26820    
##  Median :1330      Median :19635     Median :28295    
##  Mean   :1198      Mean   :19468     Mean   :27865    
##  3rd Qu.:1672      3rd Qu.:22690     3rd Qu.:29340    
##  Max.   :1680      Max.   :24790     Max.   :32220    
##  Condominium.and.apartment.dwellers Landed.property.dwellers
##  Min.   :3710                       Min.   :  0             
##  1st Qu.:4678                       1st Qu.:  0             
##  Median :5205                       Median :125             
##  Mean   :5550                       Mean   :125             
##  3rd Qu.:6078                       3rd Qu.:250             
##  Max.   :8080                       Max.   :250             
##   Pop_Density      CLUSTER SP_CLUSTER   Financial     GovtEmbassy  
##  Min.   :0.03109   1:0     1:0        Min.   : 1.0   Min.   :0.00  
##  1st Qu.:0.03433   2:0     2:4        1st Qu.: 8.5   1st Qu.:0.00  
##  Median :0.03689   3:2     3:0        Median :11.5   Median :0.00  
##  Mean   :0.03681   4:0     4:0        Mean   :12.0   Mean   :0.75  
##  3rd Qu.:0.03937   5:2     5:0        3rd Qu.:15.0   3rd Qu.:0.75  
##  Max.   :0.04236                      Max.   :24.0   Max.   :3.00  
##     Private        Shopping       Business      Industry          geometry
##  Min.   : 9.0   Min.   :0.00   Min.   :0.0   Min.   :0   MULTIPOLYGON :4  
##  1st Qu.: 9.0   1st Qu.:0.75   1st Qu.:0.0   1st Qu.:0   epsg:3414    :0  
##  Median : 9.5   Median :1.50   Median :0.5   Median :0   +proj=tmer...:0  
##  Mean   :10.0   Mean   :1.50   Mean   :1.5   Mean   :0                    
##  3rd Qu.:10.5   3rd Qu.:2.25   3rd Qu.:2.0   3rd Qu.:0                    
##  Max.   :12.0   Max.   :3.00   Max.   :5.0   Max.   :0
summary(cluster3)
##   SUBZONE_N             YOUNG       ECONOMY.ACTIVE       AGED        
##  Length:86          Min.   :    0   Min.   :   10   Min.   :   20.0  
##  Class :character   1st Qu.: 2098   1st Qu.: 4682   1st Qu.:  987.5  
##  Mode  :character   Median : 5485   Median :13080   Median : 2960.0  
##                     Mean   : 6317   Mean   :14102   Mean   : 3357.8  
##                     3rd Qu.: 8695   3rd Qu.:20418   3rd Qu.: 5340.0  
##                     Max.   :31690   Max.   :59340   Max.   :16050.0  
##  HDB1.2RM.dwellers HDB3.4RM.dwellers HDB5RM.Ec.dweller
##  Min.   :   0.0    Min.   :    0     Min.   :    0    
##  1st Qu.:   0.0    1st Qu.:    0     1st Qu.:    0    
##  Median : 190.0    Median :10355     Median : 3795    
##  Mean   : 930.3    Mean   :11444     Mean   : 5662    
##  3rd Qu.:1437.5    3rd Qu.:16185     3rd Qu.: 9208    
##  Max.   :4700.0    Max.   :56850     Max.   :41850    
##  Condominium.and.apartment.dwellers Landed.property.dwellers
##  Min.   :    0                      Min.   :    0           
##  1st Qu.:    0                      1st Qu.:    0           
##  Median : 2695                      Median :  405           
##  Mean   : 3547                      Mean   : 1992           
##  3rd Qu.: 6108                      3rd Qu.: 2218           
##  Max.   :14960                      Max.   :18820           
##   Pop_Density        CLUSTER SP_CLUSTER   Financial      GovtEmbassy    
##  Min.   :6.639e-05   1:10    1: 0       Min.   : 0.00   Min.   :0.0000  
##  1st Qu.:5.996e-03   2: 0    2: 0       1st Qu.: 2.00   1st Qu.:0.0000  
##  Median :1.486e-02   3:35    3:86       Median : 8.00   Median :0.0000  
##  Mean   :1.544e-02   4:12    4: 0       Mean   :11.76   Mean   :0.6395  
##  3rd Qu.:2.313e-02   5:29    5: 0       3rd Qu.:15.00   3rd Qu.:1.0000  
##  Max.   :3.944e-02                      Max.   :68.00   Max.   :3.0000  
##     Private          Shopping        Business        Industry     
##  Min.   :  0.00   Min.   :0.000   Min.   :  0.0   Min.   :0.0000  
##  1st Qu.:  4.00   1st Qu.:0.000   1st Qu.:  0.0   1st Qu.:0.0000  
##  Median :  9.50   Median :1.000   Median :  1.0   Median :0.0000  
##  Mean   : 19.17   Mean   :1.233   Mean   : 16.2   Mean   :0.4302  
##  3rd Qu.: 18.75   3rd Qu.:2.000   3rd Qu.:  8.5   3rd Qu.:0.0000  
##  Max.   :215.00   Max.   :8.000   Max.   :224.0   Max.   :5.0000  
##           geometry 
##  MULTIPOLYGON :86  
##  epsg:3414    : 0  
##  +proj=tmer...: 0  
##                    
##                    
## 
summary(cluster4)
##   SUBZONE_N             YOUNG      ECONOMY.ACTIVE       AGED     
##  Length:109         Min.   :   0   Min.   :   40   Min.   :   0  
##  Class :character   1st Qu.: 250   1st Qu.:  730   1st Qu.: 170  
##  Mode  :character   Median :1330   Median : 3340   Median : 810  
##                     Mean   :1669   Mean   : 4124   Mean   :1390  
##                     3rd Qu.:2420   3rd Qu.: 6260   3rd Qu.:2310  
##                     Max.   :7930   Max.   :21950   Max.   :7210  
##  HDB1.2RM.dwellers HDB3.4RM.dwellers HDB5RM.Ec.dweller
##  Min.   :   0.0    Min.   :    0     Min.   :   0.0   
##  1st Qu.:   0.0    1st Qu.:    0     1st Qu.:   0.0   
##  Median :   0.0    Median :    0     Median :   0.0   
##  Mean   : 574.9    Mean   : 3206     Mean   : 957.6   
##  3rd Qu.: 540.0    3rd Qu.: 5930     3rd Qu.:1870.0   
##  Max.   :4520.0    Max.   :23090     Max.   :7810.0   
##  Condominium.and.apartment.dwellers Landed.property.dwellers
##  Min.   :    0                      Min.   :   0.0          
##  1st Qu.:  110                      1st Qu.:   0.0          
##  Median :  770                      Median :   0.0          
##  Mean   : 1746                      Mean   : 615.2          
##  3rd Qu.: 2460                      3rd Qu.: 590.0          
##  Max.   :16770                      Max.   :7600.0          
##   Pop_Density        CLUSTER SP_CLUSTER   Financial       GovtEmbassy    
##  Min.   :6.580e-06   1:26    1:  0      Min.   :  0.00   Min.   : 0.000  
##  1st Qu.:2.672e-03   2:17    2:  0      1st Qu.:  3.00   1st Qu.: 0.000  
##  Median :7.248e-03   3:35    3:  0      Median :  7.00   Median : 1.000  
##  Mean   :1.051e-02   4:15    4:109      Mean   : 12.86   Mean   : 1.954  
##  3rd Qu.:1.690e-02   5:16    5:  0      3rd Qu.: 18.00   3rd Qu.: 3.000  
##  Max.   :4.399e-02                      Max.   :132.00   Max.   :17.000  
##     Private         Shopping         Business         Industry     
##  Min.   :  0.0   Min.   : 0.000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:  3.0   1st Qu.: 0.000   1st Qu.: 1.000   1st Qu.:0.0000  
##  Median :  7.0   Median : 1.000   Median : 3.000   Median :0.0000  
##  Mean   : 15.6   Mean   : 2.257   Mean   : 7.046   Mean   :0.1101  
##  3rd Qu.: 19.0   3rd Qu.: 3.000   3rd Qu.: 8.000   3rd Qu.:0.0000  
##  Max.   :123.0   Max.   :27.000   Max.   :50.000   Max.   :3.0000  
##           geometry  
##  MULTIPOLYGON :109  
##  epsg:3414    :  0  
##  +proj=tmer...:  0  
##                     
##                     
## 
summary(cluster5)
##   SUBZONE_N             YOUNG       ECONOMY.ACTIVE       AGED      
##  Length:2           Min.   :15720   Min.   :33080   Min.   : 6480  
##  Class :character   1st Qu.:20355   1st Qu.:44755   1st Qu.: 9575  
##  Mode  :character   Median :24990   Median :56430   Median :12670  
##                     Mean   :24990   Mean   :56430   Mean   :12670  
##                     3rd Qu.:29625   3rd Qu.:68105   3rd Qu.:15765  
##                     Max.   :34260   Max.   :79780   Max.   :18860  
##  HDB1.2RM.dwellers HDB3.4RM.dwellers HDB5RM.Ec.dweller
##  Min.   : 830      Min.   :22180     Min.   :30500    
##  1st Qu.:1698      1st Qu.:35385     1st Qu.:34865    
##  Median :2565      Median :48590     Median :39230    
##  Mean   :2565      Mean   :48590     Mean   :39230    
##  3rd Qu.:3432      3rd Qu.:61795     3rd Qu.:43595    
##  Max.   :4300      Max.   :75000     Max.   :47960    
##  Condominium.and.apartment.dwellers Landed.property.dwellers
##  Min.   :1770                       Min.   :0               
##  1st Qu.:2632                       1st Qu.:0               
##  Median :3495                       Median :0               
##  Mean   :3495                       Mean   :0               
##  3rd Qu.:4358                       3rd Qu.:0               
##  Max.   :5220                       Max.   :0               
##   Pop_Density      CLUSTER SP_CLUSTER   Financial   GovtEmbassy  
##  Min.   :0.03062   1:0     1:0        Min.   :11   Min.   :1.00  
##  1st Qu.:0.03140   2:0     2:0        1st Qu.:28   1st Qu.:1.75  
##  Median :0.03217   3:2     3:0        Median :45   Median :2.50  
##  Mean   :0.03217   4:0     4:0        Mean   :45   Mean   :2.50  
##  3rd Qu.:0.03294   5:0     5:2        3rd Qu.:62   3rd Qu.:3.25  
##  Max.   :0.03371                      Max.   :79   Max.   :4.00  
##     Private      Shopping        Business        Industry
##  Min.   : 3   Min.   : 1.00   Min.   : 0.00   Min.   :0  
##  1st Qu.: 9   1st Qu.: 3.25   1st Qu.: 3.75   1st Qu.:0  
##  Median :15   Median : 5.50   Median : 7.50   Median :0  
##  Mean   :15   Mean   : 5.50   Mean   : 7.50   Mean   :0  
##  3rd Qu.:21   3rd Qu.: 7.75   3rd Qu.:11.25   3rd Qu.:0  
##  Max.   :27   Max.   :10.00   Max.   :15.00   Max.   :0  
##           geometry
##  MULTIPOLYGON :2  
##  epsg:3414    :0  
##  +proj=tmer...:0  
##                   
##                   
## 

After perfoming analysis on the socio economic indicators, we can conclude the following of the 4 clusters respectively: Cluster 1: Moderate to High population density. Young children, working parents and elderly live together in HDB (inclusive of all types). Cluster 2: High population density. ECONOMY ACTIVE people living with their YOUNG children in HDB5RM.Ec.dweller Cluster 3: Low to Modeerate population density. Families with YOUNG children living in Condominium/Apartment or Landed Cluster 4: Low population density. Young couples (ECONOMY Active people staying by themselves) living in HDB1-2 Rooms or Condominium/Apartment or Landed Cluster 5: High population density. Have many ECONOMY ACTIVE, AGED and YOUNG people living in HDB (inclusive of all types)

6.7 Analyse statistics by urban functions

6.7.1 Financial

FIrst we will plot the number of fiancial functions by cluster

ggplot(data=socio_urban, aes(x=`Financial`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

Next, we will plot the number of financial functions in each subzone on tmap, with their respective clusters being filled. Those with a value of 0 financial functions will not be shown. This goes the same for the charts below for the other urban functions.

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="Financial",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Financial",size.lim=c(1))
## Warning: The shape socio_urban is invalid. See sf::st_is_valid
## Legend for symbol sizes not available in view mode.

Most financial functions can be found in cluster 3, 4 and 5. As we remember, apart from cluster 3, Cluster 1 has many AGED people. They would also need access to financial functions and thus, more financial functions can be built in cluster 1.

6.7.2 GovtEmbassy

govtembassy_plot <- ggplot(data=socio_urban, aes(x=`GovtEmbassy`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="GovtEmbassy",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="GovtEmbassy",size.lim=c(1))

Most of the goverment embassy are located inn cluster 4. This is mostly due to the locations being central.

6.7.3 Private

ggplot(data=socio_urban, aes(x=`Private`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="Private",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Private",size.lim=c(1))

Most privatse spaces are found in cluster 3 and 4. With high population density in cluster 3. Hence with more demand and more opportunities to build, more private spaces (Upmarket residential area) can be built in cluster 3 subzones.

6.7.4 Shopping

ggplot(data=socio_urban, aes(x=`Shopping`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="Shopping",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Shopping",size.lim=c(1))

Most shopping functions are located in the central and a few in cluster 3. However, there is a high population density in cluster 1 and cluster 2. Hence, more shopping functions can be opened in cluster 1 and 2 to create covenience for the people.

6.7.5 Business

ggplot(data=socio_urban, aes(x=`Business`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="Business",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Business",size.lim=c(1))

Many buiness functions can be found in cluster 3. However, cluster 4 is a good area to increase more business functions as the population density is low and is accesible by other subzones easily (it being the central) and relatively low number of businesses found in cluster 4.

6.7.6 Industry

ggplot(data=socio_urban, aes(x=`Industry`)) +
  geom_histogram(bins=20, color="black", fill="light blue")+
  facet_grid(rows = vars(SP_CLUSTER))

tm_shape(socio_urban) +
  tm_fill(col = "SP_CLUSTER",alpha=0.7) +
  tm_borders(lwd=1,lty="solid") +
  tm_bubbles(col="Industry",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Industry",size.lim=c(1))

Most industries can be found in cluster 3. however, the number of industries are little. with varying population densities in that cluster, more industries can be placed in cluster 3, whereby the subzone’s population density is lesser.

tmap_mode('plot')