packages = c('rgdal', 'spdep', 'ClustGeo', 'tmap', 'sf', 'ggpubr', 'cluster', 'heatmaply', 'corrplot', 'psych', 'tidyverse','units')
for (p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
## Loading required package: rgdal
## Loading required package: sp
## rgdal: version: 1.4-8, (SVN revision 845)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: FALSE
## Loaded PROJ.4 runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/proj
## Linking to sp version: 1.3-2
## Loading required package: spdep
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
## Loading required package: ClustGeo
## Loading required package: tmap
## Loading required package: ggpubr
## Loading required package: ggplot2
## Loading required package: cluster
## Loading required package: heatmaply
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
## Loading required package: viridisLite
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
##
## ======================
## Welcome to heatmaply version 1.1.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## Or contact: <tal.galili@gmail.com>
## ======================
## Loading required package: corrplot
## corrplot 0.84 loaded
## Loading required package: psych
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Loading required package: tidyverse
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.3
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: units
## udunits system database from /Library/Frameworks/R.framework/Versions/3.6/Resources/library/units/share/udunits
business_sf <- st_read(dsn = "data/geospatial", layer = "Business")
## Reading layer `Business' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 6550 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6147 ymin: 1.24605 xmax: 104.0044 ymax: 1.4698
## CRS: 4326
financial_sf <- st_read(dsn = "data/geospatial", layer = "Financial")
## Reading layer `Financial' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3320 features and 29 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6256 ymin: 1.24392 xmax: 103.9998 ymax: 1.46247
## CRS: 4326
govt_embassy_sf <- st_read(dsn = "data/geospatial", layer = "Govt_Embassy")
## Reading layer `Govt_Embassy' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 443 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6282 ymin: 1.24911 xmax: 103.9884 ymax: 1.45765
## CRS: 4326
private_residential_sf <- st_read(dsn = "data/geospatial", layer = "Private residential")
## Reading layer `Private residential' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3604 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6295 ymin: 1.23943 xmax: 103.9749 ymax: 1.45379
## CRS: 4326
shoppings_sf <- st_read(dsn = "data/geospatial", layer = "Shopping")
## Reading layer `Shopping' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 511 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.679 ymin: 1.24779 xmax: 103.9644 ymax: 1.4535
## CRS: 4326
subzone <- st_read(dsn = "data/geospatial", layer = "MP14_SUBZONE_WEB_PL")
## Reading layer `MP14_SUBZONE_WEB_PL' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 323 features and 15 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
costaloutline <- st_read(dsn = "data/geospatial", layer = "CostalOutline")
## Reading layer `CostalOutline' from data source `/Users/daniel/Google Drive (daniel.soh.2017@smu.edu.sg)/SMU | GD/1. Geospatial Analytics and Applications/Take Home Ex/Ex03/data/geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 60 features and 4 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: 2663.926 ymin: 16357.98 xmax: 56047.79 ymax: 50244.03
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
subzone <- st_transform(subzone, crs=4326)
subzone$area <- set_units(st_area(subzone), km^2)
pop_sz <- read_csv ("data/aspatial/planning-area-subzone-age-group-sex-and-type-of-dwelling-june-2011-2019.csv")
## Parsed with column specification:
## cols(
## planning_area = col_character(),
## subzone = col_character(),
## age_group = col_character(),
## sex = col_character(),
## type_of_dwelling = col_character(),
## resident_count = col_double(),
## year = col_double()
## )
# to upper for joining
pop_sz <- pop_sz %>%
mutate(subzone = toupper(subzone)) %>%
filter(year == 2019)
summary(pop_sz)
## planning_area subzone age_group sex
## Length:98192 Length:98192 Length:98192 Length:98192
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## type_of_dwelling resident_count year
## Length:98192 Min. : 0.00 Min. :2019
## Class :character 1st Qu.: 0.00 1st Qu.:2019
## Mode :character Median : 0.00 Median :2019
## Mean : 41.08 Mean :2019
## 3rd Qu.: 20.00 3rd Qu.:2019
## Max. :2440.00 Max. :2019
Total Population
total_pop_sz <- pop_sz %>%
group_by(subzone) %>%
summarise(total_pop = sum(resident_count))
Filter Economy Active Population
econ_active_pop <- pop_sz %>% filter(age_group == "25_to_29" |
age_group == "30_to_34"|
age_group == "35_to_39" |
age_group == "40_to_44" |
age_group == "45_to_49" |
age_group == "50_to_54" |
age_group == "50_to_54" |
age_group == "55_to_60" |
age_group == "60_to_64"
)
econ_active_pop <- econ_active_pop %>% group_by(subzone) %>%
summarise(econ_active_pop = sum(resident_count))
young_pop <- pop_sz %>% filter(age_group == "0_to_4" |
age_group == "5_to_9"|
age_group == "10_to_14" |
age_group == "15_to_19" |
age_group == "20_to_24") %>%
group_by(subzone) %>%
summarise(young_pop = sum(resident_count))
Aged Group
aged_pop <- pop_sz %>% filter(age_group == "65_to_69" |
age_group == "70_to_74"|
age_group == "75_to_79" |
age_group == "80_to_84" |
age_group == "85_to_89" |
age_group == "90_and_over") %>%
group_by(subzone) %>%
summarise(aged_pop = sum(resident_count))
HDB 1-2Rm
hdb1_2_pop <- pop_sz %>% filter(type_of_dwelling == "HDB 1- and 2-Room Flats") %>%
group_by(subzone) %>%
summarise(hdb1_2_pop = sum(resident_count))
HDB 3-4Rm
hdb3_4_pop <- pop_sz %>% filter(type_of_dwelling == "HDB 3-Room Flats" |type_of_dwelling == "HDB 4-Room Flats") %>%
group_by(subzone) %>%
summarise(hdb3_4_pop = sum(resident_count))
HDB 5RM-EC
hdb5_ec_pop <- pop_sz %>% filter(type_of_dwelling == "HDB 5-Room and Executive Flats") %>%
group_by(subzone) %>%
summarise(hdb5_ec_pop = sum(resident_count))
Condo
condo_pop <- pop_sz %>% filter(type_of_dwelling == "Condominiums and Other Apartments") %>%
group_by(subzone) %>%
summarise(condo_pop = sum(resident_count))
Landed
landed_pop <- pop_sz %>% filter(type_of_dwelling == "Landed Properties") %>%
group_by(subzone) %>%
summarise(landed_pop = sum(resident_count))
Population Density
pop_density <- subzone %>%select('SUBZONE_N', 'area')
pop_density <- left_join(pop_density, total_pop_sz, by = c("SUBZONE_N" = "subzone"))
## Warning: Column `SUBZONE_N`/`subzone` joining factor and character vector,
## coercing into character vector
pop_density <- mutate(pop_density, `pop_density` = `total_pop`/`area`)
sz_pop_main <- left_join(pop_density, econ_active_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, young_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, aged_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, hdb1_2_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, hdb3_4_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, hdb5_ec_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, condo_pop, by = c("SUBZONE_N" = "subzone"))
sz_pop_main <- left_join(sz_pop_main, landed_pop, by = c("SUBZONE_N" = "subzone"))
Using measurements directly will cause biases as total population in each subzone is differnt. In general, the townships with relatively higher total number of population will also have higher number of consititutent population (eg young, aged and various population dewellers)
pop_derived <- sz_pop_main %>%
mutate(`econActivePR` = `econ_active_pop`/`total_pop`) %>%
mutate(`youngPR` = `young_pop`/`total_pop`) %>%
mutate(`agedPR` = `aged_pop`/`total_pop`) %>%
mutate(`hdb12PR` = `hdb1_2_pop`/`total_pop`) %>%
mutate(`hdb34PR` = `hdb3_4_pop`/`total_pop`) %>%
mutate(`hdb5ecPR` = `hdb5_ec_pop`/`total_pop`) %>%
mutate(`condoPR` = `condo_pop`/`total_pop`) %>%
mutate(`landedPR` = `landed_pop`/`total_pop`)
tm_shape(subzone)+
tm_polygons() +
tm_shape(business_sf)+
tm_dots(col="orange")+
tm_shape(financial_sf)+
tm_dots(col="yellow")+
tm_shape(govt_embassy_sf)+
tm_dots(col="green")+
tm_shape(private_residential_sf)+
tm_dots(col="blue")+
tm_shape(shoppings_sf)+
tm_dots(col="red")
## Warning: The shape subzone is invalid. See sf::st_is_valid
## Wrangle Geospatial Data and join with main table (pop_derived) Business
summary(business_sf)
## POI_ID SEQ_NUM FAC_TYPE
## Min. :3.644e+07 Min. :1.000 Min. :5000
## 1st Qu.:9.967e+08 1st Qu.:1.000 1st Qu.:5000
## Median :1.097e+09 Median :1.000 Median :5000
## Mean :9.933e+08 Mean :1.021 Mean :5084
## 3rd Qu.:1.108e+09 3rd Qu.:1.000 3rd Qu.:5000
## Max. :1.204e+09 Max. :3.000 Max. :9991
##
## POI_NAME ST_NAME
## CAMBRIDGE INDUSTRIAL TRUST: 8 TAGORE LN : 83
## DHL : 6 JOO KOON CIR : 80
## NATIONAL OILWELL VARCO : 6 GUL CIR : 62
## ST MICROELECTRONICS : 6 KAKI BUKIT PL : 53
## CWT : 5 KAKI BUKIT IND TER: 52
## HALLIBURTON : 5 (Other) :5941
## (Other) :6514 NA's : 279
## geometry
## POINT :6550
## epsg:4326 : 0
## +proj=long...: 0
##
##
##
##
Differentiate business and industrial points using FAC_TYPE col in business_sf. Business: FAC_TYPE: 5000 Industrial: FAC_TYPE: 9001
table(business_sf$FAC_TYPE)
##
## 5000 9991
## 6440 110
business_sel_sf <- business_sf %>% filter(FAC_TYPE == 5000)
industry_sel_sef <- business_sf %>% filter(FAC_TYPE == 9991)
business_sz <- st_join(business_sel_sf, subzone) %>%
select(SUBZONE_N) %>%
arrange(SUBZONE_N) %>%
add_count(SUBZONE_N, name="business") %>%
distinct(SUBZONE_N, .keep_all = TRUE) %>%
select(SUBZONE_N, business)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
Industrial
industrial_sz <- st_join(industry_sel_sef, subzone) %>%
select(SUBZONE_N) %>%
arrange(SUBZONE_N) %>%
add_count(SUBZONE_N, name="industrial") %>%
distinct(SUBZONE_N, .keep_all = TRUE) %>%
select(SUBZONE_N, industrial)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
Govt Including Embassy
summary(govt_embassy_sf)
## POI_ID SEQ_NUM FAC_TYPE
## Min. :3.644e+07 Min. :1.000 Min. :9525
## 1st Qu.:1.010e+09 1st Qu.:1.000 1st Qu.:9525
## Median :1.058e+09 Median :1.000 Median :9525
## Mean :1.006e+09 Mean :1.111 Mean :9651
## 3rd Qu.:1.113e+09 3rd Qu.:1.000 3rd Qu.:9993
## Max. :1.203e+09 Max. :2.000 Max. :9993
##
## POI_NAME ST_NAME geometry
## ANG MO KIO TOWN COUNCIL : 5 MAXWELL RD: 16 POINT :443
## SEMBAWANG-NEE SOON TOWN COUNCIL: 3 THOMSON RD: 12 epsg:4326 : 0
## ALJUNIED HOUGANG TOWN COUNCIL : 2 ORCHARD RD: 11 +proj=long...: 0
## ALJUNIED TOWN COUNCIL : 2 COLLEGE RD: 10
## BISHAN-TOA PAYOH TOWN COUNCIL : 2 SCOTTS RD : 8
## CENTRAL PROVIDENT FUND BOARD : 2 (Other) :358
## (Other) :427 NA's : 28
govtemb_sz <- st_join(govt_embassy_sf, subzone) %>%
select(SUBZONE_N) %>%
arrange(SUBZONE_N) %>%
add_count(SUBZONE_N, name="govtemb") %>%
distinct(SUBZONE_N, .keep_all = TRUE) %>%
select(SUBZONE_N, govtemb)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
Shopping
summary(shoppings_sf)
## POI_ID SEQ_NUM FAC_TYPE
## Min. :3.644e+07 Min. :1.000 Min. :6512
## 1st Qu.:9.656e+08 1st Qu.:1.000 1st Qu.:6512
## Median :1.070e+09 Median :1.000 Median :6512
## Mean :8.934e+08 Mean :1.108 Mean :6512
## 3rd Qu.:1.104e+09 3rd Qu.:1.000 3rd Qu.:6512
## Max. :1.204e+09 Max. :3.000 Max. :6512
##
## POI_NAME ST_NAME
## BUKIT BATOK WEST SHOPPING CENTRE: 2 ORCHARD RD : 37
## CHANGE ALLEY : 2 BUKIT TIMAH RD: 7
## FARMART CENTRE : 2 SCOTTS RD : 7
## FORTUNE CENTRE : 2 BEACH RD : 6
## HARBOUR FRONT CENTRE ENTRANCE : 2 BENCOOLEN ST : 5
## NEW WORLD CENTRE : 2 (Other) :347
## (Other) :499 NA's :102
## geometry
## POINT :511
## epsg:4326 : 0
## +proj=long...: 0
##
##
##
##
shoppings_sz <- st_join(shoppings_sf, subzone) %>%
select(SUBZONE_N) %>%
arrange(SUBZONE_N) %>%
add_count(SUBZONE_N, name="shooping") %>%
distinct(SUBZONE_N, .keep_all = TRUE) %>%
select(SUBZONE_N, shooping)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
Financial
summary(financial_sf)
## LINK_ID POI_ID SEQ_NUM FAC_TYPE
## Min. :1.161e+08 Min. :3.644e+07 Min. :1.000 Min. :3578
## 1st Qu.:8.594e+08 1st Qu.:1.097e+09 1st Qu.:1.000 1st Qu.:3578
## Median :9.140e+08 Median :1.113e+09 Median :1.000 Median :3578
## Mean :9.092e+08 Mean :1.088e+09 Mean :1.008 Mean :4397
## 3rd Qu.:1.046e+09 3rd Qu.:1.132e+09 3rd Qu.:1.000 3rd Qu.:6000
## Max. :1.224e+09 Max. :1.204e+09 Max. :2.000 Max. :6000
##
## POI_NAME POI_LANGCD POI_NMTYPE POI_ST_NUM ST_NUM_FUL ST_NFUL_LC
## OCBC :788 ENG:3320 B:3293 1 : 212 29A : 1 ENG : 5
## UOB :577 J: 27 10 : 76 333A: 1 NA's:3315
## POSB :564 2 : 53 77B : 1
## DBS :282 11 : 50 7A : 1
## CITIBANK:153 304 : 50 8A : 1
## MAYBANK : 51 (Other):2004 NA's:3315
## (Other) :905 NA's : 875
## ST_NAME ST_LANGCD POI_ST_SD ACC_TYPE PH_NUMBER
## ORCHARD RD : 156 ENG :2926 L:1652 NA's:3320 63396666 : 52
## BEACH RD : 44 NA's: 394 N: 24 63272265 : 16
## NORTH BRIDGE RD : 39 R:1644 18002222121: 15
## COLLYER QUAY : 38 18004383333: 13
## NEW UPP CHANGI RD: 35 18001111111: 11
## (Other) :2614 (Other) : 539
## NA's : 394 NA's :2674
## CHAIN_ID NAT_IMPORT PRIVATE IN_VICIN NUM_PARENT
## Min. : 0 N:3320 N:3320 N:3320 Min. :0.0000
## 1st Qu.: 2526 1st Qu.:0.0000
## Median : 6918 Median :0.0000
## Mean : 5121 Mean :0.3807
## 3rd Qu.: 6920 3rd Qu.:1.0000
## Max. :24982 Max. :2.0000
##
## NUM_CHILD PERCFRREF VANCITY_ID
## Min. :0.0000000 Min. : 1.00 Min. :0
## 1st Qu.:0.0000000 1st Qu.:30.00 1st Qu.:0
## Median :0.0000000 Median :50.00 Median :0
## Mean :0.0003012 Mean :46.87 Mean :0
## 3rd Qu.:0.0000000 3rd Qu.:60.00 3rd Qu.:0
## Max. :1.0000000 Max. :99.00 Max. :0
## NA's :1339
## ACT_ADDR
## 1 KIM SENG PROMENADE SINGAPORE 237994: 7
## 3 TEMASEK BOULEVARD SINGAPORE 038983: 7
## 530 LORONG 6 TOA PAYOH SINGAPORE 310530: 7
## 2 JURONG EAST ST 21 SINGAPORE 609601: 6
## 3D RIVER VALLEY ROAD SINGAPORE 179023: 6
## (Other) : 243
## NA's :3044
## ACT_LANGCD ACT_ST_NAM ACT_ST_NUM ACT_ADMIN
## ENG : 276 DUNEARN ROAD : 7 1 : 20 INGAPORE : 1
## NA's:3044 KIM SENG PROMENADE: 7 3 : 11 SINGAPORE: 275
## LORONG 6 TOA PAYOH: 7 2 : 10 NA's :3044
## PAYA LEBAR ROAD : 7 50 : 8
## TEMASEK BOULEVARD : 7 530 : 7
## (Other) : 241 (Other): 220
## NA's :3044 NA's :3044
## ACT_POSTAL geometry
## 038983 : 7 POINT :3320
## 237994 : 7 epsg:4326 : 0
## 310530 : 7 +proj=long...: 0
## 609601 : 7
## 179023 : 6
## (Other): 242
## NA's :3044
financial_sz <- st_join(financial_sf, subzone) %>%
select(SUBZONE_N) %>%
arrange(SUBZONE_N) %>%
add_count(SUBZONE_N, name="financial") %>%
distinct(SUBZONE_N, .keep_all = TRUE) %>%
select(SUBZONE_N, financial)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
Upmarket residential Area
summary(private_residential_sf)
## POI_ID SEQ_NUM FAC_TYPE
## Min. :3.644e+07 Min. :1.000 Min. :9590
## 1st Qu.:9.968e+08 1st Qu.:1.000 1st Qu.:9590
## Median :1.070e+09 Median :1.000 Median :9590
## Mean :1.052e+09 Mean :1.007 Mean :9590
## 3rd Qu.:1.105e+09 3rd Qu.:1.000 3rd Qu.:9590
## Max. :1.204e+09 Max. :2.000 Max. :9590
##
## POI_NAME ST_NAME geometry
## BLISSFUL VIEW : 3 PASIR PANJANG RD : 45 POINT :3604
## CLEMENTI PARK : 3 UPP EAST COAST RD: 29 epsg:4326 : 0
## COMPASSVALE VIEW : 3 LOR K TELOK KURAU: 26 +proj=long...: 0
## KING'S MANSION : 3 BUKIT TIMAH RD : 24
## MIDPOINT PROPERTIES : 3 EAST COAST RD : 23
## NEE SOON CENTRAL ESTATE: 3 (Other) :3412
## (Other) :3586 NA's : 45
private_residential_sz <- st_join(private_residential_sf, subzone) %>%
select(SUBZONE_N) %>%
arrange(SUBZONE_N) %>%
add_count(SUBZONE_N, name="privateres") %>%
distinct(SUBZONE_N, .keep_all = TRUE) %>%
select(SUBZONE_N, privateres)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
pop_derived <- left_join(pop_derived, business_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector
pop_derived <- left_join(pop_derived, financial_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector
pop_derived <- left_join(pop_derived, govtemb_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector
pop_derived <- left_join(pop_derived, industrial_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector
pop_derived <- left_join(pop_derived, private_residential_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector
pop_derived <- left_join(pop_derived, shoppings_sz, by = c("SUBZONE_N" = "SUBZONE_N"))
## Warning: Column `SUBZONE_N` joining character vector and factor, coercing into
## character vector
pop_derived <- pop_derived %>%
mutate(`businessPR` = `business`/`total_pop`) %>%
mutate(`financialPR` = `financial`/`total_pop`) %>%
mutate(`privateresPR` = `privateres`/`total_pop`) %>%
mutate(`govtembPR` = `govtemb`/`total_pop`) %>%
mutate(`industrialPR` = `industrial`/`total_pop`) %>%
mutate(`shoppingsPR` = `shooping`/`total_pop`)
showMap <- function(df, segment,chartTitle) {
tm_shape(df)+
tm_polygons(col=segment,legend.hist = TRUE, palette="seq", title=chartTitle)+
tm_layout(legend.outside = TRUE, legend.hist.width = 10, aes.palette = list(seq = "-PiYG"))
}
population_map <- showMap(pop_derived, "total_pop", "Total Popuation")
econ_active_pop <- showMap(pop_derived, "econ_active_pop", "Economy Active Population")
young_map <- showMap(pop_derived, "young_pop", "Young Population")
aged_pop <- showMap(pop_derived, "aged_pop", "Aged Population")
hdb1_2_pop <- showMap(pop_derived, "hdb1_2_pop", "HDB 1- 2Rm Dwellers")
hdb3_4_pop <- showMap(pop_derived, "hdb3_4_pop", "HDB 3- 4Rm Dwellers")
hdb5_ec_pop <- showMap(pop_derived, "hdb5_ec_pop", "HDB 5Rm EC Dwellers")
condo_pop <- showMap(pop_derived, "condo_pop", "Condominium and Apt Dwellers")
landed_pop <- showMap(pop_derived, "landed_pop", "Landed Property Dwellers")
Plot 1/2 - Population Segment
tmap_arrange(population_map, econ_active_pop, young_map, aged_pop, hdb1_2_pop, hdb3_4_pop, ncol = 2)
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.50, 0.46, 0.46. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Warning: The shape df is invalid. See sf::st_is_valid
## Warning: The shape df is invalid. See sf::st_is_valid
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.55, 0.51, 0.51. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.60, 0.60, 0.60, 0.60. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.51, 0.51, 0.51. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
Plot 2/2 - Population Segment
tmap_arrange(hdb5_ec_pop, condo_pop, landed_pop, ncol = 2)
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.51, 0.51, 0.51, 0.51. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.55, 0.51, 0.51. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Warning: The shape df is invalid. See sf::st_is_valid
## Some legend labels were too wide. These labels have been resized to 0.55, 0.51, 0.51. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
business_map <- showMap(pop_derived, "business", "No Of Business Functions")
financial_map <- showMap(pop_derived, "financial", "No Of Financial Functions")
govtemb_map <- showMap(pop_derived, "govtemb", "No Of Govt/Emb Functions")
industrial_map <- showMap(pop_derived, "industrial", "No Of industrial Functions")
privateres_map <- showMap(pop_derived, "privateres", "No Of Upmarket Residential Area")
shopping_map <- showMap(pop_derived, "shooping", "No Of Shopping Functions")
Plot 2/2 - Population Urban Functions
tmap_arrange(business_map, financial_map, govtemb_map, industrial_map, privateres_map, shopping_map, ncol = 2)
## Warning: The shape df is invalid. See sf::st_is_valid
## Warning: The shape df is invalid. See sf::st_is_valid
## Warning: The shape df is invalid. See sf::st_is_valid
## Warning: The shape df is invalid. See sf::st_is_valid
## Warning: The shape df is invalid. See sf::st_is_valid
## Warning: The shape df is invalid. See sf::st_is_valid
pop_seg <- pop_derived %>%
st_set_geometry(NULL)
## Urban functions
business_hist <- ggplot(data=pop_seg, aes(x=business)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "Business")
financial_hist <- ggplot(data=pop_seg, aes(x=financial)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "Financial")
govtemb_hist <- ggplot(data=pop_seg, aes(x=govtemb)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "Govt Functions")
industrial_hist <- ggplot(data=pop_seg, aes(x=industrial)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "Industrial")
privateres_hist <- ggplot(data=pop_seg, aes(x=privateres)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "Upmarket Residential Areas")
shopping_hist <- ggplot(data=pop_seg, aes(x=shooping)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "Shopping")
## Urban functions PR
businessPR_hist <- ggplot(data=pop_seg, aes(x=businessPR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR Business")
financialPR_hist <- ggplot(data=pop_seg, aes(x=financialPR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR Financial")
govtembPR_hist <- ggplot(data=pop_seg, aes(x=govtembPR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR Govt Functions")
industrialPR_hist <- ggplot(data=pop_seg, aes(x=industrialPR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR Industrial")
privateresPR_hist <- ggplot(data=pop_seg, aes(x=privateresPR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR Upmarket Residential Areas")
shoppingPR_hist <- ggplot(data=pop_seg, aes(x=shoppingsPR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR Shopping")
## Population PR
econActivePR_hist <- ggplot(data=pop_seg, aes(x=econActivePR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR Active Population")
youngPR_hist <- ggplot(data=pop_seg, aes(x=youngPR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR Young Population")
agedPR_hist <- ggplot(data=pop_seg, aes(x=agedPR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR Aged Population")
hdb12PR_hist <- ggplot(data=pop_seg, aes(x=hdb12PR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR HDB 1-2 Rm Dwellers")
hdb34PR_hist <- ggplot(data=pop_seg, aes(x=hdb34PR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR HDB 3-4 Rm Dwellers")
hdb5ecPR_hist <- ggplot(data=pop_seg, aes(x=hdb5ecPR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR HDB 5Rm/EC Dwellers")
condoPR_hist <- ggplot(data=pop_seg, aes(x=condoPR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR Condo")
landedPR_hist <- ggplot(data=pop_seg, aes(x=landedPR)) +
geom_histogram(bins=20, color="black", fill="light blue")+
scale_x_continuous(name = "PR Landed")
ggarrange(econActivePR_hist,youngPR_hist,agedPR_hist,hdb12PR_hist,hdb34PR_hist,hdb5ecPR_hist, condoPR_hist,landedPR_hist,
nrow = 2,ncol=4)
## Warning: Removed 89 rows containing non-finite values (stat_bin).
## Warning: Removed 89 rows containing non-finite values (stat_bin).
## Warning: Removed 89 rows containing non-finite values (stat_bin).
## Warning: Removed 89 rows containing non-finite values (stat_bin).
## Warning: Removed 89 rows containing non-finite values (stat_bin).
## Warning: Removed 89 rows containing non-finite values (stat_bin).
## Warning: Removed 89 rows containing non-finite values (stat_bin).
## Warning: Removed 89 rows containing non-finite values (stat_bin).
ggarrange(business_hist,financial_hist,govtemb_hist,industrial_hist,privateres_hist,shopping_hist,
nrow = 2,ncol=3)
## Warning: Removed 107 rows containing non-finite values (stat_bin).
## Warning: Removed 73 rows containing non-finite values (stat_bin).
## Warning: Removed 190 rows containing non-finite values (stat_bin).
## Warning: Removed 274 rows containing non-finite values (stat_bin).
## Warning: Removed 84 rows containing non-finite values (stat_bin).
## Warning: Removed 176 rows containing non-finite values (stat_bin).
## Urban functions PR
ggarrange(businessPR_hist,financialPR_hist,govtembPR_hist,industrialPR_hist,privateresPR_hist,shoppingPR_hist,nrow = 2,ncol=3)
## Warning: Removed 168 rows containing non-finite values (stat_bin).
## Warning: Removed 112 rows containing non-finite values (stat_bin).
## Warning: Removed 213 rows containing non-finite values (stat_bin).
## Warning: Removed 297 rows containing non-finite values (stat_bin).
## Warning: Removed 98 rows containing non-finite values (stat_bin).
## Warning: Removed 189 rows containing non-finite values (stat_bin).
# replace NaN to 0
pop_derived_corr <- pop_derived
pop_derived_corr <- st_set_geometry(pop_derived_corr, NULL)
pop_derived_corr <- pop_derived_corr %>% mutate_if(is.numeric, ~replace_na(., 0))
row.names(pop_derived_corr) <- pop_derived_corr$"SUBZONE_N"
pop_derived_corr <- pop_derived_corr %>% select(13:20,21:26)
head(pop_derived_corr,10)
## econActivePR youngPR agedPR hdb12PR hdb34PR hdb5ecPR
## MARINA SOUTH 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.00000000
## PEARL'S HILL 0.4223228 0.1659125 0.3182504 0.6817496 0.2111614 0.00000000
## BOAT QUAY 0.5714286 0.0000000 0.2857143 0.0000000 0.0000000 0.00000000
## HENDERSON HILL 0.4790732 0.1958146 0.2436472 0.2937220 0.5971599 0.09491779
## REDHILL 0.5135135 0.2646785 0.1519105 0.1835974 0.3280522 0.30195713
## ALEXANDRA HILL 0.4644412 0.2111756 0.2402032 0.2837446 0.4586357 0.22641509
## BUKIT HO SWEE 0.4844384 0.1928281 0.2428958 0.2503383 0.5825440 0.12855210
## CLARKE QUAY 0.6666667 0.0000000 0.1666667 0.0000000 0.0000000 0.00000000
## PASIR PANJANG 1 0.5530474 0.2528217 0.1264108 0.0000000 0.0000000 0.00000000
## QUEENSWAY 0.6206897 0.1034483 0.1724138 0.0000000 0.0000000 0.00000000
## condoPR landedPR business financial govtemb industrial
## MARINA SOUTH 0.00000000 0.0000000 0 3 0 0
## PEARL'S HILL 0.06334842 0.0000000 6 25 1 0
## BOAT QUAY 0.71428571 0.0000000 40 2 2 0
## HENDERSON HILL 0.01420030 0.0000000 0 4 0 0
## REDHILL 0.17986952 0.0000000 2 12 0 0
## ALEXANDRA HILL 0.00000000 0.0000000 39 15 7 1
## BUKIT HO SWEE 0.03653586 0.0000000 6 6 4 0
## CLARKE QUAY 0.16666667 0.0000000 12 19 4 0
## PASIR PANJANG 1 0.67042889 0.3227991 16 4 0 0
## QUEENSWAY 1.00000000 0.0000000 0 2 0 0
## privateres shooping
## MARINA SOUTH 0 0
## PEARL'S HILL 6 3
## BOAT QUAY 1 1
## HENDERSON HILL 5 0
## REDHILL 6 1
## ALEXANDRA HILL 4 1
## BUKIT HO SWEE 11 1
## CLARKE QUAY 6 10
## PASIR PANJANG 1 56 0
## QUEENSWAY 1 1
# pop_derived_corr[is.na(pop_derived_corr)] <- 0
# pop_derived_corr <- st_set_geometry(pop_derived_corr, NULL)
# pop_derived_corr <- pop_derived_corr %>% select_if(is.numeric) %>% select(matches('PR|SUBZONE_N', ignore.case = FALSE))
Plot Corr
pop_derived_corr.cor = cor(pop_derived_corr)
corrplot.mixed(pop_derived_corr.cor,
lower = "ellipse",
tl.col = "black",
upper = "number",
tl.pos = "lt",
diag = "l",
number.cex = 0.5)
Remove young PR
pop_derived_corr <- pop_derived_corr %>% select(!econActivePR)
pop_derived_corr.cor = cor(pop_derived_corr)
corrplot.mixed(pop_derived_corr.cor,
lower = "ellipse",
tl.col = "black",
upper = "number",
tl.pos = "lt",
diag = "l",
number.cex = 0.5)
The code chunk below will be used to extract the clustering variables from the simple feature object into data.frame.
cluster_vars <- pop_derived %>%
mutate_if(is.numeric, ~replace_na(., 0)) %>%
st_set_geometry(NULL)
# Set row names as Subzone name
row.names(cluster_vars) <- cluster_vars$"SUBZONE_N"
# delete subzone col
cluster_vars <- select(cluster_vars, !1)
head(cluster_vars,10)
## area total_pop pop_density econ_active_pop
## MARINA SOUTH 1.6303787 [km^2] 0 0.0000 [1/km^2] 0
## PEARL'S HILL 0.5598162 [km^2] 6630 11843.1720 [1/km^2] 2800
## BOAT QUAY 0.1608075 [km^2] 70 435.3031 [1/km^2] 40
## HENDERSON HILL 0.5954289 [km^2] 13380 22471.1982 [1/km^2] 6410
## REDHILL 0.3874294 [km^2] 10730 27695.3679 [1/km^2] 5510
## ALEXANDRA HILL 1.0303786 [km^2] 13780 13373.7249 [1/km^2] 6400
## BUKIT HO SWEE 0.5517320 [km^2] 14780 26788.3660 [1/km^2] 7160
## CLARKE QUAY 0.2901846 [km^2] 60 206.7649 [1/km^2] 40
## PASIR PANJANG 1 1.0847914 [km^2] 4430 4083.7346 [1/km^2] 2450
## QUEENSWAY 0.6316441 [km^2] 290 459.1193 [1/km^2] 180
## young_pop aged_pop hdb1_2_pop hdb3_4_pop hdb5_ec_pop condo_pop
## MARINA SOUTH 0 0 0 0 0 0
## PEARL'S HILL 1100 2110 4520 1400 0 420
## BOAT QUAY 0 20 0 0 0 50
## HENDERSON HILL 2620 3260 3930 7990 1270 190
## REDHILL 2840 1630 1970 3520 3240 1930
## ALEXANDRA HILL 2910 3310 3910 6320 3120 0
## BUKIT HO SWEE 2850 3590 3700 8610 1900 540
## CLARKE QUAY 0 10 0 0 0 10
## PASIR PANJANG 1 1120 560 0 0 0 2970
## QUEENSWAY 30 50 0 0 0 290
## landed_pop econActivePR youngPR agedPR hdb12PR hdb34PR
## MARINA SOUTH 0 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## PEARL'S HILL 0 0.4223228 0.1659125 0.3182504 0.6817496 0.2111614
## BOAT QUAY 0 0.5714286 0.0000000 0.2857143 0.0000000 0.0000000
## HENDERSON HILL 0 0.4790732 0.1958146 0.2436472 0.2937220 0.5971599
## REDHILL 0 0.5135135 0.2646785 0.1519105 0.1835974 0.3280522
## ALEXANDRA HILL 0 0.4644412 0.2111756 0.2402032 0.2837446 0.4586357
## BUKIT HO SWEE 0 0.4844384 0.1928281 0.2428958 0.2503383 0.5825440
## CLARKE QUAY 0 0.6666667 0.0000000 0.1666667 0.0000000 0.0000000
## PASIR PANJANG 1 1430 0.5530474 0.2528217 0.1264108 0.0000000 0.0000000
## QUEENSWAY 0 0.6206897 0.1034483 0.1724138 0.0000000 0.0000000
## hdb5ecPR condoPR landedPR business financial govtemb
## MARINA SOUTH 0.00000000 0.00000000 0.0000000 0 3 0
## PEARL'S HILL 0.00000000 0.06334842 0.0000000 6 25 1
## BOAT QUAY 0.00000000 0.71428571 0.0000000 40 2 2
## HENDERSON HILL 0.09491779 0.01420030 0.0000000 0 4 0
## REDHILL 0.30195713 0.17986952 0.0000000 2 12 0
## ALEXANDRA HILL 0.22641509 0.00000000 0.0000000 39 15 7
## BUKIT HO SWEE 0.12855210 0.03653586 0.0000000 6 6 4
## CLARKE QUAY 0.00000000 0.16666667 0.0000000 12 19 4
## PASIR PANJANG 1 0.00000000 0.67042889 0.3227991 16 4 0
## QUEENSWAY 0.00000000 1.00000000 0.0000000 0 2 0
## industrial privateres shooping businessPR financialPR
## MARINA SOUTH 0 0 0 0.0000000000 Inf
## PEARL'S HILL 0 6 3 0.0009049774 0.0037707391
## BOAT QUAY 0 1 1 0.5714285714 0.0285714286
## HENDERSON HILL 0 5 0 0.0000000000 0.0002989537
## REDHILL 0 6 1 0.0001863933 0.0011183597
## ALEXANDRA HILL 1 4 1 0.0028301887 0.0010885341
## BUKIT HO SWEE 0 11 1 0.0004059540 0.0004059540
## CLARKE QUAY 0 6 10 0.2000000000 0.3166666667
## PASIR PANJANG 1 0 56 0 0.0036117381 0.0009029345
## QUEENSWAY 0 1 1 0.0000000000 0.0068965517
## privateresPR govtembPR industrialPR shoppingsPR
## MARINA SOUTH 0.0000000000 0.0000000000 0.000000e+00 0.000000e+00
## PEARL'S HILL 0.0009049774 0.0001508296 0.000000e+00 4.524887e-04
## BOAT QUAY 0.0142857143 0.0285714286 0.000000e+00 1.428571e-02
## HENDERSON HILL 0.0003736921 0.0000000000 0.000000e+00 0.000000e+00
## REDHILL 0.0005591799 0.0000000000 0.000000e+00 9.319664e-05
## ALEXANDRA HILL 0.0002902758 0.0005079826 7.256894e-05 7.256894e-05
## BUKIT HO SWEE 0.0007442490 0.0002706360 0.000000e+00 6.765900e-05
## CLARKE QUAY 0.1000000000 0.0666666667 0.000000e+00 1.666667e-01
## PASIR PANJANG 1 0.0126410835 0.0000000000 0.000000e+00 0.000000e+00
## QUEENSWAY 0.0034482759 0.0000000000 0.000000e+00 3.448276e-03
In general, multiple variables will be used in cluster analysis. It is not unusual their values range are different. In order to avoid the cluster analysis result is baised to clustering variables with large values, it is useful to standardise the input variables before performing cluster analysis.
In the code chunk below, normalize() of heatmaply package is used to stadardisation the clustering variables by using Min-Max method. The summary() is then used to display the summary statistics of the standardised clustering variables.
pop_derived.std <- normalize(cluster_vars)
pop_derived.std[is.na(pop_derived.std)] <- 0
pop_derived_df <- as.data.frame(pop_derived.std)
names(pop_derived_df)
## [1] "area" "total_pop" "pop_density" "econ_active_pop"
## [5] "young_pop" "aged_pop" "hdb1_2_pop" "hdb3_4_pop"
## [9] "hdb5_ec_pop" "condo_pop" "landed_pop" "econActivePR"
## [13] "youngPR" "agedPR" "hdb12PR" "hdb34PR"
## [17] "hdb5ecPR" "condoPR" "landedPR" "business"
## [21] "financial" "govtemb" "industrial" "privateres"
## [25] "shooping" "businessPR" "financialPR" "privateresPR"
## [29] "govtembPR" "industrialPR" "shoppingsPR"
business_hist <- ggplot(data=pop_derived_df, aes(x=business)) +
geom_histogram(bins=20, color="black", fill="light blue")
industrial_hist <- ggplot(data=pop_derived_df, aes(x=industrial)) +
geom_histogram(bins=20, color="black", fill="light blue")
financial_hist <- ggplot(data=pop_derived_df, aes(x=financial)) +
geom_histogram(bins=20, color="black", fill="light blue")
govtemb_hist <- ggplot(data=pop_derived_df, aes(x=govtemb)) +
geom_histogram(bins=20, color="black", fill="light blue")
privateres_hist <- ggplot(data=pop_derived_df, aes(x=privateres)) +
geom_histogram(bins=20, color="black", fill="light blue")
shopping_hist <- ggplot(data=pop_derived_df, aes(x=shooping)) +
geom_histogram(bins=20, color="black", fill="light blue")
# PR
youngPR_hist <- ggplot(data=pop_derived_df, aes(x=youngPR)) +
geom_histogram(bins=20, color="black", fill="light blue")
agedPR_hist <- ggplot(data=pop_derived_df, aes(x=agedPR)) +
geom_histogram(bins=20, color="black", fill="light blue")
hdb12PR_hist <- ggplot(data=pop_derived_df, aes(x=hdb12PR)) +
geom_histogram(bins=20, color="black", fill="light blue")
hdb34PR_hist <- ggplot(data=pop_derived_df, aes(x=hdb34PR)) +
geom_histogram(bins=20, color="black", fill="light blue")
hdb5ecPR_hist <- ggplot(data=pop_derived_df, aes(x=hdb5ecPR)) +
geom_histogram(bins=20, color="black", fill="light blue")
condoPR_hist <- ggplot(data=pop_derived_df, aes(x=condoPR)) +
geom_histogram(bins=20, color="black", fill="light blue")
landedPR_hist <- ggplot(data=pop_derived_df, aes(x=landedPR)) +
geom_histogram(bins=20, color="black", fill="light blue")
ggarrange(business_hist,industrial_hist,financial_hist,govtemb_hist,privateres_hist,shopping_hist,
ncol = 3,nrow = 3)
ggarrange(youngPR_hist,agedPR_hist,hdb12PR_hist,hdb34PR_hist,hdb5ecPR_hist,condoPR_hist,landedPR_hist, ncol = 3,nrow = 3)
K-means
wssplot <- function(data, nc=15, seed=999){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
wssplot(pop_derived_df, nc=10)
There is a distinct drop above 4 clusters. After three clusters, this decrease drops off, suggesting that a 3-cluster solution may be a good fit to the data.
k <- 4
In R, many packages provide functions to calculate distance matrix. We will compute the proximity matrix by using dist() of R.
dist() supports six distance proximity calculations, they are: euclidean, maximum, manhattan, canberra, binary and minkowski. The default is euclidean proximity matrix.
The code chunk below is used to compute the proximity matrix using euclidean method.
proxmat <- dist(pop_derived_df, method = 'euclidean')
hclust_ward <- hclust(proxmat, method = 'ward.D')
plot(hclust_ward, cex = 0.6)
Selecting the optimal clustering algorithm
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
ac <- function(x) {
agnes(pop_derived_df, method = x)$ac
}
map_dbl(m, ac)
## average single complete ward
## 0.8735622 0.7759751 0.9110540 0.9771934
In the dendrogram displayed above, each leaf corresponds to one observation. As we move up the tree, observations that are similar to each other are combined into branches, which are themselves fused at a higher height.
The height of the fusion, provided on the vertical axis, indicates the (dis)similarity between two observations. The higher the height of the fusion, the less similar the observations are. Note that, conclusions about the proximity of two observations can be drawn only based on the height where branches containing those two observations first are fused. We cannot use the proximity of two observations along the horizontal axis as a criteria of their similarity.
It’s also possible to draw the dendrogram with a border around the selected clusters by using rect.hclust() of R stats. The argument border is used to specify the border colors for the rectangles.
plot(hclust_ward, cex = 0.6)
rect.hclust(hclust_ward, k = k, border = 2:5)
Visually-driven hierarchical clustering analysis
In this section, we will perform visually-driven hiearchical clustering analysis by using heatmaply package.
pop_derived_df_mat <- data.matrix(pop_derived_df)
heatmaply(pop_derived_df_mat,
Colv=NA,
dist_method = "euclidean",
hclust_method = "ward.D",
seriate = "OLO",
colors = Blues,
k_row = k,
margins = c(NA,200,60,NA),
fontsize_row = 4,
fontsize_col = 5,
main="Geographic Segmentation of Subzones by Urban Functions and Population",
xlab = "Urban Functions and Population",
ylab = "Subzones"
)
Mapping the clusters formed
With closed examination of the dendragram above, we have decided to retain four clusters.
cutree() of R Base will be used in the code chunk below to derive a 4-cluster model.
pop_derived_c <- pop_derived
pop_derived_c[is.na(pop_derived_c)] <- 0
groups <- as.factor(cutree(hclust_ward, k=k))
pop_derived_c_cluster <- cbind(pop_derived_c, as.matrix(groups)) %>%
rename(`CLUSTER`=`as.matrix.groups.`)
Plot
hierarchical_map <- qtm(pop_derived_c_cluster, "CLUSTER")
## Warning: The shape pop_derived_c_cluster is invalid. See sf::st_is_valid
hierarchical_map
pop_derived_sp <- as_Spatial(pop_derived_c)
class(pop_derived_sp)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
pop_derived.nb <- poly2nb(pop_derived_sp)
summary(pop_derived.nb)
## Neighbour list object:
## Number of regions: 323
## Number of nonzero links: 1938
## Percentage nonzero weights: 1.857585
## Average number of links: 6
## 5 regions with no links:
## 17 18 19 295 302
## Link number distribution:
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 14 17
## 5 2 6 10 26 77 85 52 34 17 3 3 1 1 1
## 2 least connected regions:
## 16 234 with 1 link
## 1 most connected region:
## 313 with 17 links
We note that there are 5 regions with no links. Remove them
plot(pop_derived_sp, border=grey(.5))
plot(pop_derived.nb, coordinates(pop_derived_sp), col="blue", add=TRUE)
Visualised 5 regions with no links.
`%notin%` <- Negate(`%in%`)
# pop_derived_sp_filtered <- pop_derived_sp %>% filter(data$SUBZONE_N %notin% c("NORTH-EASTERN ISLANDS", "PULAU SELETAR", "SEMAKAU", "SOUTHERN GROUP", "SUDONG"))
# pop_derived_filterd_sp <- as_Spatial(pop_derived_c_filtered)
pop_derived_filtered_sf <- pop_derived_c %>% filter(SUBZONE_N %notin% c("NORTH-EASTERN ISLANDS", "PULAU SELETAR", "SEMAKAU", "SOUTHERN GROUP", "SUDONG"))
pop_derived_filtered_sp <- as_Spatial(pop_derived_filtered_sf)
# plot(pop_derived_filtered_sp)
nrow(pop_derived_sp)
## [1] 323
nrow(pop_derived_filtered_sp)
## [1] 318
class(pop_derived_sp)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
class(pop_derived_filtered_sp)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
Recalc Neighbour list
pop_derived_filtered.nb <- poly2nb(pop_derived_filtered_sp)
summary(pop_derived_filtered.nb)
## Neighbour list object:
## Number of regions: 318
## Number of nonzero links: 1938
## Percentage nonzero weights: 1.916459
## Average number of links: 6.09434
## Link number distribution:
##
## 1 2 3 4 5 6 7 8 9 10 11 12 14 17
## 2 6 10 26 77 85 52 34 17 3 3 1 1 1
## 2 least connected regions:
## 16 231 with 1 link
## 1 most connected region:
## 308 with 17 links
plot(pop_derived_filtered_sp, border=grey(.5))
plot(pop_derived_filtered.nb, coordinates(pop_derived_filtered_sp), col="blue", add=TRUE)
lcosts <- nbcosts(pop_derived_filtered.nb, pop_derived_df)
pop_derived_filtered.w <- nb2listw(pop_derived_filtered.nb, lcosts, style="B")
summary(pop_derived_filtered.w)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 318
## Number of nonzero links: 1938
## Percentage nonzero weights: 1.916459
## Average number of links: 6.09434
## Link number distribution:
##
## 1 2 3 4 5 6 7 8 9 10 11 12 14 17
## 2 6 10 26 77 85 52 34 17 3 3 1 1 1
## 2 least connected regions:
## 16 231 with 1 link
## 1 most connected region:
## 308 with 17 links
##
## Weights style: B
## Weights constants summary:
## n nn S0 S1 S2
## B 318 101124 2246.255 6013.154 71179.68
pop_derived_filtered.mst <- mstree(pop_derived_filtered.w)
class(pop_derived_filtered.mst)
## [1] "mst" "matrix"
dim(pop_derived_filtered.mst)
## [1] 317 3
head(pop_derived_filtered.mst)
## [,1] [,2] [,3]
## [1,] 56 96 1.06070658
## [2,] 96 123 0.81471313
## [3,] 123 128 0.06917768
## [4,] 123 79 0.15059524
## [5,] 128 162 0.18875858
## [6,] 162 124 0.14914557
plot(pop_derived_filtered_sp, border=gray(.5))
plot.mst(pop_derived_filtered.mst, coordinates(pop_derived_filtered_sp),
col="blue", cex.lab=0.7, cex.circles=0.005, add=TRUE)
Computing spatially constrained clusters using SKATER method
clust4 <- skater(pop_derived_filtered.mst[,1:2], pop_derived_df, 4)
str(clust4)
## List of 8
## $ groups : num [1:318] 4 1 1 1 1 1 1 1 1 4 ...
## $ edges.groups:List of 5
## ..$ :List of 3
## .. ..$ node: num [1:229] 33 69 92 161 147 271 308 192 8 25 ...
## .. ..$ edge: num [1:228, 1:3] 147 271 144 249 206 25 131 159 29 171 ...
## .. ..$ ssw : num 194
## ..$ :List of 3
## .. ..$ node: num [1:31] 304 298 313 221 292 303 265 203 280 301 ...
## .. ..$ edge: num [1:30, 1:3] 301 279 292 287 289 302 280 317 315 318 ...
## .. ..$ ssw : num 16.3
## ..$ :List of 3
## .. ..$ node: num [1:36] 283 273 236 276 240 250 284 239 248 267 ...
## .. ..$ edge: num [1:35, 1:3] 236 222 250 248 276 239 240 284 239 300 ...
## .. ..$ ssw : num 25.3
## ..$ :List of 3
## .. ..$ node: num [1:16] 19 18 17 21 20 1 22 11 30 26 ...
## .. ..$ edge: num [1:15, 1:3] 11 30 1 21 11 22 17 26 22 18 ...
## .. ..$ ssw : num 9.35
## ..$ :List of 3
## .. ..$ node: num [1:6] 311 309 277 274 314 258
## .. ..$ edge: num [1:5, 1:3] 311 311 274 309 277 258 309 314 277 274 ...
## .. ..$ ssw : num 3.79
## $ not.prune : NULL
## $ candidates : int [1:5] 1 2 3 4 5
## $ ssto : num 277
## $ ssw : num [1:5] 277 267 259 254 249
## $ crit : num [1:2] 1 Inf
## $ vec.crit : num [1:318] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "class")= chr "skater"
ccs4 <- clust4$groups
ccs4
## [1] 4 1 1 1 1 1 1 1 1 4 4 1 4 1 1 1 4 4 4 4 4 4 1 1 1 4 1 1 1 4 1 4 1 1 1 1 1
## [38] 4 1 1 4 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3 2 1 3 1 1 1 1 1 1 1 1 3 3 1 1 1 1 1 2 3
## [223] 3 3 1 1 1 1 1 1 1 1 1 2 1 3 3 3 3 3 1 1 1 1 3 3 3 3 1 3 1 2 1 1 1 2 1 5 3
## [260] 3 2 3 3 3 2 1 3 1 1 1 1 3 3 5 3 3 5 1 2 2 3 1 3 3 2 1 2 2 2 3 2 2 1 2 2 2
## [297] 2 2 2 3 2 2 2 2 3 1 3 1 5 1 5 2 2 5 2 2 2 2
table(ccs4)
## ccs4
## 1 2 3 4 5
## 229 31 36 16 6
plot(pop_derived_filtered_sp, border=gray(.5))
plot(clust4, coordinates(pop_derived_filtered_sp), cex.lab=.7,
groups.colors=c("red","green","blue", "brown", "pink"), cex.circles=0.005, add=TRUE)
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter
Visualising clusters
groups_mat <- as.matrix(clust4$groups)
pop_derived_filtered_sp_Scluster <- cbind(pop_derived_filtered_sf, as.factor(groups_mat)) %>% rename(`SP_CLUSTER`=`as.factor.groups_mat.`)
spatial_map <- qtm(pop_derived_filtered_sp_Scluster, "SP_CLUSTER")
## Warning: The shape pop_derived_filtered_sp_Scluster is invalid. See
## sf::st_is_valid
spatial_map