Import data
1. Planning Area Subzone
pasz <- read_csv ("data/aspatial/planning-area-subzone-age-group-sex-and-type-of-dwelling-june-2011-2019.csv")
## Parsed with column specification:
## cols(
## planning_area = col_character(),
## subzone = col_character(),
## age_group = col_character(),
## sex = col_character(),
## type_of_dwelling = col_character(),
## resident_count = col_double(),
## year = col_double()
## )
any(is.na(pasz))
## [1] FALSE
2. Planning Subzone boundary
mpsz <- st_read(dsn = "data/geospatial", layer = "MP14_SUBZONE_WEB_PL") %>%
select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `geometry`)
## Reading layer `MP14_SUBZONE_WEB_PL' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 323 features and 15 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## projected CRS: SVY21
mpsz_sf <- st_as_sf(mpsz, 3414)
mpsz_sf3414 <- st_transform(mpsz_sf, 3414)
st_crs(mpsz_sf3414)
## Coordinate Reference System:
## User input: EPSG:3414
## wkt:
## PROJCRS["SVY21 / Singapore TM",
## BASEGEOGCRS["SVY21",
## DATUM["SVY21",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4757]],
## CONVERSION["Singapore Transverse Mercator",
## METHOD["Transverse Mercator",
## ID["EPSG",9807]],
## PARAMETER["Latitude of natural origin",1.36666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8801]],
## PARAMETER["Longitude of natural origin",103.833333333333,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8802]],
## PARAMETER["Scale factor at natural origin",1,
## SCALEUNIT["unity",1],
## ID["EPSG",8805]],
## PARAMETER["False easting",28001.642,
## LENGTHUNIT["metre",1],
## ID["EPSG",8806]],
## PARAMETER["False northing",38744.572,
## LENGTHUNIT["metre",1],
## ID["EPSG",8807]]],
## CS[Cartesian,2],
## AXIS["northing (N)",north,
## ORDER[1],
## LENGTHUNIT["metre",1]],
## AXIS["easting (E)",east,
## ORDER[2],
## LENGTHUNIT["metre",1]],
## USAGE[
## SCOPE["unknown"],
## AREA["Singapore"],
## BBOX[1.13,103.59,1.47,104.07]],
## ID["EPSG",3414]]
3. Business
business <- st_read(dsn = "data/geospatial", layer = "Business") %>%
filter(FAC_TYPE == 5000) %>%
select(`POI_ID`, `geometry`) %>%
rename(business_id = POI_ID)
## Reading layer `Business' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 6550 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6147 ymin: 1.24605 xmax: 104.0044 ymax: 1.4698
## geographic CRS: WGS 84
business_sf <- st_as_sf(business, 4326)
business_sf3414 <- st_transform(business_sf, 3414)
st_crs(business_sf3414)
## Coordinate Reference System:
## User input: EPSG:3414
## wkt:
## PROJCRS["SVY21 / Singapore TM",
## BASEGEOGCRS["SVY21",
## DATUM["SVY21",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4757]],
## CONVERSION["Singapore Transverse Mercator",
## METHOD["Transverse Mercator",
## ID["EPSG",9807]],
## PARAMETER["Latitude of natural origin",1.36666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8801]],
## PARAMETER["Longitude of natural origin",103.833333333333,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8802]],
## PARAMETER["Scale factor at natural origin",1,
## SCALEUNIT["unity",1],
## ID["EPSG",8805]],
## PARAMETER["False easting",28001.642,
## LENGTHUNIT["metre",1],
## ID["EPSG",8806]],
## PARAMETER["False northing",38744.572,
## LENGTHUNIT["metre",1],
## ID["EPSG",8807]]],
## CS[Cartesian,2],
## AXIS["northing (N)",north,
## ORDER[1],
## LENGTHUNIT["metre",1]],
## AXIS["easting (E)",east,
## ORDER[2],
## LENGTHUNIT["metre",1]],
## USAGE[
## SCOPE["unknown"],
## AREA["Singapore"],
## BBOX[1.13,103.59,1.47,104.07]],
## ID["EPSG",3414]]
mpsz_b <- st_join(mpsz_sf3414, business_sf3414) %>%
group_by(SUBZONE_N, REGION_N, SHAPE_Area) %>%
summarise(business_count = sum(!is.na(business_id))) %>%
select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `geometry`)
4. Industry
industry <- st_read(dsn = "data/geospatial", layer = "Business") %>%
filter(FAC_TYPE == 9991) %>%
select(`POI_ID`, `geometry`) %>%
rename(industry_id = POI_ID)
## Reading layer `Business' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 6550 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6147 ymin: 1.24605 xmax: 104.0044 ymax: 1.4698
## geographic CRS: WGS 84
industry_sf <- st_as_sf(industry, 4326)
industry_sf3414 <- st_transform(industry_sf, 3414)
st_crs(industry_sf3414)
## Coordinate Reference System:
## User input: EPSG:3414
## wkt:
## PROJCRS["SVY21 / Singapore TM",
## BASEGEOGCRS["SVY21",
## DATUM["SVY21",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4757]],
## CONVERSION["Singapore Transverse Mercator",
## METHOD["Transverse Mercator",
## ID["EPSG",9807]],
## PARAMETER["Latitude of natural origin",1.36666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8801]],
## PARAMETER["Longitude of natural origin",103.833333333333,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8802]],
## PARAMETER["Scale factor at natural origin",1,
## SCALEUNIT["unity",1],
## ID["EPSG",8805]],
## PARAMETER["False easting",28001.642,
## LENGTHUNIT["metre",1],
## ID["EPSG",8806]],
## PARAMETER["False northing",38744.572,
## LENGTHUNIT["metre",1],
## ID["EPSG",8807]]],
## CS[Cartesian,2],
## AXIS["northing (N)",north,
## ORDER[1],
## LENGTHUNIT["metre",1]],
## AXIS["easting (E)",east,
## ORDER[2],
## LENGTHUNIT["metre",1]],
## USAGE[
## SCOPE["unknown"],
## AREA["Singapore"],
## BBOX[1.13,103.59,1.47,104.07]],
## ID["EPSG",3414]]
mpsz_bi <- st_join(mpsz_b, industry_sf3414) %>%
group_by(SUBZONE_N, REGION_N, SHAPE_Area, business_count) %>%
summarise(industry_count = sum(!is.na(industry_id))) %>%
select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `industry_count`, `geometry`)
5. Financial
financial <- st_read(dsn = "data/geospatial", layer = "Financial") %>%
select(`POI_ID`, `geometry`) %>%
rename(financial_id = POI_ID)
## Reading layer `Financial' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3320 features and 29 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6256 ymin: 1.24392 xmax: 103.9998 ymax: 1.46247
## geographic CRS: WGS 84
financial_sf <- st_as_sf(financial, 4326)
financial_sf3414 <- st_transform(financial_sf, 3414)
st_crs(financial_sf3414)
## Coordinate Reference System:
## User input: EPSG:3414
## wkt:
## PROJCRS["SVY21 / Singapore TM",
## BASEGEOGCRS["SVY21",
## DATUM["SVY21",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4757]],
## CONVERSION["Singapore Transverse Mercator",
## METHOD["Transverse Mercator",
## ID["EPSG",9807]],
## PARAMETER["Latitude of natural origin",1.36666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8801]],
## PARAMETER["Longitude of natural origin",103.833333333333,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8802]],
## PARAMETER["Scale factor at natural origin",1,
## SCALEUNIT["unity",1],
## ID["EPSG",8805]],
## PARAMETER["False easting",28001.642,
## LENGTHUNIT["metre",1],
## ID["EPSG",8806]],
## PARAMETER["False northing",38744.572,
## LENGTHUNIT["metre",1],
## ID["EPSG",8807]]],
## CS[Cartesian,2],
## AXIS["northing (N)",north,
## ORDER[1],
## LENGTHUNIT["metre",1]],
## AXIS["easting (E)",east,
## ORDER[2],
## LENGTHUNIT["metre",1]],
## USAGE[
## SCOPE["unknown"],
## AREA["Singapore"],
## BBOX[1.13,103.59,1.47,104.07]],
## ID["EPSG",3414]]
mpsz_bif <- st_join(mpsz_bi, financial_sf3414) %>%
group_by(SUBZONE_N, REGION_N, SHAPE_Area, business_count, industry_count) %>%
summarise(financial_count = sum(!is.na(financial_id))) %>%
select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `industry_count`, `financial_count`, `geometry`)
6. Government including embassy
govt <- st_read(dsn = "data/geospatial", layer = "Govt_Embassy") %>%
select(`POI_ID`, `geometry`) %>%
rename(govt_id = POI_ID)
## Reading layer `Govt_Embassy' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 443 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6282 ymin: 1.24911 xmax: 103.9884 ymax: 1.45765
## geographic CRS: WGS 84
govt_sf <- st_as_sf(govt, 4326)
govt_sf3414 <- st_transform(govt_sf, 3414)
st_crs(govt_sf3414)
## Coordinate Reference System:
## User input: EPSG:3414
## wkt:
## PROJCRS["SVY21 / Singapore TM",
## BASEGEOGCRS["SVY21",
## DATUM["SVY21",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4757]],
## CONVERSION["Singapore Transverse Mercator",
## METHOD["Transverse Mercator",
## ID["EPSG",9807]],
## PARAMETER["Latitude of natural origin",1.36666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8801]],
## PARAMETER["Longitude of natural origin",103.833333333333,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8802]],
## PARAMETER["Scale factor at natural origin",1,
## SCALEUNIT["unity",1],
## ID["EPSG",8805]],
## PARAMETER["False easting",28001.642,
## LENGTHUNIT["metre",1],
## ID["EPSG",8806]],
## PARAMETER["False northing",38744.572,
## LENGTHUNIT["metre",1],
## ID["EPSG",8807]]],
## CS[Cartesian,2],
## AXIS["northing (N)",north,
## ORDER[1],
## LENGTHUNIT["metre",1]],
## AXIS["easting (E)",east,
## ORDER[2],
## LENGTHUNIT["metre",1]],
## USAGE[
## SCOPE["unknown"],
## AREA["Singapore"],
## BBOX[1.13,103.59,1.47,104.07]],
## ID["EPSG",3414]]
mpsz_bifg <- st_join(mpsz_bif, govt_sf3414) %>%
group_by(SUBZONE_N, REGION_N, SHAPE_Area, business_count, industry_count, financial_count) %>%
summarise(govt_count = sum(!is.na(govt_id))) %>%
select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `industry_count`, `financial_count`, `govt_count`, `geometry`)
7. Private Residential
pr <- st_read(dsn = "data/geospatial", layer = "Private residential") %>%
select(`POI_ID`, `geometry`) %>%
rename(pr_id = POI_ID)
## Reading layer `Private residential' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3604 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6295 ymin: 1.23943 xmax: 103.9749 ymax: 1.45379
## geographic CRS: WGS 84
pr_sf <- st_as_sf(pr, 4326)
pr_sf3414 <- st_transform(pr_sf, 3414)
st_crs(pr_sf3414)
## Coordinate Reference System:
## User input: EPSG:3414
## wkt:
## PROJCRS["SVY21 / Singapore TM",
## BASEGEOGCRS["SVY21",
## DATUM["SVY21",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4757]],
## CONVERSION["Singapore Transverse Mercator",
## METHOD["Transverse Mercator",
## ID["EPSG",9807]],
## PARAMETER["Latitude of natural origin",1.36666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8801]],
## PARAMETER["Longitude of natural origin",103.833333333333,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8802]],
## PARAMETER["Scale factor at natural origin",1,
## SCALEUNIT["unity",1],
## ID["EPSG",8805]],
## PARAMETER["False easting",28001.642,
## LENGTHUNIT["metre",1],
## ID["EPSG",8806]],
## PARAMETER["False northing",38744.572,
## LENGTHUNIT["metre",1],
## ID["EPSG",8807]]],
## CS[Cartesian,2],
## AXIS["northing (N)",north,
## ORDER[1],
## LENGTHUNIT["metre",1]],
## AXIS["easting (E)",east,
## ORDER[2],
## LENGTHUNIT["metre",1]],
## USAGE[
## SCOPE["unknown"],
## AREA["Singapore"],
## BBOX[1.13,103.59,1.47,104.07]],
## ID["EPSG",3414]]
mpsz_bifgp <- st_join(mpsz_bifg, pr_sf3414) %>%
group_by(SUBZONE_N, REGION_N, SHAPE_Area, business_count, industry_count, financial_count, govt_count) %>%
summarise(pr_count = sum(!is.na(pr_id))) %>%
select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `industry_count`, `financial_count`, `govt_count`, `pr_count`, `geometry`)
8. Shopping
shopping <- st_read(dsn = "data/geospatial", layer = "Shopping") %>%
select(`POI_ID`, `geometry`) %>%
rename(shopping_id = POI_ID)
## Reading layer `Shopping' from data source `C:\Users\Jenny\Documents\SMU Documents\YEAR 3\Year 3 Semester 2\IS415 - Geospatial Analytics and Applications\Take-home exercises\IS415_Take-home_Ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 511 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.679 ymin: 1.24779 xmax: 103.9644 ymax: 1.4535
## geographic CRS: WGS 84
shopping_sf <- st_as_sf(shopping, 4326)
shopping_sf3414 <- st_transform(shopping_sf, 3414)
st_crs(shopping_sf3414)
## Coordinate Reference System:
## User input: EPSG:3414
## wkt:
## PROJCRS["SVY21 / Singapore TM",
## BASEGEOGCRS["SVY21",
## DATUM["SVY21",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4757]],
## CONVERSION["Singapore Transverse Mercator",
## METHOD["Transverse Mercator",
## ID["EPSG",9807]],
## PARAMETER["Latitude of natural origin",1.36666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8801]],
## PARAMETER["Longitude of natural origin",103.833333333333,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8802]],
## PARAMETER["Scale factor at natural origin",1,
## SCALEUNIT["unity",1],
## ID["EPSG",8805]],
## PARAMETER["False easting",28001.642,
## LENGTHUNIT["metre",1],
## ID["EPSG",8806]],
## PARAMETER["False northing",38744.572,
## LENGTHUNIT["metre",1],
## ID["EPSG",8807]]],
## CS[Cartesian,2],
## AXIS["northing (N)",north,
## ORDER[1],
## LENGTHUNIT["metre",1]],
## AXIS["easting (E)",east,
## ORDER[2],
## LENGTHUNIT["metre",1]],
## USAGE[
## SCOPE["unknown"],
## AREA["Singapore"],
## BBOX[1.13,103.59,1.47,104.07]],
## ID["EPSG",3414]]
mpsz_bifgps <- st_join(mpsz_bifgp, shopping_sf3414) %>%
group_by(SUBZONE_N, REGION_N, SHAPE_Area, business_count, industry_count, financial_count, govt_count, pr_count) %>%
summarise(shopping_count = sum(!is.na(shopping_id))) %>%
select(`SUBZONE_N`, `REGION_N`, `SHAPE_Area`, `business_count`, `industry_count`, `financial_count`, `govt_count`, `pr_count`, `shopping_count`, `geometry`)
Exploratory Data Analysis (EDA)
1. EDA using statistical graphics
young <- ggplot(data=residents_mpsz_derived,
aes(x= `young_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
economy_active <- ggplot(data=residents_mpsz_derived,
aes(x= `economy_active_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
aged <- ggplot(data=residents_mpsz_derived,
aes(x= `aged_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
HDB12rm <- ggplot(data=residents_mpsz_derived,
aes(x= `HDB12rm_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
HDB34rm <- ggplot(data=residents_mpsz_derived,
aes(x= `HDB34rm_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
HDB5rmec <- ggplot(data=residents_mpsz_derived,
aes(x= `HDB5rmec_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
condo <- ggplot(data=residents_mpsz_derived,
aes(x= `condo_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
landed <- ggplot(data=residents_mpsz_derived,
aes(x= `landed_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
pop_d <- ggplot(data=residents_mpsz_derived,
aes(x= `pop_density`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
# multiple histograms are plottted to reveal the distribution of the selected variables in the residents_mpsz_derived data.frame.
ggarrange(young, economy_active, aged, HDB12rm, HDB34rm, HDB5rmec, condo, landed, pop_d, ncol = 3, nrow = 3)

From the histograms above, we observed that the overall distribution of data values are very right skewed.
b <- ggplot(data=residents_mpsz_derived,
aes(x= `business_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
i <- ggplot(data=residents_mpsz_derived,
aes(x= `industry_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
f <- ggplot(data=residents_mpsz_derived,
aes(x= `financial_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
g <- ggplot(data=residents_mpsz_derived,
aes(x= `govt_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
p <- ggplot(data=residents_mpsz_derived,
aes(x= `pr_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
s <- ggplot(data=residents_mpsz_derived,
aes(x= `shopping_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
# multiple histograms are plottted to reveal the distribution of the selected variables in the residents_mpsz_derived data.frame.
ggarrange(b, i, f, g, p, s,
ncol = 3,
nrow = 2)

From the histograms above, we observed that the overall distribution of data values are very right skewed.
young.box <- ggplot(data=residents_mpsz_derived, aes(x=`young_pr`)) +
geom_boxplot(color="black", fill="light blue")
ea.box <- ggplot(data=residents_mpsz_derived, aes(x=`economy_active_pr`)) +
geom_boxplot(color="black", fill="light blue")
a.box <- ggplot(data=residents_mpsz_derived, aes(x=`aged_pr`)) +
geom_boxplot(color="black", fill="light blue")
hdb12.box <- ggplot(data=residents_mpsz_derived, aes(x=`HDB12rm_pr`)) +
geom_boxplot(color="black", fill="light blue")
hdb34.box <- ggplot(data=residents_mpsz_derived, aes(x=`HDB34rm_pr`)) +
geom_boxplot(color="black", fill="light blue")
hdb5.box <- ggplot(data=residents_mpsz_derived, aes(x=`HDB5rmec_pr`)) +
geom_boxplot(color="black", fill="light blue")
condo.box <- ggplot(data=residents_mpsz_derived, aes(x=`condo_pr`)) +
geom_boxplot(color="black", fill="light blue")
landed.box <- ggplot(data=residents_mpsz_derived, aes(x=`landed_pr`)) +
geom_boxplot(color="black", fill="light blue")
popd.box <- ggplot(data=residents_mpsz_derived, aes(x=`pop_density`)) +
geom_boxplot(color="black", fill="light blue")
ggarrange(young.box, ea.box, a.box, hdb12.box, hdb34.box, hdb5.box, condo.box, landed.box, popd.box, ncol = 3, nrow = 3)

From the boxplots above, we observed that some variables - aged_pr, HDB12rm_pr and HDB5rmec_pr have outliers and this may cause biasness. We may use standardisation to reduce the biasness.
b.box <- ggplot(data=residents_mpsz_derived, aes(x=`business_pr`)) +
geom_boxplot(color="black", fill="light blue")
i.box <- ggplot(data=residents_mpsz_derived, aes(x=`industry_pr`)) +
geom_boxplot(color="black", fill="light blue")
g.box <- ggplot(data=residents_mpsz_derived, aes(x=`financial_pr`)) +
geom_boxplot(color="black", fill="light blue")
f.box <- ggplot(data=residents_mpsz_derived, aes(x=`govt_pr`)) +
geom_boxplot(color="black", fill="light blue")
pr.box <- ggplot(data=residents_mpsz_derived, aes(x=`pr_pr`)) +
geom_boxplot(color="black", fill="light blue")
s.box <- ggplot(data=residents_mpsz_derived, aes(x=`shopping_pr`)) +
geom_boxplot(color="black", fill="light blue")
ggarrange(b.box, i.box, g.box, f.box, pr.box, s.box, ncol = 3, nrow = 2)

From the boxplots above, we observed that some variables - industry_pr, financial+pr, govt_pr and pr_pr have outliers and this may cause biasness. We may use standardisation to reduce the biasness.
2. EDA using choropleth map
total_residents.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "resident_count",
n = 5,
style = "jenks",
title = "Total Residents") +
tm_borders(alpha = 0.5)
young.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "young",
n = 5,
style = "jenks",
title = "Number of Young Residents ") +
tm_borders(alpha = 0.5)
ec.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "economy_active",
n = 5,
style = "jenks",
title = "Number of Economy Active Residents ") +
tm_borders(alpha = 0.5)
aged.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "aged",
n = 5,
style = "jenks",
title = "Number of Aged Residents ") +
tm_borders(alpha = 0.5)
tmap_arrange(total_residents.map, young.map, ec.map, aged.map)
## Some legend labels were too wide. These labels have been resized to 0.65. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.

The choropleth maps above clearly show that areas with relatively larger number of residents are also showing relatively higher number of young, economy active, and aged residents.
total_residents.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "resident_count",
n = 5,
style = "jenks",
title = "Total Residents") +
tm_borders(alpha = 0.5)
HDB12rm.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "HDB12rm",
n = 5,
style = "jenks",
title = "Number of Residents living in HDB12rm") +
tm_borders(alpha = 0.5)
HDB34rm.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "HDB34rm",
n = 5,
style = "jenks",
title = "Number of Residents living in HDB34rm") +
tm_borders(alpha = 0.5)
HDB5rmec.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "HDB5rmec",
n = 5,
style = "jenks",
title = "Number of Residents living in HDB5rmec") +
tm_borders(alpha = 0.5)
condo.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "condo",
n = 5,
style = "jenks",
title = "Number of Residents living in condo") +
tm_borders(alpha = 0.5)
landed.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "landed",
n = 5,
style = "jenks",
title = "Number of Residents living in landed") +
tm_borders(alpha = 0.5)
tmap_arrange(total_residents.map, HDB12rm.map, HDB34rm.map, HDB5rmec.map, condo.map, landed.map)

The choropleth maps above clearly show that areas with relatively larger number of residents are also showing relatively higher number of residents living in HDB12rm, HDB34rm, HDB5rmec, condo and landed.
area.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "SHAPE_Area",
n = 5,
style = "jenks",
title = "Area") +
tm_borders(alpha = 0.5)
business.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "business_count",
n = 5,
style = "jenks",
title = "Number of Business") +
tm_borders(alpha = 0.5)
industry.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "industry_count",
n = 5,
style = "jenks",
title = "Number of Industry") +
tm_borders(alpha = 0.5)
financial.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "financial_count",
n = 5,
style = "jenks",
title = "Number of Financial") +
tm_borders(alpha = 0.5)
govt.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "govt_count",
n = 5,
style = "jenks",
title = "Number of Govt") +
tm_borders(alpha = 0.5)
pr.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "pr_count",
n = 5,
style = "jenks",
title = "Number of Private Residents") +
tm_borders(alpha = 0.5)
shopping.map <- tm_shape(st_make_valid(residents_mpsz_derived)) +
tm_fill(col = "shopping_count",
n = 5,
style = "jenks",
title = "Number of Shopping") +
tm_borders(alpha = 0.5)
tmap_arrange(area.map, business.map, industry.map, financial.map, govt.map, pr.map, shopping.map)
## Legend labels were too wide. The labels have been resized to 0.39, 0.34, 0.32, 0.31, 0.31. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.

The choropleth maps above clearly show that areas with relatively larger area does not affect the number of business, industry, financial, government, private residential and shopping.
3. Correlation Analysis
cluster_vars.cor = cor(as.data.frame(residents_mpsz_derived)[,19:27])
corrplot.mixed(cluster_vars.cor,
lower = "ellipse",
upper = "number",
tl.pos = "lt",
diag = "l",
tl.col = "black",
tl.cex = 1)

The correlation plot above shows that young_pr and economy_active_pr are highly correlated. This suggest that only one of them should be used in the cluster analysis instead of both.
Hierarchy Cluster Analysis
2. Data Standardisation
Min-Max standardisation
r_mpsz.std <- normalize(r_mpsz)
r_mpsz.std_df <- as.data.frame(r_mpsz.std)
summary(r_mpsz.std_df)
## young_pr aged_pr HDB12rm_pr HDB34rm_pr
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.4304 Median :0.1076 Median :0.00000 Median :0.0000
## Mean :0.3369 Mean :0.1046 Mean :0.04178 Mean :0.2650
## 3rd Qu.:0.5070 3rd Qu.:0.1639 3rd Qu.:0.02383 3rd Qu.:0.5364
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## HDB5rmec_pr condo_pr landed_pr pop_density
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.02725 Median :0.00000 Median :0.09513
## Mean :0.1395 Mean :0.19954 Mean :0.09242 Mean :0.21806
## 3rd Qu.:0.2351 3rd Qu.:0.24714 3rd Qu.:0.04831 3rd Qu.:0.40450
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000
Visualising the standardised clustering variables
y <- ggplot(data=residents_mpsz_derived,
aes(x= `young_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
r_mpsz_y_df <- as.data.frame(r_mpsz.std_df)
y1 <- ggplot(data=r_mpsz_y_df,
aes(x=`young_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue") +
ggtitle("Min-Max Standardisation")
a <- ggplot(data=residents_mpsz_derived,
aes(x= `aged_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
r_mpsz_a_df <- as.data.frame(r_mpsz.std_df)
a1 <- ggplot(data=r_mpsz_a_df,
aes(x=`aged_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue") +
ggtitle("Min-Max Standardisation")
p <- ggplot(data=residents_mpsz_derived,
aes(x= `pop_density`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
r_mpsz_p_df <- as.data.frame(r_mpsz.std_df)
p1 <- ggplot(data=r_mpsz_p_df,
aes(x=`pop_density`)) +
geom_histogram(bins=20,
color="black",
fill="light blue") +
ggtitle("Min-Max Standardisation")
ggarrange(y, y1, a, a1, p, p1, ncol = 2, nrow = 3)

hdb12rm <- ggplot(data=residents_mpsz_derived,
aes(x= `HDB12rm_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
r_mpsz_hdb12rm_df <- as.data.frame(r_mpsz.std_df)
hdb12rm1 <- ggplot(data=r_mpsz_hdb12rm_df,
aes(x=`HDB12rm_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue") +
ggtitle("Min-Max Standardisation")
hdb34rm <- ggplot(data=residents_mpsz_derived,
aes(x= `HDB34rm_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
r_mpsz_hdb34rm_df <- as.data.frame(r_mpsz.std_df)
hdb34rm1 <- ggplot(data=r_mpsz_hdb34rm_df,
aes(x=`HDB34rm_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue") +
ggtitle("Min-Max Standardisation")
hdb5rm <- ggplot(data=residents_mpsz_derived,
aes(x= `HDB5rmec_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
r_mpsz_hdb5rm_df <- as.data.frame(r_mpsz.std_df)
hdb5rm1 <- ggplot(data=r_mpsz_hdb5rm_df,
aes(x=`HDB5rmec_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue") +
ggtitle("Min-Max Standardisation")
ggarrange(hdb12rm, hdb12rm1, hdb34rm, hdb34rm1, hdb5rm, hdb5rm1, ncol = 2, nrow = 3)

c <- ggplot(data=residents_mpsz_derived,
aes(x= `condo_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
r_mpsz_condo_df <- as.data.frame(r_mpsz.std_df)
c1 <- ggplot(data=r_mpsz_condo_df,
aes(x=`condo_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue") +
ggtitle("Min-Max Standardisation")
l <- ggplot(data=residents_mpsz_derived,
aes(x= `landed_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue")
r_mpsz_landed_df <- as.data.frame(r_mpsz.std_df)
l1 <- ggplot(data=r_mpsz_landed_df,
aes(x=`landed_pr`)) +
geom_histogram(bins=20,
color="black",
fill="light blue") +
ggtitle("Min-Max Standardisation")
ggarrange(c, c1, l, l1, ncol = 2, nrow = 2)

Based on the histograms above, we noticed that overall range of the clustering variables is not very large. Therefore, we do not need to apply standardisation as doing so will change the distribution.
3. Computing proximity matrix
proxmat <- dist(r_mpsz, method = 'euclidean')
4. Computing hierarchical clustering
hclust_ward <- hclust(proxmat, method = 'ward.D')
plot(hclust_ward, cex = 0.6)

5. Selecting the optimal clustering algorithm
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
ac <- function(x) {
agnes(r_mpsz, method = x)$ac
}
map_dbl(m, ac)
## average single complete ward
## 0.9431697 0.8945523 0.9567745 0.9927800
The value that is closer to 1 suggest strong clustering structure. Ward method gives us the largest agglomerative coefficients. Hence the recommended method to use is ward.
Interpreting the dendrograms
plot(hclust_ward, cex = 0.6)
rect.hclust(hclust_ward, k = 5, border = 2:5)

6. Visually-driven hierarchical clustering analysis
Plotting interactive cluster heatmap using heatmaply()
heatmaply(normalize(r_mpsz_mat),
Colv=NA,
dist_method = "euclidean",
hclust_method = "ward.D",
seriate = "OLO",
colors = Blues,
k_row = 5,
margins = c(NA,200,60,NA),
fontsize_row = 4,
fontsize_col = 5,
main="Geographic Segmentation of Subzones by Residents indicators",
xlab = "Residents Indicators",
ylab = "Subzones"
)
Spatially Constrained Clustering - SKATER approach
1. Converting into SpatialPolygonsDataFrame
residents_mpsz_filtered <- residents_mpsz_derived %>%
filter(resident_count == 0 & business_count == 0 & industry_count == 0 & govt_count == 0 & financial_count == 0 & pr_count == 0 & shopping_count == 0) %>%
ungroup()
residents_mpsz_filtered_df <- as.data.frame(residents_mpsz_filtered)
residents_filtered <- anti_join(residents_mpsz_derived, residents_mpsz_filtered_df, by='SUBZONE_N')
residents_sp <- as_Spatial(residents_filtered)
2. Computing Neighbour List
residents.nb <- poly2nb(residents_sp)
summary(residents.nb)
## Neighbour list object:
## Number of regions: 300
## Number of nonzero links: 1758
## Percentage nonzero weights: 1.953333
## Average number of links: 5.86
## Link number distribution:
##
## 1 2 3 4 5 6 7 8 9 10 11 12 15
## 1 5 13 34 79 74 47 31 12 1 1 1 1
## 1 least connected region:
## 106 with 1 link
## 1 most connected region:
## 39 with 15 links
plot(residents_sp, border=grey(.5))
plot(residents.nb, coordinates(residents_sp), col="blue", add=TRUE)

3. Computing minimum spanning tree
# Calculating edge costs
lcosts <- nbcosts(residents.nb, r_mpsz)
residents.w <- nb2listw(residents.nb, lcosts, style="B")
## Warning in nb2listw(residents.nb, lcosts, style = "B"): zero sum general weights
summary(residents.w)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 300
## Number of nonzero links: 1758
## Percentage nonzero weights: 1.953333
## Average number of links: 5.86
## Link number distribution:
##
## 1 2 3 4 5 6 7 8 9 10 11 12 15
## 1 5 13 34 79 74 47 31 12 1 1 1 1
## 1 least connected region:
## 106 with 1 link
## 1 most connected region:
## 39 with 15 links
##
## Weights style: B
## Weights constants summary:
## n nn S0 S1 S2
## B 300 90000 1213552 2072096477 22630050243
residents_mst <- mstree(residents.w)
class(residents_mst)
## [1] "mst" "matrix"
dim(residents_mst)
## [1] 299 3
plot(residents_sp, border=gray(.5))
plot.mst(residents_mst, coordinates(residents_sp),
col="blue", cex.lab=0.7, cex.circles=0.005, add=TRUE)

4. Computing spatially constrained clusters using SKATER method
clust5 <- skater(residents_mst[,1:2], r_mpsz, 4)
ccs5 <- clust5$groups
ccs5
## [1] 1 1 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1 1 4 4 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 2 3 3 1 1 1 1 1 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 4 1 1 1 5 1 4 1
## [112] 1 1 1 1 1 4 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 5 4 1 4 1 1 1 1 4 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 5 2 1 1 1 1 1 1 1 1 4 4 1
## [186] 1 1 1 5 1 1 5 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 4 1 1 1 1 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 3 1 1 1 1 5 2 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1
table(ccs5)
## ccs5
## 1 2 3 4 5
## 266 6 7 13 8
plot(residents_sp, border=gray(.5))
plot(clust5, coordinates(residents_sp), cex.lab=.7,
groups.colors=c("red","green","blue", "brown", "pink"),
cex.circles=0.005, add = TRUE)

5. Visualising the clusters in choropleth map
groups_mat <- as.matrix(clust5$groups)
residents_clusters_filtered <- anti_join(residents_mpsz_clusters, residents_mpsz_filtered_df, by='SUBZONE_N')
residents_sf_spatialcluster <- cbind(residents_clusters_filtered, as.factor(groups_mat)) %>%
rename(`SP_CLUSTER`=`as.factor.groups_mat.`)
qtm(residents_sf_spatialcluster, "SP_CLUSTER")
## Warning: The shape residents_sf_spatialcluster is invalid. See sf::st_is_valid

Based on the choropleth map above, we can observe that:
1. Cluster 2 is located at the centre of the map
2. Cluster 5 is located at the west of the map
3. Cluster 3 and 4 is located at the North-East/East of the map
5.1 Plotting business with clusters
tm_shape(st_make_valid(residents_sf_spatialcluster)) +
tm_fill("SP_CLUSTER") +
tm_borders(lwd = 0.1, alpha = 1) +
tm_layout(title = "Business") +
tm_bubbles(col = "business_count",
size = "business_count",
border.col = "black",
border.lwd = 1,
alpha = 0.7)

ggplot(residents_sf_spatialcluster, aes(x=SP_CLUSTER, y=business_count)) +
geom_bar(stat = "identity")

5.2 Plotting industry with clusters
tm_shape(st_make_valid(residents_sf_spatialcluster)) +
tm_fill("SP_CLUSTER") +
tm_borders(lwd = 0.1, alpha = 1) +
tm_layout(title = "Industry") +
tm_bubbles(col = "industry_count",
size = "industry_count",
border.col = "black",
border.lwd = 1,
alpha = 0.7)

ggplot(residents_sf_spatialcluster, aes(x=SP_CLUSTER, y=industry_count)) +
geom_bar(stat = "identity")

5.3 Plotting financial with clusters
tm_shape(st_make_valid(residents_sf_spatialcluster)) +
tm_fill("SP_CLUSTER") +
tm_borders(lwd = 0.1, alpha = 1) +
tm_layout(title = "Financial") +
tm_bubbles(col = "financial_count",
size = "financial_count",
border.col = "black",
border.lwd = 1,
alpha = 0.7)

ggplot(residents_sf_spatialcluster, aes(x=SP_CLUSTER, y=financial_count)) +
geom_bar(stat = "identity")

5.4 Plotting government with clusters
tm_shape(st_make_valid(residents_sf_spatialcluster)) +
tm_fill("SP_CLUSTER") +
tm_borders(lwd = 0.1, alpha = 1) +
tm_layout(title = "Government with Embassy") +
tm_bubbles(col = "govt_count",
size = "govt_count",
border.col = "black",
border.lwd = 1,
alpha = 0.7)

ggplot(residents_sf_spatialcluster, aes(x=SP_CLUSTER, y=govt_count)) +
geom_bar(stat = "identity")

5.5 Plotting Private Residents with clusters
tm_shape(st_make_valid(residents_sf_spatialcluster)) +
tm_fill("SP_CLUSTER") +
tm_borders(lwd = 0.1, alpha = 1) +
tm_layout(title = "Private Residents") +
tm_bubbles(col = "pr_count",
size = "pr_count",
border.col = "black",
border.lwd = 1,
alpha = 0.7)

ggplot(residents_sf_spatialcluster, aes(x=SP_CLUSTER, y=pr_count)) +
geom_bar(stat = "identity")

Based on choropleth maps and bar charts above, we observed that majority of the number of business, industry, financial, government with embassy, private resident and shopping is in cluster 1. This might be due to the fact that the area of cluster 1 is significantly larger than other clusters. Looking at the numbers in other clusters:
1. Cluster 4 and 5 has the most number of business buildings
2. Cluster 4 has the most number of industrial buildings
3. Cluster 3 has the most number of financial buildings
4. Cluster 4 has the most number of government with embassy buildings
5. Cluster 3 has the most number of private residential buildings
6. Cluster 3 has the most number of shopping Malls