In this take-home exercise, you are tasked to segment Singapore at the planning subzone level into homogeneous socioeconomic areas by combining geodemographic data extracted from Singapore Department of Statistics and urban functions extracted from the geospatial data provided.
packages = c('factoextra','rgdal', 'spdep', 'ClustGeo', 'tmap', 'sf', 'ggpubr', 'cluster', 'heatmaply', 'corrplot', 'psych', 'tidyverse')
for (p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
business_sf <- st_read(dsn = "data/geospatial", layer = "business")
## Reading layer `business' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 6550 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6147 ymin: 1.24605 xmax: 104.0044 ymax: 1.4698
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
financial_sf <- st_read(dsn = "data/geospatial", layer = "Financial")
## Reading layer `Financial' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3320 features and 29 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6256 ymin: 1.24392 xmax: 103.9998 ymax: 1.46247
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
govtembassy_sf <- st_read(dsn = "data/geospatial", layer = "Govt_Embassy")
## Reading layer `Govt_Embassy' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 443 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6282 ymin: 1.24911 xmax: 103.9884 ymax: 1.45765
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
private_sf <- st_read(dsn = "data/geospatial", layer = "Private residential")
## Reading layer `Private residential' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3604 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.6295 ymin: 1.23943 xmax: 103.9749 ymax: 1.45379
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
shopping_sf <- st_read(dsn = "data/geospatial", layer = "Shopping")
## Reading layer `Shopping' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 511 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 103.679 ymin: 1.24779 xmax: 103.9644 ymax: 1.4535
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
mpsz <- st_read(dsn = "data/geospatial/master-plan-2014-subzone-boundary-web-shp", layer = "MP14_SUBZONE_WEB_PL")
## Reading layer `MP14_SUBZONE_WEB_PL' from data source `C:\Year 3\term 3\gis\Take hom assignments\Take-home_ex03\data\geospatial\master-plan-2014-subzone-boundary-web-shp' using driver `ESRI Shapefile'
## Simple feature collection with 323 features and 15 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## epsg (SRID): NA
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
resident <- read_csv ("data/aspatial/Singapore Residents by Planning AreaSubzone Age Group Sex and Type of Dwelling June 20112019/respopagesextod2011to2019.csv")
st_crs(business_sf)
## Coordinate Reference System:
## EPSG: 4326
## proj4string: "+proj=longlat +datum=WGS84 +no_defs"
st_crs(financial_sf)
## Coordinate Reference System:
## EPSG: 4326
## proj4string: "+proj=longlat +datum=WGS84 +no_defs"
st_crs(govtembassy_sf)
## Coordinate Reference System:
## EPSG: 4326
## proj4string: "+proj=longlat +datum=WGS84 +no_defs"
st_crs(private_sf)
## Coordinate Reference System:
## EPSG: 4326
## proj4string: "+proj=longlat +datum=WGS84 +no_defs"
st_crs(shopping_sf)
## Coordinate Reference System:
## EPSG: 4326
## proj4string: "+proj=longlat +datum=WGS84 +no_defs"
st_crs(mpsz)
## Coordinate Reference System:
## No EPSG code
## proj4string: "+proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs"
mpsz3414 <- subset(st_set_crs(mpsz,3414),select=c(SUBZONE_N))
mpsz_area <- subset(st_set_crs(mpsz,3414),select=c(SUBZONE_N,SHAPE_Area)) %>% st_set_geometry(NULL)
business3414 <- st_transform(business_sf,3414)
financial3414 <- st_transform(financial_sf,3414)
govtembassy3414 <- st_transform(govtembassy_sf,3414)
private3414 <- st_transform(private_sf,3414)
shopping3414 <- st_transform(shopping_sf,3414)
Data to extract in year 2019: Economy active population (i.e. age 25-64) Young group (i.e. age 0-24) Aged group (i.e 65 and above) Population density HDB1-2RM dwellers HDB3-4RM dwellers HDB5RM-Ec dweller Condominium and apartment dwellers Landed property dwellers
residents_2019 <- resident %>%
filter(Time == 2019)
Reclassify data according to the data needed to be extracted
Print Current Age group
unique(residents_2019$AG)
## [1] "0_to_4" "5_to_9" "10_to_14" "15_to_19" "20_to_24"
## [6] "25_to_29" "30_to_34" "35_to_39" "40_to_44" "45_to_49"
## [11] "50_to_54" "55_to_59" "60_to_64" "65_to_69" "70_to_74"
## [16] "75_to_79" "80_to_84" "85_to_89" "90_and_over"
Transpose age column and group the age groups into Young, ECONOMY Active and Aged.
residents_2019_AG <- residents_2019%>%
spread(AG, Pop) %>%
mutate(YOUNG = `0_to_4`+`5_to_9`+`10_to_14`+`15_to_19`+`20_to_24`) %>%
mutate(`ECONOMY ACTIVE` = `25_to_29`+`30_to_34`+`35_to_39`+`40_to_44`+`45_to_49`+`50_to_54`+`55_to_59`+`60_to_64`)%>%
mutate(`AGED`= `65_to_69`+`70_to_74`+`75_to_79`+`80_to_84`+`85_to_89`+`90_and_over`) %>%
mutate(`Pop`= `0_to_4`+`5_to_9`+`10_to_14`+`15_to_19`+`20_to_24`+ `25_to_29`+`30_to_34`+`35_to_39`+`40_to_44`+`45_to_49`+`50_to_54`+`55_to_59`+`60_to_64`+ `65_to_69`+`70_to_74`+`75_to_79`+`80_to_84`+`85_to_89`+`90_and_over`) %>%
mutate_at(.vars = vars(PA, SZ), .funs = funs(toupper)) %>%
select(`SZ`, `YOUNG`, `ECONOMY ACTIVE`, `AGED`,`Pop`)
residents_2019_AG
## # A tibble: 5,168 x 5
## SZ YOUNG `ECONOMY ACTIVE` AGED Pop
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ANG MO KIO TOWN CENTRE 340 600 80 1020
## 2 ANG MO KIO TOWN CENTRE 0 0 0 0
## 3 ANG MO KIO TOWN CENTRE 50 160 50 260
## 4 ANG MO KIO TOWN CENTRE 80 200 80 360
## 5 ANG MO KIO TOWN CENTRE 220 530 220 970
## 6 ANG MO KIO TOWN CENTRE 0 0 0 0
## 7 ANG MO KIO TOWN CENTRE 0 0 0 0
## 8 ANG MO KIO TOWN CENTRE 0 0 0 0
## 9 ANG MO KIO TOWN CENTRE 340 510 60 910
## 10 ANG MO KIO TOWN CENTRE 0 0 0 0
## # ... with 5,158 more rows
Sum up data by Subzone
residents_2019_AG_SZ <- residents_2019_AG%>%
group_by(SZ) %>%
summarise(YOUNG = sum(YOUNG),`ECONOMY ACTIVE` = sum(`ECONOMY ACTIVE`), AGED = sum(AGED),Pop=sum(Pop))
residents_2019_AG_SZ
## # A tibble: 323 x 5
## SZ YOUNG `ECONOMY ACTIVE` AGED Pop
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ADMIRALTY 4370 8380 1360 14110
## 2 AIRPORT ROAD 0 0 0 0
## 3 ALEXANDRA HILL 2910 7560 3310 13780
## 4 ALEXANDRA NORTH 580 1420 120 2120
## 5 ALJUNIED 8350 24330 7510 40190
## 6 ANAK BUKIT 5910 12490 3850 22250
## 7 ANCHORVALE 15250 27740 3620 46610
## 8 ANG MO KIO TOWN CENTRE 1360 2790 740 4890
## 9 ANSON 0 0 0 0
## 10 BALESTIER 7350 19320 6090 32760
## # ... with 313 more rows
Calculate Population density. We join mpsz_area to residents_2019_AG_SZ and obtain Pop/SHAPE_Area
residents_2019_AG_PD <- full_join(residents_2019_AG_SZ,mpsz_area, by = c("SZ"="SUBZONE_N"))
nrow(residents_2019_AG_PD)
## [1] 323
residents_2019_AG_PD$Pop_Density<-residents_2019_AG_PD$Pop/residents_2019_AG_PD$SHAPE_Area
head(residents_2019_AG_PD)
## # A tibble: 6 x 7
## SZ YOUNG `ECONOMY ACTIVE` AGED Pop SHAPE_Area Pop_Density
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ADMIRALTY 4370 8380 1360 14110 1261649. 0.0112
## 2 AIRPORT ROAD 0 0 0 0 494504. 0
## 3 ALEXANDRA HILL 2910 7560 3310 13780 1030379. 0.0134
## 4 ALEXANDRA NORTH 580 1420 120 2120 293706. 0.00722
## 5 ALJUNIED 8350 24330 7510 40190 2959368. 0.0136
## 6 ANAK BUKIT 5910 12490 3850 22250 2768354. 0.00804
Reclassify TOD and omit those that are not needed
Print Current TOD
unique(residents_2019$TOD)
## [1] "HDB 1- and 2-Room Flats"
## [2] "HDB 3-Room Flats"
## [3] "HDB 4-Room Flats"
## [4] "HDB 5-Room and Executive Flats"
## [5] "HUDC Flats (excluding those privatised)"
## [6] "Landed Properties"
## [7] "Condominiums and Other Apartments"
## [8] "Others"
We would like to reclassify the above into the following categories: 1. HDB1-2RM dwellers 2. HDB3-4RM dwellers 3. HDB5RM-Ec dweller 4. Condominium and apartment dwellers 5. Landed property dwellers
Create new columns to reclassify TOD by subzone and Obtain sum population density by Subzone
residents_2019_TOD <- residents_2019%>%
spread(TOD,Pop)%>%
mutate("HDB1-2RM dwellers" = `HDB 1- and 2-Room Flats`)%>%
mutate("HDB3-4RM dwellers" = `HDB 3-Room Flats`+`HDB 4-Room Flats`)%>%
mutate("HDB5RM-Ec dweller" = `HDB 5-Room and Executive Flats`)%>%
mutate("Condominium and apartment dwellers" = `Condominiums and Other Apartments`)%>%
mutate("Landed property dwellers" = `Landed Properties`) %>%
mutate_at(.vars = vars(PA, SZ), .funs = funs(toupper)) %>%
select(`SZ`, `HDB1-2RM dwellers`, `HDB3-4RM dwellers`, `HDB5RM-Ec dweller`, `Condominium and apartment dwellers`,`Landed property dwellers`)
head(residents_2019_TOD)
## # A tibble: 6 x 6
## SZ `HDB1-2RM dwell~ `HDB3-4RM dwell~ `HDB5RM-Ec dwel~ `Condominium an~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ANG ~ 0 20 20 40
## 2 ANG ~ 0 20 20 50
## 3 ANG ~ 0 40 60 80
## 4 ANG ~ 0 30 50 70
## 5 ANG ~ 0 20 50 80
## 6 ANG ~ 0 30 40 90
## # ... with 1 more variable: `Landed property dwellers` <dbl>
Sum up data by Subzone
residents_2019_TOD_SZ <- residents_2019_TOD%>%
group_by(SZ) %>%
summarise(`HDB1-2RM dwellers` = sum(`HDB1-2RM dwellers`),`HDB3-4RM dwellers` = sum(`HDB3-4RM dwellers`), `HDB5RM-Ec dweller` = sum(`HDB5RM-Ec dweller`),`Condominium and apartment dwellers`=sum(`Condominium and apartment dwellers`),`Landed property dwellers`=sum(`Landed property dwellers`))
head(residents_2019_TOD_SZ)
## # A tibble: 6 x 6
## SZ `HDB1-2RM dwell~ `HDB3-4RM dwell~ `HDB5RM-Ec dwel~ `Condominium an~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ADMI~ 1140 6750 6220 0
## 2 AIRP~ 0 0 0 0
## 3 ALEX~ 3910 6320 3120 0
## 4 ALEX~ 0 0 0 2120
## 5 ALJU~ 2120 18020 6400 11930
## 6 ANAK~ 160 2090 3170 9020
## # ... with 1 more variable: `Landed property dwellers` <dbl>
Before joining, lets do a count for each dataset
nrow(mpsz3414)
## [1] 323
nrow(residents_2019_AG_PD)
## [1] 323
nrow(residents_2019_TOD_SZ)
## [1] 323
Next we will do a full join of residents_2019_AG_PD and residents_2019_TOD_SZ to MPSZ
mpsz3414_AG<- full_join(mpsz3414, residents_2019_AG_PD, by = c("SUBZONE_N" = "SZ"))
We do a count of rows to ensure the joining of both data
nrow(mpsz3414_AG)
## [1] 323
mpsz3414_AG_TOD<- full_join(mpsz3414_AG, residents_2019_TOD_SZ, by = c("SUBZONE_N" = "SZ"))
We do a count of rows again to ensure the joining of both data
nrow(mpsz3414_AG_TOD)
## [1] 323
First we need to spilt business and industry
In order to do so, we will have to view the data first.
view(business3414)
FAC_TYPE consists of the 2 types - Business and Industry. Hence, lets look at the unqiue values to obtain the 2 codes.
unique(business3414$FAC_TYPE)
## [1] 5000 9991
By referencing to business3414, we can derive that 5000 represents Business and 9991 represents Industry. Next, we will split the 2. We will first extract Business.
business3414b <- business3414%>%
filter(FAC_TYPE==5000)
business3414b
## Simple feature collection with 6440 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 3669.148 ymin: 25408.41 xmax: 47034.83 ymax: 50148.54
## epsg (SRID): 3414
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
## First 10 features:
## POI_ID SEQ_NUM FAC_TYPE POI_NAME
## 1 1101180209 1 5000 JOHN CHEN
## 2 1101180210 1 5000 TROPICAL INDUSTRIAL BUILDING
## 3 1101180211 1 5000 LIAN CHEONG INDUSTRIAL BUILDING
## 4 1101180212 1 5000 MALAYSIA GARMENT MANUFACTURERS
## 5 1101180213 1 5000 UNIGOLD
## 6 1192316144 1 5000 NUS UNIVERSITY HALL
## 7 1144317654 1 5000 SUITES AT BUKIT TIMAH
## 8 1103507488 1 5000 TIONG HUAT
## 9 1001052867 1 5000 LEE CHOON GUAN TIMBER MERCHANT
## 10 1001052868 1 5000 WEIGHT BRIDGE SERVICE
## ST_NAME geometry
## 1 LITTLE RD POINT (33818.36 35620.16)
## 2 LITTLE RD POINT (33770.51 35610.2)
## 3 LITTLE RD POINT (33779.41 35612.41)
## 4 <NA> POINT (33802.78 35598.04)
## 5 LITTLE RD POINT (33835.06 35623.47)
## 6 LOWER KENT RIDGE RD POINT (21813.48 31063.37)
## 7 JALAN JURONG KECHIL POINT (21375.11 35831.37)
## 8 KALLANG PUDDING RD POINT (33088.33 34439.2)
## 9 PENJURU RD POINT (17103.73 33407.71)
## 10 PENJURU RD POINT (17178.3 33503.9)
Next, we will extract Industry
business3414I <- business3414%>%
filter(FAC_TYPE==9991)
business3414I
## Simple feature collection with 110 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 4674.563 ymin: 26960.98 xmax: 46655.31 ymax: 49784.76
## epsg (SRID): 3414
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
## First 10 features:
## POI_ID SEQ_NUM FAC_TYPE
## 1 1110491789 1 9991
## 2 1099992474 1 9991
## 3 1099992477 1 9991
## 4 1099992477 2 9991
## 5 1100464367 1 9991
## 6 1100464367 2 9991
## 7 1100464368 1 9991
## 8 1100464369 1 9991
## 9 1100464369 2 9991
## 10 1108286608 1 9991
## POI_NAME
## 1 KAKI BUKIT INDUSTRIAL ESTATE
## 2 TERRACE FACTORIES TUAS SOUTH ST 5
## 3 TERRACE FACTORIES TUAS SOUTH ST 5 EAST ENTRANCE
## 4 JTC TERRACE FACTORIES TUAS S ST 5
## 5 CHANGI BUSINESS PARK CHANGI BUSINESS PARK CENTRAL 1 ENTRANCE
## 6 CHANGI BUSINESS PARK CENTRAL 1
## 7 CHANGI BUSINESS PARK
## 8 CHANGI BUSINESS PARK CHANGI BUSINESS PARK VISTA ENTRANCE
## 9 CHANGI BUSINESS PARK VISTA ENTRANCE
## 10 ANG MO KIO INDUSTRIAL PARK 1
## ST_NAME geometry
## 1 KAKI BUKIT AVE 1 POINT (35885.03 34987.72)
## 2 TUAS SOUTH ST 5 POINT (4726.864 30627.49)
## 3 TUAS SOUTH ST 5 POINT (4825.915 30636.33)
## 4 TUAS SOUTH ST 5 POINT (4825.915 30636.33)
## 5 CHANGI BUSINESS PARK CENTRAL 1 POINT (42460.02 35003.47)
## 6 CHANGI BUSINESS PARK CENTRAL 1 POINT (42460.02 35003.47)
## 7 CHANGI BUSINESS PARK CENTRAL 2 POINT (42729.34 35182.61)
## 8 CHANGI BUSINESS PARK VIS POINT (43055.4 35408.2)
## 9 CHANGI BUSINESS PARK VIS POINT (43055.4 35408.2)
## 10 ANG MO KIO IND PARK 1 POINT (30518.61 37985.3)
We will need to check for NA as we will need to count the number of each urban functions. Hence we need to ensure there is no NA for the column that we are using and ensure that is unique. First we check for NA.
financial3414[rowSums(is.na(financial3414))!=0,]
## Simple feature collection with 3320 features and 29 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 4881.527 ymin: 25171.88 xmax: 46526.16 ymax: 49338.02
## epsg (SRID): 3414
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
## First 10 features:
## LINK_ID POI_ID SEQ_NUM FAC_TYPE POI_NAME POI_LANGCD
## 1 1170624361 1132324230 1 3578 UOB ENG
## 2 1112103842 1132315471 1 3578 POSB ENG
## 3 1112103842 1132315472 1 3578 UOB ENG
## 4 1112103842 1132315473 1 3578 OCBC ENG
## 5 864687596 1100784924 1 3578 OCBC ENG
## 6 902073032 1132324170 1 6000 MAYBANK ENG
## 7 778516217 1141424387 1 6000 ADPOST MONEYCHANGER ENG
## 8 880495939 1096910285 1 3578 UOB ENG
## 9 866996334 1096910292 1 3578 OCBC ENG
## 10 880495939 1096910286 1 3578 CITIBANK ENG
## POI_NMTYPE POI_ST_NUM ST_NUM_FUL ST_NFUL_LC ST_NAME ST_LANGCD
## 1 B 201 <NA> <NA> YISHUN AVE 2 ENG
## 2 B 375 <NA> <NA> COMMONWEALTH AVE ENG
## 3 B 375 <NA> <NA> COMMONWEALTH AVE ENG
## 4 B 375 <NA> <NA> COMMONWEALTH AVE ENG
## 5 B <NA> <NA> <NA> JURONG WEST ST 51 ENG
## 6 B 707 <NA> <NA> EAST COAST RD ENG
## 7 B 163 <NA> <NA> TANGLIN RD ENG
## 8 B <NA> <NA> <NA> <NA> <NA>
## 9 B 11 <NA> <NA> ARTS LINK ENG
## 10 B <NA> <NA> <NA> <NA> <NA>
## POI_ST_SD ACC_TYPE PH_NUMBER CHAIN_ID NAT_IMPORT PRIVATE IN_VICIN
## 1 L <NA> <NA> 6919 N N N
## 2 R <NA> <NA> 6918 N N N
## 3 R <NA> <NA> 6919 N N N
## 4 R <NA> <NA> 6920 N N N
## 5 R <NA> <NA> 6920 N N N
## 6 L <NA> 18006292266 3657 N N N
## 7 R <NA> 67330779 0 N N N
## 8 R <NA> <NA> 6919 N N N
## 9 R <NA> <NA> 6920 N N N
## 10 R <NA> <NA> 1165 N N N
## NUM_PARENT NUM_CHILD PERCFRREF VANCITY_ID
## 1 0 0 NA 0
## 2 0 0 NA 0
## 3 0 0 NA 0
## 4 0 0 NA 0
## 5 0 0 60 0
## 6 0 0 NA 0
## 7 1 0 50 0
## 8 0 0 20 0
## 9 0 0 NA 0
## 10 0 0 20 0
## ACT_ADDR
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 501 JURONG WEST STREET 51 SINGAPORE 640501
## 6 <NA>
## 7 <NA>
## 8 <NA>
## 9 <NA>
## 10 <NA>
## ACT_LANGCD ACT_ST_NAM ACT_ST_NUM ACT_ADMIN ACT_POSTAL
## 1 <NA> <NA> <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA> <NA>
## 4 <NA> <NA> <NA> <NA> <NA>
## 5 ENG JURONG WEST STREET 51 501 SINGAPORE 640501
## 6 <NA> <NA> <NA> <NA> <NA>
## 7 <NA> <NA> <NA> <NA> <NA>
## 8 <NA> <NA> <NA> <NA> <NA>
## 9 <NA> <NA> <NA> <NA> <NA>
## 10 <NA> <NA> <NA> <NA> <NA>
## geometry
## 1 POINT (27966.77 44304.65)
## 2 POINT (24163.96 31606.25)
## 3 POINT (24163.96 31606.25)
## 4 POINT (24163.96 31606.25)
## 5 POINT (15270.94 36919.65)
## 6 POINT (37917.26 32698.88)
## 7 POINT (26981.85 31956.75)
## 8 POINT (21205.83 30939.54)
## 9 POINT (21159.08 30673.06)
## 10 POINT (21205.83 30939.54)
As seen above, POI_ID does not have any NA. this could likely be the unique primary key. Next, lets check for if it consists of only unique values. We do a count for each POI_ID.
F_POI_N <- financial3414 %>% group_by(POI_ID, LINK_ID) %>% summarize(count=n())
F_POI_N
## Simple feature collection with 3293 features and 3 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 4881.527 ymin: 25171.88 xmax: 46526.16 ymax: 49338.02
## epsg (SRID): 3414
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
## # A tibble: 3,293 x 4
## # Groups: POI_ID [3,293]
## POI_ID LINK_ID count geometry
## * <dbl> <dbl> <int> <POINT [m]>
## 1 36438791 1195622058 1 (17518.91 35275.3)
## 2 36439016 1206451148 1 (20678.49 37946.69)
## 3 36439060 862669111 1 (34824.27 41553.63)
## 4 36439061 862669111 1 (34822.04 41538.15)
## 5 36439318 939808842 1 (33879.48 39781.09)
## 6 36439341 1199010724 1 (32936.92 37359.48)
## 7 36439599 116173695 1 (23708.83 34892.54)
## 8 36439668 914931160 1 (38133.16 32755.28)
## 9 36439689 901144425 1 (25411.55 34020.09)
## 10 36439715 1199568246 1 (31094.04 31778.74)
## # ... with 3,283 more rows
F_POI_N_2 <- F_POI_N %>% filter(count>=2) %>% st_set_geometry(NULL)
F_POI_N_2
## # A tibble: 27 x 3
## # Groups: POI_ID [27]
## POI_ID LINK_ID count
## * <dbl> <dbl> <int>
## 1 1040480984 1046226297 2
## 2 1058317297 762719732 2
## 3 1058317303 762719732 2
## 4 1058317333 966347222 2
## 5 1083881074 1128584674 2
## 6 1113017405 845452563 2
## 7 1115796366 778595512 2
## 8 1115796975 1188277710 2
## 9 1115797008 116157295 2
## 10 1115868988 862559104 2
## # ... with 17 more rows
Next, we join back the data to see why the data is duplicated.
F_POI_N_2_all <- left_join(F_POI_N_2,financial3414, c("POI_ID"="POI_ID"))
F_POI_N_2_all
## # A tibble: 54 x 32
## # Groups: POI_ID [27]
## POI_ID LINK_ID.x count LINK_ID.y SEQ_NUM FAC_TYPE POI_NAME POI_LANGCD
## <dbl> <dbl> <int> <dbl> <int> <int> <fct> <fct>
## 1 1.04e9 1.05e9 2 1.05e9 1 6000 CREDIT ~ ENG
## 2 1.04e9 1.05e9 2 1.05e9 2 6000 CREDIT ~ ENG
## 3 1.06e9 7.63e8 2 7.63e8 1 6000 BANQUE ~ ENG
## 4 1.06e9 7.63e8 2 7.63e8 2 6000 BCEE ENG
## 5 1.06e9 7.63e8 2 7.63e8 1 6000 DZ BANK~ ENG
## 6 1.06e9 7.63e8 2 7.63e8 2 6000 DZ BANK ENG
## 7 1.06e9 9.66e8 2 9.66e8 1 6000 RAIFFEI~ ENG
## 8 1.06e9 9.66e8 2 9.66e8 2 6000 RAIFFEI~ ENG
## 9 1.08e9 1.13e9 2 1.13e9 1 6000 AMEERTE~ ENG
## 10 1.08e9 1.13e9 2 1.13e9 2 6000 AMEERTE~ ENG
## # ... with 44 more rows, and 24 more variables: POI_NMTYPE <fct>,
## # POI_ST_NUM <fct>, ST_NUM_FUL <fct>, ST_NFUL_LC <fct>, ST_NAME <fct>,
## # ST_LANGCD <fct>, POI_ST_SD <fct>, ACC_TYPE <fct>, PH_NUMBER <fct>,
## # CHAIN_ID <dbl>, NAT_IMPORT <fct>, PRIVATE <fct>, IN_VICIN <fct>,
## # NUM_PARENT <int>, NUM_CHILD <int>, PERCFRREF <int>, VANCITY_ID <dbl>,
## # ACT_ADDR <fct>, ACT_LANGCD <fct>, ACT_ST_NAM <fct>, ACT_ST_NUM <fct>,
## # ACT_ADMIN <fct>, ACT_POSTAL <fct>, geometry <POINT [m]>
This can be seen that it is due to the SEQ_NUM abd POI_NMTYPE whereby there is SEQ_NUM = 1 and 2 and POI_NMTYPE = B and J for each of the duplicates. The other different field is the POI_NAME which differes slightly. Hence, for the purpose of our spatial data, let’s check if those rows with counts = 1 have any SEQ_NUM = 2.
F_POI_N_1 <- F_POI_N %>% filter(count==1) %>% st_set_geometry(NULL)
F_POI_N_1_all <- left_join(F_POI_N_1,financial3414, c("POI_ID"="POI_ID"))
unique(F_POI_N_1_all$SEQ_NUM)
## [1] 1
As seen above, there are no SEQ = 2. Hence, for our spatial data, we will filter all to sequence 1 only to avoid any duplicates.
business3414b_filter <- business3414b%>%filter(SEQ_NUM==1)
business3414I_filter <- business3414I%>%filter(SEQ_NUM==1)
financial3414_filter <- financial3414%>%filter(SEQ_NUM==1)
govtembassy3414_filter <- govtembassy3414%>%filter(SEQ_NUM==1)
private3414_filter <- private3414%>%filter(SEQ_NUM==1)
shopping3414_filter <- shopping3414%>%filter(SEQ_NUM==1)
We will join all the spatial data to mpsz to otbtain the subzone that each point belongs to and count by subzone
Before joining, lets do a row count.
nrow(financial3414_filter)
## [1] 3293
We join financial3414 to mpsz3414 to obtain the subzone of each point.
mpsz_financial <- st_join(mpsz3414, financial3414_filter, join = st_contains)
We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any financial centers.
nrow(mpsz_financial)
## [1] 3366
Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.
mpsz_financial_sz <- mpsz_financial %>%
group_by(SUBZONE_N)%>%
summarise(Financial = sum(!is.na(POI_ID)))
mpsz_financial_sz_df <- mpsz_financial_sz %>% st_set_geometry(NULL)
head(mpsz_financial_sz_df)
## # A tibble: 6 x 2
## SUBZONE_N Financial
## <fct> <int>
## 1 ADMIRALTY 2
## 2 AIRPORT ROAD 0
## 3 ALEXANDRA HILL 14
## 4 ALEXANDRA NORTH 0
## 5 ALJUNIED 42
## 6 ANAK BUKIT 31
Next, we join the mpsz_financial_sz to mpsz3414_AG_TOD by Subzone
mpsz3414_AG_TOD_F<- full_join(mpsz3414_AG_TOD, mpsz_financial_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))
We then count the number of rows to ensure it adds up to 323.
nrow(mpsz3414_AG_TOD_F)
## [1] 323
Before joining, lets do a row count.
nrow(govtembassy3414_filter)
## [1] 394
We join govtembassy3414 to mpsz3414 to obtain the subzone of each point.
mpsz_govtembassy <- st_join(mpsz3414, govtembassy3414_filter, join = st_contains)
We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any govt embassy.
nrow(mpsz_govtembassy)
## [1] 584
Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.
mpsz_govtembassy_sz <- mpsz_govtembassy %>%
group_by(SUBZONE_N)%>%
summarise(GovtEmbassy = sum(!is.na(POI_ID)))
mpsz_govtembassy_sz_df <- mpsz_govtembassy_sz %>% st_set_geometry(NULL)
Next, we join the mpsz_govtembassy_sz_df to mpsz3414_AG_TOD_F
mpsz3414_AG_TOD_F_G<- full_join(mpsz3414_AG_TOD_F, mpsz_govtembassy_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))
We then count the number of rows to ensure it adds up to 323.
nrow(mpsz3414_AG_TOD_F_G)
## [1] 323
Before joining, lets do a row count.
nrow(private3414_filter)
## [1] 3580
We join private3414 to mpsz3414 to obtain the subzone of each point.
mpsz_private <- st_join(mpsz3414, private3414_filter, join = st_contains)
We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any Private.
nrow(mpsz_private)
## [1] 3664
Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.
mpsz_private_sz <- mpsz_private %>%
group_by(SUBZONE_N)%>%
summarise(Private = sum(!is.na(POI_ID)))
mpsz_private_sz_df <- mpsz_private_sz %>% st_set_geometry(NULL)
Next, we join the mpsz_private3414_sz_df to mpsz3414_AG_TOD_F_G
mpsz3414_AG_TOD_F_G_P <- full_join(mpsz3414_AG_TOD_F_G, mpsz_private_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))
We then count the number of rows to ensure it adds up to 323.
nrow(mpsz3414_AG_TOD_F_G_P)
## [1] 323
Before joining, lets do a row count.
nrow(shopping3414_filter)
## [1] 458
We join shopping3414 to mpsz3414 to obtain the subzone of each point.
mpsz_shopping <- st_join(mpsz3414, shopping3414_filter, join = st_contains)
We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any Shopping Centres
nrow(mpsz_shopping)
## [1] 634
Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.
mpsz_Shopping_sz <- mpsz_shopping %>%
group_by(SUBZONE_N)%>%
summarise(Shopping = sum(!is.na(POI_ID)))
mpsz_Shopping_sz_df <- mpsz_Shopping_sz %>% st_set_geometry(NULL)
Next, we join the mpsz_Shopping_sz_df to mpsz3414_AG_TOD_F_G_P
mpsz3414_AG_TOD_F_G_P_S <- full_join(mpsz3414_AG_TOD_F_G_P, mpsz_Shopping_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))
We then count the number of rows to ensure it adds up to 323.
nrow(mpsz3414_AG_TOD_F_G_P_S)
## [1] 323
Before joining, lets do a row count.
nrow(business3414b_filter)
## [1] 6320
We join business3414b to mpsz3414 to obtain the subzone of each point.
mpsz_business<- st_join(mpsz3414, business3414b_filter, join = st_contains)
We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any Business Centres
nrow(mpsz_business)
## [1] 6427
Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.
mpsz_business_sz <- mpsz_business %>%
group_by(SUBZONE_N)%>%
summarise(Business = sum(!is.na(POI_ID)))
mpsz_business_sz_df <- mpsz_business_sz %>% st_set_geometry(NULL)
Next, we join the mpsz_business_sz_df to mpsz3414_AG_TOD_F_G_P
mpsz3414_AG_TOD_F_G_P_S_B <- full_join(mpsz3414_AG_TOD_F_G_P_S, mpsz_business_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))
We then count the number of rows to ensure it adds up to 323.
nrow(mpsz3414_AG_TOD_F_G_P_S_B)
## [1] 323
Before joining, lets do a row count.
nrow(business3414I_filter)
## [1] 95
We join business3414I to mpsz3414 to obtain the subzone of each point.
mpsz_industry<- st_join(mpsz3414, business3414I_filter, join = st_contains)
We do a count again after joining. As seen below, there is an increase in the number of rows. This is due to the fact that some subzones do not contain any Industry Centres
nrow(mpsz_industry)
## [1] 369
Next, we will group by Subzone and count the rows. We then extract the dataframe from the sf.
mpsz_industry_sz <- mpsz_industry %>%
group_by(SUBZONE_N)%>%
summarise(Industry = sum(!is.na(POI_ID)))
mpsz_industry_sz_df <- mpsz_industry_sz %>% st_set_geometry(NULL)
Next, we join the mpsz_industry_sz_df to mpsz3414_AG_TOD_F_G_P
mpsz3414_all <- full_join(mpsz3414_AG_TOD_F_G_P_S_B, mpsz_industry_sz_df, by = c("SUBZONE_N" = "SUBZONE_N"))
We then count the number of rows to ensure it adds up to 323.
nrow(mpsz3414_all)
## [1] 323
head(mpsz3414_all)
## Simple feature collection with 6 features and 18 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 24468.89 ymin: 28369.47 xmax: 32362.39 ymax: 30542.74
## epsg (SRID): 3414
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
## SUBZONE_N YOUNG ECONOMY ACTIVE AGED Pop SHAPE_Area Pop_Density
## 1 MARINA SOUTH 0 0 0 0 1630379.3 0.0000000000
## 2 PEARL'S HILL 1100 3420 2110 6630 559816.2 0.0118431719
## 3 BOAT QUAY 0 50 20 70 160807.5 0.0004353031
## 4 HENDERSON HILL 2620 7500 3260 13380 595428.9 0.0224711973
## 5 REDHILL 2840 6260 1630 10730 387429.4 0.0276953656
## 6 ALEXANDRA HILL 2910 7560 3310 13780 1030378.8 0.0133737225
## HDB1-2RM dwellers HDB3-4RM dwellers HDB5RM-Ec dweller
## 1 0 0 0
## 2 4520 1400 0
## 3 0 0 0
## 4 3930 7990 1270
## 5 1970 3520 3240
## 6 3910 6320 3120
## Condominium and apartment dwellers Landed property dwellers Financial
## 1 0 0 3
## 2 420 0 24
## 3 50 0 2
## 4 190 0 4
## 5 1930 0 12
## 6 0 0 14
## GovtEmbassy Private Shopping Business Industry
## 1 0 0 0 0 0
## 2 1 6 3 6 0
## 3 1 1 1 39 0
## 4 0 5 0 0 0
## 5 0 6 1 1 0
## 6 5 4 1 39 1
## geometry
## 1 MULTIPOLYGON (((31495.56 30...
## 2 MULTIPOLYGON (((29092.28 30...
## 3 MULTIPOLYGON (((29932.33 29...
## 4 MULTIPOLYGON (((27131.28 30...
## 5 MULTIPOLYGON (((26451.03 30...
## 6 MULTIPOLYGON (((25899.7 297...
After joining all the data, lets check if there is any NA data.
mpsz3414_all[rowSums(is.na(mpsz3414_all))!=0,]
## Simple feature collection with 0 features and 18 fields
## bbox: xmin: NA ymin: NA xmax: NA ymax: NA
## epsg (SRID): 3414
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
## [1] SUBZONE_N YOUNG
## [3] ECONOMY ACTIVE AGED
## [5] Pop SHAPE_Area
## [7] Pop_Density HDB1-2RM dwellers
## [9] HDB3-4RM dwellers HDB5RM-Ec dweller
## [11] Condominium and apartment dwellers Landed property dwellers
## [13] Financial GovtEmbassy
## [15] Private Shopping
## [17] Business Industry
## [19] geometry
## <0 rows> (or 0-length row.names)
We first use summary() to derive the summary statistics of mpsz3414_all data.frame.
summary(mpsz3414_all)
## SUBZONE_N YOUNG ECONOMY ACTIVE AGED
## Length:323 Min. : 0 Min. : 0 Min. : 0
## Class :character 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Mode :character Median : 1170 Median : 2840 Median : 730
## Mean : 3296 Mean : 7386 Mean : 1806
## 3rd Qu.: 4365 3rd Qu.:10285 3rd Qu.: 3000
## Max. :34260 Max. :79780 Max. :18860
## Pop SHAPE_Area Pop_Density HDB1-2RM dwellers
## Min. : 0 Min. : 39438 Min. :0.00000 Min. : 0
## 1st Qu.: 0 1st Qu.: 628261 1st Qu.:0.00000 1st Qu.: 0
## Median : 4890 Median : 1229894 Median :0.00591 Median : 0
## Mean : 12487 Mean : 2420882 Mean :0.01074 Mean : 542
## 3rd Qu.: 17065 3rd Qu.: 2106483 3rd Qu.:0.01993 3rd Qu.: 605
## Max. :132900 Max. :69748299 Max. :0.04606 Max. :4700
## HDB3-4RM dwellers HDB5RM-Ec dweller Condominium and apartment dwellers
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0 Median : 230
## Mean : 5953 Mean : 3297 Mean : 1827
## 3rd Qu.: 9705 3rd Qu.: 3660 3rd Qu.: 2835
## Max. :75000 Max. :47960 Max. :16770
## Landed property dwellers Financial GovtEmbassy Private
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.0 1st Qu.: 1.0 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.0 Median : 5.0 Median : 0.00 Median : 4.00
## Mean : 773.9 Mean : 10.2 Mean : 1.22 Mean : 11.08
## 3rd Qu.: 400.0 3rd Qu.: 13.0 3rd Qu.: 1.00 3rd Qu.: 11.00
## Max. :18820.0 Max. :132.0 Max. :17.00 Max. :215.00
## Shopping Business Industry geometry
## Min. : 0.000 Min. : 0.00 Min. :0.0000 MULTIPOLYGON :323
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.:0.0000 epsg:3414 : 0
## Median : 0.000 Median : 2.00 Median :0.0000 +proj=tmer...: 0
## Mean : 1.418 Mean : 19.57 Mean :0.2941
## 3rd Qu.: 1.000 3rd Qu.: 13.50 3rd Qu.:0.0000
## Max. :27.000 Max. :303.00 Max. :5.0000
We can plot the distribution of the variables by using appropriate Exploratory Data Analysis (EDA) as shown in the code chunk below. Histogram is useful to identify the overall distribution of the data values (i.e. left skew, right skew or normal distribution)
a <- ggplot(data=mpsz3414_all, aes(x=`YOUNG`)) +
geom_histogram(bins=20, color="black", fill="light blue")
b <- ggplot(data=mpsz3414_all, aes(x=`ECONOMY ACTIVE`)) +
geom_histogram(bins=20, color="black", fill="light blue")
c <- ggplot(data=mpsz3414_all, aes(x=`AGED`)) +
geom_histogram(bins=20, color="black", fill="light blue")
d <- ggplot(data=mpsz3414_all, aes(x=`Pop`)) +
geom_histogram(bins=20, color="black", fill="light blue")
e <- ggplot(data=mpsz3414_all, aes(x=`HDB1-2RM dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
f <- ggplot(data=mpsz3414_all, aes(x=`HDB3-4RM dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
g <- ggplot(data=mpsz3414_all, aes(x=`HDB5RM-Ec dweller`)) +
geom_histogram(bins=20, color="black", fill="light blue")
h <- ggplot(data=mpsz3414_all, aes(x=`Condominium and apartment dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
i <- ggplot(data=mpsz3414_all, aes(x=`Landed property dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
ggarrange(a,b,c,d,e,f,g,h,i,
ncol = 3,
nrow = 3)
j <- ggplot(data=mpsz3414_all, aes(x=`Financial`)) +
geom_histogram(bins=20, color="black", fill="light blue")
k <- ggplot(data=mpsz3414_all, aes(x=`GovtEmbassy`)) +
geom_histogram(bins=20, color="black", fill="light blue")
l <- ggplot(data=mpsz3414_all, aes(x=`Private`)) +
geom_histogram(bins=20, color="black", fill="light blue")
m <- ggplot(data=mpsz3414_all, aes(x=`Shopping`)) +
geom_histogram(bins=20, color="black", fill="light blue")
n <- ggplot(data=mpsz3414_all, aes(x=`Business`)) +
geom_histogram(bins=20, color="black", fill="light blue")
o <- ggplot(data=mpsz3414_all, aes(x=`Industry`)) +
geom_histogram(bins=20, color="black", fill="light blue")
ggarrange(j,k,l,m,n,o,
ncol = 3,
nrow = 2)
As all the NA values is due to the fact that the total population is 0, the new dervied variabless yield a NA value. Hence, we will have to remove this NA values with 0 as it is irelevant for our analysis.
mpsz3414_all_o <- mpsz3414_all%>%filter(Pop>0)
summary(mpsz3414_all_o)
## SUBZONE_N YOUNG ECONOMY ACTIVE AGED
## Length:234 Min. : 0 Min. : 10 Min. : 0.0
## Class :character 1st Qu.: 735 1st Qu.: 1760 1st Qu.: 472.5
## Mode :character Median : 2730 Median : 6505 Median : 1905.0
## Mean : 4549 Mean :10195 Mean : 2492.7
## 3rd Qu.: 6512 3rd Qu.:15582 3rd Qu.: 3547.5
## Max. :34260 Max. :79780 Max. :18860.0
## Pop SHAPE_Area Pop_Density HDB1-2RM dwellers
## Min. : 50 Min. : 49626 Min. :0.000004 Min. : 0.0
## 1st Qu.: 2992 1st Qu.: 552455 1st Qu.:0.004062 1st Qu.: 0.0
## Median : 11085 Median : 1059527 Median :0.011992 Median : 0.0
## Mean : 17237 Mean : 1733644 Mean :0.014820 Mean : 748.1
## 3rd Qu.: 26378 3rd Qu.: 1794212 3rd Qu.:0.024407 3rd Qu.:1290.0
## Max. :132900 Max. :69748299 Max. :0.046058 Max. :4700.0
## HDB3-4RM dwellers HDB5RM-Ec dweller Condominium and apartment dwellers
## Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 52.5
## Median : 5285 Median : 1850 Median : 1285.0
## Mean : 8217 Mean : 4552 Mean : 2521.8
## 3rd Qu.:13482 3rd Qu.: 5690 3rd Qu.: 4027.5
## Max. :75000 Max. :47960 Max. :16770.0
## Landed property dwellers Financial GovtEmbassy
## Min. : 0.0 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.0 1st Qu.: 3.00 1st Qu.: 0.000
## Median : 0.0 Median : 7.50 Median : 0.000
## Mean : 1068.3 Mean : 12.38 Mean : 1.269
## 3rd Qu.: 887.5 3rd Qu.: 16.00 3rd Qu.: 1.750
## Max. :18820.0 Max. :132.00 Max. :17.000
## Private Shopping Business Industry
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. :0.0000
## 1st Qu.: 3.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.:0.0000
## Median : 7.00 Median : 1.000 Median : 2.00 Median :0.0000
## Mean : 15.16 Mean : 1.718 Mean : 10.06 Mean :0.2137
## 3rd Qu.: 15.00 3rd Qu.: 2.000 3rd Qu.: 7.00 3rd Qu.:0.0000
## Max. :215.00 Max. :27.000 Max. :224.00 Max. :5.0000
## geometry
## MULTIPOLYGON :234
## epsg:3414 : 0
## +proj=tmer...: 0
##
##
##
We can plot the distribution of the variables by using appropriate Exploratory Data Analysis (EDA) as shown in the code chunk below. Histogram is useful to identify the overall distribution of the data values (i.e. left skew, right skew or normal distribution)
youngo <- ggplot(data=mpsz3414_all_o, aes(x=`YOUNG`)) +
geom_histogram(bins=20, color="black", fill="light blue")
econo <- ggplot(data=mpsz3414_all_o, aes(x=`ECONOMY ACTIVE`)) +
geom_histogram(bins=20, color="black", fill="light blue")
agedo <- ggplot(data=mpsz3414_all_o, aes(x=`AGED`)) +
geom_histogram(bins=20, color="black", fill="light blue")
popo <- ggplot(data=mpsz3414_all_o, aes(x=`Pop`)) +
geom_histogram(bins=20, color="black", fill="light blue")
hdb1o <- ggplot(data=mpsz3414_all_o, aes(x=`HDB1-2RM dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
hdb3o <- ggplot(data=mpsz3414_all_o, aes(x=`HDB3-4RM dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
hdb5o <- ggplot(data=mpsz3414_all_o, aes(x=`HDB5RM-Ec dweller`)) +
geom_histogram(bins=20, color="black", fill="light blue")
condoo <- ggplot(data=mpsz3414_all_o, aes(x=`Condominium and apartment dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
landedo <- ggplot(data=mpsz3414_all_o, aes(x=`Landed property dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
ggarrange(youngo,econo,agedo,popo,hdb1o, hdb3o,hdb5o,condoo,landedo,i,
ncol = 3,
nrow = 4)
For analysis purpose, we split the socio economic factors and the urban functions
socio <- mpsz3414_all_o %>% select(`SUBZONE_N`,`YOUNG`,`ECONOMY ACTIVE`,`AGED`,`HDB1-2RM dwellers`,`HDB3-4RM dwellers`,`HDB5RM-Ec dweller`,`Condominium and apartment dwellers`,`Landed property dwellers`,`Pop_Density`)
urban <- mpsz3414_all_o %>% select(`Financial`,`GovtEmbassy`,`Private`,`Shopping`,`Business`,`Industry`)
To perform correlation analyss, we will need to remove the geometry.
socio_df <- socio%>% st_set_geometry(NULL)
cluster_vars.cor = cor(socio_df[2:10])
corrplot.mixed(cluster_vars.cor,
lower = "ellipse",
upper = "number",
tl.pos = "lt",
diag = "l",
tl.col = "black",width=10, height=4,number.cex = .55)
As seen from abovE: 1. Young has a high correlation with economy active of 0.99 2. Young and HDB 3-4 have a high correlation of 0.91 3. Young and HDB5RM/EC have a high correlation of 0.9242. Economy active and HDB3-4RM have a high correlation of 0.94 4. Economy active and HDB5RM-EC have a high correlation of 0.89 5. Aged has a high correlation of 0.91 with economy active of 0.91 6. Aged and HDB 3-4 RM have a high correlation of 0.91
Based on the high corrrelations, we can see that Economy active people who stays with their children (YOUNG) usually stay in HDB3-4 room or HDB 5 room/Executive condo. Economy active people who stay with their parents (AGED) usuallt stay in HDB 3-4 room. There are also cases where the YOUNG stay with AGED which could be the case that children stay with both their parents(ECONOMY ACTIVE) and grandparents (AGED), o either or. Based on the correlations, ECONOMY ACTIVE has the highest correlation with YOUNG, then ECONOMY ACTIVE and AGED,then AGED and YOUNG. Hence, we will first drop ECONOMY ACTIVE.
After dropping economy active, we are left with 3 high correlations, YOUNG and AGED, YOUNG and HDB3-4 RM, AGED and HDB3-4 RM. As discussed earlier, all 3 age groups have a high correlation with HDB3-4 rooms as families with the different age group tend to stay in HDB 3-4 rooms. Hence, we will remove HDB3-4 RM so that we are able to seperate the age groups.
Between AGED and YOUNG, we decide to drop YOUNG as YOUNG.Apart from the explanations apart, we also drop the following variales due to the it having the highest correlation. again in this case,YOUNG has the highest correlation with other variables as compared to AGED.
new_socio <- subset(socio_df, select = -c(`ECONOMY ACTIVE`,`HDB3-4RM dwellers`,`YOUNG`))
cluster_vars.cor = cor(new_socio[2:7])
corrplot.mixed(cluster_vars.cor,
lower = "ellipse",
upper = "number",
tl.pos = "lt",
diag = "l",
tl.col = "black",width=10, height=4,number.cex = .55)
In this section, you will learn how to perform hierarchical cluster analysis. The analysis consists of four major steps:
cluster_vars <- new_socio[1:7]
head(cluster_vars,10)
## SUBZONE_N AGED HDB1-2RM dwellers HDB5RM-Ec dweller
## 1 PEARL'S HILL 2110 4520 0
## 2 BOAT QUAY 20 0 0
## 3 HENDERSON HILL 3260 3930 1270
## 4 REDHILL 1630 1970 3240
## 5 ALEXANDRA HILL 3310 3910 3120
## 6 BUKIT HO SWEE 3590 3700 1900
## 7 CLARKE QUAY 10 0 0
## 8 PASIR PANJANG 1 560 0 0
## 9 QUEENSWAY 50 0 0
## 10 ALEXANDRA NORTH 120 0 0
## Condominium and apartment dwellers Landed property dwellers
## 1 420 0
## 2 50 0
## 3 190 0
## 4 1930 0
## 5 0 0
## 6 540 0
## 7 10 0
## 8 2970 1430
## 9 290 0
## 10 2120 0
## Pop_Density
## 1 0.0118431719
## 2 0.0004353031
## 3 0.0224711973
## 4 0.0276953656
## 5 0.0133737225
## 6 0.0267883659
## 7 0.0002067649
## 8 0.0040837310
## 9 0.0004591192
## 10 0.0072180927
Next, we need to change the rows by township name instead of row number by using the code chunk below
row.names(cluster_vars) <- cluster_vars$"SUBZONE_N"
head(cluster_vars,10)
## SUBZONE_N AGED HDB1-2RM dwellers HDB5RM-Ec dweller
## PEARL'S HILL PEARL'S HILL 2110 4520 0
## BOAT QUAY BOAT QUAY 20 0 0
## HENDERSON HILL HENDERSON HILL 3260 3930 1270
## REDHILL REDHILL 1630 1970 3240
## ALEXANDRA HILL ALEXANDRA HILL 3310 3910 3120
## BUKIT HO SWEE BUKIT HO SWEE 3590 3700 1900
## CLARKE QUAY CLARKE QUAY 10 0 0
## PASIR PANJANG 1 PASIR PANJANG 1 560 0 0
## QUEENSWAY QUEENSWAY 50 0 0
## ALEXANDRA NORTH ALEXANDRA NORTH 120 0 0
## Condominium and apartment dwellers
## PEARL'S HILL 420
## BOAT QUAY 50
## HENDERSON HILL 190
## REDHILL 1930
## ALEXANDRA HILL 0
## BUKIT HO SWEE 540
## CLARKE QUAY 10
## PASIR PANJANG 1 2970
## QUEENSWAY 290
## ALEXANDRA NORTH 2120
## Landed property dwellers Pop_Density
## PEARL'S HILL 0 0.0118431719
## BOAT QUAY 0 0.0004353031
## HENDERSON HILL 0 0.0224711973
## REDHILL 0 0.0276953656
## ALEXANDRA HILL 0 0.0133737225
## BUKIT HO SWEE 0 0.0267883659
## CLARKE QUAY 0 0.0002067649
## PASIR PANJANG 1 1430 0.0040837310
## QUEENSWAY 0 0.0004591192
## ALEXANDRA NORTH 0 0.0072180927
Notice that the row number has been replaced into the Subzone name.
Now, we will delete the Subzone field by using the code chunk below.
subzone_var <- select(cluster_vars, c(2:7))
head(subzone_var, 10)
## AGED HDB1-2RM dwellers HDB5RM-Ec dweller
## PEARL'S HILL 2110 4520 0
## BOAT QUAY 20 0 0
## HENDERSON HILL 3260 3930 1270
## REDHILL 1630 1970 3240
## ALEXANDRA HILL 3310 3910 3120
## BUKIT HO SWEE 3590 3700 1900
## CLARKE QUAY 10 0 0
## PASIR PANJANG 1 560 0 0
## QUEENSWAY 50 0 0
## ALEXANDRA NORTH 120 0 0
## Condominium and apartment dwellers
## PEARL'S HILL 420
## BOAT QUAY 50
## HENDERSON HILL 190
## REDHILL 1930
## ALEXANDRA HILL 0
## BUKIT HO SWEE 540
## CLARKE QUAY 10
## PASIR PANJANG 1 2970
## QUEENSWAY 290
## ALEXANDRA NORTH 2120
## Landed property dwellers Pop_Density
## PEARL'S HILL 0 0.0118431719
## BOAT QUAY 0 0.0004353031
## HENDERSON HILL 0 0.0224711973
## REDHILL 0 0.0276953656
## ALEXANDRA HILL 0 0.0133737225
## BUKIT HO SWEE 0 0.0267883659
## CLARKE QUAY 0 0.0002067649
## PASIR PANJANG 1 1430 0.0040837310
## QUEENSWAY 0 0.0004591192
## ALEXANDRA NORTH 0 0.0072180927
As the data range is very big, we perform standardization.
subzone_var.std <- normalize(subzone_var)
summary(subzone_var.std)
## AGED HDB1-2RM dwellers HDB5RM-Ec dweller
## Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.02505 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.10101 Median :0.0000 Median :0.03857
## Mean :0.13217 Mean :0.1592 Mean :0.09490
## 3rd Qu.:0.18810 3rd Qu.:0.2745 3rd Qu.:0.11864
## Max. :1.00000 Max. :1.0000 Max. :1.00000
## Condominium and apartment dwellers Landed property dwellers
## Min. :0.000000 Min. :0.00000
## 1st Qu.:0.003131 1st Qu.:0.00000
## Median :0.076625 Median :0.00000
## Mean :0.150378 Mean :0.05676
## 3rd Qu.:0.240161 3rd Qu.:0.04716
## Max. :1.000000 Max. :1.00000
## Pop_Density
## Min. :0.0000
## 1st Qu.:0.0881
## Median :0.2603
## Mean :0.3217
## 3rd Qu.:0.5299
## Max. :1.0000
popd <- ggplot(data=subzone_var.std, aes(x=`Pop_Density`)) +
geom_histogram(bins=20, color="black", fill="light blue")
hdb1d <- ggplot(data=subzone_var.std, aes(x=`HDB1-2RM dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
AGEDd <- ggplot(data=subzone_var.std, aes(x=`AGED`)) +
geom_histogram(bins=20, color="black", fill="light blue")
hdb5d <- ggplot(data=subzone_var.std, aes(x=`HDB5RM-Ec dweller`)) +
geom_histogram(bins=20, color="black", fill="light blue")
condod <- ggplot(data=subzone_var.std, aes(x=`Condominium and apartment dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
landedd <- ggplot(data=subzone_var.std, aes(x=`Landed property dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
ggarrange(popd,hdb1d,AGEDd,hdb5d,condod,landedd,
ncol = 3,
nrow = 2)
As seen from above, even after standardizing the data, it is right skewed. Hence, we need to perform Transformation.
subzone_var.log <- log(subzone_var.std)
d <- ggplot(data=subzone_var.log, aes(x=`Pop_Density`)) +
geom_histogram(bins=20, color="black", fill="light blue")
e <- ggplot(data=subzone_var.log, aes(x=`HDB1-2RM dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
f <- ggplot(data=subzone_var.log, aes(x=`AGED`)) +
geom_histogram(bins=20, color="black", fill="light blue")
g <- ggplot(data=subzone_var.log, aes(x=`HDB5RM-Ec dweller`)) +
geom_histogram(bins=20, color="black", fill="light blue")
h <- ggplot(data=subzone_var.log, aes(x=`Condominium and apartment dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
i <- ggplot(data=subzone_var.log, aes(x=`Landed property dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")
ggarrange(d,e,f,g,h,i,
ncol = 3,
nrow = 2)
After performing transformation, there may be NA/Infinite values due to 0 values being transformed. Hence, we need to check for any and replace these data with 0.
subzone_var.log[rowSums(is.na(subzone_var.log))!=0,]
## [1] AGED HDB1-2RM dwellers
## [3] HDB5RM-Ec dweller Condominium and apartment dwellers
## [5] Landed property dwellers Pop_Density
## <0 rows> (or 0-length row.names)
indx <- apply(subzone_var.log, 2, function(x) any(is.na(x) | is.infinite(x)))
indx
## AGED HDB1-2RM dwellers
## TRUE TRUE
## HDB5RM-Ec dweller Condominium and apartment dwellers
## TRUE TRUE
## Landed property dwellers Pop_Density
## TRUE TRUE
subzone_var.log[is.na(subzone_var.log)] = 0
subzone_var.replaced <- subzone_var.log %>% mutate_if(is.numeric, function(x) ifelse(is.infinite(x), 0, x))
In R, many packages provide functions to calculate distance matrix. We will compute the proximity matrix by using dist() of R.
dist() supports six distance proximity calculations, they are: euclidean, maximum, manhattan, canberra, binary and minkowski. The default is euclidean proximity matrix.
The code chunk below is used to compute the proximity matrix using euclidean method.
proxmat <- dist(subzone_var.replaced, method = 'euclidean')
One of the challenge in performing hierarchical clustering is to identify stronger clustering structures. The issue can be solved by using use agnes() function of cluster package. It functions like hclus(), however, with the agnes() function you can also get the agglomerative coefficient, which measures the amount of clustering structure found (values closer to 1 suggest strong clustering structure).
The code chunk below will be used to compute the agglomerative coefficients of all hierarchical clustering algorithms.
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
ac <- function(x) {
agnes(subzone_var.replaced, method = x)$ac
}
map_dbl(m, ac)
## average single complete ward
## 0.8515852 0.7304796 0.9113168 0.9699733
With reference to the output above, we can see that Ward’s method provides the strongest clustering structure among the four methods assessed. Hence, in the subsequent analysis, only Ward’s method will be used.
The code chunk below performs hierarchical cluster analysis using ward.D method. The hierarchical clustering output is stored in an object of class hclust which describes the tree produced by the clustering process.
hclust_ward <- hclust(proxmat, method = 'ward.D')
We can then plot the tree by using plot() of R Graphics as shown in the code chunk below.
plot(hclust_ward, cex = 0.6)
There are various techniques to use to identify the optimal number of clusters such as Silhouette Coefficient, or from the dendrogram, a horizontal line that can traverse the maximum distance vertically without intersecting a cluster is drawn and the number of vertical lines in the dendrogram cut by this line represents the optimal number of clusters. In our case, we will use that method and the elbow method. The elbow method looks at the percentage of variance explained as a function of the number of clusters: One should choose a number of clusters so that adding another cluster doesn’t give much better modeling of the data.
Next, we will use the elbow method to determine how many clusters we should use.
fviz_nbclust(subzone_var.replaced, FUN = hcut, method = "wss")
As seen above, 2 has an elbow. However,we decided to try with more clusters as there were more clusters found while doing analysis with 2 clusters. Hence, we tried 3-6 clusters as well. Of those, 5 clusters was the most suitable.
plot(hclust_ward, cex = 0.6)
rect.hclust(hclust_ward, k =5, border = 2:5)
In this section, we will learn how to perform visually-driven hiearchical clustering analysis by using heatmaply package.
With heatmaply, we are able to build both highly interactive cluster heatmap or static cluster heatmap.
The code chunk below will be used to transform subzone_var.replaced data frame into a data matrix.
subzone_var_mat <- data.matrix(subzone_var.replaced)
In the code chunk below, the heatmaply() is used to build an interactive cluster heatmap.
heatmaply(subzone_var_mat,
Colv=NA,
dist_method = "euclidean",
hclust_method = "ward.D",
seriate = "OLO",
colors = Blues,
k_row = 5,
margins = c(NA,200,60,NA),
fontsize_row = 4,
fontsize_col = 5,
main="Geographic Segmentation of Subzones by Socio economic Indicators",
xlab = "Socio economic Indicators",
ylab = "Subzones of Singapore"
)
With closed examination of the dendragram above, we have decided to retain five clusters.
cutree() of R Base will be used in the code chunk below to derive a 3-cluster model.
groups <- as.factor(cutree(hclust_ward, k=5))
The output is called groups. It is a list object.
In order to visualise the clusters, the groups object need to be appended onto socio simple feature object.
The code chunk below form the join in three steps: 1. the groups list object will be converted into a matrix; 2. cbnd() is used to append groups matrix onto socio to produce an output simple feature object called sg_cluster; and 3. rename of dplyr package is used to rename as.matrix.groups field as CLUSTER.
sg_cluster <- cbind(socio, as.matrix(groups)) %>%
rename(`CLUSTER`=`as.matrix.groups.`)
Next, qtm() of tmap package is used to plot the choropleth map showing the cluster formed.
qtm(sg_cluster, "CLUSTER")
The choropleth map above reveals the clusters are very fragmented. The is one of the major limitation when non-spatial clustering algorithm such as hierarchical cluster analysis method is used.
In this section, you swill learn how to derive spatially constrained cluster by using SKATER method.
First, we need to convert socio into SpatialPolygonDataFrame. This is because SKATER function only support sp objects such as SpatialPolygonDataFrame.
The code chunk below uses as_Spatial() of sf package to convert socio into a SpatialPolygonDataFrame called sg_sp.
sg_sp <- as_Spatial(socio)
Next, poly2nd() of spdep package will be used to compute the neighbours list from polygon list.
sg.nb <- poly2nb(sg_sp)
summary(sg.nb)
## Neighbour list object:
## Number of regions: 234
## Number of nonzero links: 1198
## Percentage nonzero weights: 2.187888
## Average number of links: 5.119658
## Link number distribution:
##
## 1 2 3 4 5 6 7 8 9
## 4 10 25 36 69 41 33 11 5
## 4 least connected regions:
## 13 204 222 233 with 1 link
## 5 most connected regions:
## 59 66 71 121 131 with 9 links
We can plot the neighbours list on sg_sp by using the code chunk below. Since we now can plot the community area boundaries as well, we plot this graph on top of the map. The first plot command gives the boundaries. This is followed by the plot of the neighbor list object, with coordinates applied to the original SpatialPolygonDataFrame to extract the centroids of the polygons. These are used as the nodes for the graph representation. We also set the color to blue and specify add=TRUE to plot the network on top of the boundaries.
lcosts <- nbcosts(sg.nb, subzone_var)
sg.w <- nb2listw(sg.nb, lcosts, style="B")
summary(sg.w)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 234
## Number of nonzero links: 1198
## Percentage nonzero weights: 2.187888
## Average number of links: 5.119658
## Link number distribution:
##
## 1 2 3 4 5 6 7 8 9
## 4 10 25 36 69 41 33 11 5
## 4 least connected regions:
## 13 204 222 233 with 1 link
## 5 most connected regions:
## 59 66 71 121 131 with 9 links
##
## Weights style: B
## Weights constants summary:
## n nn S0 S1 S2
## B 234 54756 9043408 262731206400 2.560782e+12
The minimum spanning tree is computed by mean of the mstree() of spdep package as shown in the code chunk below.
sg.mst <- mstree(sg.w)
After computing the MST, we can check its class and dimension by using the code chunk below.
class(sg.mst)
## [1] "mst" "matrix"
dim(sg.mst)
## [1] 233 3
Note that the dimension is 233 and not 234. This is because the minimum spanning tree consists on n-1 edges (links) in order to traverse all the nodes.
We can display the content of sg.mst by using head() as shown in the code chunk below.
head(sg.mst)
## [,1] [,2] [,3]
## [1,] 179 133 3905.791
## [2,] 133 126 3353.595
## [3,] 126 107 2941.003
## [4,] 107 114 2884.614
## [5,] 107 127 3610.028
## [6,] 126 140 4078.848
The plot method for the MST include a way to show the observation numbers of the nodes in addition to the edge. As before, we plot this together with the township boundaries. We can see how the initial neighbour list is simplified to just one edge connecting each of the nodes, while passing through all the nodes.
plot(sg_sp, border=gray(.5))
plot.mst(sg.mst, coordinates(sg_sp),
col="blue", cex.lab=0.7, cex.circles=0.005, add=TRUE)
The code chunk below compute the spatially constrained cluster using skater() of spdep package.
clust5 <- skater(sg.mst[,1:2], subzone_var, 4)
The skater() takes three mandatory arguments: - the first two columns of the MST matrix (i.e. not the cost), - the data matrix (to update the costs as units are being grouped), and - the number of cuts. Note: It is set to one less than the number of clusters. So, the value specified is not the number of clusters, but the number of cuts in the graph, one less than the number of custers.
The result of the skater() is an object of class skater. We can examine its contents by using the code chunk below.
str(clust5)
## List of 8
## $ groups : num [1:234] 4 4 4 4 4 4 4 4 4 4 ...
## $ edges.groups:List of 5
## ..$ :List of 3
## .. ..$ node: num [1:33] 107 126 195 179 133 124 168 164 141 153 ...
## .. ..$ edge: num [1:32, 1:3] 124 141 160 153 195 164 136 197 168 124 ...
## .. ..$ ssw : num 219752
## ..$ :List of 3
## .. ..$ node: num [1:4] 206 205 200 198
## .. ..$ edge: num [1:3, 1:3] 206 200 205 205 198 ...
## .. ..$ ssw : num 14097
## ..$ :List of 3
## .. ..$ node: num [1:86] 194 221 196 220 122 223 211 173 81 159 ...
## .. ..$ edge: num [1:85, 1:3] 221 220 122 223 211 81 232 122 159 219 ...
## .. ..$ ssw : num 678039
## ..$ :List of 3
## .. ..$ node: num [1:109] 144 89 86 83 43 42 17 40 46 9 ...
## .. ..$ edge: num [1:108, 1:3] 51 117 72 80 59 95 156 91 36 39 ...
## .. ..$ ssw : num 336728
## ..$ :List of 3
## .. ..$ node: num [1:2] 188 138
## .. ..$ edge: num [1, 1:3] 188 138 21956
## .. ..$ ssw : num 21956
## $ not.prune : NULL
## $ candidates : int [1:5] 1 2 3 4 5
## $ ssto : num 1623978
## $ ssw : num [1:5] 1623978 1514554 1428910 1329531 1270571
## $ crit : num [1:2] 1 Inf
## $ vec.crit : num [1:234] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "class")= chr "skater"
ccs5 <- clust5$groups
table(ccs5)
## ccs5
## 1 2 3 4 5
## 33 4 86 109 2
The code chunk below is used to plot the newly derived clusters by using SKATER method.
groups_mat <- as.matrix(clust5$groups)
sg_sf_spatialcluster <- cbind(sg_cluster, as.factor(groups_mat)) %>%
rename(`SP_CLUSTER`=`as.factor.groups_mat.`)
The clusters are then plotted in tmap.
tmap_mode("view")
tm_shape(sg_sf_spatialcluster)+
tm_borders(alpha = 0.5) +
tm_fill(col = "SP_CLUSTER", alpha = 0.9, id = "SUBZONE_N")
We can view the subzones in each cluster easily in the data table below
| SP_CLUSTER | |
|---|---|
| JURONG GATEWAY | 1 |
| TAMAN JURONG | 1 |
| LAKESIDE | 1 |
| YUHUA EAST | 1 |
| BUKIT BATOK SOUTH | 1 |
| JURONG WEST CENTRAL | 1 |
| TOH GUAN | 1 |
| TEBAN GARDENS | 1 |
| BOON LAY PLACE | 1 |
| BUKIT BATOK EAST | 1 |
| BUKIT BATOK WEST | 1 |
| BUKIT BATOK CENTRAL | 1 |
| HONG KAH | 1 |
| BRICKWORKS | 1 |
| GUILIN | 1 |
| WENYA | 1 |
| YUHUA WEST | 1 |
| YUNNAN | 1 |
| JELEBU | 1 |
| KEAT HONG | 1 |
| BANGKIT | 1 |
| PENG SIANG | 1 |
| TECK WHYE | 1 |
| CHOA CHU KANG CENTRAL | 1 |
| FAJAR | 1 |
| SENJA | 1 |
| SAUJANA | 1 |
| HONG KAH NORTH | 1 |
| GOMBAK | 1 |
| CHOA CHU KANG NORTH | 1 |
| YEW TEE | 1 |
| WESTERN WATER CATCHMENT | 1 |
| LIM CHU KANG | 1 |
| RIVERVALE | 2 |
| PUNGGOL FIELD | 2 |
| SENGKANG TOWN CENTRE | 2 |
| ANCHORVALE | 2 |
| TANJONG RHU | 3 |
| MOUNTBATTEN | 3 |
| SIGLAP | 3 |
| BAYSHORE | 3 |
| MARINE PARADE | 3 |
| BEDOK SOUTH | 3 |
| KEMBANGAN | 3 |
| KATONG | 3 |
| KAMPONG UBI | 3 |
| KAKI BUKIT | 3 |
| BEDOK RESERVOIR | 3 |
| JOO SENG | 3 |
| ALJUNIED | 3 |
| FRANKEL | 3 |
| GEYLANG EAST | 3 |
| UPPER PAYA LEBAR | 3 |
| SERANGOON CENTRAL | 3 |
| BISHAN EAST | 3 |
| TAMPINES WEST | 3 |
| MARYMOUNT | 3 |
| TAI SENG | 3 |
| LORONG CHUAN | 3 |
| MACPHERSON | 3 |
| BEDOK NORTH | 3 |
| HOUGANG EAST | 3 |
| KANGKAR | 3 |
| SEMBAWANG HILLS | 3 |
| HOUGANG WEST | 3 |
| XILIN | 3 |
| SIMEI | 3 |
| PASIR RIS WEST | 3 |
| YIO CHU KANG WEST | 3 |
| TRAFALGAR | 3 |
| CHANGI WEST | 3 |
| SELETAR HILLS | 3 |
| COMPASSVALE | 3 |
| YIO CHU KANG EAST | 3 |
| LOYANG WEST | 3 |
| TAGORE | 3 |
| LORONG AH SOO | 3 |
| FLORA DRIVE | 3 |
| UPPER THOMSON | 3 |
| TOWNSVILLE | 3 |
| KOVAN | 3 |
| CHONG BOON | 3 |
| SHANGRI-LA | 3 |
| SERANGOON GARDEN | 3 |
| HOUGANG CENTRAL | 3 |
| LOYANG EAST | 3 |
| TAMPINES NORTH | 3 |
| CHENG SAN | 3 |
| ANG MO KIO TOWN CENTRE | 3 |
| KEBUN BAHRU | 3 |
| SERANGOON NORTH | 3 |
| PASIR RIS CENTRAL | 3 |
| PASIR RIS PARK | 3 |
| FERNVALE | 3 |
| MATILDA | 3 |
| WATERWAY EAST | 3 |
| CHANGI POINT | 3 |
| YISHUN SOUTH | 3 |
| LOWER SELETAR | 3 |
| NORTHSHORE | 3 |
| MANDAI ESTATE | 3 |
| YISHUN CENTRAL | 3 |
| TURF CLUB | 3 |
| WOODLANDS SOUTH | 3 |
| WOODGROVE | 3 |
| YISHUN EAST | 3 |
| YISHUN WEST | 3 |
| WOODLANDS EAST | 3 |
| SEMBAWANG CENTRAL | 3 |
| SEMBAWANG EAST | 3 |
| ADMIRALTY | 3 |
| SEMBAWANG NORTH | 3 |
| NORTHLAND | 3 |
| MIDVIEW | 3 |
| WOODLANDS WEST | 3 |
| SEMBAWANG SPRINGS | 3 |
| SPRINGLEAF | 3 |
| PUNGGOL TOWN CENTRE | 3 |
| NEE SOON | 3 |
| SELETAR | 3 |
| KHATIB | 3 |
| NORTH COAST | 3 |
| SEMBAWANG STRAITS | 3 |
| PEARL’S HILL | 4 |
| BOAT QUAY | 4 |
| HENDERSON HILL | 4 |
| REDHILL | 4 |
| ALEXANDRA HILL | 4 |
| BUKIT HO SWEE | 4 |
| CLARKE QUAY | 4 |
| PASIR PANJANG 1 | 4 |
| QUEENSWAY | 4 |
| ALEXANDRA NORTH | 4 |
| INSTITUTION HILL | 4 |
| ROBERTSON QUAY | 4 |
| SENTOSA | 4 |
| MARITIME SQUARE | 4 |
| TELOK BLANGAH WAY | 4 |
| CECIL | 4 |
| KAMPONG TIONG BAHRU | 4 |
| TELOK BLANGAH DRIVE | 4 |
| PASIR PANJANG 2 | 4 |
| CENTRAL SUBZONE | 4 |
| DEPOT ROAD | 4 |
| PEOPLE’S PARK | 4 |
| BUKIT MERAH | 4 |
| CHINATOWN | 4 |
| CHINA SQUARE | 4 |
| TIONG BAHRU | 4 |
| TIONG BAHRU STATION | 4 |
| OXLEY | 4 |
| MEI CHIN | 4 |
| NATIONAL UNIVERSITY OF S’PORE | 4 |
| ONE TREE HILL | 4 |
| SOMERSET | 4 |
| BENCOOLEN | 4 |
| LEONIE HILL | 4 |
| PORT | 4 |
| DHOBY GHAUT | 4 |
| BUGIS | 4 |
| VICTORIA | 4 |
| PATERSON | 4 |
| TELOK BLANGAH RISE | 4 |
| TANJONG PAGAR | 4 |
| EVERTON PARK | 4 |
| TANGLIN HALT | 4 |
| MACKENZIE | 4 |
| SUNGEI ROAD | 4 |
| ONE NORTH | 4 |
| COMMONWEALTH | 4 |
| DOVER | 4 |
| RIDOUT | 4 |
| CAIRNHILL | 4 |
| CLEMENTI WEST | 4 |
| MONK’S HILL | 4 |
| CLEMENTI WOODS | 4 |
| ORANGE GROVE | 4 |
| KAMPONG BUGIS | 4 |
| BOULEVARD | 4 |
| LITTLE INDIA | 4 |
| FORT CANNING | 4 |
| FARRER COURT | 4 |
| NASSIM | 4 |
| WEST COAST | 4 |
| CHATSWORTH | 4 |
| KAMPONG GLAM | 4 |
| SELEGIE | 4 |
| MOUNT EMILY | 4 |
| CRAWFORD | 4 |
| MARGARET DRIVE | 4 |
| TANGLIN | 4 |
| GEYLANG BAHRU | 4 |
| FABER | 4 |
| MALCOLM | 4 |
| BENDEMEER | 4 |
| BALESTIER | 4 |
| CORONATION ROAD | 4 |
| HOLLAND DRIVE | 4 |
| FARRER PARK | 4 |
| NEWTON CIRCUS | 4 |
| GHIM MOH | 4 |
| LAVENDER | 4 |
| GOODWOOD PARK | 4 |
| SINGAPORE POLYTECHNIC | 4 |
| CLEMENTI CENTRAL | 4 |
| KAMPONG JAVA | 4 |
| BOON KENG | 4 |
| ULU PANDAN | 4 |
| HOLLAND ROAD | 4 |
| SENNETT | 4 |
| POTONG PASIR | 4 |
| PEI CHUN | 4 |
| BOON TECK | 4 |
| WOODLEIGH | 4 |
| TOA PAYOH WEST | 4 |
| MOUNT PLEASANT | 4 |
| HILLCREST | 4 |
| LORONG 8 TOA PAYOH | 4 |
| BRADDELL | 4 |
| TYERSALL | 4 |
| MOULMEIN | 4 |
| CLEMENTI NORTH | 4 |
| LEEDON PARK | 4 |
| NATURE RESERVE | 4 |
| DUNEARN | 4 |
| SUNSET WAY | 4 |
| KIM KEAT | 4 |
| TOA PAYOH CENTRAL | 4 |
| ANAK BUKIT | 4 |
| SWISS CLUB | 4 |
| HILLVIEW | 4 |
| DAIRY FARM | 4 |
| TAMPINES EAST | 5 |
| PASIR RIS DRIVE | 5 |
Before we analyse, we join back the data to the urban function data. We do a count of rows after joining to ensure that the number of rows is 234.
socio_urban <- st_join(sg_sf_spatialcluster,urban,join = st_contains)
nrow(socio_urban)
## [1] 234
head(socio_urban)
## Simple feature collection with 6 features and 18 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 24468.89 ymin: 29357.01 xmax: 29947.32 ymax: 30567.91
## epsg (SRID): 3414
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +units=m +no_defs
## SUBZONE_N YOUNG ECONOMY.ACTIVE AGED HDB1.2RM.dwellers
## 1 PEARL'S HILL 1100 3420 2110 4520
## 2 BOAT QUAY 0 50 20 0
## 3 HENDERSON HILL 2620 7500 3260 3930
## 4 REDHILL 2840 6260 1630 1970
## 5 ALEXANDRA HILL 2910 7560 3310 3910
## 6 BUKIT HO SWEE 2850 8340 3590 3700
## HDB3.4RM.dwellers HDB5RM.Ec.dweller Condominium.and.apartment.dwellers
## 1 1400 0 420
## 2 0 0 50
## 3 7990 1270 190
## 4 3520 3240 1930
## 5 6320 3120 0
## 6 8610 1900 540
## Landed.property.dwellers Pop_Density CLUSTER SP_CLUSTER Financial
## 1 0 0.0118431719 1 4 24
## 2 0 0.0004353031 2 4 2
## 3 0 0.0224711973 3 4 4
## 4 0 0.0276953656 3 4 12
## 5 0 0.0133737225 3 4 14
## 6 0 0.0267883659 3 4 6
## GovtEmbassy Private Shopping Business Industry
## 1 1 6 3 6 0
## 2 1 1 1 39 0
## 3 0 5 0 0 0
## 4 0 6 1 1 0
## 5 5 4 1 39 1
## 6 3 11 1 6 0
## geometry
## 1 MULTIPOLYGON (((29092.28 30...
## 2 MULTIPOLYGON (((29932.33 29...
## 3 MULTIPOLYGON (((27131.28 30...
## 4 MULTIPOLYGON (((26451.03 30...
## 5 MULTIPOLYGON (((25899.7 297...
## 6 MULTIPOLYGON (((27746.95 30...
ggplot(data=socio_urban, aes(x=`Pop_Density`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
Cluster 2 subzones have a relatively high population density of between 0.03 to 0.04. Cluster 1 ranges widely from 0.0 to 0.05. cluster 3 ranges from 0.0 to 0.04. Cluster 4 forms the biggest cluster which most of them have a lower density ranging from 0 to 0.03. Cluster 3 has a wide range from 0 to 0.03. Cluster 5 has a moderate to slightly higher population density of between 0.03 to 0.035.
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="Pop_Density",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "quantile",size="Pop_Density")
By looking at the map, we can see that cluster 1 generally have high population density with the exception of water catchment and Lim Chu Kang. All subzones in cluster 4 have a high population density as well. In cluster 4, there are a few that have high population density, with those around one another. One of the highly populated clusters in cluster 4 includes Tiong Bahru Station, Tiong Bahru, Telok Blangah Way, Redhill, Henderson Hill, Kampong Tiong Bahru, Telok Blangah Rise and Bukit Ho Swee. Another is Boon Teck, Toa Payoh Central, Balestier, Braddell, Lorong 8 Toa Payoh, Pei Chun, Kim Keat,Potong Pasir,Beedemeer and Balestier. Cluster 3 has a wide range of population density as seen in the barchart earlier and in the map as well. Areas neaer cluster 2 and also up north in Singapore have a high population density. Cluster 5 have a high population density.
ggplot(data=socio_urban, aes(x=`YOUNG`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="YOUNG",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="YOUNG")
As we can see from the map, Cluster 1 has a moderate number of YOUNG across the subzones. Cluster 2 subzones have a high number of YOUNG group. Cluster 3 has a moderate number of YOUNG across the subzones. Cluster 5 have a high number of YOUNG group. Cluster 4 generally has low number of YOUNG group.
ggplot(data=socio_urban, aes(x=`AGED`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="AGED",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="AGED")
Cluster 1 less to moderate number of AGED people. Cluster 2 has a moderate, lesser proportion as compared to the YOUNG. Cluser 3 varies with a low to moderate number of AGED people. Cluster 3 has a moderate to high AGED Group and cluster 4 has a low number of aged in general. Cluster 5 have moderate to high number of AGED people.
ggplot(data=socio_urban, aes(x=`ECONOMY.ACTIVE`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="ECONOMY.ACTIVE",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="ECONOMY.ACTIVE",size.lim=c(1))
There are more ECONOMY ACTIVE people as compared to the other 2 groups. Cluster 1 has a moderate to high number of economy active people. Cluster 2 has a higher number of ECONOMY ACTIVE people. Cluster 3 varies, with mostly low to moderate. Cluster 4 has the lowest number of economy active people as compared to the other clutsers. This is due to the low population density in cluster 4.Cluster 5 have moderate to high number of economy active people.
ggplot(data=socio_urban, aes(x=`HDB1.2RM.dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="HDB1.2RM.dwellers",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="HDB1.2RM.dwellers",size.lim=c(1))
As compared to the other housing types, HDB1-2 rooms in Singapore. Cluster 1 and Cluster 3 have a wide range, cluster 2 has a low umber of HDB1-2 rooms, cluster 4 are on extreme ends, with mostly having a low number with the exception of one group - Pearl’s Hill, Aleandra Hill, Boat Quay and Redhill. CLuster 4 has a wide range of opposite ends as well.
ggplot(data=socio_urban, aes(x=`HDB3.4RM.dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="HDB3.4RM.dwellers",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="HDB3.4RM.dwellers",size.lim=c(1))
CLuster 1 and 2 have a moderate to high number of HDB3-4RM . Cluster 3 varies and cluster 4 has a low number of HDB3-4RM. Cluster 5 have a moderate to high number of HDB3-4 RM.
ggplot(data=socio_urban, aes(x=`HDB5RM.Ec.dweller`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="HDB5RM.Ec.dweller",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="HDB5RM.Ec.dweller",size.lim=c(1))
Cluster 1 have moderate to high number of HDB5 room/EC. Cluster 2 have a high number of HDB5 room/EC. Cluster 3 generally has lower with the exception of Woodlands East. Cluster 4 have the lowest number of HDB5room/EC overall. Cluster 5 have a high number of HDB5RM/EC dwellers.
ggplot(data=socio_urban, aes(x=`Condominium.and.apartment.dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="Condominium.and.apartment.dwellers",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Condominium.and.apartment.dwellers",size.lim=c(1))
Cluster 1 and 2 have low to moderate number of condominium/apartments, cluster 3 have a generally high number of condominium/apartments in the East. Cluster 4 have a low to moderate number of condominium/apartments. Cluster 5 has a relatively lower number of condominium/apartments.
ggplot(data=socio_urban, aes(x=`Landed.property.dwellers`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="Landed.property.dwellers",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Landed.property.dwellers",size.lim=c(1))
## Warning: The shape socio_urban is invalid. See sf::st_is_valid
## Legend for symbol sizes not available in view mode.
Cluster 1 and 2 have relatively low numbers in Landed property, cluster 3 subzones have the highest number as compare to the others. cluster 4 in general have a low to moderae number of landed property. Cluster 5 has no landed property dwellers.
Filter by cluster
cluster1 <- socio_urban %>%
filter(SP_CLUSTER==1)
cluster2 <- socio_urban %>%
filter(SP_CLUSTER==2)
cluster3 <- socio_urban %>%
filter(SP_CLUSTER==3)
cluster4 <- socio_urban %>%
filter(SP_CLUSTER==4)
cluster5 <- socio_urban %>%
filter(SP_CLUSTER==5)
summary(cluster1)
## SUBZONE_N YOUNG ECONOMY.ACTIVE AGED
## Length:33 Min. : 20 Min. : 40 Min. : 10
## Class :character 1st Qu.: 3640 1st Qu.: 8960 1st Qu.:1790
## Mode :character Median : 6100 Median :13680 Median :3030
## Mean : 6726 Mean :14576 Mean :2947
## 3rd Qu.: 8610 3rd Qu.:19120 3rd Qu.:3710
## Max. :19400 Max. :41080 Max. :8740
## HDB1.2RM.dwellers HDB3.4RM.dwellers HDB5RM.Ec.dweller
## Min. : 0.0 Min. : 0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 6870 1st Qu.: 3990
## Median : 30.0 Median :11350 Median : 6300
## Mean : 680.6 Mean :12550 Mean : 8601
## 3rd Qu.:1330.0 3rd Qu.:16220 3rd Qu.:11720
## Max. :3700.0 Max. :35950 Max. :38700
## Condominium.and.apartment.dwellers Landed.property.dwellers
## Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0
## Median :1200 Median : 0
## Mean :1987 Mean : 337
## 3rd Qu.:3100 3rd Qu.: 0
## Max. :8670 Max. :3590
## Pop_Density CLUSTER SP_CLUSTER Financial GovtEmbassy
## Min. :0.000004 1: 0 1:33 Min. : 0.00 Min. :0.0000
## 1st Qu.:0.012054 2: 3 2: 0 1st Qu.: 5.00 1st Qu.:0.0000
## Median :0.028496 3:22 3: 0 Median : 6.00 Median :0.0000
## Mean :0.023712 4: 1 4: 0 Mean :10.48 Mean :0.6364
## 3rd Qu.:0.032726 5: 7 5: 0 3rd Qu.:14.00 3rd Qu.:1.0000
## Max. :0.046058 Max. :55.00 Max. :6.0000
## Private Shopping Business Industry
## Min. : 0.000 Min. :0 Min. : 0.000 Min. :0.0000
## 1st Qu.: 2.000 1st Qu.:0 1st Qu.: 0.000 1st Qu.:0.0000
## Median : 3.000 Median :1 Median : 0.000 Median :0.0000
## Mean : 3.879 Mean :1 Mean : 5.182 Mean :0.0303
## 3rd Qu.: 5.000 3rd Qu.:2 3rd Qu.: 3.000 3rd Qu.:0.0000
## Max. :14.000 Max. :5 Max. :45.000 Max. :1.0000
## geometry
## MULTIPOLYGON :33
## epsg:3414 : 0
## +proj=tmer...: 0
##
##
##
summary(cluster2)
## SUBZONE_N YOUNG ECONOMY.ACTIVE AGED
## Length:4 Min. :15250 Min. :27740 Min. :3620
## Class :character 1st Qu.:15768 1st Qu.:28235 1st Qu.:4265
## Mode :character Median :16870 Median :32185 Median :5180
## Mean :16838 Mean :32383 Mean :5102
## 3rd Qu.:17940 3rd Qu.:36333 3rd Qu.:6018
## Max. :18360 Max. :37420 Max. :6430
## HDB1.2RM.dwellers HDB3.4RM.dwellers HDB5RM.Ec.dweller
## Min. : 450 Min. :13810 Min. :22650
## 1st Qu.: 855 1st Qu.:16413 1st Qu.:26820
## Median :1330 Median :19635 Median :28295
## Mean :1198 Mean :19468 Mean :27865
## 3rd Qu.:1672 3rd Qu.:22690 3rd Qu.:29340
## Max. :1680 Max. :24790 Max. :32220
## Condominium.and.apartment.dwellers Landed.property.dwellers
## Min. :3710 Min. : 0
## 1st Qu.:4678 1st Qu.: 0
## Median :5205 Median :125
## Mean :5550 Mean :125
## 3rd Qu.:6078 3rd Qu.:250
## Max. :8080 Max. :250
## Pop_Density CLUSTER SP_CLUSTER Financial GovtEmbassy
## Min. :0.03109 1:0 1:0 Min. : 1.0 Min. :0.00
## 1st Qu.:0.03433 2:0 2:4 1st Qu.: 8.5 1st Qu.:0.00
## Median :0.03689 3:2 3:0 Median :11.5 Median :0.00
## Mean :0.03681 4:0 4:0 Mean :12.0 Mean :0.75
## 3rd Qu.:0.03937 5:2 5:0 3rd Qu.:15.0 3rd Qu.:0.75
## Max. :0.04236 Max. :24.0 Max. :3.00
## Private Shopping Business Industry geometry
## Min. : 9.0 Min. :0.00 Min. :0.0 Min. :0 MULTIPOLYGON :4
## 1st Qu.: 9.0 1st Qu.:0.75 1st Qu.:0.0 1st Qu.:0 epsg:3414 :0
## Median : 9.5 Median :1.50 Median :0.5 Median :0 +proj=tmer...:0
## Mean :10.0 Mean :1.50 Mean :1.5 Mean :0
## 3rd Qu.:10.5 3rd Qu.:2.25 3rd Qu.:2.0 3rd Qu.:0
## Max. :12.0 Max. :3.00 Max. :5.0 Max. :0
summary(cluster3)
## SUBZONE_N YOUNG ECONOMY.ACTIVE AGED
## Length:86 Min. : 0 Min. : 10 Min. : 20.0
## Class :character 1st Qu.: 2098 1st Qu.: 4682 1st Qu.: 987.5
## Mode :character Median : 5485 Median :13080 Median : 2960.0
## Mean : 6317 Mean :14102 Mean : 3357.8
## 3rd Qu.: 8695 3rd Qu.:20418 3rd Qu.: 5340.0
## Max. :31690 Max. :59340 Max. :16050.0
## HDB1.2RM.dwellers HDB3.4RM.dwellers HDB5RM.Ec.dweller
## Min. : 0.0 Min. : 0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0
## Median : 190.0 Median :10355 Median : 3795
## Mean : 930.3 Mean :11444 Mean : 5662
## 3rd Qu.:1437.5 3rd Qu.:16185 3rd Qu.: 9208
## Max. :4700.0 Max. :56850 Max. :41850
## Condominium.and.apartment.dwellers Landed.property.dwellers
## Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0
## Median : 2695 Median : 405
## Mean : 3547 Mean : 1992
## 3rd Qu.: 6108 3rd Qu.: 2218
## Max. :14960 Max. :18820
## Pop_Density CLUSTER SP_CLUSTER Financial GovtEmbassy
## Min. :6.639e-05 1:10 1: 0 Min. : 0.00 Min. :0.0000
## 1st Qu.:5.996e-03 2: 0 2: 0 1st Qu.: 2.00 1st Qu.:0.0000
## Median :1.486e-02 3:35 3:86 Median : 8.00 Median :0.0000
## Mean :1.544e-02 4:12 4: 0 Mean :11.76 Mean :0.6395
## 3rd Qu.:2.313e-02 5:29 5: 0 3rd Qu.:15.00 3rd Qu.:1.0000
## Max. :3.944e-02 Max. :68.00 Max. :3.0000
## Private Shopping Business Industry
## Min. : 0.00 Min. :0.000 Min. : 0.0 Min. :0.0000
## 1st Qu.: 4.00 1st Qu.:0.000 1st Qu.: 0.0 1st Qu.:0.0000
## Median : 9.50 Median :1.000 Median : 1.0 Median :0.0000
## Mean : 19.17 Mean :1.233 Mean : 16.2 Mean :0.4302
## 3rd Qu.: 18.75 3rd Qu.:2.000 3rd Qu.: 8.5 3rd Qu.:0.0000
## Max. :215.00 Max. :8.000 Max. :224.0 Max. :5.0000
## geometry
## MULTIPOLYGON :86
## epsg:3414 : 0
## +proj=tmer...: 0
##
##
##
summary(cluster4)
## SUBZONE_N YOUNG ECONOMY.ACTIVE AGED
## Length:109 Min. : 0 Min. : 40 Min. : 0
## Class :character 1st Qu.: 250 1st Qu.: 730 1st Qu.: 170
## Mode :character Median :1330 Median : 3340 Median : 810
## Mean :1669 Mean : 4124 Mean :1390
## 3rd Qu.:2420 3rd Qu.: 6260 3rd Qu.:2310
## Max. :7930 Max. :21950 Max. :7210
## HDB1.2RM.dwellers HDB3.4RM.dwellers HDB5RM.Ec.dweller
## Min. : 0.0 Min. : 0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0.0
## Median : 0.0 Median : 0 Median : 0.0
## Mean : 574.9 Mean : 3206 Mean : 957.6
## 3rd Qu.: 540.0 3rd Qu.: 5930 3rd Qu.:1870.0
## Max. :4520.0 Max. :23090 Max. :7810.0
## Condominium.and.apartment.dwellers Landed.property.dwellers
## Min. : 0 Min. : 0.0
## 1st Qu.: 110 1st Qu.: 0.0
## Median : 770 Median : 0.0
## Mean : 1746 Mean : 615.2
## 3rd Qu.: 2460 3rd Qu.: 590.0
## Max. :16770 Max. :7600.0
## Pop_Density CLUSTER SP_CLUSTER Financial GovtEmbassy
## Min. :6.580e-06 1:26 1: 0 Min. : 0.00 Min. : 0.000
## 1st Qu.:2.672e-03 2:17 2: 0 1st Qu.: 3.00 1st Qu.: 0.000
## Median :7.248e-03 3:35 3: 0 Median : 7.00 Median : 1.000
## Mean :1.051e-02 4:15 4:109 Mean : 12.86 Mean : 1.954
## 3rd Qu.:1.690e-02 5:16 5: 0 3rd Qu.: 18.00 3rd Qu.: 3.000
## Max. :4.399e-02 Max. :132.00 Max. :17.000
## Private Shopping Business Industry
## Min. : 0.0 Min. : 0.000 Min. : 0.000 Min. :0.0000
## 1st Qu.: 3.0 1st Qu.: 0.000 1st Qu.: 1.000 1st Qu.:0.0000
## Median : 7.0 Median : 1.000 Median : 3.000 Median :0.0000
## Mean : 15.6 Mean : 2.257 Mean : 7.046 Mean :0.1101
## 3rd Qu.: 19.0 3rd Qu.: 3.000 3rd Qu.: 8.000 3rd Qu.:0.0000
## Max. :123.0 Max. :27.000 Max. :50.000 Max. :3.0000
## geometry
## MULTIPOLYGON :109
## epsg:3414 : 0
## +proj=tmer...: 0
##
##
##
summary(cluster5)
## SUBZONE_N YOUNG ECONOMY.ACTIVE AGED
## Length:2 Min. :15720 Min. :33080 Min. : 6480
## Class :character 1st Qu.:20355 1st Qu.:44755 1st Qu.: 9575
## Mode :character Median :24990 Median :56430 Median :12670
## Mean :24990 Mean :56430 Mean :12670
## 3rd Qu.:29625 3rd Qu.:68105 3rd Qu.:15765
## Max. :34260 Max. :79780 Max. :18860
## HDB1.2RM.dwellers HDB3.4RM.dwellers HDB5RM.Ec.dweller
## Min. : 830 Min. :22180 Min. :30500
## 1st Qu.:1698 1st Qu.:35385 1st Qu.:34865
## Median :2565 Median :48590 Median :39230
## Mean :2565 Mean :48590 Mean :39230
## 3rd Qu.:3432 3rd Qu.:61795 3rd Qu.:43595
## Max. :4300 Max. :75000 Max. :47960
## Condominium.and.apartment.dwellers Landed.property.dwellers
## Min. :1770 Min. :0
## 1st Qu.:2632 1st Qu.:0
## Median :3495 Median :0
## Mean :3495 Mean :0
## 3rd Qu.:4358 3rd Qu.:0
## Max. :5220 Max. :0
## Pop_Density CLUSTER SP_CLUSTER Financial GovtEmbassy
## Min. :0.03062 1:0 1:0 Min. :11 Min. :1.00
## 1st Qu.:0.03140 2:0 2:0 1st Qu.:28 1st Qu.:1.75
## Median :0.03217 3:2 3:0 Median :45 Median :2.50
## Mean :0.03217 4:0 4:0 Mean :45 Mean :2.50
## 3rd Qu.:0.03294 5:0 5:2 3rd Qu.:62 3rd Qu.:3.25
## Max. :0.03371 Max. :79 Max. :4.00
## Private Shopping Business Industry
## Min. : 3 Min. : 1.00 Min. : 0.00 Min. :0
## 1st Qu.: 9 1st Qu.: 3.25 1st Qu.: 3.75 1st Qu.:0
## Median :15 Median : 5.50 Median : 7.50 Median :0
## Mean :15 Mean : 5.50 Mean : 7.50 Mean :0
## 3rd Qu.:21 3rd Qu.: 7.75 3rd Qu.:11.25 3rd Qu.:0
## Max. :27 Max. :10.00 Max. :15.00 Max. :0
## geometry
## MULTIPOLYGON :2
## epsg:3414 :0
## +proj=tmer...:0
##
##
##
After perfoming analysis on the socio economic indicators, we can conclude the following of the 4 clusters respectively: Cluster 1: Moderate to High population density. Young children, working parents and elderly live together in HDB (inclusive of all types). Cluster 2: High population density. ECONOMY ACTIVE people living with their YOUNG children in HDB5RM.Ec.dweller Cluster 3: Low to Modeerate population density. Families with YOUNG children living in Condominium/Apartment or Landed Cluster 4: Low population density. Young couples (ECONOMY Active people staying by themselves) living in HDB1-2 Rooms or Condominium/Apartment or Landed Cluster 5: High population density. Have many ECONOMY ACTIVE, AGED and YOUNG people living in HDB (inclusive of all types)
FIrst we will plot the number of fiancial functions by cluster
ggplot(data=socio_urban, aes(x=`Financial`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
Next, we will plot the number of financial functions in each subzone on tmap, with their respective clusters being filled. Those with a value of 0 financial functions will not be shown. This goes the same for the charts below for the other urban functions.
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="Financial",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Financial",size.lim=c(1))
## Warning: The shape socio_urban is invalid. See sf::st_is_valid
## Legend for symbol sizes not available in view mode.
Most financial functions can be found in cluster 3, 4 and 5. As we remember, apart from cluster 3, Cluster 1 has many AGED people. They would also need access to financial functions and thus, more financial functions can be built in cluster 1.
govtembassy_plot <- ggplot(data=socio_urban, aes(x=`GovtEmbassy`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="GovtEmbassy",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="GovtEmbassy",size.lim=c(1))
Most of the goverment embassy are located inn cluster 4. This is mostly due to the locations being central.
ggplot(data=socio_urban, aes(x=`Private`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="Private",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Private",size.lim=c(1))
Most privatse spaces are found in cluster 3 and 4. With high population density in cluster 3. Hence with more demand and more opportunities to build, more private spaces (Upmarket residential area) can be built in cluster 3 subzones.
ggplot(data=socio_urban, aes(x=`Shopping`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="Shopping",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Shopping",size.lim=c(1))
Most shopping functions are located in the central and a few in cluster 3. However, there is a high population density in cluster 1 and cluster 2. Hence, more shopping functions can be opened in cluster 1 and 2 to create covenience for the people.
ggplot(data=socio_urban, aes(x=`Business`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="Business",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Business",size.lim=c(1))
Many buiness functions can be found in cluster 3. However, cluster 4 is a good area to increase more business functions as the population density is low and is accesible by other subzones easily (it being the central) and relatively low number of businesses found in cluster 4.
ggplot(data=socio_urban, aes(x=`Industry`)) +
geom_histogram(bins=20, color="black", fill="light blue")+
facet_grid(rows = vars(SP_CLUSTER))
tm_shape(socio_urban) +
tm_fill(col = "SP_CLUSTER",alpha=0.7) +
tm_borders(lwd=1,lty="solid") +
tm_bubbles(col="Industry",alpha=0.8, border.lwd = 0, border.alpha = 0.2,palette="YlOrRd",style = "jenks",size="Industry",size.lim=c(1))
Most industries can be found in cluster 3. however, the number of industries are little. with varying population densities in that cluster, more industries can be placed in cluster 3, whereby the subzone’s population density is lesser.
tmap_mode('plot')