packages = c('sp', 'rgdal', 'rgeos', 'sf', 'spdep', 'tmap', 'tidyverse')
for (p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
set.ZeroPolicyOption(TRUE)
## [1] FALSE
In our analysis we need these data
The CSV files are imported with the read_csv() function from the readr package.
busstop_volume <- read_csv("data/aspatial/passenger volume by busstop.csv")
## Parsed with column specification:
## cols(
## YEAR_MONTH = col_character(),
## DAY_TYPE = col_character(),
## TIME_PER_HOUR = col_double(),
## PT_TYPE = col_character(),
## PT_CODE = col_character(),
## TOTAL_TAP_IN_VOLUME = col_double(),
## TOTAL_TAP_OUT_VOLUME = col_double()
## )
population <- read_csv("data/aspatial/respopagesextod2011to2019.csv")
## Parsed with column specification:
## cols(
## PA = col_character(),
## SZ = col_character(),
## AG = col_character(),
## Sex = col_character(),
## TOD = col_character(),
## Pop = col_double(),
## Time = col_double()
## )
The polygon feature data are imported with the st_read() function from the sf package as simple feature dataframes.
sf_mpsz <- st_read(dsn = "data/geospatial",
layer = "MP14_SUBZONE_WEB_PL")
## Reading layer `MP14_SUBZONE_WEB_PL' from data source `C:\Users\weich\Downloads\SMU2020\IS415\IS415_Take-home_Ex01\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 323 features and 15 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
sf_busstop <- st_read(dsn = "data/geospatial",
layer = "BusStop")
## Reading layer `BusStop' from data source `C:\Users\weich\Downloads\SMU2020\IS415\IS415_Take-home_Ex01\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 5040 features and 3 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 4427.938 ymin: 26482.1 xmax: 48282.5 ymax: 52983.82
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
At the end of data cleaning, we want a dataframe consisting of
Steps
population_agg <- population %>%
filter(Time == 2019) %>%
mutate_at(.vars=vars(SZ), .funs=toupper) %>%
group_by(SZ) %>%
summarise(Population = sum(Pop))
population_agg
## # A tibble: 323 x 2
## SZ Population
## <chr> <dbl>
## 1 ADMIRALTY 14110
## 2 AIRPORT ROAD 0
## 3 ALEXANDRA HILL 13780
## 4 ALEXANDRA NORTH 2120
## 5 ALJUNIED 40190
## 6 ANAK BUKIT 22250
## 7 ANCHORVALE 46610
## 8 ANG MO KIO TOWN CENTRE 4890
## 9 ANSON 0
## 10 BALESTIER 32760
## # ... with 313 more rows
bus_volume_agg <- busstop_volume %>%
group_by(PT_CODE) %>%
summarise(Tap_in = sum(TOTAL_TAP_IN_VOLUME), Tap_out = sum(TOTAL_TAP_OUT_VOLUME))
bus_volume_agg
## # A tibble: 5,030 x 3
## PT_CODE Tap_in Tap_out
## <chr> <dbl> <dbl>
## 1 01012 35842 42043
## 2 01013 24545 14435
## 3 01019 18858 28741
## 4 01029 37311 24811
## 5 01039 64089 59856
## 6 01059 14719 80673
## 7 01109 5575 11148
## 8 01112 213014 117028
## 9 01113 149798 71915
## 10 01119 70805 75543
## # ... with 5,020 more rows
busvolume_busstop <- inner_join(sf_busstop, bus_volume_agg,
by = c("BUS_STOP_N" = "PT_CODE"))
## Warning: Column `BUS_STOP_N`/`PT_CODE` joining factor and character vector,
## coercing into character vector
busvolume_busstop
## Simple feature collection with 4962 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 4427.938 ymin: 26482.1 xmax: 48282.5 ymax: 52983.82
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
## First 10 features:
## BUS_STOP_N BUS_ROOF_N LOC_DESC Tap_in Tap_out
## 1 78221 B06 <NA> 22198 26620
## 2 63359 B01 HOUGANG SWIM CPLX 14410 16905
## 3 64141 B13 AFT JLN TELAWI 3441 6524
## 4 83139 B07 AFT JOO CHIAT PL 4325 3585
## 5 55231 B02 OPP SBST EAST DISTRICT 15052 5462
## 6 55351 B03 OPP FUDU WALK P/G 222 1321
## 7 92089 B10 CHIJ KATONG CON 21151 10545
## 8 80271 B06 OPP BLK 2 3575 19784
## 9 82059 B02 ONE AMBER 12792 2988
## 10 97089 B02 OPP SELARANG PK DRUG REH. 3284 1387
## geometry
## 1 POINT (42227.96 39563.16)
## 2 POINT (34065.75 39047.46)
## 3 POINT (36335.3 38525.74)
## 4 POINT (36530.26 32981.18)
## 5 POINT (29669.93 40841.51)
## 6 POINT (28404.77 41300.92)
## 7 POINT (37378.19 32166.81)
## 8 POINT (33524.15 31938.86)
## 9 POINT (35142.59 31500.49)
## 10 POINT (44329.52 39204.74)
sf_busstop_zone <- st_join(busvolume_busstop, sf_mpsz, join = st_intersects)
sf_busstop_zone
## Simple feature collection with 4962 features and 20 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 4427.938 ymin: 26482.1 xmax: 48282.5 ymax: 52983.82
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
## First 10 features:
## BUS_STOP_N BUS_ROOF_N LOC_DESC Tap_in Tap_out OBJECTID
## 1 78221 B06 <NA> 22198 26620 246
## 2 63359 B01 HOUGANG SWIM CPLX 14410 16905 208
## 3 64141 B13 AFT JLN TELAWI 3441 6524 209
## 4 83139 B07 AFT JOO CHIAT PL 4325 3585 167
## 5 55231 B02 OPP SBST EAST DISTRICT 15052 5462 228
## 6 55351 B03 OPP FUDU WALK P/G 222 1321 230
## 7 92089 B10 CHIJ KATONG CON 21151 10545 103
## 8 80271 B06 OPP BLK 2 3575 19784 68
## 9 82059 B02 ONE AMBER 12792 2988 103
## 10 97089 B02 OPP SELARANG PK DRUG REH. 3284 1387 221
## SUBZONE_NO SUBZONE_N SUBZONE_C CA_IND PLN_AREA_N PLN_AREA_C
## 1 5 PASIR RIS DRIVE PRSZ05 N PASIR RIS PR
## 2 4 HOUGANG WEST HGSZ04 N HOUGANG HG
## 3 3 PAYA LEBAR WEST PLSZ03 N PAYA LEBAR PL
## 4 5 FRANKEL BDSZ05 N BEDOK BD
## 5 10 YIO CHU KANG AMSZ10 N ANG MO KIO AM
## 6 8 TAGORE AMSZ08 N ANG MO KIO AM
## 7 3 MARINE PARADE MPSZ03 N MARINE PARADE MP
## 8 5 TANJONG RHU KLSZ05 N KALLANG KL
## 9 3 MARINE PARADE MPSZ03 N MARINE PARADE MP
## 10 2 CHANGI WEST CHSZ02 N CHANGI CH
## REGION_N REGION_C INC_CRC FMEL_UPD_D X_ADDR Y_ADDR
## 1 EAST REGION ER 2B73597AF3F174F3 2014-12-05 42032.46 38988.22
## 2 NORTH-EAST REGION NER 27B0E0BDE3579753 2014-12-05 33849.22 39506.87
## 3 EAST REGION ER 4BADA58E2612A6E2 2014-12-05 36649.31 39403.49
## 4 EAST REGION ER B34F041CC4B050EC 2014-12-05 37694.55 33007.27
## 5 NORTH-EAST REGION NER 5E2B16BB381BAEA4 2014-12-05 29443.74 40836.02
## 6 NORTH-EAST REGION NER DECC20CBAF2CD0C4 2014-12-05 27445.77 40971.42
## 7 CENTRAL REGION CR 44613908A004F20C 2014-12-05 36263.98 31684.27
## 8 CENTRAL REGION CR 928DCE8E44F904C8 2014-12-05 32740.12 31438.31
## 9 CENTRAL REGION CR 44613908A004F20C 2014-12-05 36263.98 31684.27
## 10 EAST REGION ER 7460F2CFB1D7D36C 2014-12-05 44092.34 38928.66
## SHAPE_Leng SHAPE_Area geometry
## 1 5311.060 1639669.0 POINT (42227.96 39563.16)
## 2 6424.482 1328876.2 POINT (34065.75 39047.46)
## 3 5898.476 609894.7 POINT (36335.3 38525.74)
## 4 8750.386 4297140.7 POINT (36530.26 32981.18)
## 5 4533.218 909418.6 POINT (29669.93 40841.51)
## 6 8162.600 3334192.4 POINT (28404.77 41300.92)
## 7 6019.279 1160017.5 POINT (37378.19 32166.81)
## 8 6515.759 2434594.0 POINT (33524.15 31938.86)
## 9 6019.279 1160017.5 POINT (35142.59 31500.49)
## 10 14918.108 4848516.7 POINT (44329.52 39204.74)
busstop_planarea_population <- left_join(sf_busstop_zone, population_agg,
by = c("SUBZONE_N" = "SZ"))
## Warning: Column `SUBZONE_N`/`SZ` joining factor and character vector, coercing
## into character vector
busstop_planarea_population
## Simple feature collection with 4962 features and 21 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 4427.938 ymin: 26482.1 xmax: 48282.5 ymax: 52983.82
## proj4string: +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
## First 10 features:
## BUS_STOP_N BUS_ROOF_N LOC_DESC Tap_in Tap_out OBJECTID
## 1 78221 B06 <NA> 22198 26620 246
## 2 63359 B01 HOUGANG SWIM CPLX 14410 16905 208
## 3 64141 B13 AFT JLN TELAWI 3441 6524 209
## 4 83139 B07 AFT JOO CHIAT PL 4325 3585 167
## 5 55231 B02 OPP SBST EAST DISTRICT 15052 5462 228
## 6 55351 B03 OPP FUDU WALK P/G 222 1321 230
## 7 92089 B10 CHIJ KATONG CON 21151 10545 103
## 8 80271 B06 OPP BLK 2 3575 19784 68
## 9 82059 B02 ONE AMBER 12792 2988 103
## 10 97089 B02 OPP SELARANG PK DRUG REH. 3284 1387 221
## SUBZONE_NO SUBZONE_N SUBZONE_C CA_IND PLN_AREA_N PLN_AREA_C
## 1 5 PASIR RIS DRIVE PRSZ05 N PASIR RIS PR
## 2 4 HOUGANG WEST HGSZ04 N HOUGANG HG
## 3 3 PAYA LEBAR WEST PLSZ03 N PAYA LEBAR PL
## 4 5 FRANKEL BDSZ05 N BEDOK BD
## 5 10 YIO CHU KANG AMSZ10 N ANG MO KIO AM
## 6 8 TAGORE AMSZ08 N ANG MO KIO AM
## 7 3 MARINE PARADE MPSZ03 N MARINE PARADE MP
## 8 5 TANJONG RHU KLSZ05 N KALLANG KL
## 9 3 MARINE PARADE MPSZ03 N MARINE PARADE MP
## 10 2 CHANGI WEST CHSZ02 N CHANGI CH
## REGION_N REGION_C INC_CRC FMEL_UPD_D X_ADDR Y_ADDR
## 1 EAST REGION ER 2B73597AF3F174F3 2014-12-05 42032.46 38988.22
## 2 NORTH-EAST REGION NER 27B0E0BDE3579753 2014-12-05 33849.22 39506.87
## 3 EAST REGION ER 4BADA58E2612A6E2 2014-12-05 36649.31 39403.49
## 4 EAST REGION ER B34F041CC4B050EC 2014-12-05 37694.55 33007.27
## 5 NORTH-EAST REGION NER 5E2B16BB381BAEA4 2014-12-05 29443.74 40836.02
## 6 NORTH-EAST REGION NER DECC20CBAF2CD0C4 2014-12-05 27445.77 40971.42
## 7 CENTRAL REGION CR 44613908A004F20C 2014-12-05 36263.98 31684.27
## 8 CENTRAL REGION CR 928DCE8E44F904C8 2014-12-05 32740.12 31438.31
## 9 CENTRAL REGION CR 44613908A004F20C 2014-12-05 36263.98 31684.27
## 10 EAST REGION ER 7460F2CFB1D7D36C 2014-12-05 44092.34 38928.66
## SHAPE_Leng SHAPE_Area Population geometry
## 1 5311.060 1639669.0 55280 POINT (42227.96 39563.16)
## 2 6424.482 1328876.2 45530 POINT (34065.75 39047.46)
## 3 5898.476 609894.7 0 POINT (36335.3 38525.74)
## 4 8750.386 4297140.7 34340 POINT (36530.26 32981.18)
## 5 4533.218 909418.6 0 POINT (29669.93 40841.51)
## 6 8162.600 3334192.4 8040 POINT (28404.77 41300.92)
## 7 6019.279 1160017.5 26850 POINT (37378.19 32166.81)
## 8 6515.759 2434594.0 11030 POINT (33524.15 31938.86)
## 9 6019.279 1160017.5 26850 POINT (35142.59 31500.49)
## 10 14918.108 4848516.7 1310 POINT (44329.52 39204.74)
bs_pa_pop_agg <- busstop_planarea_population %>%
group_by(SUBZONE_N, Population) %>%
summarise(Tap_in = sum(Tap_in), Tap_out = sum(Tap_out)) %>%
drop_na() %>%
select(-geometry)
bs_pa_pop_agg
## # A tibble: 305 x 4
## # Groups: SUBZONE_N [305]
## SUBZONE_N Population Tap_in Tap_out
## <chr> <dbl> <dbl> <dbl>
## 1 ADMIRALTY 14110 178509 228889
## 2 AIRPORT ROAD 0 22692 24524
## 3 ALEXANDRA HILL 13780 497549 603813
## 4 ALEXANDRA NORTH 2120 59359 41292
## 5 ALJUNIED 40190 1613882 1704099
## 6 ANAK BUKIT 22250 799579 745657
## 7 ANCHORVALE 46610 472053 474260
## 8 ANG MO KIO TOWN CENTRE 4890 1100348 1052820
## 9 ANSON 0 73701 59522
## 10 BALESTIER 32760 1061622 1065839
## # ... with 295 more rows
sf_all <- left_join(sf_mpsz, bs_pa_pop_agg) %>%
mutate_at(vars(c("Population","Tap_in","Tap_out")), ~replace(., is.na(.), 0))
## Joining, by = "SUBZONE_N"
## Warning: Column `SUBZONE_N` joining factor and character vector, coercing into
## character vector
sf_mpsz3414 <- st_set_crs(sf_all, 3414)
## Warning: st_crs<- : replacing crs does not reproject data; use st_transform for
## that
sf_mpsz3414
## Simple feature collection with 323 features and 18 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## CRS: EPSG:3414
## First 10 features:
## OBJECTID SUBZONE_NO SUBZONE_N SUBZONE_C CA_IND PLN_AREA_N
## 1 1 1 MARINA SOUTH MSSZ01 Y MARINA SOUTH
## 2 2 1 PEARL'S HILL OTSZ01 Y OUTRAM
## 3 3 3 BOAT QUAY SRSZ03 Y SINGAPORE RIVER
## 4 4 8 HENDERSON HILL BMSZ08 N BUKIT MERAH
## 5 5 3 REDHILL BMSZ03 N BUKIT MERAH
## 6 6 7 ALEXANDRA HILL BMSZ07 N BUKIT MERAH
## 7 7 9 BUKIT HO SWEE BMSZ09 N BUKIT MERAH
## 8 8 2 CLARKE QUAY SRSZ02 Y SINGAPORE RIVER
## 9 9 13 PASIR PANJANG 1 QTSZ13 N QUEENSTOWN
## 10 10 7 QUEENSWAY QTSZ07 N QUEENSTOWN
## PLN_AREA_C REGION_N REGION_C INC_CRC FMEL_UPD_D X_ADDR
## 1 MS CENTRAL REGION CR 5ED7EB253F99252E 2014-12-05 31595.84
## 2 OT CENTRAL REGION CR 8C7149B9EB32EEFC 2014-12-05 28679.06
## 3 SR CENTRAL REGION CR C35FEFF02B13E0E5 2014-12-05 29654.96
## 4 BM CENTRAL REGION CR 3775D82C5DDBEFBD 2014-12-05 26782.83
## 5 BM CENTRAL REGION CR 85D9ABEF0A40678F 2014-12-05 26201.96
## 6 BM CENTRAL REGION CR 9D286521EF5E3B59 2014-12-05 25358.82
## 7 BM CENTRAL REGION CR 7839A8577144EFE2 2014-12-05 27680.06
## 8 SR CENTRAL REGION CR 48661DC0FBA09F7A 2014-12-05 29253.21
## 9 QT CENTRAL REGION CR 1F721290C421BFAB 2014-12-05 22077.34
## 10 QT CENTRAL REGION CR 3580D2AFFBEE914C 2014-12-05 24168.31
## Y_ADDR SHAPE_Leng SHAPE_Area Population Tap_in Tap_out
## 1 29220.19 5267.381 1630379.3 0 9220 11366
## 2 29782.05 3506.107 559816.2 6630 272549 212781
## 3 29974.66 1740.926 160807.5 70 51458 113105
## 4 29933.77 3313.625 595428.9 13380 385622 340528
## 5 30005.70 2825.594 387429.4 10730 337565 327084
## 6 29991.38 4428.913 1030378.8 13780 497549 603813
## 7 30230.86 3275.312 551732.0 14780 188085 192011
## 8 30222.86 2208.619 290184.7 60 244731 155963
## 9 29893.78 6571.323 1084792.3 4430 317152 333296
## 10 30104.18 3454.239 631644.3 290 113621 99051
## geometry
## 1 MULTIPOLYGON (((31495.56 30...
## 2 MULTIPOLYGON (((29092.28 30...
## 3 MULTIPOLYGON (((29932.33 29...
## 4 MULTIPOLYGON (((27131.28 30...
## 5 MULTIPOLYGON (((26451.03 30...
## 6 MULTIPOLYGON (((25899.7 297...
## 7 MULTIPOLYGON (((27746.95 30...
## 8 MULTIPOLYGON (((29351.26 29...
## 9 MULTIPOLYGON (((20996.49 30...
## 10 MULTIPOLYGON (((24472.11 29...
Before starting any analysis, we can conduct exploratory data analysis to understand our data.
The choropleth map taps on shades of colours to display observations in a given area.
qtm(sf_mpsz3414, fill="Population",
fill.palette="Blues",
fill.title="SG population by subzone planning area")
## Warning: The shape sf_mpsz3414 is invalid. See sf::st_is_valid
From the above plot, we can tell that some subzones have a significantly higher population than the other subzones.
qtm_in <- qtm(sf_mpsz3414, "Tap_in",
fill.palette="Greens",
fill.title="Bus tap-in volume by subzone planning area")
## Warning: The shape sf_mpsz3414 is invalid. See sf::st_is_valid
qtm_out <- qtm(sf_mpsz3414, "Tap_out",
fill.palette="Reds",
fill.title="Bus tap-out volume by subzone planning area")
## Warning: The shape sf_mpsz3414 is invalid. See sf::st_is_valid
tmap_arrange(qtm_in, qtm_out, asp=1, ncol=2)
From the above two plots, we can tell that some subzones have a significantly higher tap-in and tap-out volume than the other subzones. Some of these subzones coincide with the higher population subzone plot. However, a statistical analysis still needs to be conducted to ascertain whether there is any relationship between the observations.
To understand the relationship between commuter flows and population at the planning subzone level, we can plot simple linear regression models. Here, we can create two regression models, one for tap in data as independent variable and the other with tap out data as independent variable. In both cases, population data is the dependent variable.
lm1 <- lm(Tap_in~Population,data=sf_mpsz3414)
summary(lm1)
##
## Call:
## lm(formula = Tap_in ~ Population, data = sf_mpsz3414)
##
## Residuals:
## Min 1Q Median 3Q Max
## -866858 -114168 -64076 32371 1960937
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.142e+05 1.970e+04 5.796 1.62e-08 ***
## Population 1.951e+01 9.016e-01 21.641 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 290700 on 321 degrees of freedom
## Multiple R-squared: 0.5933, Adjusted R-squared: 0.5921
## F-statistic: 468.3 on 1 and 321 DF, p-value: < 2.2e-16
Tap-in linear regression (LM1): Tap_in = 1.245e+05 + 1.924e+01Population
lm2 <- lm(Tap_out~Population,data=sf_mpsz3414)
summary(lm2)
##
## Call:
## lm(formula = Tap_out ~ Population, data = sf_mpsz3414)
##
## Residuals:
## Min 1Q Median 3Q Max
## -826264 -114610 -55088 31666 1654642
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.146e+05 1.908e+04 6.007 5.11e-09 ***
## Population 1.943e+01 8.732e-01 22.246 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 281500 on 321 degrees of freedom
## Multiple R-squared: 0.6066, Adjusted R-squared: 0.6053
## F-statistic: 494.9 on 1 and 321 DF, p-value: < 2.2e-16
Tap-out linear regression (LM2): Tap_out = 1.249e+05 + 1.916e+01Population
ggplot(sf_mpsz3414,aes(x=Population, y=Tap_out)) +
geom_point(color='green4') +
geom_smooth(method='lm', formula= y~x) +
geom_segment(aes(xend=Population, yend = predict(lm1))) +
ggtitle("Linear regression between tap-out volume and population") +
xlab("Tap-out volume")
Analysis:
ggplot(sf_mpsz3414,aes(x=Population,y=Tap_out)) +
geom_point(color='red4') +
geom_smooth(method='lm', formula= y~x) +
geom_segment(aes(xend=Population, yend = predict(lm2))) +
ggtitle("Linear regression between tap-in volume and population") +
xlab("Tap-in volume")
Analysis:
plot(sf_mpsz3414$Population, residuals(lm1), main="Relationship between residuals of LM1 and Population", xlab="Population by Planning Subzone", ylab="Residuals")
plot(sf_mpsz3414$Population, residuals(lm2), main="Relationship between residuals of LM2 and Population", xlab="Population by Planning Subzone", ylab="Residuals")
Analysis:
Spatial autocorrelation measures whether observations at spatial locations are similar to each other. In this analysis, we are concerned about whether the residuals of our linear regression model display a particular relationship.
We form out hypothesis here:
Confidence interval for this analysis is 95%.
sf_mpsz3414$Tap_in_resid <- residuals(lm1)
sf_mpsz3414$Tap_out_resid <- residuals(lm2)
qtm_resid_in <- qtm(sf_mpsz3414, "Tap_in_resid",
fill.palette="Greens",
fill.title="Bus tap-in volume by subzone planning area")
## Warning: The shape sf_mpsz3414 is invalid. See sf::st_is_valid
qtm_resid_out <- qtm(sf_mpsz3414, "Tap_out_resid",
fill.palette="Reds",
fill.title="Bus tap-out volume by subzone planning area")
## Warning: The shape sf_mpsz3414 is invalid. See sf::st_is_valid
tmap_arrange(qtm_resid_in, qtm_resid_out, asp=1, ncol=2)
## Some legend labels were too wide. These labels have been resized to 0.60, 0.65, 0.59, 0.59. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Some legend labels were too wide. These labels have been resized to 0.60, 0.65, 0.59, 0.59. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
Analysis:
There are various methods to calculate weight matrices
Subsequently, row-standardisation can be applied to these weights.
wm_q <- poly2nb(sf_mpsz3414, queen=TRUE)
summary(wm_q)
## Neighbour list object:
## Number of regions: 323
## Number of nonzero links: 1934
## Percentage nonzero weights: 1.853751
## Average number of links: 5.987616
## 5 regions with no links:
## 17 18 19 295 302
## Link number distribution:
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 14 17
## 5 2 6 10 26 77 87 51 34 16 3 3 1 1 1
## 2 least connected regions:
## 16 234 with 1 link
## 1 most connected region:
## 313 with 17 links
wm_r <- poly2nb(sf_mpsz3414, queen=FALSE)
summary(wm_r)
## Neighbour list object:
## Number of regions: 323
## Number of nonzero links: 1680
## Percentage nonzero weights: 1.610291
## Average number of links: 5.201238
## 5 regions with no links:
## 17 18 19 295 302
## Link number distribution:
##
## 0 1 2 3 4 5 6 7 8 9 10 11 13 14
## 5 2 6 19 73 97 65 29 17 5 2 1 1 1
## 2 least connected regions:
## 16 234 with 1 link
## 1 most connected region:
## 313 with 14 links
centroids <- st_centroid(sf_mpsz3414$geometry)
coords <- st_coordinates(centroids)
par(mfrow=c(1,2))
plot(sf_mpsz3414$geometry, border="lightgrey", main="Queen Contiguity", asp = 1)
plot(wm_q, coords, pch = 19, cex = 0.6, add = TRUE, col= "red" )
plot(sf_mpsz3414$geometry, border="lightgrey", main="Rook Contiguity", asp = 1)
plot(wm_r, coords, pch = 19, cex = 0.6, add = TRUE, col = "red")
tm_shape(sf_mpsz3414)+
tm_fill(title = "Busstop locations") +
tm_borders(alpha = 1) +
tm_shape(sf_busstop_zone)+
tm_dots(size = 0.02)
## Warning: The shape sf_mpsz3414 is invalid. See sf::st_is_valid
On analysis of the above plot that maps the bus stop locations in Singapore, there is no particular relationship of the bus stop locations (e.g. whether the bus stops are located diagonally across the grids). Since there is little reason bus stop routes would not go diagonally across the grids, we will use the Queen contiguity based neighbour in our analysis.
k1 <- knn2nb(knearneigh(coords))
k1dists <- unlist(nbdists(k1, coords, longlat = FALSE))
summary(k1dists)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 182.5 618.1 904.8 1012.4 1180.0 5954.4
To ensure that all subzones have at least 1 neighbour, we set the max to 6000.
wm_d <- dnearneigh(coords, 0, 6000, longlat = FALSE)
wm_d
## Neighbour list object:
## Number of regions: 323
## Number of nonzero links: 20552
## Percentage nonzero weights: 19.69922
## Average number of links: 63.62848
The adaptive distance weight matrix ensures that each subzone has at least k nearest neighbours. Here, we use k=6.
wm_knn <- knn2nb(knearneigh(coords, k=6, longlat = FALSE))
wm_knn
## Neighbour list object:
## Number of regions: 323
## Number of nonzero links: 1938
## Percentage nonzero weights: 1.857585
## Average number of links: 6
## Non-symmetric neighbours list
par(mfrow=c(1,2))
plot(sf_mpsz3414$geometry, border="lightgrey", main= "Fixed-distance", asp = 1)
plot(wm_d, coords, pch = 19, cex = 0.6, add = TRUE, col = "red")
plot(sf_mpsz3414$geometry, border="lightgrey", main= "KNN 6 Neighbours", asp = 1)
plot(wm_knn, coords, pch = 19, cex = 0.6, add = TRUE, col = "red")
The subzones are not geographically organized such that they lie within a similar distance to each other. For instance, the islands to the south of Singapore are far from the mainland and have much fewer close neighbours compared to the other subzones. This is proven by the cut-off distance calculation done earlier. The shortest distance is 182.5m while the furthest distance is 5954.4m. Therefore, a more ideal measurement style would be the adaptive distance weights.
Since the QUEEN contiguity based neighbour is more useful for our case, we will use it to calculate the inverse distance weights matrix.
dist <- nbdists(wm_q, coords, longlat = FALSE)
ids <- lapply(dist, function(x) 1/(x))
head(ids)
## [[1]]
## [1] 0.0008639904 0.0007820725 0.0006389516 0.0011788831 0.0007072448
##
## [[2]]
## [1] 0.0010053075 0.0009130871 0.0013815042 0.0014707896 0.0011895686
## [6] 0.0025427862 0.0012752370 0.0011882456 0.0010944296
##
## [[3]]
## [1] 0.001005307 0.002117558 0.004246161 0.001961539 0.002786005 0.001267224
##
## [[4]]
## [1] 0.0017085086 0.0010580384 0.0015851372 0.0009068772 0.0014430880
## [6] 0.0016868603
##
## [[5]]
## [1] 0.0017085086 0.0011858756 0.0016141745 0.0015587598 0.0006863876
##
## [[6]]
## [1] 0.0011858756 0.0008362313 0.0005228567 0.0007684273 0.0016794719
## [6] 0.0008535163 0.0010335223 0.0009209275
Using the weight matrix calculated using the QUEEN contiguity based neighbours method, we can calculat the row-standardised weights matrix via the nb2listw() function.
rswm_q <- nb2listw(wm_q, style="W", zero.policy = TRUE)
summary(rswm_q)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 323
## Number of nonzero links: 1934
## Percentage nonzero weights: 1.853751
## Average number of links: 5.987616
## 5 regions with no links:
## 17 18 19 295 302
## Link number distribution:
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 14 17
## 5 2 6 10 26 77 87 51 34 16 3 3 1 1 1
## 2 least connected regions:
## 16 234 with 1 link
## 1 most connected region:
## 313 with 17 links
##
## Weights style: W
## Weights constants summary:
## n nn S0 S1 S2
## W 318 101124 318 111.2781 1309.875
lm.morantest(lm1, listw=rswm_q, zero.policy=TRUE)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = Tap_in ~ Population, data = sf_mpsz3414)
## weights: rswm_q
##
## Moran I statistic standard deviate = -0.61856, p-value = 0.7319
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## -0.024396964 -0.004125139 0.001074037
Analysis:
lm.morantest(lm2, listw=rswm_q, zero.policy=TRUE)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = Tap_out ~ Population, data = sf_mpsz3414)
## weights: rswm_q
##
## Moran I statistic standard deviate = -0.041918, p-value = 0.5167
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## -0.005498911 -0.004125139 0.001074037
Analysis:
We can use the Monte Carlo simulation to confirm our analysis.
set.seed(1234)
moran.mc(sf_mpsz3414$Tap_in_resid, listw = rswm_q, nsim = 999, zero.policy = TRUE, na.action=na.omit)
##
## Monte-Carlo simulation of Moran I
##
## data: sf_mpsz3414$Tap_in_resid
## weights: rswm_q
## number of simulations + 1: 1000
##
## statistic = -0.024397, observed rank = 252, p-value = 0.748
## alternative hypothesis: greater
Analysis:
set.seed(1234)
moran.mc(sf_mpsz3414$Tap_out_resid, listw = rswm_q, nsim = 999, zero.policy = TRUE, na.action=na.omit)
##
## Monte-Carlo simulation of Moran I
##
## data: sf_mpsz3414$Tap_out_resid
## weights: rswm_q
## number of simulations + 1: 1000
##
## statistic = -0.0054989, observed rank = 500, p-value = 0.5
## alternative hypothesis: greater
Analysis:
To analyse whether there are clustering or outliers, we can use the local Moran’s I statistic.
We will first convert our simple features dataframe to a SpatialPolygonsDataframe object of the sp package.
sp_mpsz3414 <- as_Spatial(sf_mpsz3414)
fips <- order(sp_mpsz3414$SUBZONE_N)
localMI_in <- localmoran(sp_mpsz3414$Tap_in, rswm_q)
head(localMI_in)
## Ii E.Ii Var.Ii Z.Ii Pr(z > 0)
## 1 0.475096895 -0.00310559 0.1877985 1.103483372 0.1349087
## 2 0.055932961 -0.00310559 0.1030858 0.183880768 0.4270535
## 3 0.269686390 -0.00310559 0.1560312 0.690598849 0.2449088
## 4 -0.005779640 -0.00310559 0.1560312 -0.006769611 0.5027007
## 5 0.003985369 -0.00310559 0.1877985 0.016362850 0.4934725
## 6 -0.106275623 -0.00310559 0.1163221 -0.302497911 0.6188637
localMI_out <- localmoran(sp_mpsz3414$Tap_out, rswm_q)
head(localMI_out)
## Ii E.Ii Var.Ii Z.Ii Pr(z > 0)
## 1 0.458101684 -0.00310559 0.1880673 1.06350485 0.1437765
## 2 0.098600557 -0.00310559 0.1032313 0.31654938 0.3757928
## 3 0.239412865 -0.00310559 0.1562538 0.61352105 0.2697659
## 4 0.003393259 -0.00310559 0.1562538 0.01644073 0.4934414
## 5 0.005395233 -0.00310559 0.1880673 0.01960218 0.4921804
## 6 -0.191190385 -0.00310559 0.1164869 -0.55108052 0.7092108
sp_mpsz3414.localMI_in <- cbind(sp_mpsz3414,localMI_in)
sp_mpsz3414.localMI_out <- cbind(sp_mpsz3414,localMI_out)
localMI.map <- tm_shape(sp_mpsz3414.localMI_in) +
tm_fill(col = "Ii",
style = "pretty",
title = "local moran statistics") +
tm_borders(alpha = 0.5)
pvalue.map <- tm_shape(sp_mpsz3414.localMI_in) +
tm_fill(col = "Pr.z...0.",
breaks=c(-Inf, 0.001, 0.01, 0.05, 0.1, Inf),
palette="-Blues",
title = "local Moran's I p-values") +
tm_borders(alpha = 0.5)
tmap_arrange(localMI.map, pvalue.map, asp=1, ncol=2)
## Warning: The shape sp_mpsz3414.localMI_in is invalid. See sf::st_is_valid
## Variable(s) "Ii" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
## Warning: The shape sp_mpsz3414.localMI_in is invalid. See sf::st_is_valid
localMI.map <- tm_shape(sp_mpsz3414.localMI_out) +
tm_fill(col = "Ii",
style = "pretty",
title = "local moran statistics") +
tm_borders(alpha = 0.5)
pvalue.map <- tm_shape(sp_mpsz3414.localMI_out) +
tm_fill(col = "Pr.z...0.",
breaks=c(-Inf, 0.001, 0.01, 0.05, 0.1, Inf),
palette="-Blues",
title = "local Moran's I p-values") +
tm_borders(alpha = 0.5)
tmap_arrange(localMI.map, pvalue.map, asp=1, ncol=2)
## Warning: The shape sp_mpsz3414.localMI_out is invalid. See sf::st_is_valid
## Variable(s) "Ii" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
## Warning: The shape sp_mpsz3414.localMI_out is invalid. See sf::st_is_valid
The LISA cluster map enables us to view the significant locations by spatial autocorrelation.
The Moran scatterplot enables us to view the relationship between the chosen variable and the average value of the same variable at neighbouring locations.
Here, we are interested in the tap-in and out volume with respect to neigbouring subzones.
sp_mpsz3414.localMI_in$Z.Tap_in <- scale(sp_mpsz3414.localMI_in$Tap_in) %>% as.vector
moran.plot(sp_mpsz3414.localMI_in$Z.Tap_in, rswm_q, labels=as.character(sp_mpsz3414.localMI_in$SUBZONE_N), xlab="Tap-in volume", ylab="Spatially Lag Tap-in volume")
sp_mpsz3414.localMI_out$Z.Tap_out <- scale(sp_mpsz3414.localMI_out$Tap_out) %>% as.vector
moran.plot(sp_mpsz3414.localMI_out$Z.Tap_out, rswm_q, labels=as.character(sp_mpsz3414.localMI_out$SUBZONE_N), xlab="Tap-out volume", ylab="Spatially Lag Tap-out volume")
quadrant_in <- vector(mode="numeric",length=nrow(localMI_in))
DV_in <- sp_mpsz3414$Tap_in - mean(sp_mpsz3414$Tap_in)
C_mI_in <- localMI_in[,1] - mean(localMI_in[,1])
signif <- 0.05
quadrant_in[DV_in >0 & C_mI_in>0] <- 4
quadrant_in[DV_in <0 & C_mI_in<0] <- 1
quadrant_in[DV_in <0 & C_mI_in>0] <- 2
quadrant_in[DV_in >0 & C_mI_in<0] <- 3
quadrant_in[localMI_in[,5]>signif] <- 0
quadrant_out <- vector(mode="numeric",length=nrow(localMI_in))
DV_out <- sp_mpsz3414$Tap_out - mean(sp_mpsz3414$Tap_out)
C_mI_out <- localMI_out[,1] - mean(localMI_out[,1])
signif <- 0.05
quadrant_out[DV_out >0 & C_mI_out>0] <- 4
quadrant_out[DV_out <0 & C_mI_out<0] <- 1
quadrant_out[DV_out <0 & C_mI_out>0] <- 2
quadrant_out[DV_out >0 & C_mI_out<0] <- 3
quadrant_out[localMI_in[,5]>signif] <- 0
The LISA map enables us to view the significant regions, as well as the positive/negative relationships of these significant regions.
sp_mpsz3414.localMI_in$quadrant_in <- quadrant_in
sp_mpsz3414.localMI_out$quadrant_out <- quadrant_out
colors <- c("#ffffff", "#2c7bb6", "#abd9e9", "#fdae61", "#d7191c")
clusters <- c("insignificant", "low-low", "low-high", "high-low", "high-high")
localMI_in.map <- tm_shape(sp_mpsz3414.localMI_in) +
tm_fill(col = "quadrant_in", style = "cat", palette = colors[c(sort(unique(quadrant_in)))+1], labels = clusters[c(sort(unique(quadrant_in)))+1], popup.vars = c("Postal.Code")) +
tm_view(set.zoom.limits = c(11,17)) +
tm_borders(alpha=0.5)
localMI_out.map <- tm_shape(sp_mpsz3414.localMI_out) +
tm_fill(col = "quadrant_out", style = "cat", palette = colors[c(sort(unique(quadrant_out)))+1], labels = clusters[c(sort(unique(quadrant_out)))+1], popup.vars = c("Postal.Code")) +
tm_view(set.zoom.limits = c(11,17)) +
tm_borders(alpha=0.5)
tmap_arrange(localMI_in.map, localMI_out.map, asp=1, ncol=2)
## Warning: The shape sp_mpsz3414.localMI_in is invalid. See sf::st_is_valid
## Warning: The shape sp_mpsz3414.localMI_out is invalid. See sf::st_is_valid
Analysis:
As mentioned in the fixed vs adaptive distance analysis above, adaptive distancing is a better metrics. Therefore, we will calculate the Gi statistics based on adaptive distancing.
wm_knn_lw <- nb2listw(wm_knn, style = 'B')
gi.adaptive_in <- localG(sp_mpsz3414$Tap_in, wm_knn_lw)
sp_mpsz3414.gi <- cbind(sp_mpsz3414, as.matrix(gi.adaptive_in))
names(sp_mpsz3414.gi)[length(names(sp_mpsz3414.gi))] <- "gstat_adaptive_in"
gi.adaptive_out <- localG(sp_mpsz3414$Tap_out, wm_knn_lw)
sp_mpsz3414.gi <- cbind(sp_mpsz3414.gi, as.matrix(gi.adaptive_out))
names(sp_mpsz3414.gi)[length(names(sp_mpsz3414.gi))] <- "gstat_adaptive_out"
gstat_in.map <- tm_shape(sp_mpsz3414.gi) +
tm_fill(col = "gstat_adaptive_in",
style = "pretty",
palette = "Greens",
title = "local Gi") +
tm_borders(alpha = 0.5)
gstat_out.map <- tm_shape(sp_mpsz3414.gi) +
tm_fill(col = "gstat_adaptive_out",
style = "pretty",
palette = "Reds",
title = "local Gi") +
tm_borders(alpha = 0.5)
tmap_arrange(gstat_in.map, gstat_out.map, asp=1, ncol=2)
## Warning: The shape sp_mpsz3414.gi is invalid. See sf::st_is_valid
## Warning: The shape sp_mpsz3414.gi is invalid. See sf::st_is_valid
Analysis: