Importing required packages

packages = c('rgdal', 'spdep',  'tmap', 'tidyverse', 'sf', 'ggpubr')
for (p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
## Loading required package: rgdal
## Loading required package: sp
## rgdal: version: 1.4-8, (SVN revision 845)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
##  Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/gdal
##  GDAL binary built with GEOS: FALSE 
##  Loaded PROJ.4 runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
##  Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/proj
##  Linking to sp version: 1.3-2
## Loading required package: spdep
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
## Loading required package: tmap
## Loading required package: tidyverse
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Loading required package: ggpubr

Importing data

Importing tap in tap out data (already provided)

tapData <- read_csv("data/aspatial/passenger volume by busstop.csv")
## Parsed with column specification:
## cols(
##   YEAR_MONTH = col_character(),
##   DAY_TYPE = col_character(),
##   TIME_PER_HOUR = col_double(),
##   PT_TYPE = col_character(),
##   PT_CODE = col_character(),
##   TOTAL_TAP_IN_VOLUME = col_double(),
##   TOTAL_TAP_OUT_VOLUME = col_double()
## )
#singapore <- st_read(dsn = "data/geospatial/master-plan-2014-subzone-boundary-no-sea/master-plan-2014-subzone-boundary-no-sea-shp", layer = "MP14_SUBZONE_NO_SEA_PL")
singapore <- st_read(dsn = "data/geospatial/d2", layer = "MP14_SUBZONE_WEB_PL")
## Reading layer `MP14_SUBZONE_WEB_PL' from data source `/Users/Amey/Desktop/Y2S2/IS415/assign1/data/geospatial/d2' using driver `ESRI Shapefile'
## Simple feature collection with 323 features and 15 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs
population <- read_csv("data/aspatial/respopagesextod2011to2019.csv")
## Parsed with column specification:
## cols(
##   PA = col_character(),
##   SZ = col_character(),
##   AG = col_character(),
##   Sex = col_character(),
##   TOD = col_character(),
##   Pop = col_double(),
##   Time = col_double()
## )
busStopLocation <- st_read(dsn="data/geospatial/BusStopLocation_Jan2020", layer= "BusStop")
## Reading layer `BusStop' from data source `/Users/Amey/Desktop/Y2S2/IS415/assign1/data/geospatial/BusStopLocation_Jan2020' using driver `ESRI Shapefile'
## Simple feature collection with 5040 features and 3 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 4427.938 ymin: 26482.1 xmax: 48282.5 ymax: 52983.82
## proj4string:    +proj=tmerc +lat_0=1.366666666666667 +lon_0=103.8333333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +datum=WGS84 +units=m +no_defs

Population data preperation

population <-population %>%
  filter(Time==2019) %>%
  group_by(SZ) %>%
  summarise(value=sum(Pop)) %>%
  rename( "SUBZONE_N"=SZ)

Data wrangling for population/subzone

population$SUBZONE_N <- toupper(population$SUBZONE_N)
singapore <- left_join(singapore, population, by="SUBZONE_N")

Data wrangling for busstop/tapin-out

tapData <- tapData %>%
  select(PT_CODE, TOTAL_TAP_IN_VOLUME, TOTAL_TAP_OUT_VOLUME)%>%
  group_by(PT_CODE) %>%
  summarise(TAP_IN = sum(TOTAL_TAP_IN_VOLUME),
            TAP_OUT = sum(TOTAL_TAP_OUT_VOLUME))%>%
  rename("BUS_STOP_N"=PT_CODE)

Join location of bus stop

busStopLocation <- st_set_crs(busStopLocation, 3414)
singapore <- st_set_crs(singapore, 3414)

st_transform(busStopLocation,3414)
## Simple feature collection with 5040 features and 3 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 4427.938 ymin: 26482.1 xmax: 48282.5 ymax: 52983.82
## CRS:            EPSG:3414
## First 10 features:
##    BUS_STOP_N BUS_ROOF_N                  LOC_DESC                  geometry
## 1       78221        B06                      <NA> POINT (42227.96 39563.16)
## 2       63359        B01         HOUGANG SWIM CPLX POINT (34065.75 39047.46)
## 3       64141        B13            AFT JLN TELAWI  POINT (36335.3 38525.74)
## 4       83139        B07          AFT JOO CHIAT PL POINT (36530.26 32981.18)
## 5       55231        B02    OPP SBST EAST DISTRICT POINT (29669.93 40841.51)
## 6       55351        B03         OPP FUDU WALK P/G POINT (28404.77 41300.92)
## 7       92089        B10           CHIJ KATONG CON POINT (37378.19 32166.81)
## 8       80271        B06                 OPP BLK 2 POINT (33524.15 31938.86)
## 9       82059        B02                 ONE AMBER POINT (35142.59 31500.49)
## 10      97089        B02 OPP SELARANG PK DRUG REH. POINT (44329.52 39204.74)
st_transform(singapore, 3414)
## Simple feature collection with 323 features and 16 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## CRS:            EPSG:3414
## First 10 features:
##    OBJECTID SUBZONE_NO       SUBZONE_N SUBZONE_C CA_IND      PLN_AREA_N
## 1         1          1    MARINA SOUTH    MSSZ01      Y    MARINA SOUTH
## 2         2          1    PEARL'S HILL    OTSZ01      Y          OUTRAM
## 3         3          3       BOAT QUAY    SRSZ03      Y SINGAPORE RIVER
## 4         4          8  HENDERSON HILL    BMSZ08      N     BUKIT MERAH
## 5         5          3         REDHILL    BMSZ03      N     BUKIT MERAH
## 6         6          7  ALEXANDRA HILL    BMSZ07      N     BUKIT MERAH
## 7         7          9   BUKIT HO SWEE    BMSZ09      N     BUKIT MERAH
## 8         8          2     CLARKE QUAY    SRSZ02      Y SINGAPORE RIVER
## 9         9         13 PASIR PANJANG 1    QTSZ13      N      QUEENSTOWN
## 10       10          7       QUEENSWAY    QTSZ07      N      QUEENSTOWN
##    PLN_AREA_C       REGION_N REGION_C          INC_CRC FMEL_UPD_D   X_ADDR
## 1          MS CENTRAL REGION       CR 5ED7EB253F99252E 2014-12-05 31595.84
## 2          OT CENTRAL REGION       CR 8C7149B9EB32EEFC 2014-12-05 28679.06
## 3          SR CENTRAL REGION       CR C35FEFF02B13E0E5 2014-12-05 29654.96
## 4          BM CENTRAL REGION       CR 3775D82C5DDBEFBD 2014-12-05 26782.83
## 5          BM CENTRAL REGION       CR 85D9ABEF0A40678F 2014-12-05 26201.96
## 6          BM CENTRAL REGION       CR 9D286521EF5E3B59 2014-12-05 25358.82
## 7          BM CENTRAL REGION       CR 7839A8577144EFE2 2014-12-05 27680.06
## 8          SR CENTRAL REGION       CR 48661DC0FBA09F7A 2014-12-05 29253.21
## 9          QT CENTRAL REGION       CR 1F721290C421BFAB 2014-12-05 22077.34
## 10         QT CENTRAL REGION       CR 3580D2AFFBEE914C 2014-12-05 24168.31
##      Y_ADDR SHAPE_Leng SHAPE_Area value                       geometry
## 1  29220.19   5267.381  1630379.3     0 MULTIPOLYGON (((31495.56 30...
## 2  29782.05   3506.107   559816.2  6630 MULTIPOLYGON (((29092.28 30...
## 3  29974.66   1740.926   160807.5    70 MULTIPOLYGON (((29932.33 29...
## 4  29933.77   3313.625   595428.9 13380 MULTIPOLYGON (((27131.28 30...
## 5  30005.70   2825.594   387429.4 10730 MULTIPOLYGON (((26451.03 30...
## 6  29991.38   4428.913  1030378.8 13780 MULTIPOLYGON (((25899.7 297...
## 7  30230.86   3275.312   551732.0 14780 MULTIPOLYGON (((27746.95 30...
## 8  30222.86   2208.619   290184.7    60 MULTIPOLYGON (((29351.26 29...
## 9  29893.78   6571.323  1084792.3  4430 MULTIPOLYGON (((20996.49 30...
## 10 30104.18   3454.239   631644.3   290 MULTIPOLYGON (((24472.11 29...
busStops <- left_join(tapData,busStopLocation)
## Joining, by = "BUS_STOP_N"
busStops <- busStops %>%
  select(BUS_STOP_N, TAP_IN, TAP_OUT, geometry)

Making subzone table

subZone <- singapore %>%
  select(OBJECTID, SUBZONE_N, value, geometry)%>%
  rename("Area"=OBJECTID)

Assigning subzone to bus stop

xy <- st_coordinates(singapore)

x <- xy[,1]
y <- xy[,2]
pol <- xy[,5]
poly <- data.frame(pol,x,y)

points <- st_coordinates(busStopLocation)
x <- points[,1]
y <- points[,2]
object <- busStopLocation$BUS_STOP_N

points <- data.frame(object,x,y)

library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
x <- split(poly$x, poly$pol)
y <- split(poly$y, poly$pol)

todo <- 1:nrow(points)
Area <- rep.int("", nrow(points))
pol <- names(x)

# loop through polygons
for (i in 1:length(x)) {
  # the vertices of i-th polygon
  bnd <- cbind(x[[i]], y[[i]])
  # points to allocate
  xy <- with(points, cbind(x[todo], y[todo]))
  inbnd <- in.out(bnd, xy)
  # allocation
  Area[todo[inbnd]] <- pol[i]
  # update 'todo'
  todo <- todo[!inbnd]
  }

points$Area <- Area
points2 <- points %>%
  select(object,Area) %>%
  rename("BUS_STOP_N"=object)
table <- left_join(busStops,points2)
## Joining, by = "BUS_STOP_N"
table <- table %>%
  select(BUS_STOP_N, TAP_IN, TAP_OUT, Area)
table$Area <- as.numeric(table$Area)
table <- left_join(subZone,table)
## Joining, by = "Area"

Data preperation for regression model

data <- table %>%
  select( SUBZONE_N,value,TAP_IN,TAP_OUT) %>%
  group_by(SUBZONE_N,value) %>%
  summarise(TAP_IN=sum(TAP_IN),
            TAP_OUT=sum(TAP_OUT)) %>%
  drop_na()

Creating regression for tap in

stats <- lm(TAP_IN~value,data)
summary(stats)
## 
## Call:
## lm(formula = TAP_IN ~ value, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -865527 -124146  -64237   39142 1949703 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.254e+05  2.110e+04   5.944 7.64e-09 ***
## value       1.926e+01  9.383e-01  20.524  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 298200 on 303 degrees of freedom
## Multiple R-squared:  0.5816, Adjusted R-squared:  0.5802 
## F-statistic: 421.2 on 1 and 303 DF,  p-value: < 2.2e-16
stats$coefficients
##  (Intercept)        value 
## 125402.22897     19.25786
data$tapINres <- stats$residuals

plot(data$value, data$TAP_IN, main="Regression for Population on Tap In volumes in Buses",xlab="Population in Subzone",ylab="Tap In")
abline(reg = stats,col='red')

From the regression model, we can derive the following:
(1) The linear equation is y = 19.26x + 125402
(2) The slope of the regression line is 19.26, which signifies a positive relationship between population and tap in volumes in buses
(3) The intercept is 125402
(4) The R-squared value is 0.5816, which signifies that 58.16% of the variation in tap in data is due to the variation in population.

Spatial Autocorrelaion

The following code will create a choropleth map in order to visualise the Tap In Residual Data among different subzones in Singapore.

data2 <- st_as_sf(data)
tm_shape(data2)+
  tm_fill(col="tapINres",
          style = "pretty",
          palette = "-RdBu",
          title = "Tap in Residual data")+
  tm_view(set.zoom.limits = c(11,17)) +
  tm_borders(alpha = 0.5)
## Variable(s) "tapINres" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

Negative residuals (Subzones in blue) imply over prediction of data whereas Positive residuals (Subzones in red) imply underprediction of data. Visually, spatial correlation for residuals is not evident but a more formal test needs to be conducted to confirm this. Therefore, global Moran’s test is performed on the residual data to validate if there is spatial correlation.
Firstly, weight matrix is created. The Queens weight matrix is created first in order to find out the average number of neighbours each subzone has and identify the subzones with lower number of connections.

Creating Queen’s weight matrix

wm_q <- poly2nb(data2, queen = TRUE)
summary(wm_q)
## Neighbour list object:
## Number of regions: 305 
## Number of nonzero links: 1844 
## Percentage nonzero weights: 1.982263 
## Average number of links: 6.045902 
## Link number distribution:
## 
##  2  3  4  5  6  7  8  9 10 11 12 14 17 
##  6 11 27 79 76 50 36 13  2  2  1  1  1 
## 6 least connected regions:
## 2 41 132 170 228 275 with 2 links
## 1 most connected region:
## 39 with 17 links
plot(data2$geometry, border="lightgrey")
plot(wm_q, st_centroid(data2$geometry), add = TRUE, col= "red")

From the summary data of the Queen’s weighed matrix, each subzone has an average of 6 connections. However, there are 6 subzones with only 2 links. In order to make our weighed matrix uniform, an adaptive weighed matrix is being used with each subzone being assigned 7 neighbours.

Adaptive matrix

coords <- st_centroid(data2$geometry)

wm_knn8 <- knn2nb(knearneigh(coords, k=7))

plot(data2$geometry, border="lightgrey")
plot(wm_knn8, st_centroid(data2$geometry), add = TRUE, col= "green")

Standardise the rows

The following code will standarise the weight for every subzone. As 7 neighbours are assigned to each subzone, the weight will therefore be 1/7 for each neighbour.

rswm_knn8 <- nb2listw(wm_knn8, style="W", zero.policy = TRUE)

Creating spatial lag values

Spatial lag values assign the average Tap In value of neighbours for each subzone.

tapINres.lag <- lag.listw(rswm_knn8, data2$tapINres)
data2$tapinLAG <- tapINres.lag

Comparing Tap In Residual values and Spatially lagged Tap In Values

tmap_mode("plot")
## tmap mode set to plotting
a <- tm_shape(data2)+
  tm_fill(col = "tapINres",
          title = "Tap In Residuals")+
  tm_layout(legend.outside = TRUE) +
  tm_borders(alpha = 0.5)

b <- tm_shape(data2)+
  tm_fill(col = "tapinLAG",
          title = "Tap in Residual Lag")+
  tm_layout(legend.outside = TRUE) +
  tm_borders(alpha = 0.5)


tmap_arrange(a, b,asp=1, nrow = 1, sync = TRUE)
## Variable(s) "tapINres" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
## Some legend labels were too wide. These labels have been resized to 0.38, 0.64, 0.41, 0.37, 0.37. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.
## Variable(s) "tapinLAG" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
## Some legend labels were too wide. These labels have been resized to 0.41, 0.41, 0.41, 0.64, 0.44, 0.44, 0.44. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.

Moran’s I for regression residuals

The null hypothesis is that the residuals obtained from the regression model of the relationship between population and bus tap in is randomised among different subzones in Singapore.

The alternative hypothesis is that the residuals obtained from the regression model of the relationship between population and bus tap in is not randomised among different subzones in Singapore.

The alpha (significance level) for this test is 0.05.

lm.morantest(stats, listw=rswm_knn8, zero.policy=TRUE, alternative = "greater",
 spChk=NULL, resfun=weighted.residuals, naSubset=TRUE)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = TAP_IN ~ value, data = data)
## weights: rswm_knn8
## 
## Moran I statistic standard deviate = -0.35694, p-value = 0.6394
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     -0.014424884     -0.004254316      0.000811893

The p-value is 0.724, which is above 0.05 and hence we do not reject the null hypothesis. However, to confirm our findings, we will run a Monte Carlo simulation 1000 times.

Monte Carlo for Moran’s I test

set.seed(1234)
bperm= moran.mc(data2$tapINres, listw=rswm_knn8, nsim=999, zero.policy = TRUE, na.action=na.omit)
bperm
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  data2$tapINres 
## weights: rswm_knn8  
## number of simulations + 1: 1000 
## 
## statistic = -0.014425, observed rank = 371, p-value = 0.629
## alternative hypothesis: greater
hist(bperm$res, freq=TRUE, breaks=20, xlab="Simulated Moran's I", main = "Monte Carlo Simulation Findings")
abline(v=-0.023, col="red")

The Moran’s I in the Monte Carlo simulation is in the middle, validating our findings of an insignificant p-value from the Moran’s I test.

Conclusion

The p-value is 0.724, which is above 0.05 and hence we do not reject the null hypothesis. The moran’s I value is -0.0223 which is very close to 0, implying that the residual data obtained from the population-tapin regression model conforms to spatial randomness among different subzones in Singapore. From the results of the test, we can conclude that the errors in the regression model are independent of one another, and their values do not depend on the value of residuals at neighbouring locations, i.e. there is no spatial correlation in the residuals.

Visualising spatial lag value using Moran’s scatterplot

MS <- moran.plot(data2$tapINres, rswm_knn8, zero.policy = TRUE, spChk=FALSE, labels=as.character(data2$SUBZONE_N), xlab="residual", ylab="Spatially Lag residual")

Spatial Correlogram

Spatial Correlograms are plotted in order to identify the autocorrelation of spatial observations when we increase the distance (lag) between them.

MI_corr <- sp.correlogram(wm_knn8, data2$tapINres, order=6, method="I", style="W")
plot(MI_corr, main="Spatial Correlogram")

The findings from the above graph show that the Moran’s I value are all around 0 even when we increase the distance between them.

Analysis of Tap In Data

Now, the Tap In Data itself is analysed. The Data is first analysed globally through Global Moran’s I test in order to find if there is clustering and then analysed locally through local Moran’s I test in order to find signifcant clusters, outliers, cold spots and hot spots.

Global Moran’s I for Tap In Data

The null hypothesis is that the volume of Bus Tap In is randomised among different subzones in Singapore.

The alternative hypothesis is that the volume of Bus Tap In is not randomised among different subzones in Singapore.

The alpha (significance level) for this test is 0.05.

moran.test(data2$TAP_IN, listw=rswm_knn8, zero.policy = TRUE, na.action=na.omit,alternative = "two.sided")
## 
##  Moran I test under randomisation
## 
## data:  data2$TAP_IN  
## weights: rswm_knn8    
## 
## Moran I statistic standard deviate = 6.2505, p-value = 4.092e-10
## alternative hypothesis: two.sided
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.1714396388     -0.0032894737      0.0007814573

The p-value is 4.092e-10, which is below 0.05 and hence we reject the null hypothesis. However, to confirm our findings, we will run a Monte Carlo simulation 1000 times.

set.seed(1234)
bperm= moran.mc(data2$TAP_IN, listw=rswm_knn8, nsim=999, zero.policy = TRUE, na.action=na.omit )
bperm
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  data2$TAP_IN 
## weights: rswm_knn8  
## number of simulations + 1: 1000 
## 
## statistic = 0.17144, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
hist(bperm$res, freq=TRUE, breaks=20, xlab="Simulated Moran's I", main="Findings from Monte Carlo Simulation")
abline(v=0.171, col="red")

The Moran’s I value is in the extreme right when compared with the Monte Carlo simulation i, validating our findings of an significant p-value from the Moran’s I test.

Conclusion

The p-value is 4.092e-10, which is below 0.05 and hence we reject the null hypothesis. Therefore, the volume of Bus Tap In is not randomised among different subzones in Singapore, implying a spatial relationship between them. Furthermore, the Moran’s I value is 0.171 which is greather than 0, implying that the Tap In data is positively clustered among different subzones in Singapore.

Local Moran’s I

From the code below, Local Moran’s I test is performed locally through which I value and p-value is obtained for every subzone.

fips <- order(data2$SUBZONE_N)
localMI <- localmoran(data2$TAP_IN, rswm_knn8)
head(localMI)
##            Ii         E.Ii    Var.Ii       Z.Ii  Pr(z > 0)
## 1  0.11306988 -0.003289474 0.1329659  0.3191032 0.37482413
## 2 -0.05217577 -0.003289474 0.1329659 -0.1340655 0.55332460
## 3 -0.06442490 -0.003289474 0.1329659 -0.1676574 0.56657360
## 4  0.07531885 -0.003289474 0.1329659  0.2155750 0.41465953
## 5  0.50976203 -0.003289474 0.1329659  1.4069893 0.07971528
## 6 -0.24083763 -0.003289474 0.1329659 -0.6514506 0.74262218
#printCoefmat(data.frame(localMI[fips,], row.names=data2$SUBZONE_N[fips]), check.names=FALSE)

data2.localMI <- cbind(data2,localMI)

A choropleth map of the local moran’s I value is plotted beside a choropleth map indicating the regions which have a significant p-value, i.e. a p-value which is less than 0.05.

tmap_mode("plot")
## tmap mode set to plotting
localMI.map <- tm_shape(data2.localMI) +
  tm_polygons(col = "Ii",
          style = "pretty",
          title = "Local Moran's I value",
          palette= "-RdBu") +
  tm_layout(legend.outside = TRUE) +
  tm_view(set.zoom.limits = c(10,16)) +
  tm_borders(alpha = 0.5)

pvalue.map <- tm_shape(data2.localMI) +
  tm_polygons(col = "Pr.z...0.",
          breaks=c(-Inf,0.05, Inf),
          palette=c("blue","lightgrey"),
          title = "Local Moran's p-values") +
  tm_view(set.zoom.limits = c(10,16)) +
  tm_borders(alpha = 0.5)

tmap_arrange(localMI.map, pvalue.map, asp=1, nrow=1, sync = TRUE)
## Variable(s) "Ii" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

The dark blue regions on the map on the right shows the regions which have p-values less than 0.05 and is hence statistically significant to determine these regions to be either clusters or outliers. Clusters exhibit positive I values, which are shaded red in the map on the left. Outliers exhibit negative I values, which are shaded blue in the map on the left.

To understand characteristics of these regions, LISA cluster map is created!

Creating LISA Cluster Map

Plotting Moran Scatterplot

nci <- moran.plot(data2$TAP_IN, rswm_knn8, labels=as.character(data2$SUBZONE_N), xlab="Residual Values", ylab="Spatially Lag residuals")

## Moran Scatterplot with standardised variable

data2$Z.TAP_IN <- scale(data2$TAP_IN) %>% as.vector
nci2 <- moran.plot(data2$Z.TAP_IN, rswm_knn8, labels=as.character(data2$SUBZONE_N), xlab="z-residual", ylab="Spatially Lag z-residual")

## Lisa map preperation

quadrant <- vector(mode="numeric",length=nrow(localMI))
TAP_IN.lag <- lag.listw(rswm_knn8, data2$TAP_IN)
data2$TAP_IN_LAG <- TAP_IN.lag

#DV <- data2$tapINres - mean(data2$tapINres)
#C_mI <- localMI[,1] - mean(localMI[,1])
DV <- data2$Z.TAP_IN
C_mI <- data2$TAP_IN_LAG
signif <- 0.05
quadrant[DV >0 & C_mI>0] <- 4
quadrant[DV <0 & C_mI<0] <- 1
quadrant[DV <0 & C_mI>0] <- 2
quadrant[DV >0 & C_mI<0] <- 3
quadrant[localMI[,5]>signif] <- 0

data2.localMI$quadrant <- quadrant
colors <- c("#ffffff", "#2c7bb6", "#abd9e9", "#fdae61", "#d7191c")
clusters <- c("insignificant", "low-low", "low-high", "high-low", "high-high")

tmap_mode("plot")
## tmap mode set to plotting
tm_shape(data2.localMI) +
  tm_fill(col = "quadrant", style = "cat", palette = colors[c(sort(unique(quadrant)))+1], labels = clusters[c(sort(unique(quadrant)))+1], popup.vars = c("SUBZONE_N")) +
  tm_view(set.zoom.limits = c(11,17)) +
  tm_borders(alpha=0.5)

As seen in the map above, there are various subzones in the “high-high” quadrant. High-High refers to a high tap-in value in the subzone as well as high tap in values in their neighbours. Groupings of these regions can be found majorly in the East part of Singapore (Geylang East, Kampong Ubi, Kembangan, Frankel, Bedok South, Bedok North, Bedok Reservoir, Tampines West, Simei, Tampines East, Pasir Drive, Pasir Ris Central ), the North part of Singapore (North Coast, Senoko North, Woodlands regional center, Woodlands East, Yishun West, Yishun Central, Yishun East, Yishun South ) and some locations in North-East and Western region of Singapore.

Performing Getis-Ord Statistics to identify Hot spots and Cold Spots

The Gi value is computed for each subzone through which the p-value is calculated for each subzone.

fips <- order(data2$TAP_IN)
gi.fixed <- localG(data2$TAP_IN, rswm_knn8)
gi.fixed
##   [1] -0.692900774  0.172496682 -0.668196746 -0.294558240  0.536332011
##   [6] -0.696162886  2.129695313  0.532449385 -1.181351913  0.694748729
##  [11]  0.107348685 -1.531747799  3.232264605  2.722712611  3.735994412
##  [16]  2.594988541 -0.887240437  0.394942062 -1.629446616  0.145190717
##  [21] -0.121252791 -1.429223575  0.970492360  3.121439941  1.066412123
##  [26] -1.459856689  0.892843837 -0.674421021  0.263402345 -1.180778867
##  [31] -0.262078921  0.241689495  0.482820521  1.183358197 -0.644068095
##  [36] -0.296996897 -1.528680977 -1.525968283 -1.202574122 -0.705526149
##  [41] -0.025207695  2.183129496 -1.399743343  0.651968391  2.575523156
##  [46] -1.380093101 -1.417732394  0.168704327  0.787679345  1.869132827
##  [51] -0.840526796 -0.948072043 -1.404607237 -0.571890079 -0.270211281
##  [56]  0.363309647  0.505555665 -1.496898947 -0.877334557  2.845893828
##  [61] -1.028824493 -0.931242870  0.117839526  0.453786016 -0.390213249
##  [66] -1.310186876 -0.774584595 -1.247533490  1.025531928 -1.067061602
##  [71] -0.122104758  0.238303878 -1.130843106 -1.150449711 -0.507741643
##  [76]  3.520703988 -1.106992818  3.176698908 -0.582679185 -0.406249345
##  [81]  1.926280494 -0.878473937  0.364488540 -0.617988496  3.363767246
##  [86]  0.169356267 -1.661863547 -1.643735001 -0.441271062 -1.334141426
##  [91] -0.255490895 -1.092166186 -0.803533470  0.021318542  0.070898501
##  [96]  1.237004210  0.738979610  1.037638786 -1.153141837  0.136473510
## [101] -1.053292027 -0.424941451 -0.995808166 -0.140325818 -0.093033083
## [106] -0.760001333  0.227250133 -0.318519665  0.925137434 -0.359474559
## [111]  0.798363208 -0.448951047 -0.945548445 -0.643936347 -0.336431147
## [116]  2.779277432  0.733786473  2.665771735  0.733241570  0.917605234
## [121]  4.233850465 -0.890573064  1.024216797  1.139159101  0.167066378
## [126]  2.397226016  1.828967530  1.069861902 -1.128886918 -1.088992999
## [131] -0.727684889 -0.880730712 -0.898070682 -1.048462130 -0.564019370
## [136]  0.768360223  0.547697849  0.828635452  0.850731008  0.019274897
## [141]  0.008103716 -1.462838413  0.901951648 -1.167440633  1.172921472
## [146]  1.856883465  1.752318317 -0.904386423 -0.715862166 -0.008855043
## [151] -1.649881826  1.010315661 -0.479683807  0.934323700  1.029944075
## [156] -1.262043098 -0.860171690  3.012686796 -0.404665030 -0.708757536
## [161] -0.084524774  1.453402912 -1.175743160 -0.501678651 -0.151742887
## [166]  0.225507819 -0.709281172  2.403663038  1.876955899  1.219520054
## [171] -0.892724162 -1.125659383 -0.911951474 -1.037582437  0.023937963
## [176]  0.073782694 -1.089280099 -0.736140572  2.747764933  2.319360584
## [181]  0.346560485  0.039216373  2.957887428 -0.855057533 -0.186671100
## [186] -1.439330704  0.128826223 -0.752229949 -1.066700196  0.395871073
## [191] -1.344948377 -1.027328469 -1.599518730 -1.838681063 -1.572931555
## [196] -0.923679777 -0.510232156  0.960153399  1.113352974 -0.753062198
## [201] -1.790872040 -0.237967235 -0.365829026 -1.092362001  1.204032147
## [206] -1.344543446 -0.661625567  1.491506783 -1.921599832  0.266824871
## [211] -1.055907756  0.786494166 -0.635899756  0.269773642  0.001075172
## [216]  0.702060882  0.860166739 -0.744635289  0.702039689 -0.284810207
## [221]  1.009054191 -0.978727501  0.979199360 -0.769548890  0.514445516
## [226]  0.465856184  2.571500972 -0.658771796 -0.072770467  1.193367159
## [231]  1.244437715  0.992805367  1.077400880 -1.590367947  1.050033551
## [236]  5.756743968 -0.235868005  0.477753081 -1.651360039 -0.781645731
## [241] -1.789586127 -0.728530134  0.911556513 -0.575906902 -0.674033239
## [246] -0.333865005  0.160812412  2.024036555  4.353318272  4.797062248
## [251] -0.922440338 -0.847584309 -1.203576371 -0.047675090 -0.855724331
## [256]  0.982722665 -0.102005190 -0.405576704  0.391657742  0.845271379
## [261] -1.440658428 -0.640719692 -0.468776634 -0.696080866  0.023594178
## [266]  1.184571733  0.693770169 -0.427800679  2.015648468  0.842967499
## [271] -1.837531496 -1.487705259 -1.514537581 -1.803709311 -1.752212766
## [276]  1.557431030 -0.825423689 -1.280570135 -0.331240998 -0.109162346
## [281]  0.583079597 -1.282618350  1.320646393  3.431667973  0.246756024
## [286]  2.702048880  1.232042500  1.431559276  0.919350410  1.767497392
## [291]  2.034615790 -1.033767491  5.684970762  0.329862564  0.437249562
## [296]  0.496210255 -0.153259856  0.048939378  1.332971654  1.824028289
## [301]  1.863602796  1.381284025  0.918718046  1.011728104  0.877530388
## attr(,"gstari")
## [1] FALSE
## attr(,"call")
## localG(x = data2$TAP_IN, listw = rswm_knn8)
## attr(,"class")
## [1] "localG"
data2.gi <- cbind(data2, as.matrix(gi.fixed))
names(data2.gi)[9] <- "gstat"

pvalue2sided<-2*pnorm(-abs(data2.gi$gstat))
data2.gi$pval <- pvalue2sided

Visualise Zones with Significant P-value

a<-tm_shape(data2.gi) +
  tm_fill(col = "gstat",
          style="pretty",
          palette="-RdBu",
          title = "Gi values") +
  tm_view(set.zoom.limits = c(10,17)) +
  tm_borders(alpha = 0.5)

b<-tm_shape(data2.gi) +
  tm_fill(col = "pval",
          breaks=c(-Inf,0.05, Inf),
          palette=c("blue","lightgrey"),
          title = "p-values") +
  tm_view(set.zoom.limits = c(10,17)) +
  tm_borders(alpha = 0.5)

tmap_arrange(a, b, asp=1, nrow=1, sync = TRUE)
## Variable(s) "gstat" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

data2.gi <- data2.gi %>%
  mutate("Type"= case_when(pval<0.05 & gstat<=0 ~ "Significant Cold Spot",
                          pval>=0.05 & gstat<=0 ~ "Insignificant Cold Spot",
                          pval<0.05 & gstat>0 ~ "Significant Hot Spot",
                          pval>=0.05 & gstat>0 ~ "Insignificant Hot Spot"))

tm_shape(data2.gi) +
  tm_fill(col = "Type",
          style = "pretty",
          palette= c("lightblue","lightcoral","red"),
          title = "Gi Analysis") +
  tm_view(set.zoom.limits = c(11,17)) +
  tm_borders(alpha = 0.5)

In the choropleth map above, subzones shaded in red are hot spot areas and subzones shaded in blue are the cold spot areas. The subzones shaded in dark red are significant Hot Spots, however, there are no significant cold spots. Hot Spots are present in all regions other than Central Singapore (Eastern, Western and Northern), indicating excessive usage of bus transportation in residential area as compared to the business district. The cold spot areas, on the other hand, are mainly subzones in Central Singapore.

Creating Regression Model for tap out

stats2 <- lm(TAP_OUT~value,data)
summary(stats2)
## 
## Call:
## lm(formula = TAP_OUT ~ value, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -824702 -123573  -59179   34208 1643316 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.259e+05  2.043e+04   6.163 2.27e-09 ***
## value       1.917e+01  9.088e-01  21.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 288800 on 303 degrees of freedom
## Multiple R-squared:  0.5948, Adjusted R-squared:  0.5934 
## F-statistic: 444.7 on 1 and 303 DF,  p-value: < 2.2e-16
stats2$coefficients
##  (Intercept)        value 
## 125936.35239     19.16619
data$tapOUTres <- stats2$residuals

plot(data$value, data$TAP_OUT, main="Regression for Population on Tap Out volumes in Buses",xlab="Population in Subzone",ylab="Tap Out")
abline(reg = stats2,col='red')

From the regression model, we can derive the following:
(1) The linear equation is y = 19.17x + 125936
(2) The slope of the regression line is 19.17, which signifies a positive relationship between population and tap in volumes in buses
(3) The intercept is 125936
(4) The R-squared value is 0.5948, which signifies that 59.48% of the variation in tap in data is due to the variation in population.

Comparison of TAP-IN and TAP-OUT regression models

plot(data$value, data$TAP_OUT, main="Regression for Population on Tap Out volumes in Buses",xlab="Population in Subzone",ylab="Tap Out")
abline(reg = stats2,col='red')
abline(reg = stats,col='blue')

The co-inciding lines suggest that both tap in and tap out follow a similar relationship with the population in the subzone!

Coming back to the analysis of Tap Out data, we will now find out if any spatial autocorrelation is present in the residual data of the regression model. ## Spatial Autocorrelaion

tmap_mode("plot")
## tmap mode set to plotting
data3 <- st_as_sf(data)
tm_shape(data3)+
  tm_fill(col="tapOUTres",
          style = "pretty",
          palette = "-RdBu",
          title = "Tap Out Residual data")+
  tm_borders(alpha = 0.5)
## Variable(s) "tapOUTres" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

tmap_mode("plot")
## tmap mode set to plotting

Negative residuals (Subzones in blue) imply over prediction of data whereas Positive residuals (Subzones in red) imply underprediction of data. Visually, spatial correlation for residuals is not evident but a more formal test needs to be conducted to confirm this. Therefore, global Moran’s test is performed on the residual data to validate if there is spatial correlation.

Moran’s I for regression residuals

The null hypothesis is that the residuals obtained from the regression model of the relationship between population and bus tap out is randomised among different subzones in Singapore.

The alternative hypothesis is that the residuals obtained from the regression model of the relationship between population and bus tap out is not randomised among different subzones in Singapore.

The alpha (significance level) for this test is 0.05.

lm.morantest(stats2, listw=rswm_knn8, zero.policy=TRUE, alternative = "greater",
 spChk=NULL, resfun=weighted.residuals, naSubset=TRUE)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = TAP_OUT ~ value, data = data)
## weights: rswm_knn8
## 
## Moran I statistic standard deviate = 0.058411, p-value = 0.4767
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     -0.002589963     -0.004254316      0.000811893

The p-value is 0.478, which is above 0.05 and hence we do not reject the null hypothesis. However, to confirm our findings, we will run a Monte Carlo simulation 1000 times. ### Monte Carlo simulation for Moran’s I

set.seed(1234)
bperm= moran.mc(data3$tapOUTres, listw=rswm_knn8, nsim=999, zero.policy = TRUE, na.action=na.omit)
bperm
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  data3$tapOUTres 
## weights: rswm_knn8  
## number of simulations + 1: 1000 
## 
## statistic = -0.00259, observed rank = 529, p-value = 0.471
## alternative hypothesis: greater
hist(bperm$res, freq=TRUE, breaks=20, xlab="Simulated Moran's I", main = "Findings from Monte Carlo Simulation")
abline(v=-0.002589963, col="red")

The Moran’s I in the Monte Carlo simulation is in the middle, validating our findings of an insignificant p-value from the Moran’s I test.

Conclusion

The p-value is 0.478, which is above 0.05 and hence we do not reject the null hypothesis. The moran’s I value is -0.002589963 which is very close to 0, implying that the residual data obtained from the population-tapin regression model conforms to spatial randomness among different subzones in Singapore. From the results of the test, we can conclude that the errors in the regression model are independent of one another, and their values do not depend on the value of residuals at neighbouring locations, i.e. there is no spatial correlation in the residuals.

Visualising spatial lag value using Moran’s scatterplot

MS <- moran.plot(data3$tapOUTres, rswm_knn8, zero.policy = TRUE, spChk=FALSE, labels=as.character(data2$SUBZONE_N), xlab="residual", ylab="Spatially Lag residual")

Spatial Correlogram

Spatial Correlograms are plotted in order to identify the autocorrelation of spatial observations when we increase the distance (lag) between them.

MI_corr <- sp.correlogram(wm_knn8, data2$tapINres, order=6, method="I", style="B")
plot(MI_corr, main = "Spatial Correlogram")

The findings from the above graph show that the Moran’s I value are all around 0 even when we increase the distance between them.

Analysis of Tap Out Data

Global Moran’s I for Tap Out Data

The null hypothesis is that the volume of Bus Tap Out is randomised among different subzones in Singapore.

The alternative hypothesis is that the volume of Bus Tap Out is not randomised among different subzones in Singapore.

The alpha (significance level) for this test is 0.05.

moran.test(data3$TAP_OUT, listw=rswm_knn8, zero.policy = TRUE, na.action=na.omit,alternative = "two.sided")
## 
##  Moran I test under randomisation
## 
## data:  data3$TAP_OUT  
## weights: rswm_knn8    
## 
## Moran I statistic standard deviate = 6.4916, p-value = 8.493e-11
## alternative hypothesis: two.sided
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.1783104209     -0.0032894737      0.0007825763

The p-value is 8.493e-11, which is below 0.05 and hence we reject the null hypothesis. However, to confirm our findings, we will run a Monte Carlo simulation 1000 times.

Monte Carlo Simulation for Moran’s i

set.seed(1234)
bperm= moran.mc(data3$TAP_OUT, listw=rswm_knn8, nsim=999, zero.policy = TRUE, na.action=na.omit)
bperm
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  data3$TAP_OUT 
## weights: rswm_knn8  
## number of simulations + 1: 1000 
## 
## statistic = 0.17831, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
hist(bperm$res, freq=TRUE, breaks=20, xlab="Simulated Moran's I", main="Findings from Monte Carlo Simulation")
abline(v=0.19, col="red")

The Moran’s I value is in the extreme right when compared with the Monte Carlo simulation i, validating our findings of an significant p-value from the Moran’s I test.

Conclusion

The p-value is 8.493e-11, which is below 0.05 and hence we reject the null hypothesis. Therefore, the volume of Bus Tap Out is not randomised among different subzones in Singapore, implying a spatial relationship between them. Furthermore, the Moran’s I value is 0.171 which is greather than 0, implying that the Tap In data is positively clustered among different subzones in Singapore.

Local Moran’s I for Tap Out data

From the code below, Local Moran’s I test is performed locally through which I value and p-value is obtained for every subzone.

fips <- order(data3$SUBZONE_N)
localMI <- localmoran(data3$TAP_OUT, rswm_knn8)
head(localMI)
##            Ii         E.Ii    Var.Ii       Z.Ii Pr(z > 0)
## 1  0.09232491 -0.003289474 0.1331512  0.2620298 0.3966492
## 2 -0.08047330 -0.003289474 0.1331512 -0.2115211 0.5837597
## 3 -0.10798507 -0.003289474 0.1331512 -0.2869167 0.6129119
## 4  0.09849363 -0.003289474 0.1331512  0.2789350 0.3901473
## 5  0.38825119 -0.003289474 0.1331512  1.0730112 0.1416330
## 6 -0.21862028 -0.003289474 0.1331512 -0.5901108 0.7224418
#printCoefmat(data.frame(localMI[fips,], row.names=data2$SUBZONE_N[fips]), check.names=FALSE)
data3.localMI <- cbind(data3,localMI)

A choropleth map of the Local Moran’s I value is plotted beside a choropleth map indicating the regions which have a significant p-value, i.e. a p-value which is less than 0.05.

tmap_mode("plot")
## tmap mode set to plotting
localMI.map <- tm_shape(data3.localMI) +
  tm_fill(col = "Ii",
          style = "pretty",
          title = "Local Moran's I value",
          palette= "-RdBu") +
  tm_layout(legend.outside = TRUE) +
  tm_view(set.zoom.limits = c(10,16)) +
  tm_borders(alpha = 0.5)

pvalue.map <- tm_shape(data3.localMI) +
  tm_fill(col = "Pr.z...0.",
          breaks=c(-Inf,0.05, Inf),
          palette=c("blue","lightgrey"),
          title = "local Moran's I p-values") +
  tm_layout(legend.outside = TRUE) +
  tm_view(set.zoom.limits = c(10,16)) +
  tm_borders(alpha = 0.5)

tmap_arrange(localMI.map, pvalue.map, asp=1, nrow=1, sync = TRUE)
## Variable(s) "Ii" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
## Some legend labels were too wide. These labels have been resized to 0.58. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.

tmap_mode("plot")
## tmap mode set to plotting

The dark blue regions on the map on the right shows the regions which have p-values less than 0.05 and is hence statistically significant to determine these regions to be either clusters or outliers. Clusters exhibit positive I values, which are shaded red in the map on the left. Outliers exhibit negative I values, which are shaded blue in the map on the left.

To understand characteristics of these regions, LISA cluster map is created! # Creating LISA clusters map

Plotting Moran Scatterplot

nci <- moran.plot(data3$TAP_OUT, rswm_knn8, labels=as.character(data3$SUBZONE_N), xlab="Residual Values", ylab="Spatially Lag residuals")

Moran Scatterplot with standardised variable

data3$Z.TAP_OUT <- scale(data3$TAP_OUT) %>% as.vector
nci2 <- moran.plot(data3$Z.TAP_OUT, rswm_knn8, labels=as.character(data3$SUBZONE_N), xlab="z-residual", ylab="Spatially Lag z-residual")

Visualising the LISA Cluster map

quadrant <- vector(mode="numeric",length=nrow(localMI))
TAP_OUT.lag <- lag.listw(rswm_knn8, data3$TAP_OUT)
data3$TAP_OUT_LAG <- TAP_OUT.lag

#DV <- data2$tapINres - mean(data2$tapINres)
#C_mI <- localMI[,1] - mean(localMI[,1])
DV <- data3$Z.TAP_OUT
C_mI <- data3$TAP_OUT_LAG
signif <- 0.05
quadrant[DV >0 & C_mI>0] <- 4
quadrant[DV <0 & C_mI<0] <- 1
quadrant[DV <0 & C_mI>0] <- 2
quadrant[DV >0 & C_mI<0] <- 3
quadrant[localMI[,5]>signif] <- 0

data3.localMI$quadrant <- quadrant
colors <- c("#ffffff", "#2c7bb6", "#abd9e9", "#fdae61", "#d7191c")
clusters <- c("insignificant", "low-low", "low-high", "high-low", "high-high")

tmap_mode("plot")
## tmap mode set to plotting
tm_shape(data3.localMI) +
  tm_fill(col = "quadrant", style = "cat", palette = colors[c(sort(unique(quadrant)))+1], labels = clusters[c(sort(unique(quadrant)))+1], popup.vars = c("SUBZONE_N")) +
  tm_view(set.zoom.limits = c(11,17)) +
  tm_borders(alpha=0.5)

As seen in the map above, there are various subzones in the “high-high” quadrant. High-High refers to a high tap-out value in the subzone as well as high tap in values in their neighbours. Groupings of these regions can be found majorly in the East part of Singapore (Geylang East, Kampong Ubi, Kembangan, Frankel, Bedok South, Bedok North, Tampines West, Simei, Tampines East, Pasir Drive, Pasir Ris Central ), the North part of Singapore (North Coast, Senoko North, Woodlands regional center, Woodlands East, Yishun West, Yishun Central, Yishun East ) and some locations in North-East and Western region of Singapore.

We will now perform another local statistics to find out hot spots and cold spots.

Performing Getis-Ord Statistics to identify Hot spots and Cold Spots

The Gi value is computed for each subzone through which the p-value is calculated for each subzone.

fips <- order(data3$TAP_OUT)
gi.fixed <- localG(data3$TAP_OUT, rswm_knn8)

data3.gi <- cbind(data3, as.matrix(gi.fixed))
names(data3.gi)[9] <- "gstat"
pvalue2sided<-2*pnorm(-abs(data3.gi$gstat))
data3.gi$pval <- pvalue2sided
summary(data3.gi$gstat)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.85784 -0.84453 -0.02267  0.15080  0.93063  5.88742
a<-tm_shape(data3.gi) +
  tm_fill(col = "gstat",
          style = "pretty",
          palette="-RdBu",
          title = "local Gi") +
  tm_view(set.zoom.limits = c(10,17)) +
  tm_borders(alpha = 0.5)

b<-tm_shape(data3.gi) +
  tm_fill(col = "pval",
          breaks=c(-Inf,0.05, Inf),
          palette=c("blue1","grey90"),
          title = "p-values") +
  tm_view(set.zoom.limits = c(10,17)) +
  tm_borders(alpha = 0.5)

tmap_arrange(a, b, asp=1, nrow=1, sync = TRUE)
## Variable(s) "gstat" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

These two maps can be combined to obtain values a single map which can be classified as follows:
(1) Insignificant Cold Spot [Negative Gi Value, p-value>=0.05]
(2) Significant Cold Spot [Negative Gi Value, p-value<0.05]
(3) Insignificant Hot Spot [Positive Gi Value, p-value>=0.05]
(4) Significant Hot Spot [Positive Gi Value, p-value<0.05]

data3.gi <- data3.gi %>%
  mutate("Type"= case_when(pval<0.05 & gstat<=0 ~ "Significant Cold Spot",
                          pval>=0.05 & gstat<=0 ~ "Insignificant Cold Spot",
                          pval<0.05 & gstat>0 ~ "Significant Hot Spot",
                          pval>=0.05 & gstat>0 ~ "Insignificant Hot Spot"))

tm_shape(data3.gi) +
  tm_fill(col = "Type",
          style = "pretty",
          palette= c("lightblue","lightcoral","red"),
          title = "Gi Analysis") +
  tm_view(set.zoom.limits = c(11,17)) +
  tm_borders(alpha = 0.5)

tmap_mode("plot")
## tmap mode set to plotting

In the choropleth map above, subzones shaded in red are hot spot areas and subzones shaded in blue are the cold spot areas. The subzones shaded in dark red are significant Hot Spots, however, there are no significant cold spots. Hot Spots are present in all regions other than Central Singapore (Eastern, Western and Northern), indicating excessive usage of bus transportation in residential area as compared to the business district. The cold spot areas, on the other hand, are mainly subzones in Central Singapore. This resembles to the findings from the Tap In Data.

Final Conclusion

Moran’s test was widely used in this assignment to determine the presence of spatial autocorrelation. Global Moran’s I test for Tap in and Tap out residual data indicated that there is autocorrelation among the residual data. This validates that there is no systematic error in the regression models, and the deviations are justified. By performing Global Moran’s I test on Tap in and Tap out data, a positive autocorrelation was found indicating that there is spatial relationship between the tap data and subzones. Two statistical tests in local analysis (Getis-Ord and Local Moran’s) indicated the presence of “high” clusters among the suburban regions of Singapore.