Link: https://rpubs.com/Huiling/take-home_ex01

Loading of Packages

packages = c('rgdal', 'sf', 'spdep', 'tmap', 'tidyverse', 'httr', 'mapview', 'rvest')
for (p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p,character.only = T)
}

## Loading required package: rgdal

## Loading required package: sp

## rgdal: version: 1.4-8, (SVN revision 845)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
##  Path to GDAL shared files: C:/Users/User/Documents/R/win-library/3.6/rgdal/gdal
##  GDAL binary built with GEOS: TRUE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: C:/Users/User/Documents/R/win-library/3.6/rgdal/proj
##  Linking to sp version: 1.4-1

## Loading required package: sf

## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3

## Loading required package: spdep

## Loading required package: spData

## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`

## Loading required package: tmap

## Loading required package: tidyverse

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

## Loading required package: httr

## Loading required package: mapview

## Loading required package: rvest

## Loading required package: xml2

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:purrr':
## 
##     pluck

## The following object is masked from 'package:readr':
## 
##     guess_encoding

Loading of Files

Load CSV data

Passenger_Volume <- read_csv("data/aspatial/passenger volume by busstop.csv")

## Parsed with column specification:
## cols(
##   YEAR_MONTH = col_character(),
##   DAY_TYPE = col_character(),
##   TIME_PER_HOUR = col_double(),
##   PT_TYPE = col_character(),
##   PT_CODE = col_character(),
##   TOTAL_TAP_IN_VOLUME = col_double(),
##   TOTAL_TAP_OUT_VOLUME = col_double()
## )

Population <- read_csv("data/aspatial/Singapore Residents by Planning AreaSubzone Age Group Sex and Type of Dwelling June 20112019/respopagesextod2011to2019.csv")

## Parsed with column specification:
## cols(
##   PA = col_character(),
##   SZ = col_character(),
##   AG = col_character(),
##   Sex = col_character(),
##   TOD = col_character(),
##   Pop = col_double(),
##   Time = col_double()
## )

BusStop_Location <- readOGR(dsn = "data/geospatial/BusStopLocation_Jan2020", 
                  layer = "BusStop")

## OGR data source with driver: ESRI Shapefile 
## Source: "C:\IS415-Geospatial Analytics and Applications\Take Home Exercise\Take-home_Ex01\data\geospatial\BusStopLocation_Jan2020", layer: "BusStop"
## with 5040 features
## It has 3 fields

Subzone <- readOGR(dsn = "data/geospatial/master-plan-2019-planning-area-boundary-no-sea", 
                  layer = "MP14_SUBZONE_WEB_PL")

## OGR data source with driver: ESRI Shapefile 
## Source: "C:\IS415-Geospatial Analytics and Applications\Take Home Exercise\Take-home_Ex01\data\geospatial\master-plan-2019-planning-area-boundary-no-sea", layer: "MP14_SUBZONE_WEB_PL"
## with 323 features
## It has 15 fields

Converting of data

Subzone <- st_as_sf(Subzone, 3414)

BusStop_Location <- st_as_sf(BusStop_Location, 3414)

Data Preparation

Population2019 <- Population %>%
  filter(Time == 2019) %>%
  spread(AG, Pop) %>% 
  mutate(`TOTAL`=rowSums(.[5:22])) %>%
  mutate_at(.vars = vars(PA, SZ), .funs = funs(toupper)) %>%
  group_by(SZ) %>%
  summarise(TOTAL_POPULATION = sum(TOTAL))

Passenger_Volume2020 <- Passenger_Volume %>%
   group_by(PT_CODE) %>%
   summarise_at(c("TOTAL_TAP_IN_VOLUME", "TOTAL_TAP_OUT_VOLUME"), sum)

Passenger_Volume2020$TOTAL_TAP_IN_VOLUME[is.na(Passenger_Volume2020$TOTAL_TAP_IN_VOLUME)] <- 0
Passenger_Volume2020$TOTAL_TAP_OUT_VOLUME[is.na(Passenger_Volume2020$TOTAL_TAP_OUT_VOLUME)] <- 0

Joining of tables

BusStop_Location<- left_join(BusStop_Location, Passenger_Volume2020, 
                              by = c("BUS_STOP_N" = "PT_CODE"))

Subzone<- left_join(Subzone, Population2019, 
                              by = c("SUBZONE_N" = "SZ"))

subzone_busstop <- st_join(Subzone, BusStop_Location)

Question 1 - Linear Regression

I have calibrated a simple linear regression to reveal the relation between public bus commuters’ flows (i.e. tap-in or tap-out) data and residential population at the planning sub-zone level.

Preparation of data

linear_regression <- subzone_busstop %>%
                     group_by(SUBZONE_N) %>%
                     summarise_at(c("TOTAL_TAP_IN_VOLUME", "TOTAL_TAP_OUT_VOLUME", "TOTAL_POPULATION"), sum)

linear_regression$TOTAL_TAP_IN_VOLUME[is.na(linear_regression$TOTAL_TAP_IN_VOLUME)] <- 0
linear_regression$TOTAL_TAP_OUT_VOLUME[is.na(linear_regression$TOTAL_TAP_OUT_VOLUME)] <- 0
linear_regression$TOTAL_POPULATION[is.na(linear_regression$TOTAL_POPULATION)] <- 0

Using Simple Linear Regression

I will first use the lm method to check the values for both the relationship between residential population and tap in as well as residential population between tap-out.

fips <- order(linear_regression$SUBZONE_N) 
slr = lm(TOTAL_POPULATION~TOTAL_TAP_IN_VOLUME + TOTAL_TAP_OUT_VOLUME, data=linear_regression)
summary(slr)

## 
## Call:
## lm(formula = TOTAL_POPULATION ~ TOTAL_TAP_IN_VOLUME + TOTAL_TAP_OUT_VOLUME, 
##     data = linear_regression)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2193963  -386510  -225203   122297 11875154 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           4.026e+05  7.395e+04   5.445 1.03e-07 ***
## TOTAL_TAP_IN_VOLUME  -2.196e+00  9.688e-01  -2.266 0.024103 *  
## TOTAL_TAP_OUT_VOLUME  3.752e+00  9.850e-01   3.809 0.000167 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1024000 on 320 degrees of freedom
## Multiple R-squared:  0.2258, Adjusted R-squared:  0.221 
## F-statistic: 46.67 on 2 and 320 DF,  p-value: < 2.2e-16

Based on the results above, it can be observed if the p-value of the coefficients is bigger than 0.05 which means that this predictor is not meaningful for the regression. If the p-value of the coefficients is smaller than 0.05, it means that the attribute is an excellent addition to the model. When we look at the P-Vales for slr, it can be observed that the attributes TAP_IN_VOLUME and TAP_OUT_VOLUME affects TOTAL POPULATION which is the number of residential population.

Besides that, it can be seen that the value of the multiplied R-square and Adjusted R-square is bigger than 0.5, which means both values are rather important.

plot(slr$residuals, pch = 16, col = "red")

It can be seen that the residuals are rather random, so this means that the simple linear model will be able to generalize well.

par(mfrow=c(1,2))
scatter.smooth(x=linear_regression$TOTAL_POPULATION, y=linear_regression$TOTAL_TAP_IN_VOLUME, main="Residential Population ~ Public Bus Commuters (In)")
scatter.smooth(x=linear_regression$TOTAL_POPULATION, y=linear_regression$TOTAL_TAP_OUT_VOLUME, main="Residential Population ~ Public Bus Commuters (Out)")

However, when you look at the 2 scatter plots diagram above. It is rather hard to see the distinction because the points are clumped together at the bottom left of the graphs which is hard to view. This makes it hard to determine whether there is a linearly increasing relationship between the ‘population’ and ‘tap in/tap out’ variables. Hence, we will check whether the values are close to normality.

par(mfrow=c(2, 2)) 
plot(density(linear_regression$TOTAL_POPULATION), main="Density Plot: Population", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(linear_regression$TOTAL_POPULATION), 2))) 
polygon(density(linear_regression$TOTAL_POPULATION), col="red")
plot(density(linear_regression$TOTAL_TAP_IN_VOLUME), main="Density Plot: Tap in volume", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(linear_regression$TOTAL_TAP_IN_VOLUME), 2)))
polygon(density(linear_regression$TOTAL_TAP_IN_VOLUME), col="red")
plot(density(linear_regression$TOTAL_TAP_OUT_VOLUME), main="Density Plot: Tap out volume", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(linear_regression$TOTAL_TAP_OUT_VOLUME), 2)))
polygon(density(linear_regression$TOTAL_TAP_OUT_VOLUME), col="red")

It can be seen that there seems to be a slight skewness to the right. Hence, I will apply common transformations for right-skewed data: square root, cube root, and log. This would help to improves the distribution of the data.

cor(linear_regression$TOTAL_POPULATION, linear_regression$TOTAL_TAP_IN_VOLUME)

## [1] 0.4367158

cor(linear_regression$TOTAL_POPULATION, linear_regression$TOTAL_TAP_OUT_VOLUME)

## [1] 0.461949

After testing the coorelation, it can be seen there seems to be somewhat of a high coorelation between total population with tap in volume and total population with tap out volume. However, in order to fully ensure that there is a relationship between residential population and tap in as well as residential population between tap-out. We will be transforming the data.

Transforming of data

linear_regression1 <- linear_regression %>%
                      group_by(SUBZONE_N) %>%
                      summarise_at(c("TOTAL_TAP_IN_VOLUME", "TOTAL_TAP_OUT_VOLUME", "TOTAL_POPULATION"), log) %>%
                      filter_if(~is.numeric(.), all_vars(!is.infinite(.)))

slr1 = lm(TOTAL_POPULATION~TOTAL_TAP_IN_VOLUME + TOTAL_TAP_OUT_VOLUME, data=linear_regression1)
summary(slr1)

## 
## Call:
## lm(formula = TOTAL_POPULATION ~ TOTAL_TAP_IN_VOLUME + TOTAL_TAP_OUT_VOLUME, 
##     data = linear_regression1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.23386 -0.45270  0.06608  0.55313  2.28221 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.81007    0.43869  13.244  < 2e-16 ***
## TOTAL_TAP_IN_VOLUME   0.03412    0.13567   0.251 0.801654    
## TOTAL_TAP_OUT_VOLUME  0.56361    0.14304   3.940 0.000104 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7729 on 267 degrees of freedom
## Multiple R-squared:  0.5111, Adjusted R-squared:  0.5074 
## F-statistic: 139.6 on 2 and 267 DF,  p-value: < 2.2e-16

Based on the results above, it can be observed if the p-value of the coefficients is bigger than 0.05 which means that this predictor is not meaningful for the regression. If the p-value of the coefficients is smaller than 0.05, it means that the attribute is an excellent addition to the model. When we look at the P-Vales for slr, it can be observed that the attributes TAP_IN_VOLUME is not meaningful for this regression. But on the other hand, the tap out volume is an excellent addition to the model.

Besides that, it can be seen that the value of the multiplied R-square and Adjusted R-square is bigger than 0.5, which means both values are rather important.

plot(slr1$residuals, pch = 16, col = "red")

It can be seen that the residuals are rather random, so this means that the simple linear model will be able to generalize well.

par(mfrow=c(1,2))
scatter.smooth(x=linear_regression1$TOTAL_POPULATION, y=linear_regression1$TOTAL_TAP_IN_VOLUME, main="Residential Population ~ Public Bus Commuters (In)")
scatter.smooth(x=linear_regression1$TOTAL_POPULATION, y=linear_regression1$TOTAL_TAP_OUT_VOLUME, main="Residential Population ~ Public Bus Commuters (Out)")

However, when you look at the 2 scatter plots diagram above, it has improved from the previous graphs. It makes it much easier to see that there is a linearly increasing relationship between the ‘population’ and ‘tap in/tap out’ variables.

par(mfrow=c(2, 2)) 
plot(density(linear_regression1$TOTAL_POPULATION), main="Density Plot: Population", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(linear_regression1$TOTAL_POPULATION), 2))) 
polygon(density(linear_regression1$TOTAL_POPULATION), col="red")
plot(density(linear_regression1$TOTAL_TAP_IN_VOLUME), main="Density Plot: Tap in volume", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(linear_regression1$TOTAL_TAP_IN_VOLUME), 2)))
polygon(density(linear_regression1$TOTAL_TAP_IN_VOLUME), col="red")
plot(density(linear_regression1$TOTAL_TAP_OUT_VOLUME), main="Density Plot: Tap out volume", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(linear_regression1$TOTAL_TAP_OUT_VOLUME), 2)))
polygon(density(linear_regression1$TOTAL_TAP_OUT_VOLUME), col="red")

When we look at the normality, the skewness is much better than before as well.

cor(linear_regression1$TOTAL_POPULATION, linear_regression1$TOTAL_TAP_IN_VOLUME)

## [1] 0.6947402

cor(linear_regression1$TOTAL_POPULATION, linear_regression1$TOTAL_TAP_OUT_VOLUME)

## [1] 0.7148276

After testing the coorelation, it can be seen there seems to be somewhat of a high coorelation between total population with tap in volume and total population with tap out volume.

Hence, with all these diagrams. We are able to see that there is a linearly increasing relationship between population and tap in as well as population and tap out.

Question 2

We will be performing a autocorrelation analysis on the residual of the regression model to test if the model conforms to the randomization assumption.

For our hypothesis,

Our null hypothesis will be testing whether the residual of the regression model is randomly distruted among the different subzones in Singapore. Our alternate hypothesis will be testing whether the residual of the regression model is not randomly distruted among the different subzones in Singapore.

The confidence interval that we will be using is 95%.

tmap_mode("view")

## tmap mode set to interactive viewing

tm_shape(linear_regression)+
  tm_fill("TOTAL_POPULATION",
          n = 6,
          style = "quantile",
          palette = "Blues") + 
  tm_borders(alpha = 0.5)

tm_shape(linear_regression)+
  tm_fill("TOTAL_TAP_IN_VOLUME",
          n = 6,
          style = "quantile",
          palette = "Blues") + 
  tm_borders(alpha = 0.5)

tm_shape(linear_regression)+
  tm_fill("TOTAL_TAP_OUT_VOLUME",
          n = 6,
          style = "quantile",
          palette = "Blues") + 
  tm_borders(alpha = 0.5)

We will first transform the subzone_busstop data into a spatialpointsdataframe so as to perform the weight matrix. I will be using the skewed data for analysis, hence we will be using knearest neighbour as our weighted matrix.

wm <- as_Spatial(linear_regression)

wm <- spTransform(wm, CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"))

Since there are only 323 elemnts, we will be setting the number of neighbours as 20.

coords <- coordinates(wm)
wm_knn20 <- knn2nb(knearneigh(coords, k=20))
wm_knn20

## Neighbour list object:
## Number of regions: 323 
## Number of nonzero links: 6460 
## Percentage nonzero weights: 6.19195 
## Average number of links: 20 
## Non-symmetric neighbours list

str(wm_knn20)

## List of 323
##  $ : int [1:20] 127 150 151 164 177 228 229 231 232 233 ...
##  $ : int [1:20] 20 67 99 100 101 107 113 120 121 130 ...
##  $ : int [1:20] 4 36 45 61 68 92 119 126 153 159 ...
##  $ : int [1:20] 3 35 36 45 68 92 102 119 135 153 ...
##  $ : int [1:20] 18 23 65 83 84 107 114 115 116 117 ...
##  $ : int [1:20] 31 32 33 56 57 66 74 89 94 96 ...
##  $ : int [1:20] 62 78 99 100 101 121 143 161 178 207 ...
##  $ : int [1:20] 21 46 52 124 141 160 226 230 235 243 ...
##  $ : int [1:20] 12 22 38 39 48 49 54 55 60 73 ...
##  $ : int [1:20] 18 25 27 77 83 114 118 129 133 137 ...
##  $ : int [1:20] 31 32 40 50 51 66 75 82 86 89 ...
##  $ : int [1:20] 22 28 30 38 39 48 49 53 55 60 ...
##  $ : int [1:20] 14 15 16 72 79 81 84 113 120 122 ...
##  $ : int [1:20] 13 15 16 72 79 81 84 113 120 122 ...
##  $ : int [1:20] 2 13 14 16 67 81 84 113 120 125 ...
##  $ : int [1:20] 13 14 15 72 79 81 84 113 120 122 ...
##  $ : int [1:20] 28 30 53 69 77 80 104 117 118 133 ...
##  $ : int [1:20] 10 23 25 77 83 114 115 116 118 129 ...
##  $ : int [1:20] 24 47 90 91 106 110 111 112 128 138 ...
##  $ : int [1:20] 18 25 67 83 107 115 129 130 139 140 ...
##  $ : int [1:20] 8 20 25 27 46 52 129 139 141 160 ...
##  $ : int [1:20] 12 28 38 39 48 49 53 55 60 69 ...
##  $ : int [1:20] 5 18 30 65 77 83 114 115 116 117 ...
##  $ : int [1:20] 19 29 34 47 97 108 111 112 128 132 ...
##  $ : int [1:20] 10 18 20 21 27 83 118 129 139 141 ...
##  $ : int [1:20] 35 37 45 69 87 102 104 135 147 165 ...
##  $ : int [1:20] 10 18 21 25 52 129 139 141 149 160 ...
##  $ : int [1:20] 17 22 30 53 55 60 69 80 104 117 ...
##  $ : int [1:20] 31 32 33 34 86 89 94 97 98 108 ...
##  $ : int [1:20] 17 22 28 53 60 65 69 77 80 104 ...
##  $ : int [1:20] 6 29 32 33 34 74 86 89 94 98 ...
##  $ : int [1:20] 6 29 31 33 34 66 74 86 89 94 ...
##  $ : int [1:20] 6 29 31 32 34 57 74 89 94 98 ...
##  $ : int [1:20] 29 31 32 33 74 86 89 94 97 98 ...
##  $ : int [1:20] 4 36 45 55 92 102 119 135 180 182 ...
##  $ : int [1:20] 3 4 35 45 68 73 92 119 153 159 ...
##  $ : int [1:20] 17 26 69 87 102 104 118 135 147 165 ...
##  $ : int [1:20] 9 12 22 39 48 49 53 55 60 73 ...
##  $ : int [1:20] 9 12 22 38 48 49 53 55 60 73 ...
##  $ : int [1:20] 11 66 75 82 86 93 94 105 124 172 ...
##  $ : int [1:20] 13 14 15 16 42 43 44 79 145 146 ...
##  $ : int [1:20] 13 14 15 16 41 43 44 79 145 146 ...
##  $ : int [1:20] 41 44 63 79 145 146 187 188 189 190 ...
##  $ : int [1:20] 14 15 16 41 43 79 145 146 187 188 ...
##  $ : int [1:20] 3 4 26 35 36 92 102 135 153 170 ...
##  $ : int [1:20] 8 21 52 124 141 160 226 230 235 242 ...
##  $ : int [1:20] 19 24 34 97 108 110 111 112 128 132 ...
##  $ : int [1:20] 9 12 22 28 38 39 49 53 55 60 ...
##  $ : int [1:20] 9 12 22 38 39 48 55 60 73 80 ...
##  $ : int [1:20] 11 29 31 51 66 75 82 86 89 94 ...
##  $ : int [1:20] 11 29 50 66 75 82 86 89 98 105 ...
##  $ : int [1:20] 8 21 27 46 124 130 139 141 160 226 ...
##  $ : int [1:20] 12 17 22 28 30 48 55 60 65 69 ...
##  $ : int [1:20] 9 38 39 48 49 73 119 162 196 200 ...
##  $ : int [1:20] 22 28 38 48 49 53 60 69 80 102 ...
##  $ : int [1:20] 6 33 57 58 59 70 74 85 96 103 ...
##  $ : int [1:20] 6 33 56 58 59 70 74 85 96 103 ...
##  $ : int [1:20] 56 57 59 70 74 85 96 103 171 179 ...
##  $ : int [1:20] 56 57 58 70 74 85 95 96 171 179 ...
##  $ : int [1:20] 12 22 28 30 38 39 48 49 53 55 ...
##  $ : int [1:20] 3 4 64 68 70 76 85 95 126 134 ...
##  $ : int [1:20] 7 78 99 100 101 121 130 142 143 161 ...
##  $ : int [1:20] 7 62 100 142 143 161 178 189 190 191 ...
##  $ : int [1:20] 61 70 71 76 85 93 95 96 134 153 ...
##  $ : int [1:20] 17 23 28 30 53 77 114 116 117 133 ...
##  $ : int [1:20] 6 11 29 31 32 33 40 50 75 82 ...
##  $ : int [1:20] 2 20 99 100 101 107 113 120 121 130 ...
##  $ : int [1:20] 3 4 36 45 92 119 126 153 159 163 ...
##  $ : int [1:20] 17 22 28 30 37 53 55 80 102 104 ...
##  $ : int [1:20] 56 57 58 59 61 85 95 96 126 134 ...
##  $ : int [1:20] 26 37 64 76 87 93 134 149 166 168 ...
##  $ : int [1:20] 5 13 14 16 23 81 84 113 115 116 ...
##  $ : int [1:20] 9 35 38 39 48 49 54 55 119 162 ...
##  $ : int [1:20] 6 31 32 33 56 57 58 59 103 108 ...
##  $ : int [1:20] 11 40 50 51 66 82 86 89 94 98 ...
##  $ : int [1:20] 45 61 64 71 85 93 95 96 134 153 ...
##  $ : int [1:20] 17 28 30 65 69 104 114 117 118 133 ...
##  $ : int [1:20] 7 62 99 100 101 161 206 207 211 217 ...
##  $ : int [1:20] 14 15 16 41 43 44 145 146 187 188 ...
##  $ : int [1:20] 17 22 28 30 48 53 55 60 69 102 ...
##  $ : int [1:20] 2 5 13 14 15 16 72 84 113 120 ...
##  $ : int [1:20] 11 50 51 66 75 86 105 123 131 152 ...
##  $ : int [1:20] 5 10 18 20 23 107 114 115 116 118 ...
##  $ : int [1:20] 5 23 72 81 83 107 113 115 116 120 ...
##  $ : int [1:20] 56 57 59 61 64 70 76 95 96 134 ...
##  $ : int [1:20] 11 29 31 32 33 34 50 51 66 75 ...
##  $ : int [1:20] 26 37 69 77 104 118 135 147 149 165 ...
##  $ : int [1:20] 1 131 150 152 164 176 215 228 231 232 ...
##  $ : int [1:20] 6 29 31 32 33 34 66 86 94 98 ...
##  $ : int [1:20] 19 47 91 106 109 110 112 128 138 202 ...
##  $ : int [1:20] 19 24 47 90 106 110 112 128 138 202 ...
##  $ : int [1:20] 3 4 35 36 45 68 102 119 135 180 ...
##  $ : int [1:20] 61 64 71 76 85 87 95 96 134 149 ...
##  $ : int [1:20] 6 11 29 31 32 33 34 66 75 86 ...
##  $ : int [1:20] 3 61 64 70 76 85 96 126 134 153 ...
##  $ : int [1:20] 6 56 57 59 61 64 70 74 76 85 ...
##  $ : int [1:20] 24 29 31 34 47 89 98 108 111 112 ...
##  $ : int [1:20] 29 31 32 33 34 50 51 66 86 89 ...
##  $ : int [1:20] 2 7 62 67 100 101 121 130 140 142 ...
##   [list output truncated]
##  - attr(*, "region.id")= chr [1:323] "1" "2" "3" "4" ...
##  - attr(*, "call")= language knearneigh(x = coords, k = 20)
##  - attr(*, "sym")= logi FALSE
##  - attr(*, "type")= chr "knn"
##  - attr(*, "knn-k")= num 20
##  - attr(*, "class")= chr "nb"

Based on the weighted matrix that we have, now we will compute the spatial autocorrelation analysis on the residual of the regression model to test if the model conforms to the randomization assumption. We will be using Moran’s I.

fips <- order(linear_regression$SUBZONE_N) 
lm.morantest(slr, nb2listw(wm_knn20, style="W"), resfun=weighted.residuals)

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = TOTAL_POPULATION ~ TOTAL_TAP_IN_VOLUME +
## TOTAL_TAP_OUT_VOLUME, data = linear_regression)
## weights: nb2listw(wm_knn20, style = "W")
## 
## Moran I statistic standard deviate = 8.6158, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.1347985259    -0.0033538306     0.0002571162

Based on the above statistics, we can conclude that the P-Value is 0.00000000000000022 which is smaller than 0.05. Hence, this means that we can reject the null hypothesis. We can then draw the conclusion that the distribution of the residual of the the regression model is not randomly distributed. This will allow us to infer that the distribution is a cluster distribution since the Moran I value is 0.13 which is positive spatial autocorrelation which means that it shows signs of clustering. However it is not a strong cluster as it is rather close to 0.

Question 3

I will be performinga localized geospatial statistics analysis by using commuters’ tap-in and tap-out data to identify geographical clustering.

I will first be doing on Tap in data.

wm_knn20 <- nb2listw(wm_knn20, style="W", zero.policy = TRUE)
ffips <- order(linear_regression$SUBZONE_N) 
localMI <- localmoran(linear_regression$TOTAL_TAP_IN_VOLUME, wm_knn20)
head(localMI)

##            Ii        E.Ii     Var.Ii       Z.Ii    Pr(z > 0)
## 1  0.04549968 -0.00310559 0.04622938  0.2260603 0.4105772576
## 2 -0.19326217 -0.00310559 0.04622938 -0.8844073 0.8117618132
## 3 -0.09339476 -0.00310559 0.04622938 -0.4199297 0.6627316050
## 4  0.12038482 -0.00310559 0.04622938  0.5743468 0.2828665673
## 5  0.77135664 -0.00310559 0.04622938  3.6019794 0.0001579017
## 6  0.33461883 -0.00310559 0.04622938  1.5707369 0.0581218868

wm.localMI <- cbind(wm,localMI)

Plotting of choropleth map

localMI.map <- tm_shape(wm.localMI) +
  tm_fill(col = "Ii", 
          style = "pretty", 
          palette="RdGy",
          title = "local moran statistics") +
  tm_borders(alpha = 0.5)

pvalue.map <- tm_shape(wm.localMI) +
  tm_fill(col = "Pr.z...0.", 
          breaks=c(-Inf, 0.001, 0.01, 0.05, 0.1, Inf),
          palette="-Blues", 
          title = "local Moran's I p-vaiues") +
  tm_borders(alpha = 0.5)

tmap_arrange(localMI.map, pvalue.map, asp=1, ncol=2)

## Warning: The shape wm.localMI is invalid. See sf::st_is_valid

## Variable(s) "Ii" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

## Warning: The shape wm.localMI is invalid. See sf::st_is_valid

From here, we can see that in the local moran statistisc, those with value negative are the outliers and those with the positive values are the clusters. Hence, we can see that Geylang East and Kembangan is one of the clusters available. We can also see that Geylang East and Kembangan has one of the lowest p-values which makes it statistically more significant.

ffips <- order(linear_regression$SUBZONE_N) 
localMI1 <- localmoran(linear_regression$TOTAL_TAP_OUT_VOLUME, wm_knn20)
head(localMI1)

##            Ii        E.Ii     Var.Ii        Z.Ii    Pr(z > 0)
## 1  0.01510003 -0.00310559 0.04632076  0.08458972 0.4662937869
## 2 -0.18857966 -0.00310559 0.04632076 -0.86177791 0.8055951284
## 3 -0.14548423 -0.00310559 0.04632076 -0.66154136 0.7458674004
## 4  0.12690884 -0.00310559 0.04632076  0.60409285 0.2728909580
## 5  0.78938920 -0.00310559 0.04632076  3.68221014 0.0001156103
## 6  0.28979963 -0.00310559 0.04632076  1.36094087 0.0867661878

wm.localMI1 <- cbind(wm,localMI1)