library(tidyverse)
library(tmap)
library(geojsonio)
library(plotly)
library(rgdal)
library(sf)
library(sp)
library(spdep)
library(janitor)
library(here)
library(broom)
library(tidypredict)
## Warning: package 'tidypredict' was built under R version 4.1.2
library(car)
## Warning: package 'car' was built under R version 4.1.2
library(corrr)
## Warning: package 'corrr' was built under R version 4.1.2
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.1.2
library(spatialreg)
## Warning: package 'spatialreg' was built under R version 4.1.2
library(spgwr)
## Warning: package 'spgwr' was built under R version 4.1.2

Load Data

Londonwards <- st_read(here("../wk1",
               "statistical-gis-boundaries-london", 
               "ESRI", "London_Ward_CityMerged.shp"))
## Reading layer `London_Ward_CityMerged' from data source 
##   `N:\GIS\wk1\statistical-gis-boundaries-london\ESRI\London_Ward_CityMerged.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 625 features and 7 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 503568.2 ymin: 155850.8 xmax: 561957.5 ymax: 200933.9
## Projected CRS: OSGB 1936 / British National Grid
# qtm(Londonwards)

# specify some likely 'n/a' values
# also specify Latin1 as encoding as there is a pound sign (£) in one of the column headers
LondonWardProfiles <- read_csv("https://data.london.gov.uk/download/ward-profiles-and-atlas/772d2d64-e8c6-46cb-86f9-e52b4c7851bc/ward-profiles-excel-version.csv", 
                               na = c("", "NA", "n/a"), 
                               locale = locale(encoding = 'latin1'), 
                               col_names = TRUE)
## Rows: 660 Columns: 67
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (3): Ward name, Old code, New code
## dbl (64): Population - 2015, Children aged 0-15 - 2015, Working-age (16-64) ...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# check all of the columns have been read in correctly
Datatypelist <- LondonWardProfiles %>% 
  summarise_all(class) %>%
  pivot_longer(everything(), 
               names_to="All_variables", 
               values_to="Variable_class")
# Datatypelist
# merge boundaries and data
LonWardProfiles <- Londonwards %>%
  left_join(.,
            LondonWardProfiles, 
            by = c("GSS_CODE" = "New code"))

# let's map our dependent variable to see if the join has worked:
tmap_mode("plot")
## tmap mode set to plotting
qtm(LonWardProfiles, 
    fill = "Average GCSE capped point scores - 2014", 
    borders = NULL,  
    fill.palette = "Blues")

london_schools <- read_csv("https://data.london.gov.uk/download/london-schools-atlas/57046151-39a0-45d9-8dc0-27ea7fd02de8/all_schools_xy_2016.csv")
## Rows: 3889 Columns: 24
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (14): SCHOOL_NAM, TYPE, PHASE, ADDRESS, TOWN, POSTCODE, STATUS, GENDER, ...
## dbl  (7): URN, EASTING, NORTHING, map_icon_l, Primary, x, y
## lgl  (2): NEW_URN, OLD_URN
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# from the coordinate values stored in the x and y columns, which look like they are latitude and longitude values, create a new points dataset
lon_schools_sf <- st_as_sf(london_schools, 
                           coords = c("x","y"), 
                           crs = 4326)

lond_sec_schools_sf <- lon_schools_sf %>%
  filter(PHASE=="Secondary")

tm_shape(Londonwards) + 
  tm_polygons(col = NA, alpha = 0.5) +
  tm_shape(lond_sec_schools_sf) +
  tm_dots(col = "blue")

Linear Regression

# scatter plot
q <- qplot(x = `Unauthorised Absence in All Schools (%) - 2013`, 
           y = `Average GCSE capped point scores - 2014`, 
           data = LonWardProfiles)

# plot with a regression line
q + stat_smooth(method="lm", se=FALSE, size=1)
## `geom_smooth()` using formula 'y ~ x'

# + geom_jitter() 
# moves the points so they are not all on top of each other
# in actual fact it is incorrect (should not move points)

LonWardProfiles <- LonWardProfiles %>%
  clean_names()

# run the linear regression model and store its outputs in an object called model1
Regressiondata <- LonWardProfiles%>%
  dplyr::select(average_gcse_capped_point_scores_2014, 
                unauthorised_absence_in_all_schools_percent_2013)

model1 <- Regressiondata %>%
  lm(average_gcse_capped_point_scores_2014 ~
       unauthorised_absence_in_all_schools_percent_2013,
     data = .)

summary(model1)
## 
## Call:
## lm(formula = average_gcse_capped_point_scores_2014 ~ unauthorised_absence_in_all_schools_percent_2013, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.753 -10.223  -1.063   8.547  61.842 
## 
## Coefficients:
##                                                  Estimate Std. Error t value
## (Intercept)                                       371.471      2.165   171.6
## unauthorised_absence_in_all_schools_percent_2013  -41.237      1.927   -21.4
##                                                  Pr(>|t|)    
## (Intercept)                                        <2e-16 ***
## unauthorised_absence_in_all_schools_percent_2013   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.39 on 624 degrees of freedom
## Multiple R-squared:  0.4233, Adjusted R-squared:  0.4224 
## F-statistic:   458 on 1 and 624 DF,  p-value: < 2.2e-16
tidy(model1)
## # A tibble: 2 x 5
##   term                                     estimate std.error statistic  p.value
##   <chr>                                       <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                                 371.       2.16     172.  0       
## 2 unauthorised_absence_in_all_schools_per~    -41.2      1.93     -21.4 1.27e-76
glance(model1)
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.423         0.422  16.4      458. 1.27e-76     1 -2638. 5282. 5296.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# from the broom package

# see the predictions for each point
# Regressiondata %>%
#   tidypredict_to_column(model1)

Assumptions of Linear Regression

Assumption 1: Linear relationship between x and y

First check the distribution of our variables. If the variables are normally distributed, then there is a good chance that if the two variables are in some way correlated, this will be a linear relationship.

# basic histogram of counts
# ggplot(LonWardProfiles, aes(x=average_gcse_capped_point_scores_2014)) + 
#   geom_histogram(binwidth = 5)
  
ggplot(LonWardProfiles, aes(x=average_gcse_capped_point_scores_2014)) + 
  geom_histogram(aes(y = ..density..), 
                 # use density so that we can overlay the smoothed density estimate
                 binwidth = 5) + 
  geom_density(colour="red", 
               size=1, 
               adjust=1)

ggplot(LonWardProfiles, 
       aes(x = unauthorised_absence_in_all_schools_percent_2013)) +
  geom_histogram(aes(y = ..density..),
                 binwidth = 0.1) + 
  geom_density(colour="red",
               size=1, 
               adjust=1)

ggplot(LonWardProfiles, aes(x=median_house_price_2014)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(x = median_house_price_2014, 
      y = average_gcse_capped_point_scores_2014, 
      data = LonWardProfiles)

# log transformation
ggplot(LonWardProfiles, aes(x=log(median_house_price_2014))) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# symbox function from car package
# look for the most normal distribution
# -1 here is the only power that doesn't have any outliers
symbox(~median_house_price_2014, 
       LonWardProfiles, 
       na.rm = T,
       powers = seq(-3, 3, by = .5))

ggplot(LonWardProfiles, aes(x=(median_house_price_2014)^-1)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(x = (median_house_price_2014)^-1, 
      y = average_gcse_capped_point_scores_2014,
      data = LonWardProfiles)

qplot(x = log(median_house_price_2014), 
      y = average_gcse_capped_point_scores_2014, 
      data = LonWardProfiles)

# be careful about interpreting models after transforming the variables
# https://data.library.virginia.edu/interpreting-log-transformations-in-a-linear-model/

Assumption 2: Normally distributed residuals

# save the residuals into your dataframe
model_data <- model1 %>%
  augment(., Regressiondata)

# plot residuals
qplot(x = .resid, 
      data = model_data,
      main = "Distribution of residuals",
      xlab = "residuals", 
      ylab = "count") +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Assumption 3: No multicollinearity in independent variables

# add another variable (house price) into the model
Regressiondata2 <- LonWardProfiles %>%
  dplyr::select(average_gcse_capped_point_scores_2014,
                unauthorised_absence_in_all_schools_percent_2013,
                median_house_price_2014)

model2 <- lm(average_gcse_capped_point_scores_2014 ~ unauthorised_absence_in_all_schools_percent_2013 + 
               I(median_house_price_2014^-1), 
             data = Regressiondata2)

#show the summary of those outputs
tidy(model2)
## # A tibble: 3 x 5
##   term                                     estimate std.error statistic  p.value
##   <chr>                                       <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                                3.80e2      2.31    165.   0       
## 2 unauthorised_absence_in_all_schools_pe~   -3.52e1      1.97    -17.9  4.02e-58
## 3 I(median_house_price_2014^-1)             -5.42e6 654847.       -8.28 7.62e-16
glance(model2)
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.480         0.479  15.6      288. 2.60e-89     2 -2605. 5219. 5237.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# and for future use, write the residuals out
model_data2 <- model2 %>%
  augment(., Regressiondata2)

# also add them to the shapelayer
LonWardProfiles <- LonWardProfiles %>%
  mutate(model2resids = residuals(model2))

Correlation <- LonWardProfiles %>%
  st_drop_geometry() %>%
  dplyr::select(average_gcse_capped_point_scores_2014,
                unauthorised_absence_in_all_schools_percent_2013,
                median_house_price_2014) %>%
  mutate(transformed_house_price = I(median_house_price_2014^-1)) %>%
  rename(., 
         unauthorised_absence = unauthorised_absence_in_all_schools_percent_2013) %>%
  correlate() %>%
  # just focus on unauthorised absence and house prices
  # remove the dependent variable
  focus(-average_gcse_capped_point_scores_2014, 
        -median_house_price_2014,
        mirror = TRUE)
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlation <- Correlation %>% 
  column_to_rownames(., var = "term")

# corr_matrix <- data.matrix(Correlation)

# visualise the correlation matrix
ggcorrplot(Correlation, type = "lower")

# correlation matrix with more variables
Correlation_all <- LonWardProfiles %>%
  st_drop_geometry() %>%
  dplyr::select(c(10:74)) %>%
  correlate()
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
rplot(Correlation_all)
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

# another way is to check VIF
vif(model2)
## unauthorised_absence_in_all_schools_percent_2013 
##                                         1.156823 
##                    I(median_house_price_2014^-1) 
##                                         1.156823

Assumption 4: Homoscedasticity

Equal variance

# print some model diagnostics
par(mar=c(1,1,1,1))
par(mfrow=c(2,2)) #plot to 2 by 2 array
plot(model2)

par(mfrow=c(1,1)) #change back

Assumption 5: Independent residuals

For non-spatial data, we can use the Durbin-Watson test.

For spatial data, the residuals should not have spatial autocorrelation. Last week we looked for spatial clustering using Moran’s I etc. – those are measures of spatial autocorrelation.

# run durbin-watson test
# DW <- durbinWatsonTest(model2)
# tidy(DW)

# map the residuals and look for obvious patterns
# e.g. some blue areas next to other blue areas and some red/orange areas next to other red/orange areas
# see tmaptools::palette_explorer() for the named palettes
# qtm(LonWardProfiles, fill = "model2resids", fill.palette="PiYG")

tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(LonWardProfiles) +
  tm_polygons("model2resids",
              palette = "RdYlBu") +
  tm_shape(lond_sec_schools_sf) + 
  tm_dots(col = "TYPE")
## Variable(s) "model2resids" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
# preparation for Moran's I
# calculate the centroids of all Wards in London
coordsW <- LonWardProfiles %>%
  st_centroid() %>%
  st_geometry()
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over
## geometries of x
# plot(coordsW)

# the poly2nb function builds a neighbours list based on regions with contiguous boundaries, that is sharing one or more boundary point
# queen = TRUE means a single shared boundary point meets the contiguity condition
LWard_nb <- LonWardProfiles %>%
  poly2nb(., queen=T)
# summary(LWard_nb)

plot(LWard_nb, st_geometry(coordsW), col="red")
plot(LonWardProfiles$geometry, add = T)

# knearneigh returns a matrix with the indices of points belonging to the set of the k nearest neighbours of each other
knn_wards <- coordsW %>%
  knearneigh(., k=4)
## Warning in knearneigh(., k = 4): knearneigh: identical points found
## Warning in knearneigh(., k = 4): knearneigh: kd_tree not available for identical
## points
# convert the knn object returned by knearneigh into a neighbours list
LWard_knn <- knn_wards %>%
  knn2nb()

plot(LWard_knn, st_geometry(coordsW), col="blue")
plot(LonWardProfiles$geometry, add = T)

# add spatial weights to neighbours list
Lward.queens_weight <- LWard_nb %>%
  nb2listw(., style="C")
# C is globally standardised (sums over all links to n)
# B is the basic binary coding
# W is row standardised (sums over all links to n), 

Lward.knn_4_weight <- LWard_knn %>%
  nb2listw(., style="C")

# run Moran's I test for spatial autocorrelation
Queen <- moran.test(LonWardProfiles$model2resids, 
                    Lward.queens_weight) %>%
  tidy()

# Queen

Nearest_neighbour <- moran.test(LonWardProfiles$model2resids,
                                Lward.knn_4_weight) %>%
  tidy()

# Nearest_neighbour

We can see that the Moran’s I statistic is somewhere between 0.26 and 0.29. Remembering that Moran’s I ranges from between -1 and +1 (0 indicating no spatial autocorrelation) we can conclude that there is some weak to moderate spatial autocorrelation in our residuals.

How to deal with this?

  • Spatially lagged regression model
  • Spatial error model

Spatially lagged regression model

The model incorporates spatial dependence explicitly by adding a “spatially lagged” dependent variable y on the right-hand side of the regression equation.

Decomposes the error term into a spatially lagged term for the dependent variable (which is correlated with the dependent variable) and an independent error term.

Rho is our spatial lag that measures the variable in the surrounding spatial areas as defined by the spatial weights matrix. We use this as an extra explanatory variable to account for clustering (identified by Moran’s I).

# original Model
# model2 <- lm(average_gcse_capped_point_scores_2014 ~
#                unauthorised_absence_in_all_schools_percent_2013 +
#                I(median_house_price_2014^-1),
#              data = Regressiondata2)
# 
# tidy(model2)

# run the spatially-lagged regression model with the spatial weights matrix from earlier
# Spatial simultaneous autoregressive lag model estimation
# raising house price to the power of -1 causes an error, so use log transformation instead
splag_model <- lagsarlm(average_gcse_capped_point_scores_2014 ~
                          unauthorised_absence_in_all_schools_percent_2013 +
                          log(median_house_price_2014), 
                        data = Regressiondata2, 
                        Lward.queens_weight)

# what do the outputs show?
tidy(splag_model)
## # A tibble: 4 x 5
##   term                                     estimate std.error statistic  p.value
##   <chr>                                       <dbl>     <dbl>     <dbl>    <dbl>
## 1 rho                                       5.16e-3   0.00759     0.679 4.97e- 1
## 2 (Intercept)                               2.02e+2  20.1        10.1   0       
## 3 unauthorised_absence_in_all_schools_per~ -3.62e+1   1.91      -18.9   0       
## 4 log(median_house_price_2014)              1.26e+1   1.53        8.21  2.22e-16
# results show that rho is statistically insignificant
# there is an insignificant and small effect associated with the spatially lagged dependent variable

# model stats
glance(splag_model)
## # A tibble: 1 x 6
##   r.squared   AIC   BIC deviance logLik  nobs
##       <dbl> <dbl> <dbl>    <dbl>  <dbl> <int>
## 1     0.484 5217. 5239.  150150. -2604.   626
t <- summary(splag_model)

# now use knn to run a spatially-lagged regression model
splag_model_knn4 <- lagsarlm(average_gcse_capped_point_scores_2014 ~
                               unauthorised_absence_in_all_schools_percent_2013 +
                               log(median_house_price_2014), 
                             data = Regressiondata2,
                             Lward.knn_4_weight)

# what do the outputs show?
tidy(splag_model_knn4)
## # A tibble: 4 x 5
##   term                                     estimate std.error statistic  p.value
##   <chr>                                       <dbl>     <dbl>     <dbl>    <dbl>
## 1 rho                                         0.374    0.0409      9.14 0       
## 2 (Intercept)                               116.      20.1         5.76 8.39e- 9
## 3 unauthorised_absence_in_all_schools_per~  -28.5      1.97      -14.5  0       
## 4 log(median_house_price_2014)                9.29     1.48        6.28 3.36e-10
# results are significant

# check that the residuals from the spatially lagged model are now no-longer exhibiting spatial autocorrelation
# write out the residuals
LonWardProfiles <- LonWardProfiles %>%
  mutate(splag_model_knn_resids = residuals(splag_model_knn4))

KNN4Moran <- moran.test(LonWardProfiles$splag_model_knn_resids, 
                        Lward.knn_4_weight) %>% 
  tidy()

KNN4Moran
## # A tibble: 1 x 7
##   estimate1 estimate2 estimate3 statistic p.value method             alternative
##       <dbl>     <dbl>     <dbl>     <dbl>   <dbl> <chr>              <chr>      
## 1    0.0468   -0.0016  0.000717      1.81  0.0353 Moran I test unde~ greater
# Moran’s I is close to 0 indicating no spatial autocorrelation in our residuals.

Spatial error model

The spatial error model treats spatial correlation primarily as a nuisance and focuses on estimating the parameters for the independent variables of interest and essentially disregards the possibility that the observed correlation may reflect something meaningful about the data generation process.

# Spatial simultaneous autoregressive error model estimation
sem_model <- errorsarlm(average_gcse_capped_point_scores_2014 ~ unauthorised_absence_in_all_schools_percent_2013 + 
                           log(median_house_price_2014), 
                         data = Regressiondata2,
                         Lward.knn_4_weight)

tidy(sem_model)
## # A tibble: 4 x 5
##   term                                     estimate std.error statistic  p.value
##   <chr>                                       <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                               154.      28.2         5.44 5.28e- 8
## 2 unauthorised_absence_in_all_schools_per~  -32.3      2.22      -14.5  0       
## 3 log(median_house_price_2014)               16.2      2.12        7.62 2.55e-14
## 4 lambda                                      0.475    0.0451     10.5  0

Lagrange Multiplier Test

This test can help us decide whether to use the lag model or error model. This test expects row standardisation.

Lward.queens_weight_ROW <- LWard_nb %>%
  nb2listw(., style="W")

lm.LMtests(model2, Lward.queens_weight_ROW, 
           test = c("LMerr","LMlag","RLMerr","RLMlag","SARMA"))
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = average_gcse_capped_point_scores_2014 ~
## unauthorised_absence_in_all_schools_percent_2013 +
## I(median_house_price_2014^-1), data = Regressiondata2)
## weights: Lward.queens_weight_ROW
## 
## LMerr = 136.23, df = 1, p-value < 2.2e-16
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = average_gcse_capped_point_scores_2014 ~
## unauthorised_absence_in_all_schools_percent_2013 +
## I(median_house_price_2014^-1), data = Regressiondata2)
## weights: Lward.queens_weight_ROW
## 
## LMlag = 96.234, df = 1, p-value < 2.2e-16
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = average_gcse_capped_point_scores_2014 ~
## unauthorised_absence_in_all_schools_percent_2013 +
## I(median_house_price_2014^-1), data = Regressiondata2)
## weights: Lward.queens_weight_ROW
## 
## RLMerr = 40.454, df = 1, p-value = 2.013e-10
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = average_gcse_capped_point_scores_2014 ~
## unauthorised_absence_in_all_schools_percent_2013 +
## I(median_house_price_2014^-1), data = Regressiondata2)
## weights: Lward.queens_weight_ROW
## 
## RLMlag = 0.45683, df = 1, p-value = 0.4991
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = average_gcse_capped_point_scores_2014 ~
## unauthorised_absence_in_all_schools_percent_2013 +
## I(median_house_price_2014^-1), data = Regressiondata2)
## weights: Lward.queens_weight_ROW
## 
## SARMA = 136.69, df = 2, p-value < 2.2e-16

Dummy variables

extradata <- read_csv("https://www.dropbox.com/s/qay9q1jwpffxcqj/LondonAdditionalDataFixed.csv?raw=1")
## Rows: 625 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): WardName, WardCode, Wardcode, Candidate, InnerOuter
## dbl (6): PctSharedOwnership2011, PctRentFree2011, x, y, AvgGCSE2011, UnauthA...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# add the extra data too
LonWardProfiles <- LonWardProfiles %>%
  left_join(., 
            extradata, 
            by = c("gss_code" = "Wardcode")) %>%
  clean_names()

# print some of the column names
LonWardProfiles %>%
  names() %>%
  tail(., n=10)
##  [1] "ward_code"                "pct_shared_ownership2011"
##  [3] "pct_rent_free2011"        "candidate"               
##  [5] "inner_outer"              "x"                       
##  [7] "y"                        "avg_gcse2011"            
##  [9] "unauth_absence_schools11" "geometry"
p <- ggplot(LonWardProfiles, 
            aes(x = unauth_absence_schools11, 
                y = average_gcse_capped_point_scores_2014))
p + geom_point(aes(colour = inner_outer)) 

# first, let's make sure R is reading our inner_outer variable as a factor
# see what it is at the moment...
# isitfactor <- LonWardProfiles %>%
#   dplyr::select(inner_outer) %>%
#   summarise_all(class)
# 
# isitfactor

typeof(LonWardProfiles$inner_outer)
## [1] "character"
# change to factor (aka categorical data)
LonWardProfiles <- LonWardProfiles %>%
  mutate(inner_outer = as.factor(inner_outer))

# now run the model
model3 <- lm(average_gcse_capped_point_scores_2014 ~ unauthorised_absence_in_all_schools_percent_2013 + 
               log(median_house_price_2014) + 
               inner_outer, 
             data = LonWardProfiles)

tidy(model3)
## # A tibble: 4 x 5
##   term                                     estimate std.error statistic  p.value
##   <chr>                                       <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                                  97.6     24.1       4.06 5.62e- 5
## 2 unauthorised_absence_in_all_schools_per~    -30.1      2.03    -14.8  7.02e-43
## 3 log(median_house_price_2014)                 19.8      1.74     11.4  2.09e-27
## 4 inner_outerOuter                             10.9      1.51      7.24 1.30e-12
# check which is our reference group
contrasts(LonWardProfiles$inner_outer)
##       Outer
## Inner     0
## Outer     1
# change reference group if needed
# LonWardProfiles <- LonWardProfiles %>%
#   mutate(inner_outer = relevel(inner_outer, 
#                                ref="Outer"))
# 
# model3 <- lm(average_gcse_capped_point_scores_2014 ~ unauthorised_absence_in_all_schools_percent_2013 + 
#                log(median_house_price_2014) + 
#                inner_outer, 
#              data = LonWardProfiles)
# 
# tidy(model3)

Geographically Weighted Regression

The basic idea behind GWR is to explore how the relationship between a dependent variable (Y) and one or more independent variables (the Xs) might vary geographically.

Spatial nonstationarity refers to variations in the relationship between an outcome variable and a set of predictor variables across space.

Instead of assuming that a single model can be fitted to the entire study region, it looks for geographical differences.

Apply this when you think there is some local variation that is not captured in the global model.

GWR operates by moving a search window from one regression point to the next, working sequentially through all the existing regression points in the dataset.
For a data set of 150 observations GWR will fit 150 weighted regression models.
GWR tutorial

What area should the search window cover each time?

# select some variables from the data file
myvars <- LonWardProfiles %>%
  dplyr::select(average_gcse_capped_point_scores_2014,
                unauthorised_absence_in_all_schools_percent_2013,
                median_house_price_2014,
                rate_of_job_seekers_allowance_jsa_claimants_2015,
                percent_with_level_4_qualifications_and_above_2011,
                inner_outer)

# check their correlations are OK
# pearson correlation is only appropriate for interval or ratio data
# we are only concerned about multicollinearity between predictor variables
Correlation_myvars <- myvars %>%
  st_drop_geometry() %>%
  mutate(transformed_house_price = log(median_house_price_2014)) %>%
  dplyr::select(-inner_outer, 
                -average_gcse_capped_point_scores_2014, 
                -median_house_price_2014) %>%
  rename(., 
         unauthorised_absence = unauthorised_absence_in_all_schools_percent_2013,
         percent_qualified = percent_with_level_4_qualifications_and_above_2011,
         jsa_claimants_rate = rate_of_job_seekers_allowance_jsa_claimants_2015)

# get correlation matrix
cormat <- cor(Correlation_myvars, use="complete.obs", method="pearson")

# significance test
sig1 <- corrplot::cor.mtest(Correlation_myvars, conf.level = .95)

# create a correlogram
corrplot::corrplot(cormat, type="lower",
                   method = "circle", 
                   order = "original", 
                   tl.cex = 0.7,
                   p.mat = sig1$p, sig.level = .05, 
                   col = viridis::viridis(100, option = "plasma"),
                   diag = FALSE)

# looks like percent_qualified and transformed_house_price are quite highly correlated
# the size of the circle reflects the strength of the relationships as captured by the Pearson correlation coefficient 
# crosses indicate statistically insignificant relationships at the 95% level of confidence

# pairs(Correlation_myvars)

final_eq <- average_gcse_capped_point_scores_2014 ~ 
  unauthorised_absence_in_all_schools_percent_2013 + 
  log(median_house_price_2014) +
  inner_outer +
  rate_of_job_seekers_allowance_jsa_claimants_2015 +
  percent_with_level_4_qualifications_and_above_2011

# run a final OLS model
model_final <- lm(final_eq, 
                  data = myvars)

tidy(model_final)
## # A tibble: 6 x 5
##   term                                     estimate std.error statistic  p.value
##   <chr>                                       <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                               230.      28.5         8.05 4.23e-15
## 2 unauthorised_absence_in_all_schools_per~  -23.6      2.16      -10.9  1.53e-25
## 3 log(median_house_price_2014)                8.41     2.26        3.72 2.18e- 4
## 4 inner_outerOuter                           10.4      1.65        6.30 5.71e-10
## 5 rate_of_job_seekers_allowance_jsa_claim~   -2.81     0.635      -4.43 1.12e- 5
## 6 percent_with_level_4_qualifications_and~    0.413    0.0784      5.27 1.91e- 7
vif(model_final)
##   unauthorised_absence_in_all_schools_percent_2013 
##                                           1.668610 
##                       log(median_house_price_2014) 
##                                           2.977369 
##                                        inner_outer 
##                                           1.991621 
##   rate_of_job_seekers_allowance_jsa_claimants_2015 
##                                           2.067424 
## percent_with_level_4_qualifications_and_above_2011 
##                                           3.154922
# The VIFs are below 10 indicating that multicollinearity is not highly problematic.

LonWardProfiles <- LonWardProfiles %>%
  mutate(model_final_res = residuals(model_final))

par(mfrow=c(2,2))
plot(model_final)

qtm(LonWardProfiles, fill = "model_final_res")
## Variable(s) "model_final_res" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
final_model_Moran <- moran.test(LonWardProfiles$model_final_res,
                                Lward.knn_4_weight) %>%
  tidy()

final_model_Moran
## # A tibble: 1 x 7
##   estimate1 estimate2 estimate3 statistic  p.value method            alternative
##       <dbl>     <dbl>     <dbl>     <dbl>    <dbl> <chr>             <chr>      
## 1     0.224   -0.0016  0.000718      8.42 1.91e-17 Moran I test und~ greater

Search for optimal kernel bandwidth

coordsW2 <- st_coordinates(coordsW)

LonWardProfiles2 <- cbind(LonWardProfiles, coordsW2)
# find optimal kernel bandwidth
GWR_bandwidth <- gwr.sel(final_eq,
                        data = LonWardProfiles2, 
                        coords = cbind(LonWardProfiles2$X, LonWardProfiles2$Y),
                        # provide an adaptive bandwidth as opposed to a fixed bandwidth
                        adapt = TRUE)
## Adaptive q: 0.381966 CV score: 124832.2 
## Adaptive q: 0.618034 CV score: 126396.1 
## Adaptive q: 0.236068 CV score: 122752.4 
## Adaptive q: 0.145898 CV score: 119960.5 
## Adaptive q: 0.09016994 CV score: 116484.6 
## Adaptive q: 0.05572809 CV score: 112628.7 
## Adaptive q: 0.03444185 CV score: 109427.7 
## Adaptive q: 0.02128624 CV score: 107562.9 
## Adaptive q: 0.01315562 CV score: 108373.2 
## Adaptive q: 0.02161461 CV score: 107576.6 
## Adaptive q: 0.0202037 CV score: 107505.1 
## Adaptive q: 0.01751157 CV score: 107333 
## Adaptive q: 0.01584775 CV score: 107175.5 
## Adaptive q: 0.01481944 CV score: 107564.8 
## Adaptive q: 0.01648327 CV score: 107187.9 
## Adaptive q: 0.01603246 CV score: 107143.9 
## Adaptive q: 0.01614248 CV score: 107153.1 
## Adaptive q: 0.01607315 CV score: 107147.2 
## Adaptive q: 0.01596191 CV score: 107143 
## Adaptive q: 0.01592122 CV score: 107154.4 
## Adaptive q: 0.01596191 CV score: 107143
# adapt = TRUE: find the proportion between 0 and 1 of observations to include in weighting scheme (k-nearest neighbours)
# adapt = FALSE: find global bandwidth

GWR_bandwidth
## [1] 0.01596191
# The optimal bandwidth is about 0.016 meaning 1.6% of all the total spatial units should be used for the local regression based on k-nearest neighbours. This is about 10 of the 626 wards.

Fit GWR model

# fit the gwr model based on adaptive bandwidth
gwr_abw = gwr(final_eq, 
                data = LonWardProfiles2, 
                coords = cbind(LonWardProfiles2$X, LonWardProfiles2$Y), 
                adapt = GWR_bandwidth, 
                # matrix output
                hatmatrix = TRUE, 
                # standard error
                se.fit = TRUE)
## Warning in sqrt(betase): NaNs produced

## Warning in sqrt(betase): NaNs produced
# print the results of the model
gwr_abw
## Call:
## gwr(formula = final_eq, data = LonWardProfiles2, coords = cbind(LonWardProfiles2$X, 
##     LonWardProfiles2$Y), adapt = GWR_bandwidth, hatmatrix = TRUE, 
##     se.fit = TRUE)
## Kernel function: gwr.Gauss 
## Adaptive quantile: 0.01596191 (about 9 of 626 data points)
## Summary of GWR coefficient estimates at data points:
## Warning in print.gwr(x): NAs in coefficients dropped
##                                                          Min.    1st Qu.
## X.Intercept.                                       -345.64311  -22.88658
## unauthorised_absence_in_all_schools_percent_2013    -47.06120  -31.08397
## log.median_house_price_2014.                         -0.55994   11.18979
## inner_outerOuter                                     -3.98708    3.33210
## rate_of_job_seekers_allowance_jsa_claimants_2015      1.43895   10.72734
## percent_with_level_4_qualifications_and_above_2011   -0.06701    0.49946
##                                                        Median    3rd Qu.
## X.Intercept.                                         76.12583  165.54827
## unauthorised_absence_in_all_schools_percent_2013    -14.04901   -5.00033
## log.median_house_price_2014.                         18.00032   22.78750
## inner_outerOuter                                      6.58838   10.44459
## rate_of_job_seekers_allowance_jsa_claimants_2015     16.11748   26.08932
## percent_with_level_4_qualifications_and_above_2011    0.72555    1.07515
##                                                          Max.   Global
## X.Intercept.                                        310.98676 229.5693
## unauthorised_absence_in_all_schools_percent_2013      6.79870 -23.6167
## log.median_house_price_2014.                         44.78874   8.4136
## inner_outerOuter                                     24.36827  10.3690
## rate_of_job_seekers_allowance_jsa_claimants_2015     52.82565  -2.8135
## percent_with_level_4_qualifications_and_above_2011    3.04231   0.4127
## Number of data points: 626 
## Effective number of parameters (residual: 2traceS - traceS'S): 160.9269 
## Effective degrees of freedom (residual: 2traceS - traceS'S): 465.0731 
## Sigma (residual: 2traceS - traceS'S): 12.35905 
## Effective number of parameters (model: traceS): 116.0071 
## Effective degrees of freedom (model: traceS): 509.9929 
## Sigma (model: traceS): 11.80222 
## Sigma (ML): 10.65267 
## AICc (GWR p. 61, eq 2.33; p. 96, eq. 4.21): 5026.882 
## AIC (GWR p. 96, eq. 4.22): 4854.513 
## Residual sum of squares: 71038.1 
## Quasi-global R2: 0.7557128
results <- as.data.frame(gwr_abw$SDF)
names(results)
##  [1] "sum.w"                                                    
##  [2] "X.Intercept."                                             
##  [3] "unauthorised_absence_in_all_schools_percent_2013"         
##  [4] "log.median_house_price_2014."                             
##  [5] "inner_outerOuter"                                         
##  [6] "rate_of_job_seekers_allowance_jsa_claimants_2015"         
##  [7] "percent_with_level_4_qualifications_and_above_2011"       
##  [8] "X.Intercept._se"                                          
##  [9] "unauthorised_absence_in_all_schools_percent_2013_se"      
## [10] "log.median_house_price_2014._se"                          
## [11] "inner_outerOuter_se"                                      
## [12] "rate_of_job_seekers_allowance_jsa_claimants_2015_se"      
## [13] "percent_with_level_4_qualifications_and_above_2011_se"    
## [14] "gwr.e"                                                    
## [15] "pred"                                                     
## [16] "pred.se"                                                  
## [17] "localR2"                                                  
## [18] "rate_of_job_seekers_allowance_jsa_claimants_2015_EDF"     
## [19] "X.Intercept._se_EDF"                                      
## [20] "unauthorised_absence_in_all_schools_percent_2013_se_EDF"  
## [21] "log.median_house_price_2014._se_EDF"                      
## [22] "inner_outerOuter_se_EDF"                                  
## [23] "rate_of_job_seekers_allowance_jsa_claimants_2015_se_EDF"  
## [24] "percent_with_level_4_qualifications_and_above_2011_se_EDF"
## [25] "pred.se.1"                                                
## [26] "coord.x"                                                  
## [27] "coord.y"
# save localR2 and coefficients to original data frame
LonWardProfiles2$abw_localR2 <- results$localR2

LonWardProfiles2$coef_unauth_abs <- results$unauthorised_absence_in_all_schools_percent_2013

LonWardProfiles2$coef_house_price <- results$log.median_house_price_2014.

LonWardProfiles2$coef_inner <- results$inner_outerInner 
# check reference category -- inner is 1, outer is 0

LonWardProfiles2$coef_jsa <- results$rate_of_job_seekers_allowance_jsa_claimants_2015

LonWardProfiles2$coef_lvl4qual <- results$percent_with_level_4_qualifications_and_above_2011

# map local R2
tmap_mode("plot")
## tmap mode set to plotting
legend_title = expression("Adaptive bandwidth: Local R2")

map_abgwr1 = tm_shape(LonWardProfiles2) +
  tm_fill(col = "abw_localR2", title = legend_title, 
          palette = "magma", style = "cont") +
  tm_borders(col = "white", lwd = .1) + 
  tm_compass(type = "arrow", position = c("right", "top") , size = 5) + 
  tm_scale_bar(breaks = c(0,1,2), text.size = 0.7, position =  c("center", "bottom")) +
  tm_layout(bg.color = "white")

map_abgwr1

# simple way to plot coefficients
# tm_shape(LonWardProfiles2) +
#   tm_polygons(col = "coef_unauth_abs", 
#               palette = "RdBu", 
#               alpha = 0.5)

# nicer maps
# use tmaptools::palette_explorer() to choose palette

# unauthorised_absence_in_all_schools_percent_2013
legend_title = expression("Unauthorised absence")

map_abgwr2 = tm_shape(LonWardProfiles2) +
  tm_fill(col = "coef_unauth_abs", title = legend_title, 
          palette = "RdBu", style = "cont") + 
  tm_borders(col = "white", lwd = .1)  +
  tm_compass(type = "arrow", position = c("right", "top") , size = 5) +
  tm_scale_bar(breaks = c(0,1,2), text.size = 0.7, position =  c("center", "bottom")) + 
  tm_layout(bg.color = "white")

# log(median_house_price)
legend_title = expression("Log of median house price")

map_abgwr3 = tm_shape(LonWardProfiles2) +
  tm_fill(col = "coef_house_price", title = legend_title, 
          palette = "RdBu", style = "cont") + 
  tm_borders(col = "white", lwd = .1)  + 
  tm_scale_bar(breaks = c(0,1,2), text.size = 0.7, position =  c("center", "bottom")) + 
  tm_layout(bg.color = "white") 

tmap_arrange(map_abgwr2, map_abgwr3)
## Variable(s) "coef_unauth_abs" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
## Variable(s) "coef_house_price" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
## Variable(s) "coef_unauth_abs" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
## Variable(s) "coef_house_price" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

Assessing statistical significance

Roughly, if a coefficient estimate has an absolute value of t greater than 1.96 and the sample is sufficiently large, then it is statistically significant.

# compute t statistic
LonWardProfiles2$t_unauth_abs = results$unauthorised_absence_in_all_schools_percent_2013 / results$unauthorised_absence_in_all_schools_percent_2013_se

# categorise t values
LonWardProfiles2$t_unauth_abs_cat <- cut(LonWardProfiles2$t_unauth_abs,
                     breaks = c(min(LonWardProfiles2$t_unauth_abs), 
                              -1.96, 1.96, 
                              max(LonWardProfiles2$t_unauth_abs)),
                     labels = c("sig","nonsig", "sig"))

# map statistically significant coefs for unauthorised absence
legend_title = expression("Unauthorised absence: significant")

map_sig = tm_shape(LonWardProfiles2) + 
  tm_fill(col = "t_unauth_abs_cat", title = legend_title, 
          legend.hist = TRUE, midpoint = NA, 
          textNA = "", colorNA = "white") +  
  tm_borders(col = "white", lwd = .1)  + 
  tm_compass(type = "arrow", position = c("right", "top") , size = 5) + 
  tm_scale_bar(breaks = c(0,1,2), text.size = 0.7, position =  c("center", "bottom")) + 
  tm_layout(bg.color = "white", legend.outside = TRUE)

map_sig

# repeat for other variables

# run the significance test
# sigTest = abs(results$log.median_house_price_2014.) - 
#   2 * results$log.median_house_price_2014._se
# 
# # store significance results
# LonWardProfiles2 <- LonWardProfiles2 %>%
#   mutate(GWRHousePriceSig = sigTest)
# 
# tm_shape(LonWardProfiles2) +
#   tm_polygons(col = "GWRHousePriceSig",
#               palette = "RdYlBu")