GISProject

R Markdown

get the obesity data from the data folder

rawData <- read_csv("data/EnglandObesity.csv", 
                               na = c("", "NA", "n/a"),
                               col_names = TRUE, 
                               locale = locale(encoding = 'Latin1'))

## Parsed with column specification:
## cols(
##   `UTLA code` = col_character(),
##   `Local Authority` = col_character(),
##   EMPLOY = col_double(),
##   UNEMPLOY = col_double(),
##   ECO_INACT = col_double(),
##   SMOK = col_double(),
##   ALCOHOL = col_double(),
##   PHY_ACT = col_double(),
##   DAY5 = col_double(),
##   OBESE = col_double()
## )

# rawData <- rawData %>%clean_names()

draw the pair plots to have a look at the distribution and roughly relationships of the data

ggpairs(select(rawData,c(3:10)))

draw the box plot to see the range, average and other information of the data

par(mfrow=c(1,2))
boxplot(select(rawData,c(3,7:9)))
boxplot(select(rawData,c(4:6,10)))

For the skewed-distribution attribute, use Tukey’s ladder to see how to transform them to be more like normal distributions

symbox(~ALCOHOL, 
       rawData, 
       na.rm=T,
       powers=seq(-3,3,by=.5))

symbox(~EMPLOY, 
       rawData, 
       na.rm=T,
       powers=seq(-3,3,by=.5))

symbox(~UNEMPLOY, 
       rawData, 
       na.rm=T,
       powers=seq(-3,3,by=.5))

symbox(~ECO_INACT, 
       rawData, 
       na.rm=T,
       powers=seq(-3,3,by=.5))

Take ALCOHOL for example to check the distribution of the transformed data by histogram

ggplot(rawData, aes(x=log(ALCOHOL))) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

use the transformed data to build the linear model. Get the information of the model coefficients and the model itself

tukeyData <- rawData %>%
  mutate(ALCOHOL = log(ALCOHOL),
         OBESE = OBESE,
         EMPLOY = EMPLOY^2,
         UNEMPLOY = log(UNEMPLOY),
         DAY5 = DAY5,
         SMOK = SMOK,
         PHY_ACT = PHY_ACT,
         ECO_INACT = log(ECO_INACT))
OLSModel <- lm(OBESE ~ SMOK + 
                 ALCOHOL + 
                 PHY_ACT +
                 EMPLOY +
                 UNEMPLOY +
                 ECO_INACT +
                 DAY5, 
               data = tukeyData)
tidy(OLSModel)

## # A tibble: 8 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept) 21.8      73.6        0.296  7.68e- 1
## 2 SMOK         0.178     0.107      1.67   9.72e- 2
## 3 ALCOHOL      3.32      1.51       2.20   2.96e- 2
## 4 PHY_ACT     -0.513     0.0687    -7.46   7.97e-12
## 5 EMPLOY       0.00180   0.00455    0.396  6.93e- 1
## 6 UNEMPLOY     0.125     2.87       0.0435 9.65e- 1
## 7 ECO_INACT    1.43     14.3        0.0999 9.21e- 1
## 8 DAY5         0.00773   0.0618     0.125  9.01e- 1

glance(OLSModel)

## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.504         0.480  2.94      20.7 5.70e-19     7  -370.  759.  786.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

#summary(OLSModel)

Use VIF and correlation matrix to see whether there is multicolinearity among the attributes. Firstly, remove the attribute whose VIF is the highest and larger than 10.

# Variance Inflation Factor 
vif(OLSModel)

##       SMOK    ALCOHOL    PHY_ACT     EMPLOY   UNEMPLOY  ECO_INACT       DAY5 
##   1.939512   1.783955   2.224730 182.836535  19.575199 113.816333   2.058593

OLSModel <- lm(OBESE ~ SMOK + 
                 ALCOHOL + 
                 PHY_ACT +
                 # EMPLOY +
                 UNEMPLOY +
                 ECO_INACT +
                 DAY5, 
               data = tukeyData)
# residual error
s2 = sum(summary(OLSModel)$residuals^2)/143
# correlation matrix
Correlation_all<- rawData  %>%
  dplyr::select(c(3:10))%>%
  correlate()

## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'

rplot(Correlation_all)

## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

Correlation_all

## # A tibble: 8 x 9
##   term      EMPLOY UNEMPLOY ECO_INACT   SMOK ALCOHOL PHY_ACT   DAY5  OBESE
##   <chr>      <dbl>    <dbl>     <dbl>  <dbl>   <dbl>   <dbl>  <dbl>  <dbl>
## 1 EMPLOY    NA       -0.791    -0.967 -0.555  -0.460   0.584  0.612 -0.286
## 2 UNEMPLOY  -0.791   NA         0.611  0.544   0.421  -0.581 -0.586  0.337
## 3 ECO_INACT -0.967    0.611    NA      0.493   0.422  -0.515 -0.553  0.233
## 4 SMOK      -0.555    0.544     0.493 NA       0.576  -0.485 -0.498  0.411
## 5 ALCOHOL   -0.460    0.421     0.422  0.576  NA      -0.551 -0.351  0.486
## 6 PHY_ACT    0.584   -0.581    -0.515 -0.485  -0.551  NA      0.623 -0.666
## 7 DAY5       0.612   -0.586    -0.553 -0.498  -0.351   0.623 NA     -0.374
## 8 OBESE     -0.286    0.337     0.233  0.411   0.486  -0.666 -0.374 NA

Repeat the step of deleting the highest VIF until all the value is within 10. Then calculate the AIC and BIC of the model without the removed attributes, and search for the most suitable model

OLSModel <- lm(OBESE ~ SMOK + 
                 ALCOHOL + 
                 PHY_ACT +
                 # EMPLOY +
                 UNEMPLOY +
                 ECO_INACT +
                 DAY5, 
               data = tukeyData)
tidy(OLSModel)

## # A tibble: 7 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept) 50.7        9.78       5.18  7.38e- 7
## 2 SMOK         0.180      0.107      1.69  9.36e- 2
## 3 ALCOHOL      3.31       1.51       2.20  2.93e- 2
## 4 PHY_ACT     -0.514      0.0684    -7.52  5.66e-12
## 5 UNEMPLOY    -0.947      0.937     -1.01  3.14e- 1
## 6 ECO_INACT   -4.20       1.81      -2.32  2.19e- 2
## 7 DAY5         0.00931    0.0615     0.151 8.80e- 1

glance(OLSModel)

## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.504         0.483  2.93      24.2 1.19e-19     6  -371.  757.  781.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

vif(OLSModel)

##      SMOK   ALCOHOL   PHY_ACT  UNEMPLOY ECO_INACT      DAY5 
##  1.937520  1.783879  2.217240  2.100975  1.826710  2.050030

# BIC
step(OLSModel, direction = "both", k=log(nrow(tukeyData)))

## Start:  AIC=350.55
## OBESE ~ SMOK + ALCOHOL + PHY_ACT + UNEMPLOY + ECO_INACT + DAY5
## 
##             Df Sum of Sq    RSS    AIC
## - DAY5       1      0.20 1229.0 345.56
## - UNEMPLOY   1      8.79 1237.6 346.60
## - SMOK       1     24.49 1253.3 348.49
## - ALCOHOL    1     41.66 1270.4 350.54
## <none>                   1228.8 350.55
## - ECO_INACT  1     46.12 1274.9 351.06
## - PHY_ACT    1    485.37 1714.2 395.47
## 
## Step:  AIC=345.56
## OBESE ~ SMOK + ALCOHOL + PHY_ACT + UNEMPLOY + ECO_INACT
## 
##             Df Sum of Sq    RSS    AIC
## - UNEMPLOY   1      9.78 1238.8 341.74
## - SMOK       1     24.51 1253.5 343.51
## <none>                   1229.0 345.56
## - ALCOHOL    1     43.57 1272.5 345.77
## - ECO_INACT  1     49.18 1278.2 346.43
## + DAY5       1      0.20 1228.8 350.55
## - PHY_ACT    1    559.11 1788.1 396.79
## 
## Step:  AIC=341.74
## OBESE ~ SMOK + ALCOHOL + PHY_ACT + ECO_INACT
## 
##             Df Sum of Sq    RSS    AIC
## - SMOK       1     17.68 1256.4 338.85
## <none>                   1238.8 341.74
## - ALCOHOL    1     46.51 1285.3 342.26
## - ECO_INACT  1     74.18 1312.9 345.45
## + UNEMPLOY   1      9.78 1229.0 345.56
## + DAY5       1      1.18 1237.6 346.60
## - PHY_ACT    1    568.47 1807.2 393.38
## 
## Step:  AIC=338.85
## OBESE ~ ALCOHOL + PHY_ACT + ECO_INACT
## 
##             Df Sum of Sq    RSS    AIC
## <none>                   1256.4 338.85
## - ECO_INACT  1     60.31 1316.7 340.87
## + SMOK       1     17.68 1238.8 341.74
## - ALCOHOL    1     82.76 1339.2 343.41
## + UNEMPLOY   1      2.95 1253.5 343.51
## + DAY5       1      0.00 1256.4 343.86
## - PHY_ACT    1    608.54 1865.0 393.09

## 
## Call:
## lm(formula = OBESE ~ ALCOHOL + PHY_ACT + ECO_INACT, data = tukeyData)
## 
## Coefficients:
## (Intercept)      ALCOHOL      PHY_ACT    ECO_INACT  
##     48.2297       4.2609      -0.5015      -4.2547

# AIC
step(OLSModel, direction = "both")

## Start:  AIC=329.47
## OBESE ~ SMOK + ALCOHOL + PHY_ACT + UNEMPLOY + ECO_INACT + DAY5
## 
##             Df Sum of Sq    RSS    AIC
## - DAY5       1      0.20 1229.0 327.50
## - UNEMPLOY   1      8.79 1237.6 328.54
## <none>                   1228.8 329.47
## - SMOK       1     24.49 1253.3 330.43
## - ALCOHOL    1     41.66 1270.4 332.47
## - ECO_INACT  1     46.12 1274.9 333.00
## - PHY_ACT    1    485.37 1714.2 377.41
## 
## Step:  AIC=327.5
## OBESE ~ SMOK + ALCOHOL + PHY_ACT + UNEMPLOY + ECO_INACT
## 
##             Df Sum of Sq    RSS    AIC
## - UNEMPLOY   1      9.78 1238.8 326.68
## <none>                   1229.0 327.50
## - SMOK       1     24.51 1253.5 328.46
## + DAY5       1      0.20 1228.8 329.47
## - ALCOHOL    1     43.57 1272.5 330.72
## - ECO_INACT  1     49.18 1278.2 331.38
## - PHY_ACT    1    559.11 1788.1 381.74
## 
## Step:  AIC=326.68
## OBESE ~ SMOK + ALCOHOL + PHY_ACT + ECO_INACT
## 
##             Df Sum of Sq    RSS    AIC
## <none>                   1238.8 326.68
## - SMOK       1     17.68 1256.4 326.81
## + UNEMPLOY   1      9.78 1229.0 327.50
## + DAY5       1      1.18 1237.6 328.54
## - ALCOHOL    1     46.51 1285.3 330.21
## - ECO_INACT  1     74.18 1312.9 333.41
## - PHY_ACT    1    568.47 1807.2 381.34

## 
## Call:
## lm(formula = OBESE ~ SMOK + ALCOHOL + PHY_ACT + ECO_INACT, data = tukeyData)
## 
## Coefficients:
## (Intercept)         SMOK      ALCOHOL      PHY_ACT    ECO_INACT  
##     50.4179       0.1433       3.4528      -0.4894      -4.9004

Consider the result using Mallow’s Cp statistic and each regressor’s p-value in two model to find the final model

OLSModel <- lm(OBESE ~ 
                 SMOK +
                 ALCOHOL + 
                 PHY_ACT +
                 # EMPLOY +
                 # UNEMPLOY +
                 ECO_INACT,
                 # DAY5, 
               data = tukeyData)
summary(OLSModel)

## 
## Call:
## lm(formula = OBESE ~ SMOK + ALCOHOL + PHY_ACT + ECO_INACT, data = tukeyData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8082 -1.7321  0.3107  2.1128  6.1905 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  50.4179     9.4169   5.354 3.29e-07 ***
## SMOK          0.1433     0.0996   1.438  0.15245    
## ALCOHOL       3.4528     1.4798   2.333  0.02100 *  
## PHY_ACT      -0.4894     0.0600  -8.157 1.49e-13 ***
## ECO_INACT    -4.9004     1.6631  -2.947  0.00374 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.923 on 145 degrees of freedom
## Multiple R-squared:  0.4999, Adjusted R-squared:  0.4861 
## F-statistic: 36.23 on 4 and 145 DF,  p-value: < 2.2e-16

SSE1 = sum(summary(OLSModel)$residuals^2)
cp1 = SSE1/s2 - (150-2*5)
print(paste("cp1:", cp1))

## [1] "cp1: 4.1605096775167"

OLSModel <- lm(OBESE ~ 
                 # SMOK + 
                 ALCOHOL + 
                 PHY_ACT +
                 # EMPLOY +
                 # UNEMPLOY +
                 ECO_INACT,
                 # DAY5, 
               data = tukeyData)
summary(OLSModel)

## 
## Call:
## lm(formula = OBESE ~ ALCOHOL + PHY_ACT + ECO_INACT, data = tukeyData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.317 -1.764  0.160  1.843  6.141 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 48.22972    9.32717   5.171 7.55e-07 ***
## ALCOHOL      4.26090    1.37399   3.101  0.00231 ** 
## PHY_ACT     -0.50145    0.05963  -8.409 3.42e-14 ***
## ECO_INACT   -4.25475    1.60720  -2.647  0.00900 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.934 on 146 degrees of freedom
## Multiple R-squared:  0.4927, Adjusted R-squared:  0.4823 
## F-statistic: 47.28 on 3 and 146 DF,  p-value: < 2.2e-16

SSE2 = sum(summary(OLSModel)$residuals^2)
cp2 = SSE2/s2 - (150-2*4)
print(paste("cp2:", cp2))

## [1] "cp2: 4.217752987106"

# choose the model that its Cp is closer to p than other models. As there is no lack of fit when the value of cp is approximately p.

check whether the residuals in the model is normally distributed

# add residuals to the data
modelData <- tukeyData %>%
  select(c(1,5,7:8,10)) %>%
  mutate(OLSModelresids = residuals(OLSModel))

model_data <- OLSModel %>% augment(., modelData)
#plot residuals
model_data%>%
dplyr::select(.resid)%>%
  pull()%>%
  qplot()+ 
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

check for homo/heteroscedasticity

par(mfrow=c(2,2)) 
plot(OLSModel)

Read the map data and merge it with the data we get above. plot the obesity rate and other attributes on the map

EnglandWales <- st_read(here::here("data","Counties_and_Unitary_Authorities__December_2016__Boundaries-shp","Counties_and_Unitary_Authorities__December_2016__Boundaries.shp"))

## Reading layer `Counties_and_Unitary_Authorities__December_2016__Boundaries' from data source `/cloud/project/data/Counties_and_Unitary_Authorities__December_2016__Boundaries-shp/Counties_and_Unitary_Authorities__December_2016__Boundaries.shp' using driver `ESRI Shapefile'
## Simple feature collection with 174 features and 10 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 83757.91 ymin: 5556.599 xmax: 655510.8 ymax: 657536.3
## proj4string:    +proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +datum=OSGB36 +units=m +no_defs

England <- EnglandWales %>%
  dplyr::filter(str_detect(ctyua16cd, "^E"))%>%
  st_transform(., 27700) %>%
  select(c(ctyua16cd,geometry))

geometryData <- England%>%
  left_join(.,
            modelData, 
            by = c("ctyua16cd" = "UTLA code")) %>%
  na.omit()


# qtm(geometryData)
# drqw the distribution of obese on the map
tmap_mode("plot")

## tmap mode set to plotting

tm1 <- tm_shape(geometryData) + 
  tm_polygons("OBESE",palette="RdBu",breaks=c(10,22,25,28,35))+
  tm_legend(show=FALSE)+
  tm_layout(frame=FALSE,bg.color = "transparent")
  # tm_credits("(a)", position=c(0.1,0.85), size=1.5)
legend <- tm_shape(geometryData) + 
  tm_polygons("OBESE",palette="RdBu",breaks=c(10,22,25,28,35))+
  tm_scale_bar(position=c(0,0.1), text.size=0.6)+
  tm_compass(north=0, position=c(0,0.8))+
  tm_layout(legend.only = TRUE, legend.position=c(0,0.4),asp=0.1)
t=tmap_arrange(tm1,legend)
t

# draw the distribution of other attributes on the map
tmap_mode("plot")

## tmap mode set to plotting

tm1 <- tm_shape(geometryData) + 
  tm_polygons("ALCOHOL",palette="RdBu", breaks=c(3.6, 3.9, 4.1, 4.3, 5.0))+
  # tm_legend()+
  tm_layout(frame=FALSE,bg.color = "transparent",legend.position=c(0.1,0.4)) +
  tm_credits("(a)", position=c(0.65,0.85), size=1.5)
tm2 <- tm_shape(geometryData) + 
  tm_polygons("PHY_ACT",palette="RdBu",breaks=c(40,55,58,61,70))+
  # tm_legend(show=FALSE)+
  tm_layout(frame=FALSE,bg.color = "transparent",legend.position=c(0.1,0.45)) +
  tm_credits("(b)", position=c(0.65,0.85), size=1.5)
tm3 <- tm_shape(geometryData) + 
  tm_polygons("ECO_INACT",palette="RdBu",breaks=c(2.6,2.9,3.0,3.1,3.6))+
  # tm_legend(show=FALSE)+
  tm_layout(frame=FALSE,bg.color = "transparent",legend.position=c(0.1,0.5)) +
  tm_credits("(c)", position=c(0.65,0.85), size=1.5)
legend <- tm_shape(geometryData) + 
  tm_polygons("ALCOHOL",palette="RdBu")+
  tm_scale_bar(position=c(0.3,0.2), text.size=3)+
  tm_compass(north=0, position=c(0.4,0.5),size = 2)+
  tm_layout(legend.only = TRUE, legend.position=c(3,0.4),asp=0.1)
t=tmap_arrange(tm1,tm2,tm3,legend, nrow=2)
t

Calculate the Moran’s I statistic for both Queen’s case neighbours and k-nearest neighbours of 4

#  spatial autocorrelation Moran’s I
#calculate the centroids
coordsW <- geometryData%>%
  st_centroid()%>%
  st_geometry()

## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over
## geometries of x

# plot(coordsW)

#generate a spatial weights matrix 
#queen's case neighbours
LWard_nb <- geometryData %>%
  poly2nb(., queen=T)
#or nearest neighbours
knn_wards <-coordsW %>%
  knearneigh(., k=4)
LWard_knn <- knn_wards %>%
  knn2nb()
# plot(LWard_nb, st_geometry(coordsW), col="red")
# plot(LWard_knn, st_geometry(coordsW), col="blue")
# plot(geometryData)

#create a spatial weights matrix object from these weights
Lward.queens_weight <- LWard_nb %>%
  nb2listw(., style="C",zero.policy = T)
Lward.knn_4_weight <- LWard_knn %>%
  nb2listw(., style="C")
Queen <- geometryData %>%
  st_drop_geometry()%>%
  dplyr::select(OLSModelresids)%>%
  pull()%>%
  moran.test(., Lward.queens_weight,zero.policy = T)%>%
  tidy()
Queen

## # A tibble: 1 x 7
##   estimate1 estimate2 estimate3 statistic  p.value method            alternative
##       <dbl>     <dbl>     <dbl>     <dbl>    <dbl> <chr>             <chr>      
## 1     0.415  -0.00676   0.00269      8.12 2.31e-16 Moran I test und… greater

Nearest_neighbour <- geometryData %>%
  st_drop_geometry()%>%
  dplyr::select(OLSModelresids)%>%
  pull()%>%
  moran.test(., Lward.knn_4_weight)%>%
  tidy()
Nearest_neighbour

## # A tibble: 1 x 7
##   estimate1 estimate2 estimate3 statistic  p.value method            alternative
##       <dbl>     <dbl>     <dbl>     <dbl>    <dbl> <chr>             <chr>      
## 1     0.474  -0.00671   0.00278      9.11 4.05e-20 Moran I test und… greater

#Remembering that Moran’s I ranges from between -1 and +1 (0 indicating no spatial autocorrelation) we can conclude that there is some moderate spatial autocorrelation in our residuals.

construct the GWR model use the attributes we select by MLR

st_crs(geometryData) = 27700
geometryDataSP <- geometryData %>%
  as(., "Spatial")
st_crs(coordsW) = 27700
coordsWSP <- coordsW %>%
  as(., "Spatial")
#calculate kernel bandwidth
GWRbandwidth <- gwr.sel(OBESE ~ ALCOHOL +
                 # SMOK +
                 PHY_ACT +
                 # EMPLOY +
                 # UNEMPLOY +
                 ECO_INACT, 
                 # DAY5,
                        data = geometryDataSP, 
                        coords=coordsWSP,
                        adapt=T,
                        gweight = gwr.bisquare, 
                        verbose = FALSE,
                        method = "AIC")

## Warning in gwr.sel(OBESE ~ ALCOHOL + PHY_ACT + ECO_INACT, data =
## geometryDataSP, : data is Spatial* object, ignoring coords argument

gwr.model = gwr(OBESE ~ ALCOHOL +  
                 # SMOK +
                 PHY_ACT +
                 # EMPLOY +
                 # UNEMPLOY +
                 ECO_INACT, 
                 # DAY5,
                    data=geometryDataSP, 
                    coords=coordsWSP, 
                    adapt = GWRbandwidth,
                    gweight = gwr.bisquare,
                    hatmatrix=TRUE, 
                    se.fit=TRUE)

## Warning in gwr(OBESE ~ ALCOHOL + PHY_ACT + ECO_INACT, data = geometryDataSP, :
## data is Spatial* object, ignoring coords argument

gwr.model

## Call:
## gwr(formula = OBESE ~ ALCOHOL + PHY_ACT + ECO_INACT, data = geometryDataSP, 
##     coords = coordsWSP, gweight = gwr.bisquare, adapt = GWRbandwidth, 
##     hatmatrix = TRUE, se.fit = TRUE)
## Kernel function: gwr.bisquare 
## Adaptive quantile: 0.5031398 (about 75 of 150 data points)
## Summary of GWR coefficient estimates at data points:
##                   Min.   1st Qu.    Median   3rd Qu.      Max.  Global
## X.Intercept. 29.900495 57.043361 62.441404 86.995783 89.205814 48.2297
## ALCOHOL      -2.931730 -1.998932 -0.441513  1.291099  8.216442  4.2609
## PHY_ACT      -0.633899 -0.592649 -0.469005 -0.431624 -0.357972 -0.5015
## ECO_INACT    -8.963359 -7.654702 -4.273400 -3.178335  0.070912 -4.2547
## Number of data points: 150 
## Effective number of parameters (residual: 2traceS - traceS'S): 20.9273 
## Effective degrees of freedom (residual: 2traceS - traceS'S): 129.0727 
## Sigma (residual: 2traceS - traceS'S): 2.622095 
## Effective number of parameters (model: traceS): 16.30293 
## Effective degrees of freedom (model: traceS): 133.6971 
## Sigma (model: traceS): 2.576349 
## Sigma (ML): 2.432316 
## AICc (GWR p. 61, eq 2.33; p. 96, eq. 4.21): 731.75 
## AIC (GWR p. 96, eq. 4.22): 708.6376 
## Residual sum of squares: 887.4241 
## Quasi-global R2: 0.641725

gwrResults <- as.data.frame(gwr.model$SDF)
names(gwrResults)

##  [1] "sum.w"               "X.Intercept."        "ALCOHOL"            
##  [4] "PHY_ACT"             "ECO_INACT"           "X.Intercept._se"    
##  [7] "ALCOHOL_se"          "PHY_ACT_se"          "ECO_INACT_se"       
## [10] "gwr.e"               "pred"                "pred.se"            
## [13] "localR2"             "X.Intercept._se_EDF" "ALCOHOL_se_EDF"     
## [16] "PHY_ACT_se_EDF"      "ECO_INACT_se_EDF"    "pred.se.1"

show the distribution of the coefficients on the map

#attach coefficients to original SF
geometryDataSP2 <- geometryData %>%
  mutate(coef_ALCOHOL = gwrResults$ALCOHOL,
         localR2 = gwrResults$localR2,
         coef_ECO_INACT = gwrResults$ECO_INACT,
         coef_PHY_ACT = gwrResults$PHY_ACT
         )
         # coef_physically_active_adults = gwrResults$PHY_ACT,
         # coef_alcohol_related_admissions = gwrResults$ALCOHOL,
         # coef_ilo_economically_inactive = gwrResults$ECO_INACT)

# the coefficients of three attributes
tmap_mode("plot")

## tmap mode set to plotting

tm1 <- tm_shape(geometryDataSP2) + 
  tm_polygons("coef_ALCOHOL",palette="RdBu",midpoint = 0)+
  # tm_legend()+
  tm_layout(frame=FALSE,bg.color = "transparent",legend.position=c(0.1,0.52)) +
  tm_credits("(a)", position=c(0.65,0.85), size=1.5)
tm2 <- tm_shape(geometryDataSP2) + 
  tm_polygons("coef_PHY_ACT",palette="RdBu",midpoint = 0)+
  # tm_legend(show=FALSE)+
  tm_layout(frame=FALSE,bg.color = "transparent",legend.position=c(0.1,0.45)) +
  tm_credits("(b)", position=c(0.65,0.85), size=1.5)
tm3 <- tm_shape(geometryDataSP2) + 
  tm_polygons("coef_ECO_INACT",palette="RdBu",midpoint = 0)+
  # tm_legend(show=FALSE)+
  tm_layout(frame=FALSE,bg.color = "transparent",legend.position=c(0.1,0.32)) +
  tm_credits("(c)", position=c(0.65,0.85), size=1.5)
legend <- tm_shape(geometryDataSP2) + 
  tm_polygons("coef_ECO_INACT",palette="RdBu",midpoint = 0)+
  tm_scale_bar(position=c(0.3,0.2), text.size=3)+
  tm_compass(north=0, position=c(0.4,0.5),size = 2)+
  tm_layout(legend.only = TRUE, legend.position=c(3,0.4),asp=0.1)
t=tmap_arrange(tm1,tm2,tm3,legend, nrow=2)
t

# tm_shape(geometryDataSP2) +
#   tm_polygons(col = "localR2", 
#               palette = "RdBu", 
#               alpha = 0.5,
#               midpoint = NA)

to see whether the coefficient estimate is more than 2 standard errors away from zero

#run the significance test
PHY_ACTSigTest = abs(gwrResults$PHY_ACT)-2 * gwrResults$PHY_ACT_se
# smokingSigTest1 = 2*gwrResults$UNEMPLOY / gwrResults$EMPLOY_se
#store significance results
ALCOHOLSigTest = abs(gwrResults$ALCOHOL)-2 * gwrResults$ALCOHOL_se
ECO_INACTSigTest = abs(gwrResults$ECO_INACT)-2 * gwrResults$ECO_INACT_se
geometryDataSP2 <- geometryDataSP2 %>%
  mutate(PHY_ACTSigTest = PHY_ACTSigTest,
         ALCOHOLSigTest = ALCOHOLSigTest,
         ECO_INACTSigTest = ECO_INACTSigTest)

# the value bigger than zero is statistically significiant
tmap_mode("plot")

## tmap mode set to plotting

tm1 <- tm_shape(geometryDataSP2) + 
  tm_polygons("ALCOHOLSigTest",palette="RdBu",midpoint = 0)+
  # tm_legend()+
  tm_layout(frame=FALSE,bg.color = "transparent",legend.position=c(0.1,0.4)) +
  tm_credits("(a)", position=c(0.65,0.85), size=1.5)
tm2 <- tm_shape(geometryDataSP2) + 
  tm_polygons("PHY_ACTSigTest",palette="RdBu",midpoint = 0)+
  # tm_legend(show=FALSE)+
  tm_layout(frame=FALSE,bg.color = "transparent",legend.position=c(0.1,0.45)) +
  tm_credits("(b)", position=c(0.65,0.85), size=1.5)
tm3 <- tm_shape(geometryDataSP2) + 
  tm_polygons("ECO_INACTSigTest",palette="RdBu",midpoint = 0)+
  # tm_legend(show=FALSE)+
  tm_layout(frame=FALSE,bg.color = "transparent",legend.position=c(0.1,0.5)) +
  tm_credits("(c)", position=c(0.65,0.85), size=1.5)
legend <- tm_shape(geometryDataSP2) + 
  tm_polygons("ECO_INACTSigTest",palette="RdBu",midpoint = 0)+
  tm_scale_bar(position=c(0.3,0.2), text.size=3)+
  tm_compass(north=0, position=c(0.4,0.5),size = 2)+
  tm_layout(legend.only = TRUE, legend.position=c(3,0.4),asp=0.1)
t=tmap_arrange(tm1,tm2,tm3,legend, nrow=2)
t

have a look at the local R2 of the model

tmap_mode("plot")

## tmap mode set to plotting

tm1 <- tm_shape(geometryDataSP2) + 
  tm_polygons("localR2",palette="RdBu")+
  tm_legend(show=FALSE)+
  tm_layout(frame=FALSE,bg.color = "transparent")
  # tm_credits("(a)", position=c(0.1,0.85), size=1.5)
legend <- tm_shape(geometryDataSP2) + 
  tm_polygons("localR2",palette="RdBu")+
  tm_scale_bar(position=c(0,0.1), text.size=0.6)+
  tm_compass(north=0, position=c(0,0.8))+
  tm_layout(legend.only = TRUE, legend.position=c(0,0.4),asp=0.1)
t=tmap_arrange(tm1,legend)
t

Some codes refer from : https://andrewmaclachlan.github.io/CASA0005repo/gwr-and-spatially-lagged-regression.html

GISProject

Xiaohan Feng

R Markdown