1. Prepocess

library(broom)
library(corrplot)
library(grid)
library(gridExtra)
library(htmlwidgets)
library(leaflet)
library(moments) 
library(psych)
library(plyr)
library(rgdal)
library(rgeos)
library(spdep)
library(sf)
library(sp)
library(spgwr)
library(tmap)
library(tmaptools)
library(tidyverse)
library(car)
# Change the presentation of decimal numbers to 4 and avoid scientific notation
options(prompt="R> ", digits=4, scipen=999)

Download the data source we uploaded online earlier.

download.file("https://github.com/lizhiyuan913/GIS-assessment/archive/main.zip", destfile="source.zip")
unzip("source.zip")

Then read in the data, and have an overview.

Automobile.data <- read.csv("./GIS-assessment-main/data.csv", sep=',')
glimpse(Automobile.data)
## Rows: 73
## Columns: 12
## $ 市县名称                                                             <chr> "杭...
## $ city.name                                                            <chr> ...
## $ car.ownership.per.100.people.unit.1000people.                        <dbl> ...
## $ population.10000people.                                              <dbl> ...
## $ per.capita.GDP.yuan.                                                 <int> ...
## $ household.price.yuan...                                              <int> ...
## $ population.density.person.k..                                        <dbl> ...
## $ urbanization...                                                      <dbl> ...
## $ Per.capita.milage.of.roadway.network                                 <dbl> ...
## $ Ratio.of.bus.passengers.to.population.1000.person.times.1000.person. <dbl> ...
## $ whether.downtown.of.a.prefecture.level.city.dummy.variable.          <int> ...
## $ whether.municipal.district.dummy.variable.                           <int> ...

Chinese names of cities here are used for joining datas.

variable.name <- c("car.ownership", "population", "per.GDP", "household.price", "population.density", "urbanization", "milage.roadway", "ratio.bus", "prefectrue", "municipal.district")
names(Automobile.data)[3:12] <- variable.name

Read in the map.

Output.Areas <- readOGR("./GIS-assessment-main", "district", use_iconv = TRUE, encoding = "UTF-8")
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\VIAN33\Documents\WeChat Files\wxid_9cp3t5anw7pd12\FileStorage\File\2021-01\GIS-assessment-main", layer: "district"
## with 73 features
## It has 4 fields

Name the columns.

names(Output.Areas)[1:2] <- c("市级","县级")

Merge the two datasets.

OA.Automobile <- merge(Output.Areas, Automobile.data, by.x="县级", by.y="市县名称", all=FALSE)
head(OA.Automobile, 5)
## class       : SpatialPolygonsDataFrame 
## features    : 5 
## extent      : 118.3, 120.4, 29.19, 30.45  (xmin, xmax, ymin, ymax)
## crs         : +proj=longlat +datum=WGS84 +no_defs 
## variables   : 15
## names       :   县级,   市级,    Shape_Leng,      Shape_Area, city.name, car.ownership, population, per.GDP, household.price, population.density, urbanization, milage.roadway,   ratio.bus, prefectrue, municipal.district 
## min values  : 淳安县, 杭州市, 2.02686291242, 0.0650807384866,    Chunan,   16.59413408,       35.8,   68407,           14077,        81.05048676,           50,     1.15134189,  3.20754717,          0,                  0 
## max values  : 临安区, 杭州市, 3.88099095089,  0.411349378838,     Linan,   28.51087515,        857,  148784,           34647,         1033.52629,           90,    7.715083799, 11.22905028,          1,                  1
any(is.na(OA.Automobile@data)) #check missing value
## [1] FALSE

Firstly, we calculate Moran’s I for every varaible to see if it is geo-related.

nb <- poly2nb(OA.Automobile)
lw <- nb2listw(nb, style="W", zero.policy=TRUE)

Some variables are very geographically related.

for(i in names(OA.Automobile)[6:15]){
global.moran <- moran.test(OA.Automobile[[i]],lw, na.action = na.exclude, zero.policy=TRUE)
print(i)
print(global.moran$estimate[1])}
## [1] "car.ownership"
## Moran I statistic 
##            0.1964 
## [1] "population"
## Moran I statistic 
##            0.1181 
## [1] "per.GDP"
## Moran I statistic 
##            0.6084 
## [1] "household.price"
## Moran I statistic 
##            0.2067 
## [1] "population.density"
## Moran I statistic 
##            0.4483 
## [1] "urbanization"
## Moran I statistic 
##            0.1938 
## [1] "milage.roadway"
## Moran I statistic 
##            0.5811 
## [1] "ratio.bus"
## Moran I statistic 
##            0.3554 
## [1] "prefectrue"
## Moran I statistic 
##          -0.09234 
## [1] "municipal.district"
## Moran I statistic 
##            0.1211

Lets have a look at the dependent variables.

qtm(OA.Automobile, fill=variable.name[2:8], fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), n=9, title=variable.name[2:8]) + tm_scale_bar(position=c("left", "bottom"))

Then we have a look at the distribution of the car ownership – the independent variable.

boxplot(Automobile.data[3], main="boxplot of car ownership")

qtm(OA.Automobile, fill="car.ownership", fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title="boxplot of car ownership") + tm_scale_bar(position=c("left", "bottom"))

We can see there are two strange places.
After checking the data, we notice that data for Jindong district is 0 and remove it.

Automobile.data <- Automobile.data[-42,]
OA.Automobile <- merge(Output.Areas, Automobile.data, by.x="县级", by.y="市县名称", all=FALSE)

2. Regression

Set up the regression pattern. Perform a linear regression first.

formula <- paste0("car.ownership ~", paste(names(Automobile.data)[4:12], collapse=' + '))
model <- lm(formula, data = OA.Automobile)
glance(model)
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.440         0.359  5.47      5.42 1.84e-5     9  -219.  460.  485.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

We can see the R2 is 0.4 and the p value is low.
The residuals in my model is normally distributed from the above. So we can proceed on other validation.

par(mfrow=c(2,2))
plot(model)

3. Skewness Test

Firstly, We draw see the distribution of variable.

multi.hist(Automobile.data[4:10])

Then We run skewness test.

skewness(Automobile.data[4:10])
##         population            per.GDP    household.price population.density 
##             4.5786             0.6771             1.4238             0.9688 
##       urbanization     milage.roadway          ratio.bus 
##             0.5689             2.3596             1.2082

We can see some variables turn out to be positively-skewed. So let’s try symbox.

for(i in c(4,6,7,9,10)){
symbox(Automobile.data[[i]])}

We choose to do the log transformation to skewed variables.

for(i in c(4,6,7,9,10)){
Automobile.data[i] <- log10(Automobile.data[i])}
skewness(Automobile.data[4:10])
##         population            per.GDP    household.price population.density 
##            0.09817            0.67714            0.55627           -0.42585 
##       urbanization     milage.roadway          ratio.bus 
##            0.56888            0.22024           -0.40566

Now all skewness is under control.

4. VIF Test

par(mai=c(0,0,40,0), mex=2)
corrplot(cor(Automobile.data[4:10]), cl.pos="b", tl.col="black", main="Correlation matrix of the independent variables", mar=c(0,0,1,0), tl.cex=0.6)

We calculate the VIF to test multi-correlation.

car::vif(model)
##         population            per.GDP    household.price population.density 
##              2.513              2.428              2.892              2.790 
##       urbanization     milage.roadway          ratio.bus         prefectrue 
##              3.835              2.132              1.517              2.195 
## municipal.district 
##              1.691

All the VIFs are under 5 thus do not need to remove. Now try again.

model <- lm(formula, data = OA.Automobile)
glance(model)
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.440         0.359  5.47      5.42 1.84e-5     9  -219.  460.  485.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
par(mfrow=c(2,2))
plot(model)

5. Model Interpretation

Now let’s see the variables’ importance. We begin with the standardized coefficients of the variables.

OA.Automobile.data.standardized <- lapply(OA.Automobile@data[, colnames(OA.Automobile@data)[6:15]], scale) 
model <- lm(formula, data = OA.Automobile.data.standardized)
model$coefficients
##            (Intercept)             population                per.GDP 
## -0.0000000000000002451 -0.1226112251978570128 -0.0809917292503943564 
##        household.price     population.density           urbanization 
##  0.4462026129776222594  0.0484180978199241080  0.2731129521868498333 
##         milage.roadway              ratio.bus             prefectrue 
## -0.3100273105529269824 -0.0693344842429929470 -0.1859919311704416600 
##     municipal.district 
## -0.1131985211680714698
par(mai=c(1,2,1,1))
barplot(model$coefficients, horiz=TRUE, las=1, main="Standardized coefficients of the variables")

We try to remove the variables one for a time and see how much the adjusted R2 drop.

original <- summary(model)$adj.r.squared
for (i in 1:9){
formula.compare <- paste0("car.ownership ~", paste(names(Automobile.data)[4:12][-i], collapse=' + '))
model.compare <- lm(formula.compare, data=OA.Automobile)
print(names(Automobile.data)[4:12][i])
print(original - summary(model.compare)$adj.r.squared)}
## [1] "population"
## [1] -0.003434
## [1] "per.GDP"
## [1] -0.007131
## [1] "household.price"
## [1] 0.06742
## [1] "population.density"
## [1] -0.009229
## [1] "urbanization"
## [1] 0.01174
## [1] "milage.roadway"
## [1] 0.04063
## [1] "ratio.bus"
## [1] -0.006605
## [1] "prefectrue"
## [1] 0.007585
## [1] "municipal.district"
## [1] -0.001637

It turns out that Road Milage,Household and Urbanization are the 3 most important variables. This is the same as above.

resids <- residuals(model)
map.resids <- cbind(OA.Automobile, resids) 
names(map.resids)[length(names(map.resids))] <- "resids"

Draw residuals on the map.

qtm(map.resids, fill = "resids", text="city.name", scale=0.6, fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title="residuals") + tm_scale_bar(position=c("left", "bottom"))

We can see the residuals are very geo-related.So we decide to try GWR on it.

6. Geographic Weighted Regression

GWRbandwidth <- gwr.sel(formula, data = OA.Automobile, adapt = TRUE)
## Adaptive q: 0.382 CV score: 2490 
## Adaptive q: 0.618 CV score: 2651 
## Adaptive q: 0.2361 CV score: 2276 
## Adaptive q: 0.1459 CV score: 2094 
## Adaptive q: 0.09017 CV score: 2075 
## Adaptive q: 0.1033 CV score: 2041 
## Adaptive q: 0.1156 CV score: 2051 
## Adaptive q: 0.1065 CV score: 2040 
## Adaptive q: 0.1057 CV score: 2040 
## Adaptive q: 0.1059 CV score: 2040 
## Adaptive q: 0.1058 CV score: 2040 
## Adaptive q: 0.1058 CV score: 2040 
## Adaptive q: 0.1058 CV score: 2040

Train our GWR model.

gwr.model = gwr(formula,
                data = OA.Automobile,
                adapt=GWRbandwidth,
                hatmatrix=TRUE,
                se.fit=TRUE)
gwr.model
## Call:
## gwr(formula = formula, data = OA.Automobile, adapt = GWRbandwidth, 
##     hatmatrix = TRUE, se.fit = TRUE)
## Kernel function: gwr.Gauss 
## Adaptive quantile: 0.1058 (about 7 of 72 data points)
## Summary of GWR coefficient estimates at data points:
##                           Min.     1st Qu.      Median     3rd Qu.        Max.
## X.Intercept.       -14.5940736  -7.1086118  -3.7570997   2.8721890  18.6470875
## population          -0.0228107  -0.0153691  -0.0113684  -0.0036852   0.0387347
## per.GDP             -0.0001122  -0.0000224   0.0000192   0.0001131   0.0002054
## household.price      0.0000859   0.0002307   0.0004008   0.0005170   0.0007724
## population.density  -0.0068098   0.0000259   0.0016807   0.0039227   0.0142429
## urbanization         0.1136966   0.2174272   0.2720037   0.3887964   0.5798661
## milage.roadway      -1.9356064  -0.8427146  -0.3909161  -0.0872519   2.1475596
## ratio.bus           -0.3334740  -0.0272611   0.0930551   0.1619113   0.2162950
## prefectrue         -10.8604121  -7.0904722  -5.2410947  -3.5639391  -0.1917933
## municipal.district -13.3438148  -7.9175550  -6.8557015  -4.5399086   1.4777459
##                    Global
## X.Intercept.         9.70
## population          -0.01
## per.GDP              0.00
## household.price      0.00
## population.density   0.00
## urbanization         0.18
## milage.roadway      -0.73
## ratio.bus           -0.04
## prefectrue          -3.39
## municipal.district  -2.45
## Number of data points: 72 
## Effective number of parameters (residual: 2traceS - traceS'S): 41.73 
## Effective degrees of freedom (residual: 2traceS - traceS'S): 30.27 
## Sigma (residual: 2traceS - traceS'S): 4.475 
## Effective number of parameters (model: traceS): 33.13 
## Effective degrees of freedom (model: traceS): 38.87 
## Sigma (model: traceS): 3.948 
## Sigma (ML): 2.901 
## AICc (GWR p. 61, eq 2.33; p. 96, eq. 4.21): 491 
## AIC (GWR p. 96, eq. 4.22): 390.8 
## Residual sum of squares: 606 
## Quasi-global R2: 0.8174

After the GWR, the R2 is now up to 0.82, which is a huge imporvement from OLS.

results <-as.data.frame(gwr.model$SDF)
gwr.map <- cbind(OA.Automobile, as.matrix(results))
variable.number.name <- paste(variable.name,".number", sep = "")
variable.coef.name <- paste(variable.name[-1],".coef", sep = "")
names(gwr.map)[6:15] <- variable.number.name
names(gwr.map)[18:26] <- variable.coef.name

We draw the local R2 to have a better look.

qtm(gwr.map, fill = "localR2",text="city.name", scale=0.6, fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title="localR2", format=) + tm_scale_bar(position=c("left", "bottom"))

We draw the parameters of every feature to have a better look.

par(mfrow=c(3,4))
qtm(gwr.map, fill = variable.coef.name, scale=0.6, fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title=variable.coef.name) + tm_scale_bar(position=c("left", "bottom"))

7. Online Presentation

Draw the local R2, the 3 most impactable variables and their coefficients as different layers in one leaflet map.

pick.variable.coef.name <- c("milage.roadway.coef", "urbanization.coef", "household.price.coef")
pick.variable.number.name <- c( "milage.roadway.number", "household.price.number", "urbanization.number")
leaflet.tmap <- tm_shape(gwr.map) + tm_polygons(
c(pick.variable.coef.name, "localR2", pick.variable.number.name), palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), n=9) + tm_facets(as.layers = TRUE)
tmap_leaflet(leaflet.tmap)