GWR-Based Analysis of Influencing Factors on Car Ownership in Zhejiang Province, China

1. Prepocess

library(broom)
library(corrplot)
library(grid)
library(gridExtra)
library(htmlwidgets)
library(leaflet)
library(moments) 
library(psych)
library(plyr)
library(rgdal)
library(rgeos)
library(spdep)
library(sf)
library(sp)
library(spgwr)
library(tmap)
library(tmaptools)
library(tidyverse)
library(car)
# Change the presentation of decimal numbers to 4 and avoid scientific notation
options(prompt="R> ", digits=4, scipen=999)

Download the data source we uploaded online earlier.

download.file("https://github.com/lizhiyuan913/GIS-assessment/archive/main.zip", destfile="source.zip")
unzip("source.zip")

Then read in the data, and have an overview.

Automobile.data <- read.csv("./GIS-assessment-main/data.csv", sep=',')
glimpse(Automobile.data)

## Rows: 73
## Columns: 12
## $ 市县名称                                                             <chr> "杭...
## $ city.name                                                            <chr> ...
## $ car.ownership.per.100.people.unit.1000people.                        <dbl> ...
## $ population.10000people.                                              <dbl> ...
## $ per.capita.GDP.yuan.                                                 <int> ...
## $ household.price.yuan...                                              <int> ...
## $ population.density.person.k..                                        <dbl> ...
## $ urbanization...                                                      <dbl> ...
## $ Per.capita.milage.of.roadway.network                                 <dbl> ...
## $ Ratio.of.bus.passengers.to.population.1000.person.times.1000.person. <dbl> ...
## $ whether.downtown.of.a.prefecture.level.city.dummy.variable.          <int> ...
## $ whether.municipal.district.dummy.variable.                           <int> ...

Chinese names of cities here are used for joining datas.

variable.name <- c("car.ownership", "population", "per.GDP", "household.price", "population.density", "urbanization", "milage.roadway", "ratio.bus", "prefectrue", "municipal.district")
names(Automobile.data)[3:12] <- variable.name

Read in the map.

Output.Areas <- readOGR("./GIS-assessment-main", "district", use_iconv = TRUE, encoding = "UTF-8")

## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\VIAN33\Documents\WeChat Files\wxid_9cp3t5anw7pd12\FileStorage\File\2021-01\GIS-assessment-main", layer: "district"
## with 73 features
## It has 4 fields

Name the columns.

names(Output.Areas)[1:2] <- c("市级","县级")

Merge the two datasets.

OA.Automobile <- merge(Output.Areas, Automobile.data, by.x="县级", by.y="市县名称", all=FALSE)

head(OA.Automobile, 5)

## class       : SpatialPolygonsDataFrame 
## features    : 5 
## extent      : 118.3, 120.4, 29.19, 30.45  (xmin, xmax, ymin, ymax)
## crs         : +proj=longlat +datum=WGS84 +no_defs 
## variables   : 15
## names       :   县级,   市级,    Shape_Leng,      Shape_Area, city.name, car.ownership, population, per.GDP, household.price, population.density, urbanization, milage.roadway,   ratio.bus, prefectrue, municipal.district 
## min values  : 淳安县, 杭州市, 2.02686291242, 0.0650807384866,    Chunan,   16.59413408,       35.8,   68407,           14077,        81.05048676,           50,     1.15134189,  3.20754717,          0,                  0 
## max values  : 临安区, 杭州市, 3.88099095089,  0.411349378838,     Linan,   28.51087515,        857,  148784,           34647,         1033.52629,           90,    7.715083799, 11.22905028,          1,                  1

any(is.na(OA.Automobile@data)) #check missing value

## [1] FALSE

Firstly, we calculate Moran’s I for every varaible to see if it is geo-related.

nb <- poly2nb(OA.Automobile)
lw <- nb2listw(nb, style="W", zero.policy=TRUE)

Some variables are very geographically related.

for(i in names(OA.Automobile)[6:15]){
global.moran <- moran.test(OA.Automobile[[i]],lw, na.action = na.exclude, zero.policy=TRUE)
print(i)
print(global.moran$estimate[1])}

## [1] "car.ownership"
## Moran I statistic 
##            0.1964 
## [1] "population"
## Moran I statistic 
##            0.1181 
## [1] "per.GDP"
## Moran I statistic 
##            0.6084 
## [1] "household.price"
## Moran I statistic 
##            0.2067 
## [1] "population.density"
## Moran I statistic 
##            0.4483 
## [1] "urbanization"
## Moran I statistic 
##            0.1938 
## [1] "milage.roadway"
## Moran I statistic 
##            0.5811 
## [1] "ratio.bus"
## Moran I statistic 
##            0.3554 
## [1] "prefectrue"
## Moran I statistic 
##          -0.09234 
## [1] "municipal.district"
## Moran I statistic 
##            0.1211

Lets have a look at the dependent variables.

qtm(OA.Automobile, fill=variable.name[2:8], fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), n=9, title=variable.name[2:8]) + tm_scale_bar(position=c("left", "bottom"))

Then we have a look at the distribution of the car ownership – the independent variable.

boxplot(Automobile.data[3], main="boxplot of car ownership")

qtm(OA.Automobile, fill="car.ownership", fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title="boxplot of car ownership") + tm_scale_bar(position=c("left", "bottom"))

We can see there are two strange places.
After checking the data, we notice that data for Jindong district is 0 and remove it.

Automobile.data <- Automobile.data[-42,]
OA.Automobile <- merge(Output.Areas, Automobile.data, by.x="县级", by.y="市县名称", all=FALSE)

2. Regression

Set up the regression pattern. Perform a linear regression first.

formula <- paste0("car.ownership ~", paste(names(Automobile.data)[4:12], collapse=' + '))
model <- lm(formula, data = OA.Automobile)
glance(model)

## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.440         0.359  5.47      5.42 1.84e-5     9  -219.  460.  485.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

We can see the R2 is 0.4 and the p value is low.
The residuals in my model is normally distributed from the above. So we can proceed on other validation.

par(mfrow=c(2,2))
plot(model)

3. Skewness Test

Firstly, We draw see the distribution of variable.

multi.hist(Automobile.data[4:10])

Then We run skewness test.

skewness(Automobile.data[4:10])

##         population            per.GDP    household.price population.density 
##             4.5786             0.6771             1.4238             0.9688 
##       urbanization     milage.roadway          ratio.bus 
##             0.5689             2.3596             1.2082

We can see some variables turn out to be positively-skewed. So let’s try symbox.

for(i in c(4,6,7,9,10)){
symbox(Automobile.data[[i]])}

We choose to do the log transformation to skewed variables.

for(i in c(4,6,7,9,10)){
Automobile.data[i] <- log10(Automobile.data[i])}

skewness(Automobile.data[4:10])

##         population            per.GDP    household.price population.density 
##            0.09817            0.67714            0.55627           -0.42585 
##       urbanization     milage.roadway          ratio.bus 
##            0.56888            0.22024           -0.40566

Now all skewness is under control.

4. VIF Test

par(mai=c(0,0,40,0), mex=2)
corrplot(cor(Automobile.data[4:10]), cl.pos="b", tl.col="black", main="Correlation matrix of the independent variables", mar=c(0,0,1,0), tl.cex=0.6)

We calculate the VIF to test multi-correlation.

car::vif(model)

##         population            per.GDP    household.price population.density 
##              2.513              2.428              2.892              2.790 
##       urbanization     milage.roadway          ratio.bus         prefectrue 
##              3.835              2.132              1.517              2.195 
## municipal.district 
##              1.691

All the VIFs are under 5 thus do not need to remove. Now try again.

model <- lm(formula, data = OA.Automobile)
glance(model)

## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.440         0.359  5.47      5.42 1.84e-5     9  -219.  460.  485.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

par(mfrow=c(2,2))
plot(model)

5. Model Interpretation

Now let’s see the variables’ importance. We begin with the standardized coefficients of the variables.

OA.Automobile.data.standardized <- lapply(OA.Automobile@data[, colnames(OA.Automobile@data)[6:15]], scale) 
model <- lm(formula, data = OA.Automobile.data.standardized)
model$coefficients

##            (Intercept)             population                per.GDP 
## -0.0000000000000002451 -0.1226112251978570128 -0.0809917292503943564 
##        household.price     population.density           urbanization 
##  0.4462026129776222594  0.0484180978199241080  0.2731129521868498333 
##         milage.roadway              ratio.bus             prefectrue 
## -0.3100273105529269824 -0.0693344842429929470 -0.1859919311704416600 
##     municipal.district 
## -0.1131985211680714698

par(mai=c(1,2,1,1))
barplot(model$coefficients, horiz=TRUE, las=1, main="Standardized coefficients of the variables")

We try to remove the variables one for a time and see how much the adjusted R2 drop.

original <- summary(model)$adj.r.squared
for (i in 1:9){
formula.compare <- paste0("car.ownership ~", paste(names(Automobile.data)[4:12][-i], collapse=' + '))
model.compare <- lm(formula.compare, data=OA.Automobile)
print(names(Automobile.data)[4:12][i])
print(original - summary(model.compare)$adj.r.squared)}

## [1] "population"
## [1] -0.003434
## [1] "per.GDP"
## [1] -0.007131
## [1] "household.price"
## [1] 0.06742
## [1] "population.density"
## [1] -0.009229
## [1] "urbanization"
## [1] 0.01174
## [1] "milage.roadway"
## [1] 0.04063
## [1] "ratio.bus"
## [1] -0.006605
## [1] "prefectrue"
## [1] 0.007585
## [1] "municipal.district"
## [1] -0.001637

It turns out that Road Milage，Household and Urbanization are the 3 most important variables. This is the same as above.

resids <- residuals(model)
map.resids <- cbind(OA.Automobile, resids) 
names(map.resids)[length(names(map.resids))] <- "resids"

Draw residuals on the map.

qtm(map.resids, fill = "resids", text="city.name", scale=0.6, fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title="residuals") + tm_scale_bar(position=c("left", "bottom"))

We can see the residuals are very geo-related.So we decide to try GWR on it.

6. Geographic Weighted Regression

GWRbandwidth <- gwr.sel(formula, data = OA.Automobile, adapt = TRUE)

## Adaptive q: 0.382 CV score: 2490 
## Adaptive q: 0.618 CV score: 2651 
## Adaptive q: 0.2361 CV score: 2276 
## Adaptive q: 0.1459 CV score: 2094 
## Adaptive q: 0.09017 CV score: 2075 
## Adaptive q: 0.1033 CV score: 2041 
## Adaptive q: 0.1156 CV score: 2051 
## Adaptive q: 0.1065 CV score: 2040 
## Adaptive q: 0.1057 CV score: 2040 
## Adaptive q: 0.1059 CV score: 2040 
## Adaptive q: 0.1058 CV score: 2040 
## Adaptive q: 0.1058 CV score: 2040 
## Adaptive q: 0.1058 CV score: 2040

Train our GWR model.

gwr.model = gwr(formula,
                data = OA.Automobile,
                adapt=GWRbandwidth,
                hatmatrix=TRUE,
                se.fit=TRUE)
gwr.model

## Call:
## gwr(formula = formula, data = OA.Automobile, adapt = GWRbandwidth, 
##     hatmatrix = TRUE, se.fit = TRUE)
## Kernel function: gwr.Gauss 
## Adaptive quantile: 0.1058 (about 7 of 72 data points)
## Summary of GWR coefficient estimates at data points:
##                           Min.     1st Qu.      Median     3rd Qu.        Max.
## X.Intercept.       -14.5940736  -7.1086118  -3.7570997   2.8721890  18.6470875
## population          -0.0228107  -0.0153691  -0.0113684  -0.0036852   0.0387347
## per.GDP             -0.0001122  -0.0000224   0.0000192   0.0001131   0.0002054
## household.price      0.0000859   0.0002307   0.0004008   0.0005170   0.0007724
## population.density  -0.0068098   0.0000259   0.0016807   0.0039227   0.0142429
## urbanization         0.1136966   0.2174272   0.2720037   0.3887964   0.5798661
## milage.roadway      -1.9356064  -0.8427146  -0.3909161  -0.0872519   2.1475596
## ratio.bus           -0.3334740  -0.0272611   0.0930551   0.1619113   0.2162950
## prefectrue         -10.8604121  -7.0904722  -5.2410947  -3.5639391  -0.1917933
## municipal.district -13.3438148  -7.9175550  -6.8557015  -4.5399086   1.4777459
##                    Global
## X.Intercept.         9.70
## population          -0.01
## per.GDP              0.00
## household.price      0.00
## population.density   0.00
## urbanization         0.18
## milage.roadway      -0.73
## ratio.bus           -0.04
## prefectrue          -3.39
## municipal.district  -2.45
## Number of data points: 72 
## Effective number of parameters (residual: 2traceS - traceS'S): 41.73 
## Effective degrees of freedom (residual: 2traceS - traceS'S): 30.27 
## Sigma (residual: 2traceS - traceS'S): 4.475 
## Effective number of parameters (model: traceS): 33.13 
## Effective degrees of freedom (model: traceS): 38.87 
## Sigma (model: traceS): 3.948 
## Sigma (ML): 2.901 
## AICc (GWR p. 61, eq 2.33; p. 96, eq. 4.21): 491 
## AIC (GWR p. 96, eq. 4.22): 390.8 
## Residual sum of squares: 606 
## Quasi-global R2: 0.8174

After the GWR, the R2 is now up to 0.82, which is a huge imporvement from OLS.

results <-as.data.frame(gwr.model$SDF)

gwr.map <- cbind(OA.Automobile, as.matrix(results))

variable.number.name <- paste(variable.name,".number", sep = "")
variable.coef.name <- paste(variable.name[-1],".coef", sep = "")

names(gwr.map)[6:15] <- variable.number.name
names(gwr.map)[18:26] <- variable.coef.name

We draw the local R2 to have a better look.

qtm(gwr.map, fill = "localR2",text="city.name", scale=0.6, fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title="localR2", format=) + tm_scale_bar(position=c("left", "bottom"))

We draw the parameters of every feature to have a better look.

par(mfrow=c(3,4))
qtm(gwr.map, fill = variable.coef.name, scale=0.6, fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title=variable.coef.name) + tm_scale_bar(position=c("left", "bottom"))

7. Online Presentation

Draw the local R2, the 3 most impactable variables and their coefficients as different layers in one leaflet map.

pick.variable.coef.name <- c("milage.roadway.coef", "urbanization.coef", "household.price.coef")
pick.variable.number.name <- c( "milage.roadway.number", "household.price.number", "urbanization.number")

leaflet.tmap <- tm_shape(gwr.map) + tm_polygons(
c(pick.variable.coef.name, "localR2", pick.variable.number.name), palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), n=9) + tm_facets(as.layers = TRUE)
tmap_leaflet(leaflet.tmap)