library(broom)
library(corrplot)
library(grid)
library(gridExtra)
library(htmlwidgets)
library(leaflet)
library(moments)
library(psych)
library(plyr)
library(rgdal)
library(rgeos)
library(spdep)
library(sf)
library(sp)
library(spgwr)
library(tmap)
library(tmaptools)
library(tidyverse)
library(car)
# Change the presentation of decimal numbers to 4 and avoid scientific notation
options(prompt="R> ", digits=4, scipen=999)
Download the data source we uploaded online earlier.
download.file("https://github.com/lizhiyuan913/GIS-assessment/archive/main.zip", destfile="source.zip")
unzip("source.zip")
Then read in the data, and have an overview.
Automobile.data <- read.csv("./GIS-assessment-main/data.csv", sep=',')
glimpse(Automobile.data)
## Rows: 73
## Columns: 12
## $ 市县名称 <chr> "杭...
## $ city.name <chr> ...
## $ car.ownership.per.100.people.unit.1000people. <dbl> ...
## $ population.10000people. <dbl> ...
## $ per.capita.GDP.yuan. <int> ...
## $ household.price.yuan... <int> ...
## $ population.density.person.k.. <dbl> ...
## $ urbanization... <dbl> ...
## $ Per.capita.milage.of.roadway.network <dbl> ...
## $ Ratio.of.bus.passengers.to.population.1000.person.times.1000.person. <dbl> ...
## $ whether.downtown.of.a.prefecture.level.city.dummy.variable. <int> ...
## $ whether.municipal.district.dummy.variable. <int> ...
Chinese names of cities here are used for joining datas.
variable.name <- c("car.ownership", "population", "per.GDP", "household.price", "population.density", "urbanization", "milage.roadway", "ratio.bus", "prefectrue", "municipal.district")
names(Automobile.data)[3:12] <- variable.name
Read in the map.
Output.Areas <- readOGR("./GIS-assessment-main", "district", use_iconv = TRUE, encoding = "UTF-8")
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\VIAN33\Documents\WeChat Files\wxid_9cp3t5anw7pd12\FileStorage\File\2021-01\GIS-assessment-main", layer: "district"
## with 73 features
## It has 4 fields
Name the columns.
names(Output.Areas)[1:2] <- c("市级","县级")
Merge the two datasets.
OA.Automobile <- merge(Output.Areas, Automobile.data, by.x="县级", by.y="市县名称", all=FALSE)
head(OA.Automobile, 5)
## class : SpatialPolygonsDataFrame
## features : 5
## extent : 118.3, 120.4, 29.19, 30.45 (xmin, xmax, ymin, ymax)
## crs : +proj=longlat +datum=WGS84 +no_defs
## variables : 15
## names : 县级, 市级, Shape_Leng, Shape_Area, city.name, car.ownership, population, per.GDP, household.price, population.density, urbanization, milage.roadway, ratio.bus, prefectrue, municipal.district
## min values : 淳安县, 杭州市, 2.02686291242, 0.0650807384866, Chunan, 16.59413408, 35.8, 68407, 14077, 81.05048676, 50, 1.15134189, 3.20754717, 0, 0
## max values : 临安区, 杭州市, 3.88099095089, 0.411349378838, Linan, 28.51087515, 857, 148784, 34647, 1033.52629, 90, 7.715083799, 11.22905028, 1, 1
any(is.na(OA.Automobile@data)) #check missing value
## [1] FALSE
Firstly, we calculate Moran’s I for every varaible to see if it is geo-related.
nb <- poly2nb(OA.Automobile)
lw <- nb2listw(nb, style="W", zero.policy=TRUE)
Some variables are very geographically related.
for(i in names(OA.Automobile)[6:15]){
global.moran <- moran.test(OA.Automobile[[i]],lw, na.action = na.exclude, zero.policy=TRUE)
print(i)
print(global.moran$estimate[1])}
## [1] "car.ownership"
## Moran I statistic
## 0.1964
## [1] "population"
## Moran I statistic
## 0.1181
## [1] "per.GDP"
## Moran I statistic
## 0.6084
## [1] "household.price"
## Moran I statistic
## 0.2067
## [1] "population.density"
## Moran I statistic
## 0.4483
## [1] "urbanization"
## Moran I statistic
## 0.1938
## [1] "milage.roadway"
## Moran I statistic
## 0.5811
## [1] "ratio.bus"
## Moran I statistic
## 0.3554
## [1] "prefectrue"
## Moran I statistic
## -0.09234
## [1] "municipal.district"
## Moran I statistic
## 0.1211
Lets have a look at the dependent variables.
qtm(OA.Automobile, fill=variable.name[2:8], fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), n=9, title=variable.name[2:8]) + tm_scale_bar(position=c("left", "bottom"))
Then we have a look at the distribution of the car ownership – the independent variable.
boxplot(Automobile.data[3], main="boxplot of car ownership")
qtm(OA.Automobile, fill="car.ownership", fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title="boxplot of car ownership") + tm_scale_bar(position=c("left", "bottom"))
We can see there are two strange places.
After checking the data, we notice that data for Jindong district is 0 and remove it.
Automobile.data <- Automobile.data[-42,]
OA.Automobile <- merge(Output.Areas, Automobile.data, by.x="县级", by.y="市县名称", all=FALSE)
Set up the regression pattern. Perform a linear regression first.
formula <- paste0("car.ownership ~", paste(names(Automobile.data)[4:12], collapse=' + '))
model <- lm(formula, data = OA.Automobile)
glance(model)
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.440 0.359 5.47 5.42 1.84e-5 9 -219. 460. 485.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
We can see the R2 is 0.4 and the p value is low.
The residuals in my model is normally distributed from the above. So we can proceed on other validation.
par(mfrow=c(2,2))
plot(model)
Firstly, We draw see the distribution of variable.
multi.hist(Automobile.data[4:10])
Then We run skewness test.
skewness(Automobile.data[4:10])
## population per.GDP household.price population.density
## 4.5786 0.6771 1.4238 0.9688
## urbanization milage.roadway ratio.bus
## 0.5689 2.3596 1.2082
We can see some variables turn out to be positively-skewed. So let’s try symbox.
for(i in c(4,6,7,9,10)){
symbox(Automobile.data[[i]])}
We choose to do the log transformation to skewed variables.
for(i in c(4,6,7,9,10)){
Automobile.data[i] <- log10(Automobile.data[i])}
skewness(Automobile.data[4:10])
## population per.GDP household.price population.density
## 0.09817 0.67714 0.55627 -0.42585
## urbanization milage.roadway ratio.bus
## 0.56888 0.22024 -0.40566
Now all skewness is under control.
par(mai=c(0,0,40,0), mex=2)
corrplot(cor(Automobile.data[4:10]), cl.pos="b", tl.col="black", main="Correlation matrix of the independent variables", mar=c(0,0,1,0), tl.cex=0.6)
We calculate the VIF to test multi-correlation.
car::vif(model)
## population per.GDP household.price population.density
## 2.513 2.428 2.892 2.790
## urbanization milage.roadway ratio.bus prefectrue
## 3.835 2.132 1.517 2.195
## municipal.district
## 1.691
All the VIFs are under 5 thus do not need to remove. Now try again.
model <- lm(formula, data = OA.Automobile)
glance(model)
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.440 0.359 5.47 5.42 1.84e-5 9 -219. 460. 485.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
par(mfrow=c(2,2))
plot(model)
Now let’s see the variables’ importance. We begin with the standardized coefficients of the variables.
OA.Automobile.data.standardized <- lapply(OA.Automobile@data[, colnames(OA.Automobile@data)[6:15]], scale)
model <- lm(formula, data = OA.Automobile.data.standardized)
model$coefficients
## (Intercept) population per.GDP
## -0.0000000000000002451 -0.1226112251978570128 -0.0809917292503943564
## household.price population.density urbanization
## 0.4462026129776222594 0.0484180978199241080 0.2731129521868498333
## milage.roadway ratio.bus prefectrue
## -0.3100273105529269824 -0.0693344842429929470 -0.1859919311704416600
## municipal.district
## -0.1131985211680714698
par(mai=c(1,2,1,1))
barplot(model$coefficients, horiz=TRUE, las=1, main="Standardized coefficients of the variables")
We try to remove the variables one for a time and see how much the adjusted R2 drop.
original <- summary(model)$adj.r.squared
for (i in 1:9){
formula.compare <- paste0("car.ownership ~", paste(names(Automobile.data)[4:12][-i], collapse=' + '))
model.compare <- lm(formula.compare, data=OA.Automobile)
print(names(Automobile.data)[4:12][i])
print(original - summary(model.compare)$adj.r.squared)}
## [1] "population"
## [1] -0.003434
## [1] "per.GDP"
## [1] -0.007131
## [1] "household.price"
## [1] 0.06742
## [1] "population.density"
## [1] -0.009229
## [1] "urbanization"
## [1] 0.01174
## [1] "milage.roadway"
## [1] 0.04063
## [1] "ratio.bus"
## [1] -0.006605
## [1] "prefectrue"
## [1] 0.007585
## [1] "municipal.district"
## [1] -0.001637
It turns out that Road Milage,Household and Urbanization are the 3 most important variables. This is the same as above.
resids <- residuals(model)
map.resids <- cbind(OA.Automobile, resids)
names(map.resids)[length(names(map.resids))] <- "resids"
Draw residuals on the map.
qtm(map.resids, fill = "resids", text="city.name", scale=0.6, fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title="residuals") + tm_scale_bar(position=c("left", "bottom"))
We can see the residuals are very geo-related.So we decide to try GWR on it.
GWRbandwidth <- gwr.sel(formula, data = OA.Automobile, adapt = TRUE)
## Adaptive q: 0.382 CV score: 2490
## Adaptive q: 0.618 CV score: 2651
## Adaptive q: 0.2361 CV score: 2276
## Adaptive q: 0.1459 CV score: 2094
## Adaptive q: 0.09017 CV score: 2075
## Adaptive q: 0.1033 CV score: 2041
## Adaptive q: 0.1156 CV score: 2051
## Adaptive q: 0.1065 CV score: 2040
## Adaptive q: 0.1057 CV score: 2040
## Adaptive q: 0.1059 CV score: 2040
## Adaptive q: 0.1058 CV score: 2040
## Adaptive q: 0.1058 CV score: 2040
## Adaptive q: 0.1058 CV score: 2040
Train our GWR model.
gwr.model = gwr(formula,
data = OA.Automobile,
adapt=GWRbandwidth,
hatmatrix=TRUE,
se.fit=TRUE)
gwr.model
## Call:
## gwr(formula = formula, data = OA.Automobile, adapt = GWRbandwidth,
## hatmatrix = TRUE, se.fit = TRUE)
## Kernel function: gwr.Gauss
## Adaptive quantile: 0.1058 (about 7 of 72 data points)
## Summary of GWR coefficient estimates at data points:
## Min. 1st Qu. Median 3rd Qu. Max.
## X.Intercept. -14.5940736 -7.1086118 -3.7570997 2.8721890 18.6470875
## population -0.0228107 -0.0153691 -0.0113684 -0.0036852 0.0387347
## per.GDP -0.0001122 -0.0000224 0.0000192 0.0001131 0.0002054
## household.price 0.0000859 0.0002307 0.0004008 0.0005170 0.0007724
## population.density -0.0068098 0.0000259 0.0016807 0.0039227 0.0142429
## urbanization 0.1136966 0.2174272 0.2720037 0.3887964 0.5798661
## milage.roadway -1.9356064 -0.8427146 -0.3909161 -0.0872519 2.1475596
## ratio.bus -0.3334740 -0.0272611 0.0930551 0.1619113 0.2162950
## prefectrue -10.8604121 -7.0904722 -5.2410947 -3.5639391 -0.1917933
## municipal.district -13.3438148 -7.9175550 -6.8557015 -4.5399086 1.4777459
## Global
## X.Intercept. 9.70
## population -0.01
## per.GDP 0.00
## household.price 0.00
## population.density 0.00
## urbanization 0.18
## milage.roadway -0.73
## ratio.bus -0.04
## prefectrue -3.39
## municipal.district -2.45
## Number of data points: 72
## Effective number of parameters (residual: 2traceS - traceS'S): 41.73
## Effective degrees of freedom (residual: 2traceS - traceS'S): 30.27
## Sigma (residual: 2traceS - traceS'S): 4.475
## Effective number of parameters (model: traceS): 33.13
## Effective degrees of freedom (model: traceS): 38.87
## Sigma (model: traceS): 3.948
## Sigma (ML): 2.901
## AICc (GWR p. 61, eq 2.33; p. 96, eq. 4.21): 491
## AIC (GWR p. 96, eq. 4.22): 390.8
## Residual sum of squares: 606
## Quasi-global R2: 0.8174
After the GWR, the R2 is now up to 0.82, which is a huge imporvement from OLS.
results <-as.data.frame(gwr.model$SDF)
gwr.map <- cbind(OA.Automobile, as.matrix(results))
variable.number.name <- paste(variable.name,".number", sep = "")
variable.coef.name <- paste(variable.name[-1],".coef", sep = "")
names(gwr.map)[6:15] <- variable.number.name
names(gwr.map)[18:26] <- variable.coef.name
We draw the local R2 to have a better look.
qtm(gwr.map, fill = "localR2",text="city.name", scale=0.6, fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title="localR2", format=) + tm_scale_bar(position=c("left", "bottom"))
We draw the parameters of every feature to have a better look.
par(mfrow=c(3,4))
qtm(gwr.map, fill = variable.coef.name, scale=0.6, fill.palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), title=variable.coef.name) + tm_scale_bar(position=c("left", "bottom"))
Draw the local R2, the 3 most impactable variables and their coefficients as different layers in one leaflet map.
pick.variable.coef.name <- c("milage.roadway.coef", "urbanization.coef", "household.price.coef")
pick.variable.number.name <- c( "milage.roadway.number", "household.price.number", "urbanization.number")
leaflet.tmap <- tm_shape(gwr.map) + tm_polygons(
c(pick.variable.coef.name, "localR2", pick.variable.number.name), palette=c("#ffffd9",
"#edf8b1",
"#c7e9b4",
"#7fcdbb",
"#41b6c4",
"#1d91c0",
"#225ea8",
"#253494",
"#081d58"), n=9) + tm_facets(as.layers = TRUE)
tmap_leaflet(leaflet.tmap)