Set up

packages = c('tidyverse', 'funModeling', 'knitr', 'rgdal', 'maptools', 'sf','raster','spatstat', 'tmap','tmaptools', 'gridExtra', 'leaflet', 'OpenStreetMap', 'microbenchmark', 'doParallel', 'foreach', 'GWmodel', 'car', 'httr', 'jsonlite', 'kableExtra')
for (p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
options(kableExtra.auto_format = FALSE)

1 Loading the Data

load("data/listings_gwr1_v3.RData")
load("data/listings_gwr3_v1.RData")

We load the data that contains our dependent and independent variables for the regression analysis. listings_gwr1 contains the variables for 7272 listings. listings_gwr3 filters out listings that do not have a review, meaning that they have had at least one visitor, and has 4466 listings.

2 Bandwidth selection

During model calibration, bandwidths are tested and assigned cross validation (CV) scores; the bandwidth with the lowest CV score produces the lowest root mean square prediction error. We use bw.gwr() to select the best bandwidth to be used in our GWR analysis. We use 2 different approaches (AICc, CV) and use 2 different kernels (gaussian, bisquare) to identify the bandwidths.

2.1 Adaptive bandwidths

# Adaptive bandwidths
bwidth_adaptive <- list()

for (i in seq_along(kerneltype)) {
  kernel_name <- kerneltype[i]
  bwidth_adaptive[[kernel_name]] <- list()
  for (j in seq_along(approachtype)) {
    approach_name <- approachtype[j]
    bwidth_adaptive[[kernel_name]][[approach_name]] <- bw.gwr(total_price ~ bathrooms + bedrooms + private + entire + shared + number_of_reviews + guests_included + superhost + mrt_350m + mrt_700m + tourdistindex + hotels_250m + hotels_500m + hotels_1000m + hotels_2000m + hosp_500m + hosp_1000m + hosp_2000m + hosp_5000m + mall_500m + mall_1000m + mall_2000m + malldistindex + flexible + moderate, data=listings_gwr1, approach=approachtype[j], kernel=kerneltype[i], adaptive = TRUE)
  }
}

We create a list object with the various bandwidths calculated to be able to call them up later. They can be accessed in the form bwidth_{fixed/adaptive}[[kerneltype]][[approachtype]]. The above code above gives us the adapative bandwidths.

2.2 Saving bandwidth data

save(bwidth_adaptive, file="data/GWRbandwidth_adaptive.RData")

The best adaptive bandwidth using AIC and CV is 134 and 109 respectively using a Gaussian kernel; the best bandwidth using AIC and CV is 1596 and 1598 respectively using a Bisquare kernel. We save the bandwidths to fit our basic gwr model.

3 Basic GWR

# Create vector of all independent variables
all_indepvar <- c("bathrooms", "bedrooms", "private", "entire", "shared", "number_of_reviews", "guests_included", "superhost", "mrt_350m", "mrt_700m", "tourdistindex", "hotels_250m", "hotels_500m", "hotels_1000m", "hotels_2000m", "hosp_500m", "hosp_1000m", "hosp_2000m", "hosp_5000m", "mall_500m", "mall_1000m", "mall_2000m", "malldistindex", "flexible", "moderate")

We create a vector of all the independent variables.

3.1 Model Selection

We use the gwr.model.selection() function to go through models with different permutations of the independent variables. This returns an object with the model, the independent variables and the kernel, AIC, AICc, and RSS of the models. As we will be doing this for the 4 different permutations, we create functions to run the model selection, help select the best models by RSS, AIC and AICc and then the best model amongst the three with the highest adjusted R2 value.

3.1.1 Helper Functions

3.1.1.1 Select models with lowest AIC/AICc/RSS

# Function to select models with lowest AIC, AICc and RSS
model_type <- c("best_AIC", "best_AICc", "best_RSS")

best_model <- function(modelsel, indepvar) {
  sorted.models <- gwr.model.sort(modelsel, numVars = length(indepvar), ruler.vector = modelsel[[2]][,2])
  modelsel.df <- data.frame()
  for (i in seq_along(sorted.models[[1]])) {
  modelsel.df[i, "model"] <-sorted.models[[1]][[i]][[1]]
  }
  res_1 <- sorted.models[[2]] %>% as.data.frame()
  colnames(res_1) <- c("bandwidth", "AIC", "AICc", "RSS")

  modelsel.df <- cbind(modelsel.df, res_1)

  obj_name <- rbind(
    modelsel.df[which.min(modelsel.df$AIC),],
    modelsel.df[which.min(modelsel.df$AICc),],
    modelsel.df[which.min(modelsel.df$RSS),]
  )
  modelselname <- rep(deparse(substitute(modelsel)), 3)
  obj_name <- cbind(obj_name, model_type, modelselname)
}

We create a function to sort the models and select the 3 models with the lowest AIC, AICc, and RSS, extract the model into a dataframe.

3.1.1.2 Functions to get best model by adjusted R2

best_model_gwr <- function(bestmodelobj, kernel_sel) {
  basegwr <- list()
  for (i in seq_along(bestmodelobj$model)) {
    x = bestmodelobj$model_type[i]
     basegwr[[x]] <- gwr.basic(formula=bestmodelobj$model[i], data=listings_gwr1, kernel=kernel_sel, adaptive = TRUE, bw=bestmodelobj$bandwidth[i], cv=TRUE)
  }
  return(basegwr)
}

best_model_adjr2 <- function(x) {
  diagnostics <- data.frame()
  for (i in seq_along(x)) {
    temp <- x[[i]]$GW.diagnostic %>% as.data.frame()
    temp <- cbind(temp, x[[i]]$GW.arguments %>% as.data.frame())
    temp <- cbind(temp, model_type=names(x)[i])
    diagnostics <- rbind(diagnostics, temp)
  }
  diagnostics[which.max(diagnostics$gwR2.adj),]
}

The above functions help us to run the basic gwr function on the 3 selected models using the given bandwidths and kernel and select the best model with the highest adjusted R2.

3.1.2 Adaptive bw models

modelselect.adaptive <- list()
approachtype2 <- c("AIC", "CV")

for (i in seq_along(kerneltype)) {
  kernel_name <- kerneltype[i]
  modelselect.adaptive[[kernel_name]] <- list()
  for (j in seq_along(approachtype2)) {
    approach_name <- approachtype2[j]
    approach_bw <- approachtype[j]
    modelselect.adaptive[[kernel_name]][[approach_name]] <- gwr.model.selection(DeVar = "total_price", InDeVars = all_indepvar, data=listings_gwr1, approach = approach_name, kernel=kernel_name, adaptive = TRUE, bw=bwidth_adaptive[[kernel_name]][[approach_bw]], parallel.method = "cluster")
  }
}

We create a list object of model selections in the form x[[kerneltype]][[approachtype]]. We then use the helper functions to select the best models from each of these kernel types and approach to get the best model.

3.1.3 Best adaptive bw models

The sections below use the helper functions to obtain the best model with the highest adjusted R2 for a GWR using the different kernel and approaches. The results are saved in an RData file that can be accessed for further analysis.

3.1.3.1 Best Gaussian, AIC model

# AIC, Gaussian
# Select model with lowest AIC, AICc, RSS
selgwr.aicg <- best_model(modelselect.adaptive$gaussian$AICc, all_indepvar)
selgwr.aicg.top <- best_model_gwr(selgwr.aicg, "gaussian")

# Select model with best adjusted R2
selgwr.aicg.best <- best_model_adjr2(selgwr.aicg.top)
save(selgwr.aicg, selgwr.aicg.top, selgwr.aicg.best, file="data/aic_gaussian_best.RData")

3.1.3.2 Best Gaussian CV model

# CV, Gaussian
# Select model with lowest AIC, AICc, RSS
ptm <- proc.time()
selgwr.cvg <- best_model(modelselect.adaptive$gaussian$CV, all_indepvar)
selgwr.cvg.top <- best_model_gwr(selgwr.cvg, "gaussian")

# Select model with best adjusted R2
selgwr.cvg.best <- best_model_adjr2(selgwr.cvg.top)

save(selgwr.cvg, selgwr.cvg.top, selgwr.cvg.best, file="data/cv_gaussian_best.RData")
proc.time() - ptm

3.1.3.3 Best Bisquare AIC model

# AIC, Bisquare
# Select model with lowest AIC, AICc, RSS
ptm <- proc.time()
selgwr.aicb <- best_model(modelselect.adaptive$bisquare$AICc, all_indepvar)
selgwr.aicb.top <- best_model_gwr(selgwr.aicb, "bisquare")

# Select model with best adjusted R2
selgwr.aicb.best <- best_model_adjr2(selgwr.aicb.top)
save(selgwr.aicb, selgwr.aicb.top, selgwr.aicb.best, file="data/aic_bisquare_best.RData")
proc.time() - ptm

3.1.3.4 Best Bisquare CV model

# CV, Bisquare
# Select model with lowest AIC, AICc, RSS
ptm <- proc.time()
selgwr.cvb <- best_model(modelselect.adaptive$bisquare$CV, all_indepvar)
selgwr.cvb.top <- best_model_gwr(selgwr.cvb, "bisquare")

# Select model with best adjusted R2
selgwr.cvb.best <- best_model_adjr2(selgwr.cvb.top)
save(selgwr.cvb, selgwr.cvb.top, selgwr.cvb.best, file="data/cv_bisquare_best.RData")
proc.time() - ptm

3.1.4 Model Selection

# Select best model out of the various model selections
results <- rbind(selgwr.aicg.best, 
                 selgwr.cvg.best, 
                 selgwr.aicb.best,
                 selgwr.cvb.best)
results[which.max(results$gwR2.adj),]
save(results, file="data/results_adaptive.RData")

We bind the results of the best models from the 4 permutations and find the model with the highest adjusted R2. This is the model using a Gaussian kernel and the CV approach, with the lowest AICc value.

3.1.5 Best model

results.best <- results[which.max(results$gwR2.adj),]
best <- gwr.basic(results.best$formula, data=listings_gwr1, kernel = "gaussian", bw=results.best$bw, adaptive = TRUE)
results.best$formula
save(results.best, best, file="data/GWRbestmodel.RData")

We obtain the formula for the model and get obtain the results for the best GWR regression model.

Appendix 3 - GWR Bandwidth and Model Selection

Geospatial Analysis of Airbnb in Singapore

Clara Chua

25/04/2020 (updated: 2021-06-09)