NCG602A Regression Assignment

Introduction

This document contains the R code to accompany the hard copy submission of the NCG602A regression assignment. All data and graphics presented in the hard copy of the assignment are reproduced here alongside the code used to generate them.

Functions

The following code is contained within Functions.R. These functions are required for various parts of the analysis.

############################################################
# Functions for Regression Analysis                        #
############################################################

############################################################
# Choropleth Maps using GISTools                           #
############################################################

require(GISTools)

# Standard choropleth map
plotMap <- function(Var, Title, LegendTitle) {
  shades <- auto.shading(Var, n = 7, cols = brewer.pal(7, "YlOrBr"))
  choropleth(Gedu.counties, Var, shading = shades)
  choro.legend(1059710, 3872640, shades, title = LegendTitle, under = "<", over = ">", between = "-",
               fmt = "%3.1f", cex = 0.8)
  title(Title)
}

# OLS standardised residuals (manual breaks, adjusted rounding)
plotResiduals <- function(Var, Title, LegendTitle) {
  shades <- shading(breaks = c(-2, -1, 0, 1, 2), cols = rev(brewer.pal(6, "Spectral")))
  choropleth(Gedu.counties, Var, shading = shades)
  choro.legend(1059710, 3872640, shades, title = LegendTitle, under = "<", over = ">", 
               fmt = "%1.0f", cex = 0.8)
  title(Title)
}

# GWR intercept, PctFB
plotGWR <- function(Var, Title, LegendTitle) {
  shades <- auto.shading(Var, n = 11, cols = rev(brewer.pal(11, "Spectral")))
  choropleth(Gedu.gwr$SDF, Var, shading = shades)
  choro.legend(1059710, 3872640, shades, title = LegendTitle, under = "<", over = ">", between = "to",
               fmt = "%3.1f", cex = 0.8)
  title(Title)
}

# GWR PctPov (adjusted rounding)
plotGWR.1 <- function(Var, Title, LegendTitle) {
  shades <- auto.shading(Var, n = 11, cols = rev(brewer.pal(11, "Spectral")))
  choropleth(Gedu.gwr$SDF, Var, shading = shades)
  choro.legend(1059710, 3872640, shades, title = LegendTitle, under = "<", over = ">", between = "to",
               fmt = "%3.2f", cex = 0.8)
  title(Title)
}

# GWR PctBlack (adjusted rounding)
plotGWR.2 <- function(Var, Title, LegendTitle) {
  shades <- auto.shading(Var, n = 11, cols = rev(brewer.pal(11, "Spectral")))
  choropleth(Gedu.gwr$SDF, Var, shading = shades)
  choro.legend(1059710, 3872640, shades, title = LegendTitle, under = "<", over = ">", between = "to", 
               fmt = "%3.3f", cex = 0.8)
  title(Title)
}

############################################################
# Scatterplots using ggplot2                               #
############################################################

require(ggplot2)
require(gridExtra)

createPlot <- function(Data, Var1, Var2, x, y, xlab, ylab) {
  ggplot(Data, aes_string(x = x, y = y)) +
    geom_point() +
    geom_vline(xintercept = mean(Var1), lty = "dashed") +
    geom_hline(yintercept = mean(Var2), lty = "dashed") +
    geom_smooth(method = "lm", colour = "red", se = F) +
    labs(x = xlab, y = ylab) +
    theme_light()
}

############################################################
# Correlation Matrix Values and Styling                    #
############################################################

require(reshape2)

lower.values <- function(cor.matrix) {
  cor.matrix[upper.tri(cor.matrix)] <- NA
  return(cor.matrix)
}

upper.values <- function(cor.matrix) {
  cor.matrix[lower.tri(cor.matrix)] <- NA
  return(cor.matrix)
}

strip.elements <- theme(
  axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 12, colour = "black"),
  axis.text.y = element_text(size = 12, colour = "black"),
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  axis.ticks = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_blank(),
  panel.background = element_blank(),
  legend.justification = c(0, 1),
  legend.position = c(0.1, 0.9),
  legend.direction = "horizontal"
)

############################################################
# Calculate Predicted R-squared                            #
############################################################

prs <- function(model) {
  lm.variance <- anova(model)
  tss <- sum(lm.variance$"Sum Sq")
  predicted.r.squared <- 1 - PRESS(model) / (tss)
  return(predicted.r.squared)
}

PRESS <- function(model) {
  pr <- residuals(model) / (1 - lm.influence(model)$hat)
  PRESS <- sum(pr^2)
  return(PRESS)
}

Data

Data for Georgia are included in the GWmodel library. Georgia is a data frame containing socio-economic information for each county in the state. GeorgiaCounties is a spatial polygons data frame containing geometry data. The following code loads the required libraries and functions as well as the Georgia data. Each county in Georgia is uniquely identified by its corresponding AreaKey/AREAKEY value. AKMatch matches the contents of each data frame by this column so that they can be merged.

# Libraries
require(GWmodel)
require(car)
require(spdep)
require(MuMIn)
require(rgdal)

# Working directory
setwd("C:/Users/Sean/Desktop/Assignments/Regression Analysis/R")

# Functions
#source("Functions.R")

# Georgia data
data(Georgia)
data(GeorgiaCounties)

# Match and merge the data frames
AKOrder <- match(Gedu.counties$AREAKEY, Gedu.df$AreaKey)
Gedu.counties@data <- data.frame(Gedu.counties@data, Gedu.df[AKOrder, ])

Exploratory Data Analysis

It is generally helpful to look at a data set before any models are fitted or hypotheses formally tested. Exploratory Data Analysis (EDA) can be used to get a “feel” for the data and is helpful for determining if any relationships exist between variables.

Five Number Summary

We can begin with a simple five number summary of the variables we’re interested in.

# Five number summary
summary(Gedu.counties@data[, 14:18])

##     PctBach          PctEld          PctFB           PctPov     
##  Min.   : 4.20   Min.   : 1.46   Min.   :0.040   Min.   : 2.60  
##  1st Qu.: 7.60   1st Qu.: 9.81   1st Qu.:0.415   1st Qu.:14.05  
##  Median : 9.40   Median :12.07   Median :0.720   Median :18.60  
##  Mean   :10.95   Mean   :11.74   Mean   :1.131   Mean   :19.34  
##  3rd Qu.:12.00   3rd Qu.:13.70   3rd Qu.:1.265   3rd Qu.:24.65  
##  Max.   :37.50   Max.   :22.96   Max.   :6.740   Max.   :35.90  
##     PctBlack    
##  Min.   : 0.00  
##  1st Qu.:11.75  
##  Median :27.64  
##  Mean   :27.39  
##  3rd Qu.:40.06  
##  Max.   :79.64

Choropleth Maps

Whilst a five number summary is useful, a map is a much better exploratory tool when spatial data are being considered. Choropleth maps for each variable are created using GISTools and the plotMap function.

PctBach

# PctBach
plotMap(Gedu.counties$PctBach, "% Educated to Degree Level (PctBach)", "Percent (%)")

PctEld

# PctEld
plotMap(Gedu.counties$PctEld, "% Over the Age of 65 (PctEld)", "Percent (%)")

PctFB

# PctFB
plotMap(Gedu.counties$PctFB, "% Born Outside of the United States (PctFB)", "Percent (%)")

PctPov

# PctPov
plotMap(Gedu.counties$PctPov, "% Living Below the Poverty Line (PctPov)", "Percent (%)")

PctBlack

# PctBlack
plotMap(Gedu.counties$PctBlack, "% Ethnic Black (PctBlack)", "Percent (%)")

Scatterplots

Scatterplots can also be used to analyse the relationship between educational attainment and socio-economic variables. These are produced using ggplot2 and the createPlot function.

# Scatterplots
sp1 <- createPlot(Gedu.df, Gedu.df$PctEld, Gedu.df$PctBach, "PctEld", "PctBach", 
                  "% Over the Age of 65", "% Educated to Degree Level")

sp2 <- createPlot(Gedu.df, Gedu.df$PctFB, Gedu.df$PctBach, "PctFB", "PctBach",
                  "% Born Outside of the United States", " % Educated to Degree Level")

sp3 <- createPlot(Gedu.df, Gedu.df$PctPov, Gedu.df$PctBach, "PctPov", "PctBach",
                  "% Living Below the Poverty Line", "% Educated to Degree Level")

sp4 <- createPlot(Gedu.df, Gedu.df$PctBlack, Gedu.df$PctBach, "PctBlack", "PctBach",
                  "% Ethnic Black", "% Educated to Degree Level")

# Grid arrangement
grid.arrange(sp1, sp2, sp3, sp4, nrow = 2, ncol = 2)

Multicollinearity

Before a regression model is fitted to the data, it is worth checking for the presence of multicollinearity as this can lead to a loss of precision and power in the parameter estimates. A visual correlation matrix calculating Pearson’s r pairwise for each set of variables can be created using ggplot2.

# Standard correlation matrix
cor.matrix <- round(cor(Gedu.counties@data[, 14:18]), 2)

# Get relevant values
cor.melt <- melt(upper.values(cor.matrix), na.rm = T)

# Visual plot
vcm <- ggplot(data = cor.melt, aes(Var2, Var1, fill = value)) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, limit = c(-1, 1), space = "Lab",
                       name = "Pearson Correlation") +
  guides(fill = guide_colourbar(barwidth = 7, barheight = 1, title.position = "top", title.hjust = 0.5)) +
  geom_tile(colour = "white") +
  geom_text(aes(Var2, Var1, label = value), colour = "black", size = 4) +
  coord_fixed() +
  theme_minimal() +
  strip.elements

# Output
vcm

Earlier analyses highlight a negative relationship between PctBach and PctBlack though the correlation here is quite weak. This correlation can be tested using cor.test.

# Correlation test
cor.test(Gedu.counties$PctBach, Gedu.counties$PctBlack)

## 
##  Pearson's product-moment correlation
## 
## data:  Gedu.counties$PctBach and Gedu.counties$PctBlack
## t = -1.3777, df = 157, p-value = 0.1703
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.26050822  0.04715686
## sample estimates:
##        cor 
## -0.1092925

The null hypothesis can be accepted that the value of r is not significantly different from zero and likely arose due to chance.

Returning to the issue of multicollinearity, a more reliable test involves calculating the Variance Inflation Factor (VIF) for each variable. This is the expected increase in the variance of a regression coefficient as a result of multicollinearity. As a rule of thumb VIFs greater than 5 indicate multicollinearity and VIFs greater than 10 indicate severe multicollinearity. VIFs can be calculated using the vif function from the car package.

# Dummy model with all potential predictors
dummy <- lm(PctBach ~ PctEld + PctFB + PctPov + PctBlack, data = Gedu.counties)

# Variance Inflation Factors
vif(dummy)

##   PctEld    PctFB   PctPov PctBlack 
## 1.769013 1.334895 3.148163 2.328246

The square root of the VIF is a measure of how much larger the standard error would be as a result of multicollinearity.

# Square root of VIFs
sqrt(vif(dummy))

##   PctEld    PctFB   PctPov PctBlack 
## 1.330042 1.155376 1.774306 1.525859

The VIFs indicate that multicollinearity is unlikely to be a problem.

Model Selection (1)

Four models were fitted and evaluated on the basis of five criteria:

Unadjusted r-squared.
Adjusted r-squared.
Predicted r-squared.
Akaike Information Criterion (AIC).
Weighted AIC.

# Fit four regression models
m1 <- lm(PctBach ~ PctEld, data = Gedu.counties)
m2 <- lm(PctBach ~ PctEld + PctFB, data = Gedu.counties)
m3 <- lm(PctBach ~ PctEld + PctFB + PctPov, data = Gedu.counties)
m4 <- lm(PctBach ~ PctEld + PctFB + PctPov + PctBlack, data = Gedu.counties)

Unadjusted r-squared

The unadjusted r-squared measures the proportion of variance in the dependent variable that is explained by the predictor variables (i.e. the model). The unadjusted r-squared will always increase when new terms are added to a model, even by chance.

# Unadjusted r-squared
print(c(summary(m1)$r.squared, summary(m2)$r.squared, summary(m3)$r.squared, summary(m4)$r.squared))

## [1] 0.2102183 0.4749891 0.4926663 0.5162791

Adjusted r-squared

The adjusted r-squared will only increase if new terms improve the predictive power of the model more than would be expected by chance.

# Adjusted r-squared
print(c(summary(m1)$adj.r.squared, summary(m2)$adj.r.squared, summary(m3)$adj.r.squared,
        summary(m4)$adj.r.squared))

## [1] 0.2051879 0.4682582 0.4828469 0.5037149

Predicted r-squared

The predicted r-squared is a measure of the predictive power of a model. It is calculated using the predicted residual error sum of squares (PRESS) statistic and the total sum of squares (TSS) statistic. These calculations are performed in the prs function.

# Predicted r-squared
print(c(prs(m1), prs(m2), prs(m3), prs(m4)))

## [1] 0.1822080 0.4247562 0.4333449 0.4459278

Akaike Information Criterion

The Akaike Information Criterion (AIC) considers the residual error of the model with respect to the number of observations and number of predictors. Lower AIC scores indicate better predictive ability. AIC scores can be calculated using the AIC function.

print(c(AIC(m1), AIC(m2), AIC(m3), AIC(m4)))

## [1] 971.9981 909.0725 905.6267 900.0487

Weighted AIC

AIC scores can be weighted to give the probability that model i is the best model at minimising information loss given the data set and the set of candidate models. Weighted AIC scores can be calculated using the Weights function in the MuMIn package.

# Weighted AIC
akaike.weights <- c(AIC(m1), AIC(m2), AIC(m3), AIC(m4))

Weights(akaike.weights)

##  model weights 
## [1] 0.000 0.010 0.057 0.932

Model Selection (2)

Summary of `m4`

Based on the five criteria outlined in the previous section, the fourth model (m4) performs the best. We can view a summary of the model:

summary(m4)

## 
## Call:
## lm(formula = PctBach ~ PctEld + PctFB + PctPov + PctBlack, data = Gedu.counties)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1414  -2.0934  -0.2113   1.6709  19.9212 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.67113    1.63682   7.741 1.22e-12 ***
## PctEld      -0.10531    0.13784  -0.764 0.446047    
## PctFB        2.54521    0.29839   8.530 1.29e-14 ***
## PctPov      -0.28291    0.07810  -3.622 0.000396 ***
## PctBlack     0.07685    0.02803   2.742 0.006834 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.013 on 154 degrees of freedom
## Multiple R-squared:  0.5163, Adjusted R-squared:  0.5037 
## F-statistic: 41.09 on 4 and 154 DF,  p-value: < 2.2e-16

Based on the summary there is little evidence that PctEld contributes to variation in the model. We can accept the null hypothesis that the difference between the value of this parameter and zero is due to chance.

Removing `PctEld`

Does removing PctEld improve the predictive power of the model?

We can fit a fifth model:

# Fit fifth model
m5 <- lm(PctBach ~ PctFB + PctPov + PctBlack, data = Gedu.counties)

And calculate the same performance criteria:

# Peformance criteria
print(c(summary(m5)$r.squared, summary(m5)$adj.r.squared, prs(m5), AIC(m5)))

## [1]   0.5144457   0.5050479   0.4531972 898.6501310

# Recalculate Akaike Weights
akaike.weights <- c(AIC(m1), AIC(m2), AIC(m3), AIC(m4), AIC(m5))
Weights(akaike.weights)

##  model weights 
## [1] 0.000 0.004 0.020 0.324 0.652

In general the new model that excludes PctEld performs better than the previous model. This model will be used for the regression analysis.

Results

We can view a summary of the final model.

# Regression model
Gedu.ols <- lm(PctBach ~ PctFB + PctPov + PctBlack, data = Gedu.counties)

# Summary
summary(Gedu.ols)

## 
## Call:
## lm(formula = PctBach ~ PctFB + PctPov + PctBlack, data = Gedu.counties)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.5233  -2.1449  -0.1925   1.7846  20.2514 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.77059    1.13415  10.378  < 2e-16 ***
## PctFB        2.62551    0.27889   9.414  < 2e-16 ***
## PctPov      -0.30964    0.06973  -4.440  1.7e-05 ***
## PctBlack     0.08015    0.02766   2.898   0.0043 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.008 on 155 degrees of freedom
## Multiple R-squared:  0.5144, Adjusted R-squared:  0.505 
## F-statistic: 54.74 on 3 and 155 DF,  p-value: < 2.2e-16

Based on this, educational attainment is predicted by the formula:

PctBach = 11.77 + 2.62 PctFB - 0.3 PctPov + 0.08 PctBlack

The accuracy of the model can be determined by plotting the standardised residuals. The standardised residuals have a mean of 0 and a variance of 1. Values above 2 or below -2 indicate comparatively poor performance and can be regarded as interesting or unusual. Standardised residuals are obtained using the stdres function from the MASS package. The map is plotted using GISTools and the plotResiduals function.

# Add residuals and standardised residuals
Gedu.counties$residuals <- residuals.lm(Gedu.ols)
Gedu.counties$stdres <- stdres(Gedu.ols)

# Plot residuals
plotResiduals(Gedu.counties$stdres, "OLS Standardised Residuals", "Residual")

A worrying trend is that positive residuals tend to have positive residuals as neighbours whilst negative residuals tend to have negative residuals as neighbours. This may indicate the presence of spatial autocorrelation. A fundamental assumption of regression is that the residuals are independent and identically distributed (i.i.d). This implies:

Normally distributed residuals.
Homoscedastic residuals.
The absence of spatial autocorrelation.

These can be tested.

Residual Tests

Spatial Autocorrelation

The most common test for spatial autocorrelation is Moran’s I. The null hypothesis is that the observed spatial distribution is due to chance. Moran’s I is calculated using lm.morantest after fitting a row standardised spatial weights matrix with poly2nb and nb2listw.

# Spatial weights matrix
Gedu.nb <- poly2nb(Gedu.counties)
Gedu.listw <- nb2listw(Gedu.nb)

# Moran's I
lm.morantest(Gedu.ols, Gedu.listw)

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = PctBach ~ PctFB + PctPov + PctBlack, data =
## Gedu.counties)
## weights: Gedu.listw
## 
## Moran I statistic standard deviate = 2.1536, p-value = 0.01564
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##      0.088562186     -0.015021024      0.002313443

This can also be represented visually using moran.plot.

# Moran plot
moran.plot(Gedu.counties$residuals, Gedu.listw, xlab = "Residuals", ylab = "Spatially Lagged Residuals")

The null hypothesis can be rejected as the spatial distribution of values is more spatially clustered than would be expected by chance.

Normality

Normality can be explored visually using a histogram and a quantile-quantile plot.

# Histogram
hist(Gedu.counties$residuals, freq = F, breaks = 12, main = "", xlab = "Residuals")
lines(density(Gedu.counties$residuals), col = "red")
lines(seq(-15, 20, .5), dnorm(seq(-15, 20, .5), mean(Gedu.counties$residuals), sd(Gedu.counties$residuals)), col = "blue")
legend("topright",
       legend = c("Empirical distribution", "Normal distribution"),
       col = c("red", "blue"), lty = 1, lwd = 1, cex = 0.7)

# Quantile-quantile plot
qqnorm(Gedu.counties$residuals, main = "", ylab = "Residuals")
qqline(Gedu.counties$residuals)

It can be confirmed using the Shapiro-Wilk test of normality. This is provided by shapiro.test. The null hypothesis is that the data is normally distributed.

# Shapiro-Wilk test
shapiro.test(Gedu.counties$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  Gedu.counties$residuals
## W = 0.89949, p-value = 5.788e-09

The null hypothesis that the data is normally distributed can be rejected.

Homoscedasticity

Homoscedasticity can be tested using the Breusch-Pagan test for non-constant variance. The null hypothesis is that the residuals are homoscedastic. The Breusch-Pagan test can be performed using the lmtest::bptest function from the car package.

# Breusch-Pagan test
lmtest::bptest(Gedu.ols)

## 
##  studentized Breusch-Pagan test
## 
## data:  Gedu.ols
## BP = 36.743, df = 3, p-value = 5.214e-08

The null hypothesis that the residuals are homoscedastic can be rejected.

Geographically Weighted Regression

The standard regression model fails all of the tests for independent and identically distributed residuals. This is perhaps indicative of the fact that a more classical approach to statistical inference is simply inadequate when dealing with spatial data. A possible way around this is to incorporate spatial structure into the data through Geographically Weighted Regression (GWR).

Standard regression assumes that the relationships being modelled are stationary over space. However, there is often good reason to doubt this assumption. The spatial autocorrelation in the residuals may be a result of the displacement of the variation in the regression parameters due to spatial non-stationarity. GWR accounts for this by allowing parameter estimates to vary over space. With GWR, parameters can be estimated anywhere in the study region given a dependent variable and one or more independent variables which have been measured at a known location.

GWR requires specification of the appropriate coordinate reference system (CRS) using proj4string and calibration of an optimal bandwidth using bw.gwr.

# Projection
proj4string(Gedu.counties) <- CRS("+init=epsg:32616")

# Optimal bandwidth using a bisquare adaptive kernel
Gedu.bw <- bw.gwr(PctBach ~ PctFB + PctPov + PctBlack, data = Gedu.counties, approach = "AIC", 
                  kernel = "bisquare", adaptive = T)

## Adaptive bandwidth (number of nearest neighbours): 105 AICc value: 872.3882 
## Adaptive bandwidth (number of nearest neighbours): 73 AICc value: 879.2741 
## Adaptive bandwidth (number of nearest neighbours): 126 AICc value: 871.144 
## Adaptive bandwidth (number of nearest neighbours): 138 AICc value: 871.6328 
## Adaptive bandwidth (number of nearest neighbours): 117 AICc value: 871.064 
## Adaptive bandwidth (number of nearest neighbours): 113 AICc value: 871.4861 
## Adaptive bandwidth (number of nearest neighbours): 121 AICc value: 871.1724 
## Adaptive bandwidth (number of nearest neighbours): 116 AICc value: 871.218 
## Adaptive bandwidth (number of nearest neighbours): 119 AICc value: 871.2819 
## Adaptive bandwidth (number of nearest neighbours): 117 AICc value: 871.064

We can now fit the model and view a summary of it.

# Fit the GWR model
Gedu.gwr <- gwr.basic(PctBach ~ PctFB + PctPov + PctBlack, data = Gedu.counties, bw = Gedu.bw, 
                      kernel = "bisquare", adaptive = T)

# Summary
print(Gedu.gwr)

##    ***********************************************************************
##    *                       Package   GWmodel                             *
##    ***********************************************************************
##    Program starts at: 2018-01-28 00:46:23 
##    Call:
##    gwr.basic(formula = PctBach ~ PctFB + PctPov + PctBlack, data = Gedu.counties, 
##     bw = Gedu.bw, kernel = "bisquare", adaptive = T)
## 
##    Dependent (y) variable:  PctBach
##    Independent variables:  PctFB PctPov PctBlack
##    Number of data points: 159
##    ***********************************************************************
##    *                    Results of Global Regression                     *
##    ***********************************************************************
## 
##    Call:
##     lm(formula = formula, data = data)
## 
##    Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.5233  -2.1449  -0.1925   1.7846  20.2514 
## 
##    Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
##    (Intercept) 11.77059    1.13415  10.378  < 2e-16 ***
##    PctFB        2.62551    0.27889   9.414  < 2e-16 ***
##    PctPov      -0.30964    0.06973  -4.440  1.7e-05 ***
##    PctBlack     0.08015    0.02766   2.898   0.0043 ** 
## 
##    ---Significance stars
##    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
##    Residual standard error: 4.008 on 155 degrees of freedom
##    Multiple R-squared: 0.5144
##    Adjusted R-squared: 0.505 
##    F-statistic: 54.74 on 3 and 155 DF,  p-value: < 2.2e-16 
##    ***Extra Diagnostic information
##    Residual sum of squares: 2489.959
##    Sigma(hat): 3.982413
##    AIC:  898.6501
##    AICc:  899.0423
##    ***********************************************************************
##    *          Results of Geographically Weighted Regression              *
##    ***********************************************************************
## 
##    *********************Model calibration information*********************
##    Kernel function: bisquare 
##    Adaptive bandwidth: 117 (number of nearest neighbours)
##    Regression points: the same locations as observations are used.
##    Distance metric: Euclidean distance metric is used.
## 
##    ****************Summary of GWR coefficient estimates:******************
##                   Min.   1st Qu.    Median   3rd Qu.    Max.
##    Intercept  8.883446  9.428867 10.581766 11.758524 12.8296
##    PctFB      0.919668  1.519431  3.145136  3.850875  4.1379
##    PctPov    -0.383448 -0.290742 -0.200941 -0.144741 -0.0670
##    PctBlack  -0.031607  0.013804  0.042514  0.094850  0.1383
##    ************************Diagnostic information*************************
##    Number of data points: 159 
##    Effective number of parameters (2trace(S) - trace(S'S)): 14.29423 
##    Effective degrees of freedom (n-2trace(S) + trace(S'S)): 144.7058 
##    AICc (GWR book, Fotheringham, et al. 2002, p. 61, eq 2.33): 871.064 
##    AIC (GWR book, Fotheringham, et al. 2002,GWR p. 96, eq. 4.22): 855.8672 
##    Residual sum of squares: 1890.092 
##    R-square value:  0.6314228 
##    Adjusted R-square value:  0.5947609 
## 
##    ***********************************************************************
##    Program stops at: 2018-01-28 00:46:24

The GW model is a much better fit to the data than the standard regression model.

GW Maps

Maps of the GW parameter estimates can be created to determine how well the model captures the spatial variation in the data. These are plotted using GISTools and the plotGWR functions.

GW Intercept

# GW intercept
plotGWR(Gedu.gwr$SDF$Intercept, "GW Intercept", "Intercept")

GW PctFB Estimates

# GW PctFB
plotGWR(Gedu.gwr$SDF$PctFB, "GW PctFB Parameter Estimates", "Estimate")

GW PctPov Estimates

# GW PctPov
plotGWR.1(Gedu.gwr$SDF$PctPov, "GW PctPov Parameter Estimates", "Estimate")

GW PctBlack Estimates

# GW PctBlack
plotGWR.2(Gedu.gwr$SDF$PctBlack, "GW PctBlack Parameter Estimates", "Estimate")

Monte Carlo Test

The model appears to capture the spatial variability quite well. A Monte Carlo test for significant spatial variability in the parameters can be run with montecarlo.gwr using a distance matrix created by gw.dist. The random nature of the test means the results presented here will not be identical to those presented in the hard copy of the assignment.

# Monte Carlo test for significant spatial variability.
DM <- gw.dist(dp.locat=coordinates(Gedu.counties))

mc <- montecarlo.gwr(PctBach ~ PctFB + PctPov + PctBlack, data = Gedu.counties, bw = Gedu.bw,
                     kernel = "bisquare", adaptive = T, dMat = DM, nsim = 1000)

## 
## Tests based on the Monte Carlo significance test
## 
##             p-value
## (Intercept)  0.7882
## PctFB        0.0090
## PctPov       0.6144
## PctBlack     0.0829

Conclusion

There are significant issues associated with spatial data that limit the applicability of standard regression models. A non-spatial model is almost always a poor fit for spatial data. Spatial models tend to perform far better, though this is hardly surprising. Geographically Weighted Regression is a particularly useful tool in this regard.

References

Brunsdon, C., Fotheringham, A. S. and Charlton, M. E. (1996) Geographically Weighted Regression: A Method for Exploring Spatial Nonstationarity. Geographical Analysis, 28(4), 281—98.

Fotheringham, A. S., Charlton, M. E. and Brunsdon, C. (1998) Geographically weighted regression: a natural evolution of the expansion method for spatial data analysis. Environment and Planning A, 30, 1905—27.

Fotheringham, A. S., Brunson, C. and Charlton, M. E. (2002) Geographically Weighted Regression: The Analysis of Spatially Varying Relationships. Chichester: John Wiley & Sons.