Geographically weighted regression models:Explore the influencial of NYC students proficiency in neighbourhood range

Libraries

load datasets

#library(tidyverse)
#library(tmap)
#library(geojsonio)
#library(plotly)
#library(rgdal)
#library(broom)
#library(mapview)
#library(sf)
#library(sp)
#library(car)
#library(spdep)
#library(spatstat)
#library(fs)
#library(janitor)
#library(rgeos)
#library(tmaptools)
#library(spgwr)
#library(RColorBrewer)
#library(rgeos)
#library(GWmodel)
#library(gstat)
#library(raster)

#load datasets
nycmap <- st_read(here::here("CASA005_practice",
                             "nyc", 
                             "nynta.shp"))%>%
  st_transform(., 32017)

## Reading layer `nynta' from data source `C:\Users\18811\Documents\CASA005_practice\nyc\nynta.shp' using driver `ESRI Shapefile'
## Simple feature collection with 195 features and 7 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 913175.1 ymin: 120121.9 xmax: 1067383 ymax: 272844.3
## projected CRS:  NAD83 / New York Long Island (ftUS)

#2016_School_Explorer
shsat <- read_csv("CASA005_practice/nyc/SHSATrefine.csv")%>%
  st_as_sf(., coords = c("Longitude", "Latitude"), 
           crs = 4326) %>%
  st_transform(., 32017)
names(shsat)

##  [1] "SchoolName"                            
##  [2] "Location_Code"                         
##  [3] "District"                              
##  [4] "Zip"                                   
##  [5] "Economic_Need_Index"                   
##  [6] "School_Income_Estimate"                
##  [7] "Minorities"                            
##  [8] "Black_or_Hispanic"                     
##  [9] "Percent Hispanic"                      
## [10] "White"                                 
## [11] "Student_Attendance_Rate"               
## [12] "Percent_of_Students_Chronically_Absent"
## [13] "Rigorous_Instruction"                  
## [14] "Collaborative_Teachers"                
## [15] "Supportive_Environment"                
## [16] "Effective_School_Leadership"           
## [17] "Strong_Family-Community_Ties"          
## [18] "Trust"                                 
## [19] "Average_ELA_Proficiency"               
## [20] "Average_Math_Proficiency"              
## [21] "ELAPassRate"                           
## [22] "ELApass"                               
## [23] "ELAWhiteRate"                          
## [24] "MathTesters"                           
## [25] "Mathpass"                              
## [26] "Mathpassrate"                          
## [27] "MathWhitePass"                         
## [28] "ELA_limited_pass"                      
## [29] "ELA_eco_pass"                          
## [30] "Math_limited_pass"                     
## [31] "Math_eco_pass"                         
## [32] "geometry"

safety <- readr::read_csv("CASA005_practice/nyc/safety2016.csv") %>%
  st_as_sf(., coords = c("Longitude", "Latitude"), 
           crs = 4326) %>%
  st_transform(., 32017)

EDA

Check and fill NaNs, standardise and mutate the overall proficiency column

#fillna with mean
shsat$Average_ELA_Proficiency[is.na(shsat$Average_ELA_Proficiency)] <- mean(
  shsat$Average_ELA_Proficiency,na.rm=TRUE)

shsat$Average_Math_Proficiency[is.na(shsat$Average_Math_Proficiency)] <- mean(
  shsat$Average_Math_Proficiency,na.rm=TRUE)

shsat$Economic_Need_Index[is.na(shsat$Economic_Need_Index)] <- mean(
  shsat$Economic_Need_Index,na.rm=TRUE)

shsat$School_Income_Estimate[is.na(shsat$School_Income_Estimate)] <- mean(
  shsat$School_Income_Estimate,na.rm=TRUE)

shsat$Black_or_Hispanic[is.na(shsat$Black_or_Hispanic)] <- mean(
  shsat$Black_or_Hispanic,na.rm=TRUE)

shsat$Supportive_Environment[is.na(shsat$Supportive_Environment)] <- mean(
  shsat$Supportive_Environment,na.rm=TRUE)

shsat$Student_Attendance_Rate[is.na(shsat$Student_Attendance_Rate)] <- mean(
  shsat$Student_Attendance_Rate,na.rm=TRUE)

# standardise the income 
shsat$School_Income_Estimate<-scale(shsat$School_Income_Estimate)

shsat <- shsat %>%
  mutate(Total_Proficiency = (Average_ELA_Proficiency + Average_Math_Proficiency) / 2)

Hist plot to check data distributions

ggplot(data=shsat, aes(x=Student_Attendance_Rate)) + 
  geom_histogram(
                 fill="white",
                 col= 'black',
                 na.rm = TRUE) + 
  labs(title="Histogram for Student Attendance Rate", x="Student_Attendance_Rate", y="Count")

ggplot(data=shsat, aes(x=Economic_Need_Index)) + 
  geom_histogram(
                 fill="white",
                 col= 'black',
                 na.rm = TRUE) + 
  labs(title="Histogram for Economic Need Index", x="Economic_Need_Index", y="Count")

ggplot(data=shsat, aes(x=Average_ELA_Proficiency)) + 
  geom_histogram(
    fill="white",
    col= 'black',
    na.rm = TRUE) + 
  labs(title="Histogram for Average_ELA_Proficiency", x="Average_ELA_Proficiency", y="Count")

ggplot(data=shsat, aes(x=Average_Math_Proficiency)) + 
  geom_histogram(
    fill="white",
    col= 'black',
    na.rm = TRUE) + 
  labs(title="Histogram for Average Math Proficiency", x="Average_Math_Proficiency", y="Count")

ggplot(data=shsat, aes(x=School_Income_Estimate)) + 
  geom_histogram(
    fill="white",
    col= 'black',
    na.rm = TRUE) + 
  labs(title="Histogram for School Income", x="School_Income_Estimate", y="Count")

#handing outliers: 
x <- shsat$School_Income_Estimate
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(x, na.rm = T)
x[x < (qnt[1] - H)] <- caps[1]
x[x > (qnt[2] + H)] <- caps[2]

check the school distribution in the city

window <- as.owin(nycmap)
plot(window)

school <- distinct(shsat)

school0 <- shsat %>%
  as(., 'Spatial')

school0.ppp <- ppp(x=school0@coords[,1],
                   y=school0@coords[,2],
                          window=window)

## Warning: data contain duplicated points

school0.ppp %>%
  density(., sigma=3500) %>%
  plot()

Spatial join, summarizes main information

Calculate the number of schools in each neighbourhood (NTA Code) by spatial join and summarize neibourhoods information as variabels to fit models.

points_sf_joined <- nycmap%>%
  st_join(shsat)%>%
  add_count(NTACode)%>%
  janitor::clean_names()%>%
  mutate(area=st_area(.))%>%
  mutate(density=n/area)

points_sf_joined$total_proficiency[is.na(points_sf_joined$total_proficiency)] <- mean(
  points_sf_joined$total_proficiency,na.rm=TRUE)

points_sf_joined$black_or_hispanic[is.na(points_sf_joined$black_or_hispanic)] <- mean(
  points_sf_joined$black_or_hispanic,na.rm=TRUE)

points_sf_joined$economic_need_index[is.na(points_sf_joined$economic_need_index)] <- mean(
  points_sf_joined$economic_need_index,na.rm=TRUE)

points_sf_joined$supportive_environment[is.na(points_sf_joined$supportive_environment)] <- mean(
  points_sf_joined$supportive_environment,na.rm=TRUE)

points_sf_joined$student_attendance_rate[is.na(points_sf_joined$student_attendance_rate)] <- mean(
  points_sf_joined$student_attendance_rate,na.rm=TRUE)

points_sf_joined$school_income_estimate[is.na(points_sf_joined$school_income_estimate)] <- mean(
  points_sf_joined$school_income_estimate,na.rm=TRUE)

points_sf_joined0<- points_sf_joined %>%                    
  group_by(nta_code, nta_name) %>%         
  summarise(ela = median(average_ela_proficiency),
            mat = median(average_math_proficiency),
            economic_need_index = median(economic_need_index),
            black_or_hispanic = median(black_or_hispanic),
            total_proficiency = median(total_proficiency),
            supportive_environment = median(supportive_environment),
            student_attendance_rate = median(student_attendance_rate),
            school_income_estimate = median(school_income_estimate),
            school_count= first(n),
            density = first(density))

Summarrise the Safety dataset as above, and merge two datasets.

safety_sf_joined <- nycmap%>%
  st_join(safety)%>%
  add_count(NTACode)%>%
  janitor::clean_names()%>%
  #calculate area
  mutate(area=st_area(.))%>%
  #then density of the points per ward
  mutate(density=n/area)

safety_sf_joined0<- safety_sf_joined %>%                    
  group_by(nta_code) %>%         
  summarise(
    census= first(census_tract),
    nocrime= mean(avg_of_no_crim))

safety_sf_joined0$nocrime[is.na(safety_sf_joined0$nocrime)] <- median(
  safety_sf_joined0$nocrime, na.rm=TRUE)

safe <- st_drop_geometry(safety_sf_joined0)

mods <- merge(x=points_sf_joined0, y=safe)

Plot the basic distribution of Variables.

You could make plots with all variables.

tmap_mode("plot")
Colours<- rev(brewer.pal(8, "RdYlBu"))
tm1 <- tm_shape(mods) + 
  tm_polygons("total_proficiency",
              borders.alpha=0.05,
              style="jenks", 
              palette=Colours,
  ) + 
  tm_legend(show=TRUE)+
  tm_layout(frame=FALSE)+
  
  tm_credits("(a)", position=c(0.1,0.1), size=0.8)

tm2 <- tm_shape(mods)+ 
  tm_polygons('black_or_hispanic',
              style="jenks",
              palette="GnBu")+
  tm_legend(show=TRUE)+
  tm_layout(frame=FALSE)+
  tm_credits("(b)", position=c(0.1,0.1), size=0.8)

tm3 <- tm_shape(mods)+ 
  tm_polygons('economic_need_index',
              style="fisher",
              palette="GnBu")+
  tm_legend(show=TRUE)+
  tm_layout(frame=FALSE)+
  tm_credits("(c)", position=c(0.1,0.1), size=0.8)


Counting <- tm_shape(mods) +
  tm_polygons("nocrime",
              style="quantile",
              palette="GnBu") +
  tm_scale_bar(position=c(0.65,0.04), text.size=0.6)+
  tm_compass(north=0, position=c(0.77,0.14))+
  tm_layout(frame=FALSE)+
  tm_credits("(d)", position=c(0.1,0.1), size = 0.8)

t=tmap_arrange(tm1, tm2, tm3, Counting, ncol=2)

t

If you plot a linear model,you need to check the model fit as below:

Residuals vs Fitted: considers the relationship between the actual and the predicted data. The more dispersed the residuals are, the weaker the R2 should be. This can be useful to identify outlines too.The fit also tells us if the residuals are non-linearly distributed.
Normal Q-Q: Demonstrates the extent to which the residuals are normally distributed. Normal residuals should fit the straight line well.
Scale-Location: Shows if the residuals are spread equally across the full range of the predictors. If the values in this chart display a linear positive relationship, it suggests that the residuals spread wider and wider for greater values (this is known as heteroscedasticity).
Residuals vs Leverage: Identifies outliers, high-leverage points and influential observations. This plot is pretty difficult to interpret and there are other means of identifying these values.
VIF: Check whether multicollinearity exists.

model_final <- lm(total_proficiency ~ supportive_environment + school_income_estimate +
                    economic_need_index + student_attendance_rate + black_or_hispanic + nocrime, 
                  data = mods)
tidy(model_final)

## # A tibble: 7 x 5
##   term                    estimate std.error statistic  p.value
##   <chr>                      <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)              -2.48     0.955       -2.60 1.01e- 2
## 2 supportive_environment    0.454    0.368        1.23 2.19e- 1
## 3 school_income_estimate    0.0145   0.0134       1.08 2.79e- 1
## 4 economic_need_index      -0.308    0.0892      -3.46 6.77e- 4
## 5 student_attendance_rate   5.72     0.974        5.88 1.89e- 8
## 6 black_or_hispanic        -0.597    0.0642      -9.29 3.82e-17
## 7 nocrime                  -0.0117   0.00848     -1.38 1.70e- 1

glance(model_final)

## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.829         0.824 0.143      152. 2.06e-69     6   105. -195. -169.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

map model residuals in a spatial range.

we need to create a column vector of the residuals, and then Add the residuals column to the extended-spatial dataset.

model_res = residuals(model_final)

map.resids <- cbind(mods, model_res) 

tm_shape(map.resids) +
  tm_polygons("model_res",
              style="fisher",
              palette='RdYlBu',
              midpoint=NA,
              
              title="model residuals") +
  tm_scale_bar(position = c("right", "bottom"), width = 0.15) +
  tm_compass(north=0, position=c(0.9,0.9))

More information on these plots is here:

sum(resid(model_final)^2)

## [1] 3.870442

par(mfrow=c(2,2))
plot(model_final)

qplot(model_res)+ 
  geom_histogram()

#VIF
vif(model_final)

##  supportive_environment  school_income_estimate     economic_need_index 
##                1.668277                1.198979                2.620484 
## student_attendance_rate       black_or_hispanic                 nocrime 
##                2.358333                3.447366                1.136783

##Spatial autocorrelation:

build a binary matrix of queen’s case neighbours (otherwise known as Contiguity edges corners) and use the global Moran’s I test.

Moran’s I test would tell us whether we have clustered values (close to 1) or dispersed values (close to -1),

coordsW <- mods%>%
  st_centroid()%>%
  st_geometry()

## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over
## geometries of x

plot(coordsW,axes=TRUE)

#create a neighbours list

NYC_nb <- mods %>%
  poly2nb(., queen=T)

#plot them
plot(NYC_nb, st_geometry(coordsW), col="red")
#add a map underneath
plot(points_sf_joined0$geometry, add=T)

#create a spatial weights object from these weights
NYC.lw <- NYC_nb %>%
  nb2listw(., style="C")

I_NYC_Global_Density <- mods %>%
  pull(density) %>%
  as.vector()%>%
  moran.test(., NYC.lw)

I_NYC_Global_Density

## 
##  Moran I test under randomisation
## 
## data:  .  
## weights: NYC.lw    
## 
## Moran I statistic standard deviate = 10.99, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.448669159      -0.005154639       0.001705357

LISA Cluster

then we need to calculate local versions of the Moran’s I statistic,LISA would be based on our global Moran statistics.

locm <- localmoran(mods$total_proficiency, NYC.lw)
summary(locm)

##        Ii                E.Ii                Var.Ii            Z.Ii         
##  Min.   :-0.88029   Min.   :-0.0343088   Min.   :0.0321   Min.   :-1.87246  
##  1st Qu.: 0.01702   1st Qu.:-0.0055636   1st Qu.:0.1264   1st Qu.: 0.04838  
##  Median : 0.32694   Median :-0.0046363   Median :0.1572   Median : 0.85788  
##  Mean   : 0.44852   Mean   :-0.0051546   Mean   :0.1722   Mean   : 1.14202  
##  3rd Qu.: 0.70309   3rd Qu.:-0.0037091   3rd Qu.:0.1876   3rd Qu.: 1.70442  
##  Max.   : 2.91426   Max.   :-0.0009273   Max.   :0.9679   Max.   : 7.36194  
##    Pr(z > 0)      
##  Min.   :0.00000  
##  1st Qu.:0.04419  
##  Median :0.19548  
##  Mean   :0.26817  
##  3rd Qu.:0.48071  
##  Max.   :0.96943

# manually make a moran plot standarize variables
mods$stotal_proficiency <- scale(mods$total_proficiency)  #save to a new column

# create a lagged variable
mods$lag_stotal_proficiency <- lag.listw(NYC.lw, mods$stotal_proficiency)

summary(mods$stotal_proficiency)

##        V1         
##  Min.   :-1.4431  
##  1st Qu.:-0.8801  
##  Median :-0.1760  
##  Mean   : 0.0000  
##  3rd Qu.: 0.6699  
##  Max.   : 2.7391

summary(mods$lag_stotal_proficiency)

##        V1          
##  Min.   :-2.82847  
##  1st Qu.:-0.47904  
##  Median :-0.07134  
##  Mean   :-0.02514  
##  3rd Qu.: 0.46553  
##  Max.   : 2.92758

plot(x = mods$stotal_proficiency, y = mods$lag_stotal_proficiency, main = " Moran Scatter Plot")
abline(h = 0, v = 0)
abline(lm(mods$lag_stotal_proficiency ~ mods$stotal_proficiency), lty = 4, lwd = 4, col = "red")

Lisa0 <- st_as_sf(mods) %>% 
  mutate(quad_sig = ifelse(mods$stotal_proficiency > 0 & 
                             mods$lag_stotal_proficiency > 0 & 
                             locm[,5] <= 0.05, 
                           "high-high",
                           ifelse(mods$stotal_proficiency <= 0 & 
                                    mods$lag_stotal_proficiency <= 0 & 
                                    locm[,5] <= 0.05, 
                                  "low-low", 
                                  ifelse(mods$stotal_proficiency > 0 & 
                                           mods$lag_stotal_proficiency <= 0 & 
                                           locm[,5] <= 0.05, 
                                         "high-low",
                                         ifelse(mods$stotal_proficiency <= 0 & 
                                                  mods$lag_stotal_proficiency > 0 & 
                                                  locm[,5] <= 0.05,
                                                "low-high", 
                                                "non-significant")))))
tmap_mode("plot")
colors <- c("lightpink", "skyblue2", "lightgrey")
tm_shape(Lisa0) +
  tm_polygons(col = "quad_sig", 
              palette = colors, 
              title = 'LISA Custer') +
  tm_scale_bar(position=c(0.6,0.05), text.size=0.6)+
  tm_compass(north=0, position=c(0.90,0.90))