DEM 7263 - Homework Spatially Autoregressive Models 1

In this assignment, I use 12-month enrollment data from IPEDS for the year 2016 and ACS 5-year estimates for the year 2015. I aggregate all Hispanic students who were enrolled in a post-graduate program by state and utilize the proportion of Non-Resident Aliens also enrolled in post-graduate programs, poverty rates and unemployment rates; these last two variables were obtained from the ACS data. In terms of analysis, I set neighbors using a queen style weight matrix and then use an OLS model with the outcome variable of the proportion of male Hispanics enrolled in a post-graduate program. I plot the residuals and try different SAR models including a Spatial Lag Model and Spatial Error Model.

Part 1: Creating merged File with IPEDS and ACS Information

Merge two IPEDS file (institution characteristics and student enrollment patterns) with various 2016 variables: 12-Month Enrollment in Graduate Programs by Institution)

ipeds_map1<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds_Hisp_ResStatus_2.csv")
ipeds_map11<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds_institutions.csv")

ipeds_map2<-merge(ipeds_map1,ipeds_map11, by = "unitid")

Rename variables, remove duplicates and create proportions from IPEDS data

#ipedsmerged_map2a2<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds.csv")
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

ipeds_map2<-rename(ipeds_map2, institution = instnm.x, year = year.x, state = stabbr, county_code = countycd , year = year.y, total_enrollment = efytotlt, total_female_enrollment = efytotlw, total_male_enrollment = efytotlm, non_resident_alien_total= efynralt, non_resident_alien_male = efynralm, non_resident_alien_female = efynralw, count_hispanic_male = efyhispm, count_hispanic_female = efyhispw, count_hispanic_total = efyhispt)

#Remove duplicated columns
ipeds_map2<-ipeds_map2[, !duplicated(colnames(ipeds_map2))]

Getting ACS Summary Files

Extract from ACS summary file merged_map2a profile variables from 2016 for all states The merged profile tables are very useful because they contain lots of pre-calculated variables.

allstates_acs<-get_acs(geography = "state", year = 2016,
                variables=c("DP05_0001E", "DP03_0119PE", "DP04_0003PE", "DP02_0004PE", "DP03_0005PE"),
                geometry = T, output = "wide")

## Getting data from the 2012-2016 5-year ACS

## Using the ACS Data Profile

#create a state FIPS code - 5 digit
allstates_acs$state<-substr(allstates_acs$GEOID, 1, 5)


#rename variables and filter missing cases
allstates_acs2<-allstates_acs%>%
  mutate(totalpop= DP05_0001E,povertyrate=DP03_0119PE,vacancyrate=DP04_0003PE, marriagerate=DP02_0004PE, unemploymentrate=DP03_0005PE)
#st_transform(crs = 102009)%>%
 # filter(complete.cases(povertyrate,vacancyrate,marriagerate,unemploymentrate))

class(allstates_acs2)

## [1] "sf"         "data.frame"

#ipedsmerged_map2a2[is.na(ipedsmerged_map2a2)] <- 0

Creating Shapefile and mapping university locations

library(sf)
library(RQGIS)

## Loading required package: reticulate

library(mapview)

##Set up QGIS environment
#This lets R find where your QGIS binaries are located. `set_env()` should work without you specifying the path, but in my case I had to tell R where Qgis lives.

my_env<-set_env(root="C:/Program Files/QGIS 2.18")
#set_env()

edupoints<-st_as_sf(ipeds_map2, coords=c("longitud","latitude"), crs=4269, agr="constant")
#I name it the name of the first cell. 
mapview(edupoints["institution"])

colnames(edupoints)

##  [1] "unitid"                    "institution"              
##  [3] "year"                      "effylev"                  
##  [5] "total_enrollment"          "total_male_enrollment"    
##  [7] "total_female_enrollment"   "count_hispanic_total"     
##  [9] "count_hispanic_male"       "count_hispanic_female"    
## [11] "non_resident_alien_total"  "non_resident_alien_male"  
## [13] "non_resident_alien_female" "idx_e12"                  
## [15] "instnm.y"                  "instnm.1"                 
## [17] "state"                     "zip"                      
## [19] "fips"                      "county_code"              
## [21] "geometry"

Saving IPEDS spatial merged_map2a out of R

library(sf)
st_write(edupoints,dsn = "ipeds_points", layer="ipeds_map2", driver = "ESRI Shapefile", delete_dsn = T)

Aggregate IPEDS data by State FIPS Code and Create Proportions

library(dplyr)
ipeds_map<-aggregate(cbind(total_enrollment,total_male_enrollment,total_female_enrollment,non_resident_alien_total,non_resident_alien_male,non_resident_alien_female,count_hispanic_male,count_hispanic_female)~fips,data=ipeds_map2, FUN=sum)

ipeds_map<-mutate(ipeds_map, Prop_Hisp_Male=count_hispanic_male/total_male_enrollment*100, Prop_Hisp_Female=count_hispanic_female/total_female_enrollment*100, Prop_NRAlien_Male=non_resident_alien_male/total_male_enrollment*100, Prop_NRAlien_Female=non_resident_alien_female/total_female_enrollment*100)

Merging IPEDS data with ACS data

library(haven)
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(dplyr)

#ommitting non-continental US states
ipeds_map$fips[1:7]<-c("01","02","04","05","06","08","09")
project_merge<-left_join(ipeds_map,allstates_acs2, by=c('fips'='GEOID'))
class(project_merge)

## [1] "data.frame"

merged_map<-st_as_sf(project_merge)
##Removed non-continental states 
merged_map<-merged_map[-c(02, 12),]

st_write(merged_map,dsn = "Ipeds_ACS_shapefile", layer = "project_merge", driver = "ESRI Shapefile", delete_dsn = T )
class(merged_map)

Part 2: OLS Model

Establishing Spatial Weights

library(nortest)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:dplyr':
## 
##     recode

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(classInt)
library(spdep)

## Loading required package: sp

## Loading required package: spData

## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source'))`

library(rgdal)

## rgdal: version: 1.3-6, (SVN revision 773)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
##  Path to GDAL shared files: C:/Users/PCMcC/Documents/R/win-library/3.5/rgdal/gdal
##  GDAL binary built with GEOS: TRUE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: C:/Users/PCMcC/Documents/R/win-library/3.5/rgdal/proj
##  Linking to sp version: 1.3-1

library(RColorBrewer)

#Make a queen style weight matrix
usb<-poly2nb(as(merged_map, "Spatial"), queen=T)
summary(usb)

## Neighbour list object:
## Number of regions: 49 
## Number of nonzero links: 218 
## Percentage nonzero weights: 9.07955 
## Average number of links: 4.44898 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8 
##  1  5  9 10 10 10  2  2 
## 1 least connected region:
## 20 with 1 link
## 2 most connected regions:
## 26 43 with 8 links

usbw<-nb2listw(usb, glist=NULL, style="W", zero.policy = NULL)

Definining Outcome and Variables

I expect for the proportion of male Hispanics enrolled in graduate programs to be less in states that have higher levels of unemployment and poverty.

Fitting OLS Model

fit2<-lm(scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + scale(povertyrate) + scale(Prop_NRAlien_Male), data=merged_map )
summary(fit2)

## 
## Call:
## lm(formula = scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + 
##     scale(povertyrate) + scale(Prop_NRAlien_Male), data = merged_map)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4960 -0.6231 -0.1202  0.2355  4.1841 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)               2.083e-17  1.389e-01   0.000    1.000
## scale(unemploymentrate)   1.936e-01  1.750e-01   1.107    0.274
## scale(povertyrate)        1.926e-01  1.710e-01   1.126    0.266
## scale(Prop_NRAlien_Male) -2.878e-02  1.468e-01  -0.196    0.845
## 
## Residual standard error: 0.972 on 45 degrees of freedom
## Multiple R-squared:  0.1143, Adjusted R-squared:  0.0552 
## F-statistic: 1.935 on 3 and 45 DF,  p-value: 0.1375

Plotting the Model Residuals

In plotting the residuals and reviewing the Moran I result, we see a high clustering effect taking place in the dataset.

merged_map$resid<-rstudent(fit2)

merged_map2<-merged_map %>%
  mutate(cv_cut=cut(resid, breaks = quantile(resid, na.rm=T, p=seq(0,1,length.out = 6)),include.lowest = T))
#changed to ggplot in order to map residuals from an sf merged_map2a frame object
p2<-ggplot(merged_map2, aes( fill=cv_cut, color=cv_cut))+
  geom_sf() + ggtitle("Residuals ", subtitle = "U.S. States")+ scale_fill_brewer(palette = "Blues") + scale_color_brewer(palette = "Blues")
p2

lm.morantest(fit2, listw = usbw, resfun = rstudent)

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = scale(Prop_Hisp_Male) ~
## scale(unemploymentrate) + scale(povertyrate) +
## scale(Prop_NRAlien_Male), data = merged_map)
## weights: usbw
## 
## Moran I statistic standard deviate = 2.7766, p-value = 0.002746
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##      0.208120547     -0.046453427      0.008406179

Fitting Alternative SAR Models

fit.lag<-lagsarlm(scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + scale(povertyrate) + scale(Prop_NRAlien_Male), data=merged_map2, listw=usbw, type="lag")
summary(fit.lag, Nagelkerke=T)

## 
## Call:lagsarlm(formula = scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + 
##     scale(povertyrate) + scale(Prop_NRAlien_Male), data = merged_map2, 
##     listw = usbw, type = "lag")
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.15559 -0.49944 -0.15230  0.18613  3.93309 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)               0.020904   0.121750  0.1717   0.8637
## scale(unemploymentrate)   0.173757   0.154414  1.1253   0.2605
## scale(povertyrate)        0.149804   0.151402  0.9894   0.3224
## scale(Prop_NRAlien_Male) -0.012279   0.128697 -0.0954   0.9240
## 
## Rho: 0.46329, LR test value: 5.8948, p-value: 0.015186
## Asymptotic standard error: 0.14641
##     z-value: 3.1643, p-value: 0.0015545
## Wald statistic: 10.013, p-value: 0.0015545
## 
## Log likelihood: -63.10302 for lag model
## ML residual variance (sigma squared): 0.72596, (sigma: 0.85203)
## Nagelkerke pseudo-R-squared: 0.21465 
## Number of observations: 49 
## Number of parameters estimated: 6 
## AIC: 138.21, (AIC for lm: 142.1)
## LM test for residual autocorrelation
## test value: 1.6145, p-value: 0.20386

#Test for homoskedasticity
bptest.sarlm(fit.lag)

## 
##  studentized Breusch-Pagan test
## 
## data:  
## BP = 7.8278, df = 3, p-value = 0.04971

fit.err<-errorsarlm(scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + scale(povertyrate) + scale(Prop_NRAlien_Male), data=merged_map2, listw=usbw)
summary(fit.err, Nagelkerke=T)

## 
## Call:
## errorsarlm(formula = scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + 
##     scale(povertyrate) + scale(Prop_NRAlien_Male), data = merged_map2, 
##     listw = usbw)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.09515 -0.50825 -0.12062  0.19210  3.85829 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)              0.026879   0.231733  0.1160   0.9077
## scale(unemploymentrate)  0.247234   0.180906  1.3666   0.1717
## scale(povertyrate)       0.137403   0.178294  0.7707   0.4409
## scale(Prop_NRAlien_Male) 0.011020   0.134447  0.0820   0.9347
## 
## Lambda: 0.47521, LR test value: 5.8527, p-value: 0.015553
## Asymptotic standard error: 0.14959
##     z-value: 3.1768, p-value: 0.0014893
## Wald statistic: 10.092, p-value: 0.0014893
## 
## Log likelihood: -63.12405 for error model
## ML residual variance (sigma squared): 0.72413, (sigma: 0.85096)
## Nagelkerke pseudo-R-squared: 0.21397 
## Number of observations: 49 
## Number of parameters estimated: 6 
## AIC: 138.25, (AIC for lm: 142.1)

bptest.sarlm(fit.err)

## 
##  studentized Breusch-Pagan test
## 
## data:  
## BP = 6.9661, df = 3, p-value = 0.07299

Model Comparison

AIC(fit2)

## [1] 142.1008

AIC(fit.lag)

## [1] 138.206

AIC(fit.err)

## [1] 138.2481

The Spatial Lag Model appears to be the best test fitting based on the lowest AIC of 138.206.

Fitting Spatial Lag Model to OLS

library (spdep)
fit3<-lagsarlm(scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + scale(povertyrate) + scale(Prop_NRAlien_Male), data=merged_map, listw=usbw )
summary(fit3)

## 
## Call:lagsarlm(formula = scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + 
##     scale(povertyrate) + scale(Prop_NRAlien_Male), data = merged_map, 
##     listw = usbw)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.15559 -0.49944 -0.15230  0.18613  3.93309 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)               0.020904   0.121750  0.1717   0.8637
## scale(unemploymentrate)   0.173757   0.154414  1.1253   0.2605
## scale(povertyrate)        0.149804   0.151402  0.9894   0.3224
## scale(Prop_NRAlien_Male) -0.012279   0.128697 -0.0954   0.9240
## 
## Rho: 0.46329, LR test value: 5.8948, p-value: 0.015186
## Asymptotic standard error: 0.14641
##     z-value: 3.1643, p-value: 0.0015545
## Wald statistic: 10.013, p-value: 0.0015545
## 
## Log likelihood: -63.10302 for lag model
## ML residual variance (sigma squared): 0.72596, (sigma: 0.85203)
## Number of observations: 49 
## Number of parameters estimated: 6 
## AIC: 138.21, (AIC for lm: 142.1)
## LM test for residual autocorrelation
## test value: 1.6145, p-value: 0.20386