In this assignment, I use 12-month enrollment data from IPEDS for the year 2016 and ACS 5-year estimates for the year 2015. I aggregate all Hispanic students who were enrolled in a post-graduate program by state and utilize the proportion of Non-Resident Aliens also enrolled in post-graduate programs, poverty rates and unemployment rates; these last two variables were obtained from the ACS data. In terms of analysis, I set neighbors using a queen style weight matrix and then use an OLS model with the outcome variable of the proportion of male Hispanics enrolled in a post-graduate program. I plot the residuals and try different SAR models including a Spatial Lag Model and Spatial Error Model.
ipeds_map1<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds_Hisp_ResStatus_2.csv")
ipeds_map11<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds_institutions.csv")
ipeds_map2<-merge(ipeds_map1,ipeds_map11, by = "unitid")
#ipedsmerged_map2a2<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ipeds_map2<-rename(ipeds_map2, institution = instnm.x, year = year.x, state = stabbr, county_code = countycd , year = year.y, total_enrollment = efytotlt, total_female_enrollment = efytotlw, total_male_enrollment = efytotlm, non_resident_alien_total= efynralt, non_resident_alien_male = efynralm, non_resident_alien_female = efynralw, count_hispanic_male = efyhispm, count_hispanic_female = efyhispw, count_hispanic_total = efyhispt)
#Remove duplicated columns
ipeds_map2<-ipeds_map2[, !duplicated(colnames(ipeds_map2))]
Extract from ACS summary file merged_map2a profile variables from 2016 for all states The merged profile tables are very useful because they contain lots of pre-calculated variables.
allstates_acs<-get_acs(geography = "state", year = 2016,
variables=c("DP05_0001E", "DP03_0119PE", "DP04_0003PE", "DP02_0004PE", "DP03_0005PE"),
geometry = T, output = "wide")
## Getting data from the 2012-2016 5-year ACS
## Using the ACS Data Profile
#create a state FIPS code - 5 digit
allstates_acs$state<-substr(allstates_acs$GEOID, 1, 5)
#rename variables and filter missing cases
allstates_acs2<-allstates_acs%>%
mutate(totalpop= DP05_0001E,povertyrate=DP03_0119PE,vacancyrate=DP04_0003PE, marriagerate=DP02_0004PE, unemploymentrate=DP03_0005PE)
#st_transform(crs = 102009)%>%
# filter(complete.cases(povertyrate,vacancyrate,marriagerate,unemploymentrate))
class(allstates_acs2)
## [1] "sf" "data.frame"
#ipedsmerged_map2a2[is.na(ipedsmerged_map2a2)] <- 0
library(sf)
library(RQGIS)
## Loading required package: reticulate
library(mapview)
##Set up QGIS environment
#This lets R find where your QGIS binaries are located. `set_env()` should work without you specifying the path, but in my case I had to tell R where Qgis lives.
my_env<-set_env(root="C:/Program Files/QGIS 2.18")
#set_env()
edupoints<-st_as_sf(ipeds_map2, coords=c("longitud","latitude"), crs=4269, agr="constant")
#I name it the name of the first cell.
mapview(edupoints["institution"])
colnames(edupoints)
## [1] "unitid" "institution"
## [3] "year" "effylev"
## [5] "total_enrollment" "total_male_enrollment"
## [7] "total_female_enrollment" "count_hispanic_total"
## [9] "count_hispanic_male" "count_hispanic_female"
## [11] "non_resident_alien_total" "non_resident_alien_male"
## [13] "non_resident_alien_female" "idx_e12"
## [15] "instnm.y" "instnm.1"
## [17] "state" "zip"
## [19] "fips" "county_code"
## [21] "geometry"
library(sf)
st_write(edupoints,dsn = "ipeds_points", layer="ipeds_map2", driver = "ESRI Shapefile", delete_dsn = T)
library(dplyr)
ipeds_map<-aggregate(cbind(total_enrollment,total_male_enrollment,total_female_enrollment,non_resident_alien_total,non_resident_alien_male,non_resident_alien_female,count_hispanic_male,count_hispanic_female)~fips,data=ipeds_map2, FUN=sum)
ipeds_map<-mutate(ipeds_map, Prop_Hisp_Male=count_hispanic_male/total_male_enrollment*100, Prop_Hisp_Female=count_hispanic_female/total_female_enrollment*100, Prop_NRAlien_Male=non_resident_alien_male/total_male_enrollment*100, Prop_NRAlien_Female=non_resident_alien_female/total_female_enrollment*100)
library(haven)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(dplyr)
#ommitting non-continental US states
ipeds_map$fips[1:7]<-c("01","02","04","05","06","08","09")
project_merge<-left_join(ipeds_map,allstates_acs2, by=c('fips'='GEOID'))
class(project_merge)
## [1] "data.frame"
merged_map<-st_as_sf(project_merge)
##Removed non-continental states
merged_map<-merged_map[-c(02, 12),]
st_write(merged_map,dsn = "Ipeds_ACS_shapefile", layer = "project_merge", driver = "ESRI Shapefile", delete_dsn = T )
class(merged_map)
library(nortest)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(classInt)
library(spdep)
## Loading required package: sp
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source'))`
library(rgdal)
## rgdal: version: 1.3-6, (SVN revision 773)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
## Path to GDAL shared files: C:/Users/PCMcC/Documents/R/win-library/3.5/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
## Path to PROJ.4 shared files: C:/Users/PCMcC/Documents/R/win-library/3.5/rgdal/proj
## Linking to sp version: 1.3-1
library(RColorBrewer)
#Make a queen style weight matrix
usb<-poly2nb(as(merged_map, "Spatial"), queen=T)
summary(usb)
## Neighbour list object:
## Number of regions: 49
## Number of nonzero links: 218
## Percentage nonzero weights: 9.07955
## Average number of links: 4.44898
## Link number distribution:
##
## 1 2 3 4 5 6 7 8
## 1 5 9 10 10 10 2 2
## 1 least connected region:
## 20 with 1 link
## 2 most connected regions:
## 26 43 with 8 links
usbw<-nb2listw(usb, glist=NULL, style="W", zero.policy = NULL)
I expect for the proportion of male Hispanics enrolled in graduate programs to be less in states that have higher levels of unemployment and poverty.
fit2<-lm(scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + scale(povertyrate) + scale(Prop_NRAlien_Male), data=merged_map )
summary(fit2)
##
## Call:
## lm(formula = scale(Prop_Hisp_Male) ~ scale(unemploymentrate) +
## scale(povertyrate) + scale(Prop_NRAlien_Male), data = merged_map)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4960 -0.6231 -0.1202 0.2355 4.1841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.083e-17 1.389e-01 0.000 1.000
## scale(unemploymentrate) 1.936e-01 1.750e-01 1.107 0.274
## scale(povertyrate) 1.926e-01 1.710e-01 1.126 0.266
## scale(Prop_NRAlien_Male) -2.878e-02 1.468e-01 -0.196 0.845
##
## Residual standard error: 0.972 on 45 degrees of freedom
## Multiple R-squared: 0.1143, Adjusted R-squared: 0.0552
## F-statistic: 1.935 on 3 and 45 DF, p-value: 0.1375
In plotting the residuals and reviewing the Moran I result, we see a high clustering effect taking place in the dataset.
merged_map$resid<-rstudent(fit2)
merged_map2<-merged_map %>%
mutate(cv_cut=cut(resid, breaks = quantile(resid, na.rm=T, p=seq(0,1,length.out = 6)),include.lowest = T))
#changed to ggplot in order to map residuals from an sf merged_map2a frame object
p2<-ggplot(merged_map2, aes( fill=cv_cut, color=cv_cut))+
geom_sf() + ggtitle("Residuals ", subtitle = "U.S. States")+ scale_fill_brewer(palette = "Blues") + scale_color_brewer(palette = "Blues")
p2
lm.morantest(fit2, listw = usbw, resfun = rstudent)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = scale(Prop_Hisp_Male) ~
## scale(unemploymentrate) + scale(povertyrate) +
## scale(Prop_NRAlien_Male), data = merged_map)
## weights: usbw
##
## Moran I statistic standard deviate = 2.7766, p-value = 0.002746
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.208120547 -0.046453427 0.008406179
fit.lag<-lagsarlm(scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + scale(povertyrate) + scale(Prop_NRAlien_Male), data=merged_map2, listw=usbw, type="lag")
summary(fit.lag, Nagelkerke=T)
##
## Call:lagsarlm(formula = scale(Prop_Hisp_Male) ~ scale(unemploymentrate) +
## scale(povertyrate) + scale(Prop_NRAlien_Male), data = merged_map2,
## listw = usbw, type = "lag")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.15559 -0.49944 -0.15230 0.18613 3.93309
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.020904 0.121750 0.1717 0.8637
## scale(unemploymentrate) 0.173757 0.154414 1.1253 0.2605
## scale(povertyrate) 0.149804 0.151402 0.9894 0.3224
## scale(Prop_NRAlien_Male) -0.012279 0.128697 -0.0954 0.9240
##
## Rho: 0.46329, LR test value: 5.8948, p-value: 0.015186
## Asymptotic standard error: 0.14641
## z-value: 3.1643, p-value: 0.0015545
## Wald statistic: 10.013, p-value: 0.0015545
##
## Log likelihood: -63.10302 for lag model
## ML residual variance (sigma squared): 0.72596, (sigma: 0.85203)
## Nagelkerke pseudo-R-squared: 0.21465
## Number of observations: 49
## Number of parameters estimated: 6
## AIC: 138.21, (AIC for lm: 142.1)
## LM test for residual autocorrelation
## test value: 1.6145, p-value: 0.20386
#Test for homoskedasticity
bptest.sarlm(fit.lag)
##
## studentized Breusch-Pagan test
##
## data:
## BP = 7.8278, df = 3, p-value = 0.04971
fit.err<-errorsarlm(scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + scale(povertyrate) + scale(Prop_NRAlien_Male), data=merged_map2, listw=usbw)
summary(fit.err, Nagelkerke=T)
##
## Call:
## errorsarlm(formula = scale(Prop_Hisp_Male) ~ scale(unemploymentrate) +
## scale(povertyrate) + scale(Prop_NRAlien_Male), data = merged_map2,
## listw = usbw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.09515 -0.50825 -0.12062 0.19210 3.85829
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.026879 0.231733 0.1160 0.9077
## scale(unemploymentrate) 0.247234 0.180906 1.3666 0.1717
## scale(povertyrate) 0.137403 0.178294 0.7707 0.4409
## scale(Prop_NRAlien_Male) 0.011020 0.134447 0.0820 0.9347
##
## Lambda: 0.47521, LR test value: 5.8527, p-value: 0.015553
## Asymptotic standard error: 0.14959
## z-value: 3.1768, p-value: 0.0014893
## Wald statistic: 10.092, p-value: 0.0014893
##
## Log likelihood: -63.12405 for error model
## ML residual variance (sigma squared): 0.72413, (sigma: 0.85096)
## Nagelkerke pseudo-R-squared: 0.21397
## Number of observations: 49
## Number of parameters estimated: 6
## AIC: 138.25, (AIC for lm: 142.1)
bptest.sarlm(fit.err)
##
## studentized Breusch-Pagan test
##
## data:
## BP = 6.9661, df = 3, p-value = 0.07299
AIC(fit2)
## [1] 142.1008
AIC(fit.lag)
## [1] 138.206
AIC(fit.err)
## [1] 138.2481
The Spatial Lag Model appears to be the best test fitting based on the lowest AIC of 138.206.
library (spdep)
fit3<-lagsarlm(scale(Prop_Hisp_Male) ~ scale(unemploymentrate) + scale(povertyrate) + scale(Prop_NRAlien_Male), data=merged_map, listw=usbw )
summary(fit3)
##
## Call:lagsarlm(formula = scale(Prop_Hisp_Male) ~ scale(unemploymentrate) +
## scale(povertyrate) + scale(Prop_NRAlien_Male), data = merged_map,
## listw = usbw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.15559 -0.49944 -0.15230 0.18613 3.93309
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.020904 0.121750 0.1717 0.8637
## scale(unemploymentrate) 0.173757 0.154414 1.1253 0.2605
## scale(povertyrate) 0.149804 0.151402 0.9894 0.3224
## scale(Prop_NRAlien_Male) -0.012279 0.128697 -0.0954 0.9240
##
## Rho: 0.46329, LR test value: 5.8948, p-value: 0.015186
## Asymptotic standard error: 0.14641
## z-value: 3.1643, p-value: 0.0015545
## Wald statistic: 10.013, p-value: 0.0015545
##
## Log likelihood: -63.10302 for lag model
## ML residual variance (sigma squared): 0.72596, (sigma: 0.85203)
## Number of observations: 49
## Number of parameters estimated: 6
## AIC: 138.21, (AIC for lm: 142.1)
## LM test for residual autocorrelation
## test value: 1.6145, p-value: 0.20386