DEM 7263 - Homework Generalized Linear Models for Spatial Count data

In this assignment, I continue to use 12-month enrollment data from IPEDS for the year 2016 and ACS 5-year estimates for the year 2015. In previous assignments, I have used proportion however, I use counts of enrolled individuals in post-graduate education programs and those of male Hispanics. I select male Hispanics as they are commonely underepresented when compared to female Hispanics at the graduate level.

I use a Poisson and a Negative Binomial Model in which I set the number of male Hispanics enrolled in post-graduate education programs as a numerator and the total number of individuals enrolled in post-graduate education as the denominator. Counts are aggregated by state.

In terms of predictors, I use poverty rate (classified by low, middle and high), vacancy rate, unemployment rate and marriage rate by state.

Merge two IPEDS file (institution characteristics and student enrollment patterns) with various 2016 variables: 12-Month Enrollment in Graduate Programs by Institution)

ipeds_map1<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds_Hisp_ResStatus_2.csv")
ipeds_map11<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds_institutions.csv")

ipeds_map2<-merge(ipeds_map1,ipeds_map11, by = "unitid")

Rename variables, remove duplicates and create proportions from IPEDS data

#ipedsmerged_map2a2<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds.csv")
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

ipeds_map2<-rename(ipeds_map2, institution = instnm.x, year = year.x, state = stabbr, county_code = countycd , year = year.y, total_enrollment = efytotlt, total_female_enrollment = efytotlw, total_male_enrollment = efytotlm, non_resident_alien_total= efynralt, non_resident_alien_male = efynralm, non_resident_alien_female = efynralw, count_hispanic_male = efyhispm, count_hispanic_female = efyhispw, count_hispanic_total = efyhispt)

#Remove duplicated columns
ipeds_map2<-ipeds_map2[, !duplicated(colnames(ipeds_map2))]

Getting ACS Summary Files

Extract from ACS summary file merged_map2a profile variables from 2016 for all states The merged profile tables are very useful because they contain lots of pre-calculated variables.

allstates_acs<-get_acs(geography = "state", year = 2016,
                variables=c("DP05_0001E", "DP03_0119PE", "DP04_0003PE", "DP02_0004PE", "DP03_0005PE"),
                geometry = T, output = "wide")

## Getting data from the 2012-2016 5-year ACS

## Using the ACS Data Profile

#create a state FIPS code - 5 digit
allstates_acs$state<-substr(allstates_acs$GEOID, 1, 5)


#rename variables and filter missing cases
allstates_acs2<-allstates_acs%>%
  mutate(totalpop= DP05_0001E,povertyrate=DP03_0119PE,vacancyrate=DP04_0003PE, marriagerate=DP02_0004PE, unemploymentrate=DP03_0005PE)
#st_transform(crs = 102009)%>%
 # filter(complete.cases(povertyrate,vacancyrate,marriagerate,unemploymentrate))

Creating Shapefile and mapping university locations

library(sf)
library(RQGIS)

## Loading required package: reticulate

library(mapview)

##Set up QGIS environment
#This lets R find where your QGIS binaries are located. `set_env()` should work without you specifying the path, but in my case I had to tell R where Qgis lives.

my_env<-set_env(root="C:/Program Files/QGIS 2.18")
#set_env()

edupoints<-st_as_sf(ipeds_map2, coords=c("longitud","latitude"), crs=4269, agr="constant")
#I name it the name of the first cell. 
#mapview(edupoints["institution"])
#colnames(edupoints)

Saving IPEDS spatial merged_map2a out of R

library(sf)
st_write(edupoints,dsn = "ipeds_points", layer="ipeds_map2", driver = "ESRI Shapefile", delete_dsn = T)

Aggregate IPEDS data by State FIPS Code and Create Proportions

library(dplyr)
ipeds_map<-aggregate(cbind(total_enrollment,total_male_enrollment,total_female_enrollment,non_resident_alien_total,non_resident_alien_male,non_resident_alien_female,count_hispanic_male,count_hispanic_female)~fips,data=ipeds_map2, FUN=sum)

ipeds_map<-mutate(ipeds_map, Prop_Hisp_Male=count_hispanic_male/total_male_enrollment*100, Prop_Hisp_Female=count_hispanic_female/total_female_enrollment*100, Prop_NRAlien_Male=non_resident_alien_male/total_male_enrollment*100, Prop_NRAlien_Female=non_resident_alien_female/total_female_enrollment*100)

Merging IPEDS data with ACS data

library(haven)
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(dplyr)

#ommitting non-continental US states
ipeds_map$fips[1:7]<-c("01","02","04","05","06","08","09")
project_merge<-left_join(ipeds_map,allstates_acs2, by=c('fips'='GEOID'))
#class(project_merge)

merged_map<-st_as_sf(project_merge)
##Removed non-continental states 
merged_map<-merged_map[-c(02, 12),]

st_write(merged_map,dsn = "Ipeds_ACS_shapefile", layer = "project_merge", driver = "ESRI Shapefile", delete_dsn = T )
#class(merged_map)
names(merged_map)

Part 2.1: Poisson disribution for counts or rates

The Poisson model is selected because For the Poisson model, the dataset includes counts for enrollmment which are strictly positive values.

Among the most relevant observations, we see that the number of male Hispanics with a post-graduate education is lower in areas with low poverty and higher in states with greater unemploymployment rate and marriage rates. The risk ratios confirm these results showing that states with greater unemployent are 30% more likely to have a larger number of Male Hispanics enrolled in Post-graduate education whereas areas with less poverty are almost 50% less likely to have Male Hispanics pursue graduate education.

#quantile(merged_map$povertyrate)
merged_map$povertyq<- ifelse(merged_map$povertyrate>=12.8, "high",
                                     ifelse(merged_map$povertyrate >= 8.3 & merged_map$povertyrate <=12.7, "middle",
                                            ifelse(merged_map$povertyrate <=8.2, "low",NA)))

fit_pois<- glm(count_hispanic_male ~ offset(log(total_enrollment)) + povertyq + scale(vacancyrate) + scale(unemploymentrate) + scale(marriagerate), 
               family=poisson, 
               data=merged_map)
summary(fit_pois)

## 
## Call:
## glm(formula = count_hispanic_male ~ offset(log(total_enrollment)) + 
##     povertyq + scale(vacancyrate) + scale(unemploymentrate) + 
##     scale(marriagerate), family = poisson, data = merged_map)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -37.888  -17.089   -6.781    1.021   75.860  
## 
## Coefficients:
##                          Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)             -3.451646   0.006131 -563.015  < 2e-16 ***
## povertyqlow             -0.536934   0.010519  -51.044  < 2e-16 ***
## povertyqmiddle          -0.214533   0.007605  -28.209  < 2e-16 ***
## scale(vacancyrate)      -0.025951   0.003857   -6.729 1.71e-11 ***
## scale(unemploymentrate)  0.263047   0.004951   53.129  < 2e-16 ***
## scale(marriagerate)      0.171120   0.004552   37.596  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 36717  on 48  degrees of freedom
## Residual deviance: 29013  on 43  degrees of freedom
## AIC: 29448
## 
## Number of Fisher Scoring iterations: 5

If we exponentiate the coefficients from the model, I get the risk ratios:

exp(coef(fit_pois))

##             (Intercept)             povertyqlow          povertyqmiddle 
##              0.03169344              0.58453797              0.80691781 
##      scale(vacancyrate) scale(unemploymentrate)     scale(marriagerate) 
##              0.97438262              1.30088844              1.18663269

Part 2.2: Checking for Overdispersion

The model shows extreme overdispersion as demonstrated by the value of 25. Also in testing the Goodness of Fit Statistic we can confirm that this model does not fit the data.

#Model Fit from the Summary Function
scale<-sqrt(fit_pois$deviance/fit_pois$df.residual)
scale

## [1] 25.97528

#Goodness of Fit Statistic
1-pchisq(fit_pois$deviance, df = fit_pois$df.residual)

## [1] 0

Part 3: Negative Binomial disribution and Checking for Overdispersion

In using the negative binomial model, we can see that the dispersion decreases to 3.5 which is much lower than the 25 value from the poisson model.

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

fit_nb<- glm.nb(count_hispanic_male ~ offset(log(total_enrollment)) + povertyq + scale(vacancyrate) + scale(unemploymentrate) + scale(marriagerate), 
               data=merged_map)

summary(fit_nb)

## 
## Call:
## glm.nb(formula = count_hispanic_male ~ offset(log(total_enrollment)) + 
##     povertyq + scale(vacancyrate) + scale(unemploymentrate) + 
##     scale(marriagerate), data = merged_map, init.theta = 3.592510147, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9984  -0.7847  -0.1700   0.2745   2.8783  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -3.67110    0.15977 -22.978  < 2e-16 ***
## povertyqlow             -0.29043    0.23444  -1.239  0.21541    
## povertyqmiddle          -0.20466    0.19877  -1.030  0.30318    
## scale(vacancyrate)      -0.02040    0.07911  -0.258  0.79654    
## scale(unemploymentrate)  0.25844    0.09523   2.714  0.00665 ** 
## scale(marriagerate)      0.08993    0.09330   0.964  0.33515    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(3.5925) family taken to be 1)
## 
##     Null deviance: 63.027  on 48  degrees of freedom
## Residual deviance: 51.140  on 43  degrees of freedom
## AIC: 762.25
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  3.593 
##           Std. Err.:  0.700 
## 
##  2 x log-likelihood:  -748.252

Again, we see the same overall interpretation of the model effects, but the risk ratios are different compared to the Poisson model:

The risk ratios are modified when using the Negative Binomial Model, however similar trends than the ones shown using the Poisson model are also found.

exp(coef(fit_nb))

##             (Intercept)             povertyqlow          povertyqmiddle 
##              0.02544847              0.74793855              0.81492570 
##      scale(vacancyrate) scale(unemploymentrate)     scale(marriagerate) 
##              0.97981006              1.29490726              1.09409415

Part 4: Filtering GLMs for spatial autocorrelation

Autocorrelation in GLMs

In looking at the possibility of clustering taking place in our data, we see that there is a very small autocorrelation in the residuals as demonstrated by the Moran I value of .16.

library(spdep)

## Loading required package: sp

## Loading required package: spData

## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source'))`

nbs<-knearneigh(coordinates(as(merged_map, "Spatial")), k = 4, longlat = T)
nbs<-knn2nb(nbs, sym = T)
us.wt4<-nb2listw(nbs, style = "W")
lm.morantest(fit_pois, listw = us.wt4)

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm(formula = count_hispanic_male ~
## offset(log(total_enrollment)) + povertyq + scale(vacancyrate) +
## scale(unemploymentrate) + scale(marriagerate), family = poisson,
## data = merged_map)
## weights: us.wt4
## 
## Moran I statistic standard deviate = 0.87967, p-value = 0.1895
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     9.046573e-02    -3.316748e-05     1.058389e-02

lm.morantest(fit_nb, listw = us.wt4)

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm.nb(formula = count_hispanic_male ~
## offset(log(total_enrollment)) + povertyq + scale(vacancyrate) +
## scale(unemploymentrate) + scale(marriagerate), data = merged_map,
## init.theta = 3.592510147, link = log)
## weights: us.wt4
## 
## Moran I statistic standard deviate = 1.8268, p-value = 0.03386
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##      0.161831874     -0.013590502      0.009221004

Removing Spatial Autocorrelation

In reviewing the autocorrelation of the residuals, I am unable to find significant eigenvectors. Further analysis using smaller units such as counties may be more likely to show clustering.

me.fit<-ME(count_hispanic_male ~ offset(log(total_enrollment)) + povertyq + scale(vacancyrate) + scale(unemploymentrate) + scale(marriagerate), family=negative.binomial(3.593),
           data=merged_map, listw = us.wt4, verbose=T,alpha=.5 )

## eV[,41], I: -3.110593e-05 ZI: NA, pr(ZI): 0.44
## eV[,30], I: 3.06864e-05 ZI: NA, pr(ZI): 0.45
## eV[,20], I: 2.566513e-05 ZI: NA, pr(ZI): 0.4
## eV[,18], I: 2.566513e-05 ZI: NA, pr(ZI): 0.39
## eV[,17], I: -0.000387106 ZI: NA, pr(ZI): 0.4
## eV[,23], I: -1.212028e-05 ZI: NA, pr(ZI): 0.47
## eV[,45], I: -0.0001404604 ZI: NA, pr(ZI): 0.42
## eV[,14], I: 0.000131513 ZI: NA, pr(ZI): 0.44
## eV[,16], I: -0.0001172024 ZI: NA, pr(ZI): 0.39
## eV[,13], I: 0.0001230917 ZI: NA, pr(ZI): 0.44
## eV[,10], I: 0.000154246 ZI: NA, pr(ZI): 0.37

## Warning: glm.fit: algorithm did not converge

## eV[,27], I: 0.0004981136 ZI: NA, pr(ZI): 0.51

me.fit

##    Eigenvector ZI pr(ZI)
## 0           NA NA   0.39
## 1           41 NA   0.44
## 2           30 NA   0.45
## 3           20 NA   0.40
## 4           18 NA   0.39
## 5           17 NA   0.40
## 6           23 NA   0.47
## 7           45 NA   0.42
## 8           14 NA   0.44
## 9           16 NA   0.39
## 10          13 NA   0.44
## 11          10 NA   0.37
## 12          27 NA   0.51

fits<-data.frame(fitted(me.fit))
merged_map$me23<-fits$vec23

library(tigris)

## To enable 
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.

## 
## Attaching package: 'tigris'

## The following object is masked from 'package:graphics':
## 
##     plot

library(sf)
library(ggplot2)

merged_map%>%
  ggplot()+
  geom_sf(aes(fill=me23, color=me23))+
  scale_color_viridis_c()+
  scale_fill_viridis_c(na.value = "grey50")+
  geom_sf(data=merged_map, color="black")+
  coord_sf(crs = 102008)+
  ggtitle(label = "Spatial Distribution Moran Eigenvector 23")

Based on the values of the eigenvector it uneceesary for this assignment to clean the autocorrelation of the eigenvectors.

#clean.nb<-glm.nb(count_hispanic_male ~ offset(log(total_enrollment)) + povertyq + scale(vacancyrate) + scale(unemploymentrate) + scale(marriagerate) + fitted(me.fit), merged_map)
#summary(clean.nb)
#lm.morantest(clean.nb, listw=us.wt4)
#AIC(clean.nb)
#AIC(fit_nb)