In this assignment, I continue to use 12-month enrollment data from IPEDS for the year 2016 and ACS 5-year estimates for the year 2015. In previous assignments, I have used proportion however, I use counts of enrolled individuals in post-graduate education programs and those of male Hispanics. I select male Hispanics as they are commonely underepresented when compared to female Hispanics at the graduate level.
I use a Poisson and a Negative Binomial Model in which I set the number of male Hispanics enrolled in post-graduate education programs as a numerator and the total number of individuals enrolled in post-graduate education as the denominator. Counts are aggregated by state.
In terms of predictors, I use poverty rate (classified by low, middle and high), vacancy rate, unemployment rate and marriage rate by state.
ipeds_map1<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds_Hisp_ResStatus_2.csv")
ipeds_map11<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds_institutions.csv")
ipeds_map2<-merge(ipeds_map1,ipeds_map11, by = "unitid")
#ipedsmerged_map2a2<-read.csv("C:/Users/PCMcC/Documents/Spatial Demography/Homeworks/HW2/Ipeds.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ipeds_map2<-rename(ipeds_map2, institution = instnm.x, year = year.x, state = stabbr, county_code = countycd , year = year.y, total_enrollment = efytotlt, total_female_enrollment = efytotlw, total_male_enrollment = efytotlm, non_resident_alien_total= efynralt, non_resident_alien_male = efynralm, non_resident_alien_female = efynralw, count_hispanic_male = efyhispm, count_hispanic_female = efyhispw, count_hispanic_total = efyhispt)
#Remove duplicated columns
ipeds_map2<-ipeds_map2[, !duplicated(colnames(ipeds_map2))]
Extract from ACS summary file merged_map2a profile variables from 2016 for all states The merged profile tables are very useful because they contain lots of pre-calculated variables.
allstates_acs<-get_acs(geography = "state", year = 2016,
variables=c("DP05_0001E", "DP03_0119PE", "DP04_0003PE", "DP02_0004PE", "DP03_0005PE"),
geometry = T, output = "wide")
## Getting data from the 2012-2016 5-year ACS
## Using the ACS Data Profile
#create a state FIPS code - 5 digit
allstates_acs$state<-substr(allstates_acs$GEOID, 1, 5)
#rename variables and filter missing cases
allstates_acs2<-allstates_acs%>%
mutate(totalpop= DP05_0001E,povertyrate=DP03_0119PE,vacancyrate=DP04_0003PE, marriagerate=DP02_0004PE, unemploymentrate=DP03_0005PE)
#st_transform(crs = 102009)%>%
# filter(complete.cases(povertyrate,vacancyrate,marriagerate,unemploymentrate))
library(sf)
library(RQGIS)
## Loading required package: reticulate
library(mapview)
##Set up QGIS environment
#This lets R find where your QGIS binaries are located. `set_env()` should work without you specifying the path, but in my case I had to tell R where Qgis lives.
my_env<-set_env(root="C:/Program Files/QGIS 2.18")
#set_env()
edupoints<-st_as_sf(ipeds_map2, coords=c("longitud","latitude"), crs=4269, agr="constant")
#I name it the name of the first cell.
#mapview(edupoints["institution"])
#colnames(edupoints)
library(sf)
st_write(edupoints,dsn = "ipeds_points", layer="ipeds_map2", driver = "ESRI Shapefile", delete_dsn = T)
library(dplyr)
ipeds_map<-aggregate(cbind(total_enrollment,total_male_enrollment,total_female_enrollment,non_resident_alien_total,non_resident_alien_male,non_resident_alien_female,count_hispanic_male,count_hispanic_female)~fips,data=ipeds_map2, FUN=sum)
ipeds_map<-mutate(ipeds_map, Prop_Hisp_Male=count_hispanic_male/total_male_enrollment*100, Prop_Hisp_Female=count_hispanic_female/total_female_enrollment*100, Prop_NRAlien_Male=non_resident_alien_male/total_male_enrollment*100, Prop_NRAlien_Female=non_resident_alien_female/total_female_enrollment*100)
library(haven)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(dplyr)
#ommitting non-continental US states
ipeds_map$fips[1:7]<-c("01","02","04","05","06","08","09")
project_merge<-left_join(ipeds_map,allstates_acs2, by=c('fips'='GEOID'))
#class(project_merge)
merged_map<-st_as_sf(project_merge)
##Removed non-continental states
merged_map<-merged_map[-c(02, 12),]
st_write(merged_map,dsn = "Ipeds_ACS_shapefile", layer = "project_merge", driver = "ESRI Shapefile", delete_dsn = T )
#class(merged_map)
names(merged_map)
The Poisson model is selected because For the Poisson model, the dataset includes counts for enrollmment which are strictly positive values.
Among the most relevant observations, we see that the number of male Hispanics with a post-graduate education is lower in areas with low poverty and higher in states with greater unemploymployment rate and marriage rates. The risk ratios confirm these results showing that states with greater unemployent are 30% more likely to have a larger number of Male Hispanics enrolled in Post-graduate education whereas areas with less poverty are almost 50% less likely to have Male Hispanics pursue graduate education.
#quantile(merged_map$povertyrate)
merged_map$povertyq<- ifelse(merged_map$povertyrate>=12.8, "high",
ifelse(merged_map$povertyrate >= 8.3 & merged_map$povertyrate <=12.7, "middle",
ifelse(merged_map$povertyrate <=8.2, "low",NA)))
fit_pois<- glm(count_hispanic_male ~ offset(log(total_enrollment)) + povertyq + scale(vacancyrate) + scale(unemploymentrate) + scale(marriagerate),
family=poisson,
data=merged_map)
summary(fit_pois)
##
## Call:
## glm(formula = count_hispanic_male ~ offset(log(total_enrollment)) +
## povertyq + scale(vacancyrate) + scale(unemploymentrate) +
## scale(marriagerate), family = poisson, data = merged_map)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -37.888 -17.089 -6.781 1.021 75.860
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.451646 0.006131 -563.015 < 2e-16 ***
## povertyqlow -0.536934 0.010519 -51.044 < 2e-16 ***
## povertyqmiddle -0.214533 0.007605 -28.209 < 2e-16 ***
## scale(vacancyrate) -0.025951 0.003857 -6.729 1.71e-11 ***
## scale(unemploymentrate) 0.263047 0.004951 53.129 < 2e-16 ***
## scale(marriagerate) 0.171120 0.004552 37.596 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 36717 on 48 degrees of freedom
## Residual deviance: 29013 on 43 degrees of freedom
## AIC: 29448
##
## Number of Fisher Scoring iterations: 5
If we exponentiate the coefficients from the model, I get the risk ratios:
exp(coef(fit_pois))
## (Intercept) povertyqlow povertyqmiddle
## 0.03169344 0.58453797 0.80691781
## scale(vacancyrate) scale(unemploymentrate) scale(marriagerate)
## 0.97438262 1.30088844 1.18663269
The model shows extreme overdispersion as demonstrated by the value of 25. Also in testing the Goodness of Fit Statistic we can confirm that this model does not fit the data.
#Model Fit from the Summary Function
scale<-sqrt(fit_pois$deviance/fit_pois$df.residual)
scale
## [1] 25.97528
#Goodness of Fit Statistic
1-pchisq(fit_pois$deviance, df = fit_pois$df.residual)
## [1] 0
In using the negative binomial model, we can see that the dispersion decreases to 3.5 which is much lower than the 25 value from the poisson model.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
fit_nb<- glm.nb(count_hispanic_male ~ offset(log(total_enrollment)) + povertyq + scale(vacancyrate) + scale(unemploymentrate) + scale(marriagerate),
data=merged_map)
summary(fit_nb)
##
## Call:
## glm.nb(formula = count_hispanic_male ~ offset(log(total_enrollment)) +
## povertyq + scale(vacancyrate) + scale(unemploymentrate) +
## scale(marriagerate), data = merged_map, init.theta = 3.592510147,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9984 -0.7847 -0.1700 0.2745 2.8783
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.67110 0.15977 -22.978 < 2e-16 ***
## povertyqlow -0.29043 0.23444 -1.239 0.21541
## povertyqmiddle -0.20466 0.19877 -1.030 0.30318
## scale(vacancyrate) -0.02040 0.07911 -0.258 0.79654
## scale(unemploymentrate) 0.25844 0.09523 2.714 0.00665 **
## scale(marriagerate) 0.08993 0.09330 0.964 0.33515
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(3.5925) family taken to be 1)
##
## Null deviance: 63.027 on 48 degrees of freedom
## Residual deviance: 51.140 on 43 degrees of freedom
## AIC: 762.25
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 3.593
## Std. Err.: 0.700
##
## 2 x log-likelihood: -748.252
Again, we see the same overall interpretation of the model effects, but the risk ratios are different compared to the Poisson model:
The risk ratios are modified when using the Negative Binomial Model, however similar trends than the ones shown using the Poisson model are also found.
exp(coef(fit_nb))
## (Intercept) povertyqlow povertyqmiddle
## 0.02544847 0.74793855 0.81492570
## scale(vacancyrate) scale(unemploymentrate) scale(marriagerate)
## 0.97981006 1.29490726 1.09409415
In looking at the possibility of clustering taking place in our data, we see that there is a very small autocorrelation in the residuals as demonstrated by the Moran I value of .16.
library(spdep)
## Loading required package: sp
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source'))`
nbs<-knearneigh(coordinates(as(merged_map, "Spatial")), k = 4, longlat = T)
nbs<-knn2nb(nbs, sym = T)
us.wt4<-nb2listw(nbs, style = "W")
lm.morantest(fit_pois, listw = us.wt4)
##
## Global Moran I for regression residuals
##
## data:
## model: glm(formula = count_hispanic_male ~
## offset(log(total_enrollment)) + povertyq + scale(vacancyrate) +
## scale(unemploymentrate) + scale(marriagerate), family = poisson,
## data = merged_map)
## weights: us.wt4
##
## Moran I statistic standard deviate = 0.87967, p-value = 0.1895
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 9.046573e-02 -3.316748e-05 1.058389e-02
lm.morantest(fit_nb, listw = us.wt4)
##
## Global Moran I for regression residuals
##
## data:
## model: glm.nb(formula = count_hispanic_male ~
## offset(log(total_enrollment)) + povertyq + scale(vacancyrate) +
## scale(unemploymentrate) + scale(marriagerate), data = merged_map,
## init.theta = 3.592510147, link = log)
## weights: us.wt4
##
## Moran I statistic standard deviate = 1.8268, p-value = 0.03386
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.161831874 -0.013590502 0.009221004
In reviewing the autocorrelation of the residuals, I am unable to find significant eigenvectors. Further analysis using smaller units such as counties may be more likely to show clustering.
me.fit<-ME(count_hispanic_male ~ offset(log(total_enrollment)) + povertyq + scale(vacancyrate) + scale(unemploymentrate) + scale(marriagerate), family=negative.binomial(3.593),
data=merged_map, listw = us.wt4, verbose=T,alpha=.5 )
## eV[,41], I: -3.110593e-05 ZI: NA, pr(ZI): 0.44
## eV[,30], I: 3.06864e-05 ZI: NA, pr(ZI): 0.45
## eV[,20], I: 2.566513e-05 ZI: NA, pr(ZI): 0.4
## eV[,18], I: 2.566513e-05 ZI: NA, pr(ZI): 0.39
## eV[,17], I: -0.000387106 ZI: NA, pr(ZI): 0.4
## eV[,23], I: -1.212028e-05 ZI: NA, pr(ZI): 0.47
## eV[,45], I: -0.0001404604 ZI: NA, pr(ZI): 0.42
## eV[,14], I: 0.000131513 ZI: NA, pr(ZI): 0.44
## eV[,16], I: -0.0001172024 ZI: NA, pr(ZI): 0.39
## eV[,13], I: 0.0001230917 ZI: NA, pr(ZI): 0.44
## eV[,10], I: 0.000154246 ZI: NA, pr(ZI): 0.37
## Warning: glm.fit: algorithm did not converge
## eV[,27], I: 0.0004981136 ZI: NA, pr(ZI): 0.51
me.fit
## Eigenvector ZI pr(ZI)
## 0 NA NA 0.39
## 1 41 NA 0.44
## 2 30 NA 0.45
## 3 20 NA 0.40
## 4 18 NA 0.39
## 5 17 NA 0.40
## 6 23 NA 0.47
## 7 45 NA 0.42
## 8 14 NA 0.44
## 9 16 NA 0.39
## 10 13 NA 0.44
## 11 10 NA 0.37
## 12 27 NA 0.51
fits<-data.frame(fitted(me.fit))
merged_map$me23<-fits$vec23
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
##
## Attaching package: 'tigris'
## The following object is masked from 'package:graphics':
##
## plot
library(sf)
library(ggplot2)
merged_map%>%
ggplot()+
geom_sf(aes(fill=me23, color=me23))+
scale_color_viridis_c()+
scale_fill_viridis_c(na.value = "grey50")+
geom_sf(data=merged_map, color="black")+
coord_sf(crs = 102008)+
ggtitle(label = "Spatial Distribution Moran Eigenvector 23")
Based on the values of the eigenvector it uneceesary for this assignment to clean the autocorrelation of the eigenvectors.
#clean.nb<-glm.nb(count_hispanic_male ~ offset(log(total_enrollment)) + povertyq + scale(vacancyrate) + scale(unemploymentrate) + scale(marriagerate) + fitted(me.fit), merged_map)
#summary(clean.nb)
#lm.morantest(clean.nb, listw=us.wt4)
#AIC(clean.nb)
#AIC(fit_nb)