For this project, we utilize the Gini Index which is a summary measure of income inequality to see the dispersion of income accross the United States. Gender FIPS Code Name of Institution Number of degrees awarded masters and doctoral level 2013-2017

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##census_api_key(key="95a91ddea073de2c8d473e622e3cb99007afdfee", install=T)

Extract from ACS summary file data profile variables from 2015 for all states The data profile tables are very useful because they contain lots of pre-calculated variables.

allstates_acs<-get_acs(geography = "state", year = 2015,
                variables=c( "B19083_001"),
                summary_var = "B01001_001",
                geometry = T, output = "wide")
## Getting data from the 2011-2015 5-year ACS
#create a state FIPS code - 5 digit
allstates_acs$state<-substr(allstates_acs$GEOID, 1, 5)
#rename variables and filter missing cases
allstates_acs2<-allstates_acs%>%
  mutate(gini1= B19083_001E, gini2=B19083_001M)
#st_transform(crs = 102740)
class(allstates_acs2)
## [1] "sf"         "data.frame"

Maping of Gini Index

library(classInt)

#devtools::install_github("tidyverse/ggplot2",force=TRUE)
#devtools::install_github("oswaldosantos/ggsn",force=TRUE)
#devtools::install_github("jannes-m/RQGIS",force=TRUE)
library(dplyr)
library(ggplot2)
library(RQGIS)
## Loading required package: reticulate
library(ggsn)
## Loading required package: grid
library(classInt)
ginimap<-allstates_acs2 %>%
  mutate(cv_cut=cut(gini1, breaks =  quantile(gini1, na.rm=T, p=seq(0,1,length.out = 6)),include.lowest = T))

library(ggsn)  
p1<-ggplot(ginimap, aes( fill=cv_cut, color=cv_cut))+
  geom_sf() + 
  ggtitle("GINI Coefficient, 2015 ", 
          subtitle = "U.S. States")+
  scale_fill_brewer(palette = "Blues") + 
  scale_color_brewer(palette = "Blues")+
  #scale_fill_manual(values=myfil)+
  
    theme(axis.text.x = element_blank(), axis.text.y = element_blank())+
  north(ginimap)+
  scalebar(ginimap, dist = 5,  dd2km =T, model="GRS80", st.size = 2)
p1

##PART II. IPEDS DATA

library(tigris)
## To enable 
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
## 
## Attaching package: 'tigris'
## The following object is masked from 'package:graphics':
## 
##     plot
library(sp)
library(tidyverse)
data(fips_codes)
ipedsdata<- read.csv("C:/Users/PCMcC/Documents/GIS Class/Assignments/Data_6-24-2018---118.csv", header = TRUE)
names(ipedsdata) #print the column names
##  [1] "UnitID"                                      
##  [2] "Institution.Name"                            
##  [3] "Grand.total..C2016_C..Master.s.degree."      
##  [4] "Grand.total.men..C2016_C..Master.s.degree."  
##  [5] "Grand.total.women..C2016_C..Master.s.degree."
##  [6] "Grand.total..C2016_C..Doctor.s.degree."      
##  [7] "Grand.total.men..C2016_C..Doctor.s.degree."  
##  [8] "Grand.total.women..C2016_C..Doctor.s.degree."
##  [9] "Institution..entity..name..HD2015."          
## [10] "ZIP.code..HD2015."                           
## [11] "Latitude.location.of.institution..HD2015."   
## [12] "Longitude.location.of.institution..HD2015."  
## [13] "City.location.of.institution..HD2015."       
## [14] "FIPS.state.code..HD2017."                    
## [15] "X"
ipedsdata2<-ipedsdata%>%
  transmute(UnitID=UnitID, longitude= Longitude.location.of.institution..HD2015.,latitude=Latitude.location.of.institution..HD2015.,Total_Masters=Grand.total..C2016_C..Master.s.degree.,Total_Masters_Men=Grand.total.men..C2016_C..Master.s.degree.,Total_Masters_Women=Grand.total.women..C2016_C..Master.s.degree.,Total_Doctoral=Grand.total..C2016_C..Doctor.s.degree., Total_Doctoral_Men=Grand.total.men..C2016_C..Doctor.s.degree., Total_Doctoral_Women=Grand.total.women..C2016_C..Doctor.s.degree., Zip_Code=ZIP.code..HD2015., City=City.location.of.institution..HD2015., FIPS_State=FIPS.state.code..HD2017., Institution.Name=Institution..entity..name..HD2015.) %>%
#  st_transform(crs = 102740)%>%
  filter(complete.cases(longitude,latitude,Total_Masters,Total_Masters_Women,Total_Masters_Men))
ipedsdata2[is.na(ipedsdata2)] <- 0
library(sf)
library(RQGIS) 
library(mapview)
edupoints<-st_as_sf(ipedsdata2, coords=c("longitude","latitude"), crs=4269, agr="constant")
#I name it the name of the first cell. 
mapview(edupoints["UnitID"])
colnames(edupoints)
##  [1] "UnitID"               "Total_Masters"        "Total_Masters_Men"   
##  [4] "Total_Masters_Women"  "Total_Doctoral"       "Total_Doctoral_Men"  
##  [7] "Total_Doctoral_Women" "Zip_Code"             "City"                
## [10] "FIPS_State"           "Institution.Name"     "geometry"

Saving IPEDS spatial data out of R

R is good at reading these data in, projecting them and we can also write the data out again, in case we wanted to share them or use them in QGIS.

st_write(edupoints,dsn = "C:/Users/PCMcC/Documents/GIS Class", layer = "ipeds_proj2", driver = "ESRI Shapefile", delete_dsn = T )

Saving GINI Index spatial data out of R

st_write(ginimap,dsn = "C:/Users/PCMcC/Documents/GIS Class/shapefiles", layer = "gini_proj3", driver = "ESRI Shapefile", delete_dsn = T )

Mapping of 2015 IPEDS Data

edupoints<-st_as_sf(ipedsdata2, coords=c("longitude","latitude"), crs=4269, agr="constant")
#I name it the name of the first cell. 
mapview(edupoints,zcol=c("Total_Masters_Women", "Total_Masters_Men", "Total_Masters","Total_Doctoral", "Total_Doctoral_Men","Total_Doctoral_Women"), legend = TRUE)
colnames(edupoints)
##  [1] "UnitID"               "Total_Masters"        "Total_Masters_Men"   
##  [4] "Total_Masters_Women"  "Total_Doctoral"       "Total_Doctoral_Men"  
##  [7] "Total_Doctoral_Women" "Zip_Code"             "City"                
## [10] "FIPS_State"           "Institution.Name"     "geometry"

Create Proportions for IPEDS Data

library(dplyr)
ipedsdatap<-aggregate(cbind(Total_Masters,Total_Doctoral,Total_Masters_Men,Total_Masters_Women,Total_Doctoral_Men,Total_Doctoral_Women )~FIPS_State,data=ipedsdata2, FUN=sum)

ipedsdatap<-mutate(ipedsdatap, Total_Masters_Men_P=Total_Masters_Men/Total_Masters*100, Total_Masters_Women_P=Total_Masters_Women/Total_Masters*100,Total_Doctoral_Men_P=Total_Doctoral_Men/Total_Doctoral*100,Total_Doctoral_Women_P=Total_Doctoral_Women/Total_Doctoral*100)

Merge IPEDS Data with GINI ACS Data

library(haven)
library(survey)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(dplyr)

ipedsdatap$FIPS_State[1:7]<-c("01","02","04","05","06","08","09")
project_merge<-left_join(ipedsdatap,allstates_acs2, by=c('FIPS_State'='GEOID'))
class(project_merge)
## [1] "data.frame"

Maping of Merged Data

library(dplyr)
library(ggplot2)
library(RQGIS)
library(ggsn)
library(classInt)
merged_map<-project_merge %>%
  mutate(cv_cut=cut(gini1, breaks =  quantile(gini1, na.rm=T, p=seq(0,1,length.out = 6)),include.lowest = T))

library(ggsn)  
p2<-ggplot(merged_map, aes( fill=cv_cut, color=cv_cut))+
  geom_sf() + 
  ggtitle("GINI Coefficient, 2015 ", 
          subtitle = "U.S. States")+
  scale_fill_brewer(palette = "Blues") + 
  scale_color_brewer(palette = "Blues")+
  #scale_fill_manual(values=myfil)+
  
    theme(axis.text.x = element_blank(), axis.text.y = element_blank())+
  north(ginimap)+
  scalebar(merged_map, dist = 5,  dd2km =T, model="GRS80", st.size = 2)
## Warning in min(data$long): no non-missing arguments to min; returning Inf
## Warning in max(data$long): no non-missing arguments to max; returning -Inf
## Warning in min(data$lat): no non-missing arguments to min; returning Inf
## Warning in max(data$lat): no non-missing arguments to max; returning -Inf
## Warning in sin(lat): NaNs produced
## Warning in cos(phi): NaNs produced
## Warning in sin(phi): NaNs produced

## Warning in sin(phi): NaNs produced
## Warning in sin(lat): NaNs produced
## Warning in cos(phi): NaNs produced
## Warning in sin(phi): NaNs produced

## Warning in sin(phi): NaNs produced
p2
## Warning: Removed 2 rows containing missing values (geom_text).

Saving Merged File as Spatial Data Out of R to Continue Mapping

merged_map<-st_as_sf(merged_map)

st_write(merged_map,dsn = "C:/Users/PCMcC/Documents/GIS Class/shapefiles", layer = "project_merge", driver = "ESRI Shapefile", delete_dsn = T )
class(ginimap)

class(merged_map)

Basic Regression Analysis

I examine the correlation between the predictors which in this case include the different levels of education as well as gender.

library(readr)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
library(dplyr)

fit<-lm(gini1~Total_Masters_Men+Total_Masters_Women+Total_Doctoral_Women+Total_Doctoral_Men,data=merged_map)
fit1<-lm(gini1~Total_Masters_Men,data=merged_map)
fit2<-lm(gini1~Total_Masters_Women,data=merged_map)
fit3<-lm(gini1~Total_Doctoral_Men,data=merged_map)
fit4<-lm(gini1~Total_Doctoral_Men,data=merged_map)


summary (fit1)
## 
## Call:
## lm(formula = gini1 ~ Total_Masters_Men, data = merged_map)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.035234 -0.013424 -0.002272  0.011615  0.071835 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.530e-01  3.547e-03 127.690  < 2e-16 ***
## Total_Masters_Men 1.467e-06  3.774e-07   3.887 0.000305 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01887 on 49 degrees of freedom
## Multiple R-squared:  0.2357, Adjusted R-squared:  0.2201 
## F-statistic: 15.11 on 1 and 49 DF,  p-value: 0.0003048
summary (fit2)
## 
## Call:
## lm(formula = gini1 ~ Total_Masters_Women, data = merged_map)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.034899 -0.012384 -0.002538  0.011083  0.071985 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.526e-01  3.588e-03 126.146  < 2e-16 ***
## Total_Masters_Women 1.056e-06  2.683e-07   3.936 0.000261 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01882 on 49 degrees of freedom
## Multiple R-squared:  0.2403, Adjusted R-squared:  0.2247 
## F-statistic:  15.5 on 1 and 49 DF,  p-value: 0.0002612
summary (fit3)
## 
## Call:
## lm(formula = gini1 ~ Total_Doctoral_Men, data = merged_map)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.034291 -0.012588 -0.001689  0.011768  0.069448 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.522e-01  3.528e-03 128.171  < 2e-16 ***
## Total_Doctoral_Men 6.167e-06  1.483e-06   4.158 0.000129 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01856 on 49 degrees of freedom
## Multiple R-squared:  0.2608, Adjusted R-squared:  0.2457 
## F-statistic: 17.29 on 1 and 49 DF,  p-value: 0.0001287
summary (fit4)
## 
## Call:
## lm(formula = gini1 ~ Total_Doctoral_Men, data = merged_map)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.034291 -0.012588 -0.001689  0.011768  0.069448 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.522e-01  3.528e-03 128.171  < 2e-16 ***
## Total_Doctoral_Men 6.167e-06  1.483e-06   4.158 0.000129 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01856 on 49 degrees of freedom
## Multiple R-squared:  0.2608, Adjusted R-squared:  0.2457 
## F-statistic: 17.29 on 1 and 49 DF,  p-value: 0.0001287

Testing for Multicollinearity

vif(fit)
##    Total_Masters_Men  Total_Masters_Women Total_Doctoral_Women 
##             67.59436             44.12020            112.42851 
##   Total_Doctoral_Men 
##            147.25061

Additional Analysis

#Men Masters
A<-ggplot(merged_map, aes(gini1,Total_Masters_Men_P),xlab="Gini Index Value", ylab="% Males with Masters Degrees")
A<-A+geom_point(aes(colour=Total_Masters_Men_P))
A<-A+geom_line(aes(colour=Total_Masters_Men_P,group=Total_Masters_Men_P))
A<-A+geom_text(aes(label=ifelse(Total_Masters_Men_P>46,as.character(NAME),'')))
A<-A+geom_text(aes(label=ifelse(Total_Masters_Men_P<30,as.character(NAME),'')))

A<-A+ylab("% Masters Degrees")
A<-A+xlab("Gini Index Value")
A+ggtitle("Percentage of Master's Degrees Awarded to Males and Gini Index Value by State")
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

#Women Masters
B<-ggplot(merged_map, aes(gini1,Total_Masters_Women_P),xlab="Gini Index Value", ylab="% Females with Masters Degrees")
B<-B+geom_point(aes(colour=Total_Masters_Women_P))
B<-B+geom_line(aes(colour=Total_Masters_Women_P,group=Total_Masters_Women_P))
B<-B+geom_text(aes(label=ifelse(Total_Masters_Women_P>66.2,as.character(NAME),'')))
B<-B+geom_text(aes(label=ifelse(Total_Masters_Women_P<54,as.character(NAME),'')))



B<-B+ylab("% Masters Degrees")
B<-B+xlab("Gini Index Value")
B+ggtitle("Percentage of Master's Degrees Awarded to Females and Gini Index Value by State")
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

#Women Doctoral
C<-ggplot(merged_map, aes(gini1,Total_Doctoral_Women_P),xlab="Gini Index Value", ylab="% Females with Doctoral Degrees")
C<-C+geom_point(aes(colour=Total_Doctoral_Women_P))
C<-C+geom_line(aes(colour=Total_Doctoral_Women_P,group=Total_Doctoral_Women_P))
C<-C+geom_text(aes(label=ifelse(Total_Doctoral_Women_P>58,as.character(NAME),'')))
C<-C+geom_text(aes(label=ifelse(Total_Doctoral_Women_P<47,as.character(NAME),'')))


C<-C+ylab("% Doctoral Degrees")
C<-C+xlab("Gini Index Value")
C+ggtitle("Percentage of Doctoral Degrees Awarded to Females and Gini Index Value by State")
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

#Men Doctoral
D<-ggplot(merged_map, aes(gini1,Total_Doctoral_Men_P),xlab="Gini Index Value", ylab="% Males with Doctoral Degrees")
D<-D+geom_point(aes(colour=Total_Doctoral_Men_P))
D<-D+geom_line(aes(colour=Total_Doctoral_Men_P,group=Total_Doctoral_Women_P))
D<-D+geom_text(aes(label=ifelse(Total_Doctoral_Men_P>53,as.character(NAME),'')))
D<-D+geom_text(aes(label=ifelse(Total_Doctoral_Men_P<41,as.character(NAME),'')))


D<-D+ylab("% Doctoral Degrees")
D<-D+xlab("Gini Index Value")
D+ggtitle("Percentage of Doctoral Degrees Awarded to Males and Gini Index Value by State")
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

##Chi Square Analysis We compute chi-square analyses to observe all the various interactions with the gini index values.

#column gini index by males with a masters degree
chisq.test(table(merged_map$gini1, merged_map$Total_Masters_Men))
## Warning in chisq.test(table(merged_map$gini1, merged_map
## $Total_Masters_Men)): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(merged_map$gini1, merged_map$Total_Masters_Men)
## X-squared = 2499, df = 2450, p-value = 0.2404
#column gini index by females with a masters degree
chisq.test(table(merged_map$gini1, merged_map$Total_Masters_Women))
## Warning in chisq.test(table(merged_map$gini1, merged_map
## $Total_Masters_Women)): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(merged_map$gini1, merged_map$Total_Masters_Women)
## X-squared = 2499, df = 2450, p-value = 0.2404
#column gini index by females with a doctoral degree
chisq.test(table(merged_map$gini1, merged_map$Total_Doctoral_Women))
## Warning in chisq.test(table(merged_map$gini1, merged_map
## $Total_Doctoral_Women)): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(merged_map$gini1, merged_map$Total_Doctoral_Women)
## X-squared = 2499, df = 2450, p-value = 0.2404
#column gini index by males with a doctoral degree
chisq.test(table(merged_map$gini1, merged_map$Total_Doctoral_Men))
## Warning in chisq.test(table(merged_map$gini1, merged_map
## $Total_Doctoral_Men)): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(merged_map$gini1, merged_map$Total_Doctoral_Men)
## X-squared = 2448, df = 2401, p-value = 0.2472

Spatial Statistics

#Remove Alaska from calculations in order to compute Moran I from continental USA
library(dplyr)
merged_map_moran<-merged_map [-c(02,12),]
library(spdep)
nbsR<-poly2nb(as(merged_map_moran, "Spatial"), queen=F, row.names = merged_map_moran$FIPS_State)

plot(as(merged_map_moran, "Spatial"))
plot(nbsR, coords= coordinates(as(merged_map_moran, "Spatial")), add= T, col="red", cex=.75, pch='.')

###Moran I Statistic

library(spdep)
library(RColorBrewer)

#Queen Spatial weight matrix, row standardized
nbs<-poly2nb(as(merged_map_moran,"Spatial"), queen = T)
nbs
wts<-nb2listw(neighbours = nbs, style = "W")
moran.test(merged_map_moran$gini1, listw = wts)

Moran I Scatter plot

merged_map_moran$gini1<-as.numeric(scale(merged_map_moran$gini1))
moran.plot(x = merged_map_moran$gini1, listw = wts, labels = F, xlab = "Gini Coefficient in States- z score", ylab="Lagged Gini Coefficient in States - z score", main="Moran I Scatterplot of Gini Coefficient in the United States, 2015")