Alternative title + sign from the rally at the MA State House today: Melt the ICE
intro
This project is motivated by recent (Monday 7/6) ICE news here.
The NSF puts out data on doctoral degree earners and their citizenship – the most recent data covers 2006-2016. See table 7-4 here.
I use this data to show summary stats, and graphs related to the prevalence of international students (temporary residents in the NSF) in S&E PhD programs.
prep/clean
The NSF data is not in a natural shape for me to plot it. So, first things first. I need to clean it up. I just care about the permanent resident/US citizen and temporary resident dichotomy for now.
library(tidyverse);library(readxl)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ───────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mggplot2[30m 3.3.0 [32m✓[30m [34mpurrr [30m 0.3.4
[32m✓[30m [34mtibble [30m 3.0.1 [32m✓[30m [34mdplyr [30m 0.8.5
[32m✓[30m [34mtidyr [30m 1.0.3 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mreadr [30m 1.3.1 [32m✓[30m [34mforcats[30m 0.5.0[39m
[30m── [1mConflicts[22m ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
nsf<-read_xlsx("degrees_nsf.xlsx")
New names:
* `` -> ...2
* `` -> ...3
* `` -> ...4
* `` -> ...5
* `` -> ...6
* ...
#remove unneeded rows
nsf1<-nsf%>%
slice(-(1:2))%>%
filter(`Table 7-4`!="Hispanic or Latinoa" &
`Table 7-4`!="Non-Hispanic or Latino" &
`Table 7-4`!="American Indian or Alaska Native" &
`Table 7-4`!="Asian" &
`Table 7-4`!="Asian or Pacific Islanderb" &
`Table 7-4`!="Black or African American" &
`Table 7-4`!="Native Hawaiian or Other Pacific Islander" &
`Table 7-4`!="White" &
`Table 7-4`!="More than one racec" &
`Table 7-4`!="Other or unknown race and ethnicity")
names(nsf1)<-nsf1[1,]
The `value` argument of ``names<-`()` must be a character vector as of tibble 3.0.0.
[90mThis warning is displayed once every 8 hours.[39m
[90mCall `lifecycle::last_warnings()` to see where this warning was generated.[39m
nsf1<-nsf1[-1,]
types<-nsf1%>%
filter(`Field, citizenship, ethnicity, and race`!="U.S. citizen and permanent resident" &
`Field, citizenship, ethnicity, and race`!="Temporary resident")%>%
select(`Field, citizenship, ethnicity, and race`)%>%
mutate(order=row_number())
types3<-types%>%
bind_rows(types)%>%
bind_rows(types)%>%
arrange(order)
nsf2<-nsf1%>%
bind_cols(types3)%>%select(-order)%>%
rename(citizen=`Field, citizenship, ethnicity, and race`)%>%
rename(type=`Field, citizenship, ethnicity, and race1`)%>%
filter(citizen=="U.S. citizen and permanent resident" | citizen=="Temporary resident")
nsf_long<-nsf2%>%
pivot_longer(cols = starts_with("2"),
names_to = "year")%>%
mutate(value1=as.double(value))%>%
filter(type!="Other")
summary stats
What % of all doctoral degree recipients are temporary residents?
nsf_long%>%
filter(type=="All degrees")%>%
group_by(citizen)%>%
summarise(n=sum(value1))
188492/(188492+503012)
[1] 0.2725827
What % of S+E doctoral degree recipients are temporary residents?
nsf_long%>%
filter(type=="All S&E")%>%
group_by(citizen)%>%
summarise(n=sum(value1))
145999/(145999+239995)
[1] 0.3782416
Economics?
nsf_long%>%
filter(type=="Economics")%>%
group_by(citizen)%>%
summarise(n=sum(value1))
7894/(7894+5177)
[1] 0.6039324
Graph for raw counts by citizenship and field
# select most detailed field
nsf_long_fields<-nsf_long%>%
filter(type!="All degrees" &
type!="All S&E" &type!="Non-S&E" &
type!="Science" &
type!="Engineering" &
type!="Earth, atmospheric, and ocean sciences" &
type!="Physical sciences" &
type!="Social sciences")%>%
mutate(type=replace(type, type=="Political science and public administration",
"Poli sci & public admin"))%>%
mutate(type=replace(type, type=="Mathematics and statistics",
"Mathematics & statistics"))%>%
mutate(type=replace(type, type=="Ethnic and area studies",
"Ethnic & area studies"))%>%
mutate(citizen=replace(citizen, citizen=="U.S. citizen and permanent resident",
"U.S. citizen or permanent resident"))
ggplot(data=nsf_long_fields, aes(x=year, y=value1, fill=citizen, group=citizen)) +
theme_minimal()+ theme(text=element_text(family="Palatino", size=13),
plot.title.position = "plot",
legend.position = "top",
plot.title = element_text(size=22),
axis.title.y = element_text(size=15))+
geom_area(aes(fill=citizen), position='stack')+
scale_fill_manual(values = c("#009E73", "#999999"), name="Citizenship") +
facet_wrap(~type, scales="free")+ expand_limits(y = 0)+
scale_x_discrete(labels = c("'06", "", "'08", "", "'10", "", "'12", "", "'14", "", "'16"))+
labs(x="", y="Number of Doctoral Degrees Awarded", caption="Viz by Alex Albright")+
ggtitle("How Many Science & Engineering Doctoral Degree Earners are International Students?",
subtitle= "NSF Data on Doctoral Degrees, 2006-2016")
ggsave("phd_count_by_citizenship.png", width=12, height=10, dpi=300)

Graph by percent temporary resident within field
Nightingale plot
nsf_long_fields1<-nsf_long_fields%>%
group_by(year, type)%>%
mutate(tot=sum(value1))%>%
filter(citizen=="Temporary resident")%>%
mutate(temp_perc=value1/tot,
cit_perm_perc=1-temp_perc)%>%
select(-c(value, value1, tot))%>%
pivot_longer(cols = 4:5)%>%
mutate(name=replace(name, name=="temp_perc",
"% Temporary resident"))%>%
mutate(name=replace(name, name=="cit_perm_perc",
"% U.S. citizen or permanent resident"))
ggplot(data=nsf_long_fields1, aes(x=year, y=value, fill=name, group=name)) +
theme_minimal()+ theme(text=element_text(family="Palatino", size=13),
plot.margin = margin(0.1, 0.1, 0.1, 0.1, "cm"),
plot.title.position = "plot",
legend.position = "top",
plot.title = element_text(size=21),
strip.text = element_text(size=9),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=15))+
geom_bar(stat="identity", aes(fill=name), position='stack')+
coord_polar()+ facet_wrap(~type, nrow=5)+
scale_fill_manual(values = c("#009E73", "#999999"), name="Citizenship") +
scale_x_discrete(labels = c("'06", "'07", "'08", "'09", "'10",
"'11", "'12", "'13", "'14", "'15", "'16"))+
scale_y_continuous(labels = scales::percent)+
labs(x="", y="Percent of Doctoral Degrees Awarded", caption="Viz by Alex Albright")+
ggtitle("What % of Science & Engineering Doctoral Degree Earners\nare International Students?",
subtitle= "NSF Data on Doctoral Degrees, 2006-2016")
ggsave("phd_per_by_citizenship.png", width=8, height=10, dpi=300)

just a simple bar chart
Color economics for emphasis.
field_p<-nsf_long_fields%>%
ungroup()%>%
group_by(type)%>%
mutate(tot=sum(value1))%>%
filter(citizen=="Temporary resident")%>%
mutate(temp=sum(value1))%>%
mutate(temp_perc=temp/tot)%>%
select(type, temp_perc)%>% unique()%>%
mutate(econ=if_else(type=="Economics", 1, 0))
ggplot(data=field_p, aes(x=reorder(type, -temp_perc),
y=temp_perc, fill=factor(econ), group=factor(econ))) +
theme_minimal()+ theme(text=element_text(family="Palatino", size=13),
legend.position = "none", plot.title.position = "plot",
plot.title = element_text(size=20.5),
strip.text = element_text(size=9),
axis.title.x = element_text(size=18),
axis.text = element_text(size=15))+
geom_bar(stat="identity")+ coord_flip()+
scale_fill_manual(values = c("#999999", "#E69F00")) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks=seq(0,.7, .1))+
labs(x="", y="Percent of Doctoral Degrees Awarded to Temporary Residents",
caption="Viz by Alex Albright")+
ggtitle("What % of Science & Engineering Doctoral Degree Earners are International Students?",
subtitle= "NSF Data on Doctoral Degrees, 2006-2016")
ggsave("phd_per_temp.png", width=11, height=10, dpi=300)

diagram for how NSF categorizes things
See here for source. More on the DiagrammeR
package here.
library(DiagrammeR)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
library(DiagrammeRsvg)
library(magrittr)
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
library(rsvg)
graph<-"digraph {
graph [layout = dot, rankdir = LR]
# node definitions with substituted label text
node [fontname = Palatino, shape = rectangle]
tab1 [label = '@@1']
tab2 [label = '@@2']
tab3 [label = '@@3']
tab4 [label = '@@4']
tab5 [label = '@@5']
tab6 [label = '@@6']
tab7 [label = '@@7']
tab8 [label = '@@8']
tab9 [label = '@@9']
tab10 [label = '@@10']
tab11 [label = '@@11']
tab12 [label = '@@12']
tab13 [label = '@@13']
tab14 [label = '@@14']
tab15 [label = '@@15']
tab16 [label = '@@16']
tab17 [label = '@@17']
tab18 [label = '@@18']
tab19 [label = '@@19']
tab20 [label = '@@20']
tab21 [label = '@@21']
tab22 [label = '@@22']
tab23 [label = '@@23']
tab24 [label = '@@24']
tab25 [label = '@@25']
tab26 [label = '@@26']
tab27 [label = '@@27']
tab28 [label = '@@28']
tab29 [label = '@@29']
tab30 [label = '@@30']
tab31 [label = '@@31']
tab32 [label = '@@32']
tab33 [label = '@@33']
tab34 [label = '@@34']
tab35 [label = '@@35']
tab36 [label = '@@36']
# edge definitions with the node IDs
tab1 -> tab2;
tab1 -> tab3;
tab2 -> tab4;
tab2 -> tab5;
tab4 -> tab6;
tab4 -> tab7;
tab4 -> tab8;
tab4 -> tab9;
tab4 -> tab10;
tab4 -> tab11;
tab4 -> tab12;
tab4 -> tab13;
tab5 -> tab14;
tab5 -> tab15;
tab5 -> tab16;
tab5 -> tab17;
tab5 -> tab18;
tab5 -> tab19;
tab5 -> tab20;
tab5 -> tab21;
tab9 -> tab22;
tab9 -> tab23;
tab9 -> tab24;
tab11 -> tab25;
tab11 -> tab26;
tab11 -> tab27;
tab11 -> tab28;
tab13 -> tab29;
tab13 -> tab30;
tab13 -> tab31;
tab13 -> tab32;
tab13 -> tab33;
tab13 -> tab34;
tab13 -> tab35;
tab13 -> tab36;
}
[1]: 'All PhDs'
[2]: 'S&E'
[3]: 'Non-S&E'
[4]: 'Science'
[5]: 'Engineering'
[6]: 'Agricultural Sciences'
[7]: 'Biological Sciences'
[8]: 'Computer Sciences'
[9]: 'Earth, Atmospheric, & Ocean Sciences'
[10]: 'Mathematics and statistics'
[11]: 'Physical sciences'
[12]: 'Psychology'
[13]: 'Social sciences'
[14]: 'Aerospace'
[15]: 'Chemical'
[16]: 'Civil'
[17]: 'Electrical'
[18]: 'Industrial'
[19]: 'Materials'
[20]: 'Mechanical'
[21]: 'Other'
[22]: 'Atmospheric sciences'
[23]: 'Earth sciences'
[24]: 'Ocean sciences'
[25]: 'Astronomy'
[26]: 'Chemistry'
[27]: 'Physics'
[28]: 'Other'
[29]: 'Anthropology'
[30]: 'Area and ethnic studies'
[31]: 'Economics'
[32]: 'History of science'
[33]: 'Linguistics'
[34]: 'Political science & public administration'
[35]: 'Sociology'
[36]: 'Other'
"
grViz(graph) %>%
export_svg %>% charToRaw %>% rsvg_png("nsf_cat.png")
