About the data:
The overall data contains the number of students enrolled in the fall, by race/ethnicity, gender, attendance (full- or part-time) status and level of student for selected major fields of study.
Major field of studies included are Education, Engineering, Biological Sciences/Life Sciences, Mathematics, Physical Sciences, Business Management and Administrative Services, Law, Dentistry, and Medicine.
The data contains enrollment data for 4-year institutions only. Institutions with traditional academic year calendar systems (semester, quarter, trimester or 4-1-4) report their enrollment as of October 15 or the official fall reporting date of the institution. Institutions with calendar systems that differ by program or allow continuous enrollment report students that are enrolled at any time between August 1 and October 31.
The data has multiple records per institution. Records are uniquely defined by the variables IPEDS ID (UNITID), and the major field of study, attendance status and level of student (EFCIPLEV). Each record will contain the total enrollment, enrollment for men and women, and enrollment for men and women for all race/ethnicity categories.
Corresponding figure is here.
# adding a new data frame called gender with year, grand totals, total women and total men columns
calculate_totals <- function(df, year) {
total_men <- sum(df$EFTOTLM, na.rm = TRUE)
total_women <- sum(df$EFTOTLW, na.rm = TRUE)
grand_total <- sum(df$EFTOTLT, na.rm = TRUE)
return(data.frame(year = year, total_men = total_men, total_women = total_women, grand_total = grand_total))
}
# years for the df's
years <- c(2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020, 2022)
gender <- data.frame()
# looping through each year and df, calculating totals, and appending rows to df
for (i in seq_along(years)) {
df <- get(paste0("df", years[i]))
gender <- bind_rows(gender, calculate_totals(df, years[i]))
}
Corresponding figure is here.
# now doing this to calculate the cip code totals for each df
cipcode_totals <- function(df, year) {
df %>%
group_by(CIPCODE) %>%
summarize(grand_total = sum(EFTOTLT, na.rm = TRUE)) %>%
ungroup() %>%
mutate(Year = year)
}
cipcode <- data.frame()
for (year in years) {
df <- get(paste0("df", year))
cipcode <- bind_rows(cipcode, cipcode_totals(df, year))
}
# relabeling
as.factor(cipcode$CIPCODE)
## [1] 13 14 22.0101 26 27 40 51.0401 51.1201 52
## [10] 13 14 22.0101 26 27 40 51.0401 51.1201 52
## [19] 13 14 22.0101 26 27 40 51.0401 51.1201 52
## [28] 13 14 22.0101 26 27 40 51.0401 51.1201 52
## [37] 13 14 22.0101 26 27 40 51.0401 51.1201 52
## [46] 13 14 22.0101 26 27 40 51.0401 51.1201 52
## [55] 13 14 22.0101 26 27 40 51.0401 51.1201 52
## [64] 13 14 22.0101 26 27 40 51.0401 51.1201 52
## [73] 13 14 22.0101 26 27 40 51.0401 51.1201 52
## [82] 13 14 22.0101 26 27 40 51.0401 51.1201 52
## Levels: 13 14 22.0101 26 27 40 51.0401 51.1201 52
cipcode <- cipcode %>%
mutate(CIPCODE = case_when(
CIPCODE == "13" ~ "Education",
CIPCODE == "14" ~ "Engineering",
CIPCODE == "26" ~ "Biological Sciences/Life Sciences",
CIPCODE == "27" ~ "Mathematics",
CIPCODE == "40" ~ "Physical Sciences",
CIPCODE == "51.0401" ~ "Dentistry",
CIPCODE == "51.1201" ~ "Medicine",
CIPCODE == "52" ~ "Business Management and Administrative Services",
CIPCODE == "22.0101" ~ "Law"
))
# creating life science majors and medicine vs others
biomed <- cipcode %>%
mutate(Major = case_when(
CIPCODE %in% c("Biological Sciences/Life Sciences", "Medicine") ~ CIPCODE,
TRUE ~ "Other")) %>%
group_by(Year, Major) %>%
summarise(grand_total = sum(grand_total)) %>%
ungroup()
Corresponding figure is here.
# now doing this for level of study - comparing just all students total, undergrad totals, graduate totals, and first professional totals
lstudy_totals <- function(df, year) {
df %>%
filter(LSTUDY %in% c(1, 2, 12, 16)) %>%
group_by(LSTUDY) %>%
summarize(grand_total = sum(EFTOTLT, na.rm = TRUE)) %>%
ungroup() %>%
mutate(Year = year)
}
lstudy <- data.frame()
for (year in years) {
df <- get(paste0("df", year))
lstudy <- bind_rows(lstudy, lstudy_totals(df, year))
}
# relabeling
lstudy <- lstudy %>%
mutate(LSTUDY = case_when(
LSTUDY == "1" ~ "All students total",
LSTUDY == "2" ~ "All students, Undergraduate total",
LSTUDY == "12" ~ "All students, Graduate",
LSTUDY == "16" ~ "All students, First professional"))
Corresponding figure is here.
# now we will look at just the subset of biological/life sciences students and compare trends for all students total, undergrad total, and graduate totals
biostat_totals <- function(df, year) {
df %>%
filter(EFCIPLEV %in% c(301, 302, 312)) %>%
group_by(EFCIPLEV) %>%
summarize(grand_total = sum(EFTOTLT, na.rm = TRUE)) %>%
ungroup() %>%
mutate(Year = year)
}
biostat <- data.frame()
for (year in years) {
df <- get(paste0("df", year))
biostat <- bind_rows(biostat, biostat_totals(df, year))
}
# relabeling
biostat <- biostat %>%
mutate(EFCIPLEV = case_when(
EFCIPLEV == "301" ~ "All students total",
EFCIPLEV == "302" ~ "Undergraduate total",
EFCIPLEV == "312" ~ "Graduate total"))
# and here we will look at only undergrad - degree/certificate seeking
biounder_totals <- function(df, year) {
df %>%
filter(EFCIPLEV %in% c(302, 303, 304, 305)) %>%
group_by(EFCIPLEV) %>%
summarize(grand_total = sum(EFTOTLT, na.rm = TRUE)) %>%
ungroup() %>%
mutate(Year = year)
}
biounder <- data.frame()
for (year in years) {
df <- get(paste0("df", year))
biounder <- bind_rows(biounder, biounder_totals(df, year))
}
# relabel
biounder <- biounder %>%
mutate(EFCIPLEV = case_when(
EFCIPLEV == "302" ~ "Undergraduate total",
EFCIPLEV == "303" ~ "Total Degree-Seeking",
EFCIPLEV == "304" ~ "First-time",
EFCIPLEV == "305" ~ "Other"))
Corresponding figure is here.
# and here we will look at just medicine - all students, full time and part time
med_totals <- function(df, year) {
df %>%
filter(EFCIPLEV %in% c(916, 936, 956)) %>%
group_by(EFCIPLEV) %>%
summarize(grand_total = sum(EFTOTLT, na.rm = TRUE)) %>%
ungroup() %>%
mutate(Year = year)
}
medicine <- data.frame()
for (year in years) {
df <- get(paste0("df", year))
medicine <- bind_rows(medicine, med_totals(df, year))
}
# relabel
medicine <- medicine %>%
mutate(EFCIPLEV = case_when(
EFCIPLEV == "916" ~ "All students",
EFCIPLEV == "936" ~ "Full time",
EFCIPLEV == "956" ~ "Part time"))
Data cleaning process is here.
genderfig <- ggplot(gender, aes(x = year)) +
geom_line(aes(y = total_women, color = "Total Women"), linewidth = 0.75) +
geom_line(aes(y = total_men, color = "Total Men"), linewidth = 0.75) +
geom_line(aes(y = grand_total, color = "Combined Total"), linewidth = 0.75) +
geom_point(aes(y = total_women, color = "Total Women"), size = 1) +
geom_point(aes(y = total_men, color = "Total Men"), size = 1) +
geom_point(aes(y = grand_total, color = "Combined Total"), size = 1) +
labs(x = "Year", y = "Enrollment", title = "Fall Enrollment for Four-Year Institutions by Gender (2004-2022)") +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = seq(2002, 2022, by = 4)) +
theme_light() +
theme(legend.position = 'bottom',
plot.title = element_text(hjust = 0.5),
legend.box = "horizontal") +
scale_color_futurama(name = "Gender")
genderfig
Data cleaning process is here.
cipcodefig <- ggplot(cipcode, aes(x = Year, y = grand_total, color = CIPCODE, group = CIPCODE)) +
labs(x = "Year", y = "Enrollment", title = "Fall Enrollment for Four-Year Institutions by Major Field of Study (2004-2022)") +
geom_line(linewidth = 0.75) +
geom_point(size = 1) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = seq(2002, 2022, by = 4)) +
theme_light() +
theme(legend.position = 'bottom',
plot.title = element_text(hjust = 0.5),
legend.box = "horizontal") +
scale_color_futurama()
cipcodefig
biomedfig <- ggplot(biomed, aes(x = Year, y = grand_total, color = Major, group = Major)) +
labs(x = "Year", y = "Enrollment", title = "Fall Enrollment Biological Sciences/Life Sciences and Medicine vs Other Majors") +
geom_line(linewidth = 0.75) +
geom_point(size = 1) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = seq(2002, 2022, by = 4)) +
theme_light() +
theme(legend.position = 'bottom',
plot.title = element_text(hjust = 0.5),
legend.box = "horizontal") +
scale_color_futurama()
biomedfig
Data cleaning process is here.
biostatfig <- ggplot(biostat, aes(x = Year, y = grand_total, color = EFCIPLEV, group = EFCIPLEV)) +
labs(x = "Year", y = "Enrollment", title = "Fall Level of Study Biological Sciences/Life Sciences (2004-2022)") +
geom_line(linewidth = 0.75) +
geom_point(size = 1) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = seq(2002, 2022, by = 4)) +
theme_light() +
theme(legend.position = 'bottom',
plot.title = element_text(hjust = 0.5),
legend.box = "horizontal") +
scale_color_futurama()
biostatfig
Data cleaning process is here.
biounderfig <- ggplot(biounder, aes(x = Year, y = grand_total, color = EFCIPLEV, group = EFCIPLEV)) +
labs(x = "Year", y = "Enrollment", title = "Fall Undergraduate Categorization for Biological Sciences/Life Sciences (2004-2022)") +
geom_line(linewidth = 0.75) +
geom_point(size = 1) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = seq(2002, 2022, by = 4)) +
theme_light() +
theme(legend.position = 'bottom',
plot.title = element_text(hjust = 0.5),
legend.box = "horizontal") +
scale_color_futurama()
biounderfig
Data cleaning process is here.
medicinefig <- ggplot(medicine, aes(x = Year, y = grand_total, color = EFCIPLEV, group = EFCIPLEV)) +
labs(x = "Year", y = "Enrollment", title = "Fall Level of Study Medicine Students (2004-2022)") +
geom_line(linewidth = 0.75) +
geom_point(size = 1) +
scale_y_continuous(labels = scales::comma_format()) +
scale_x_continuous(breaks = seq(2002, 2022, by = 4)) +
theme_light() +
theme(legend.position = 'bottom',
plot.title = element_text(hjust = 0.5),
legend.box = "horizontal") +
scale_color_futurama()
medicinefig