1 Overview

The following code is designed to clean and prepare data for analyzing degree enrollments and completions by various student and institution characteristics. The data was downloaded from IPEDS and spans the years 2004-2022. This document is divded into two sections, both of which contain further details about the data used.

2 Data Cleaning

# cleaning the 2004-2007 data first bc they all have the same number of variables
enr04_07 <- rbind(enr2004, enr2005, enr2006, enr2007)
rm(enr2004, enr2005, enr2006, enr2007)

enr04_07 <- enr04_07 %>% 
  select(-contains(c("XFYRAC"))) %>% 
  mutate(EFFYLEV = recode(EFFYLEV, 
                          '1' = 'All students total', 
                          '2' = 'Undergraduate', 
                          '3' = 'First professional', 
                          '4' = 'Graduate')) %>% 
  mutate(LSTUDY = recode(LSTUDY, 
                         '1' = 'Undergraduate', 
                         '2' = 'First professional', 
                         '3' = 'Graduate', 
                         '999' = 'Generated total')) %>% 
  rename(NONRESM = FYRACE01,
         NONRESW = FYRACE02,
         NONREST = FYRACE17,
         
         BLACKM = FYRACE03,
         BLACKW = FYRACE04,
         BLACKT = FYRACE18,
         
         AIANM = FYRACE05,
         AIANW = FYRACE06,
         AIANT = FYRACE19,
         
         AAPIM = FYRACE07,
         AAPIW = FYRACE08,
         AAPIT = FYRACE20,
         
         HISPM = FYRACE09,
         HISPW = FYRACE10,
         HISPT = FYRACE21,
         
         WHITEM = FYRACE11,
         WHITEW = FYRACE12,
         WHITET = FYRACE22,
         
         UNRACEM = FYRACE13,
         UNRACEW = FYRACE14,
         UNRACET = FYRACE23,
         
         TOTALM = FYRACE15,
         TOTALW = FYRACE16,
         TOTAL = FYRACE24) %>%
  mutate(MULTRACEM = NA,
         MULTRACEW = NA,
         MULTRACET = NA)


# cleaning the 2008 to 2010 dataset next 

enr08_10 <- rbind(enr2008, enr2009, enr2010)
rm(enr2008, enr2009, enr2010)

enr08_10 <- enr08_10 %>% 
  select(-contains(c("XFYRAC", "XEFY", "XDVEY", "XEY"))) %>% 
  subset(., select = c(UNITID, YEAR, EFFYLEV, LSTUDY, EFYNRALM, EFYNRALW, EFYUNKNM, EFYUNKNW, EFYTOTLM, EFYTOTLW, EFYNRALT, EFYUNKNT, EFYTOTLT, EFY2MORM, EFY2MORW, EFY2MORT, DVEYAIT, DVEYAIM, DVEYAIW, DVEYAPT, DVEYAPM, DVEYAPW, DVEYBKT, DVEYBKM, DVEYBKW, DVEYHST, DVEYHSM, DVEYHSW, DVEYWHT, DVEYWHM, DVEYWHW, STABBR, SECTOR, ICLEVEL, HLOFFER, UGOFFER, GROFFER, HOSPITAL, MEDICAL, CARNEGIE, CYACTIVE)) %>%
  mutate(EFFYLEV = recode(EFFYLEV, 
                          '1' = 'All students total', 
                          '2' = 'Undergraduate', 
                          '3' = 'First professional', 
                          '4' = 'Graduate')) %>% 
  mutate(LSTUDY = recode(LSTUDY, 
                         '1' = 'Undergraduate', 
                         '2' = 'First professional', 
                         '3' = 'Graduate', 
                         '999' = 'Generated total')) %>% 
  rename(NONRESM = EFYNRALM,
         NONRESW = EFYNRALW,
         NONREST = EFYNRALT,
         
         BLACKM = DVEYBKM,
         BLACKW = DVEYBKW,
         BLACKT = DVEYBKT,
         
         AIANM = DVEYAIM,
         AIANW = DVEYAIW,
         AIANT = DVEYAIT,
         
         AAPIM = DVEYAPM,
         AAPIW = DVEYAPW,
         AAPIT = DVEYAPT,
         
         HISPM = DVEYHSM,
         HISPW = DVEYHSW,
         HISPT = DVEYHST,
         
         WHITEM = DVEYWHM,
         WHITEW = DVEYWHW,
         WHITET = DVEYWHT,
         
         UNRACEM = EFYUNKNM,
         UNRACEW = EFYUNKNW,
         UNRACET = EFYUNKNT,
         
         TOTALM = EFYTOTLM,
         TOTALW = EFYTOTLW,
         TOTAL = EFYTOTLT,
         
         MULTRACEM = EFY2MORM,
         MULTRACEW = EFY2MORW,
         MULTRACET = EFY2MORT) 

# now doing 2011 to 2013 data cleaning

enr11_13 <- rbind(enr2011, enr2012, enr2013)
rm(enr2011, enr2012, enr2013)

enr11_13 <- enr11_13 %>% 
  select(-contains(c("XEY", "XEFY"))) %>% 
  mutate(EFFYLEV = recode(EFFYLEV, 
                          '1' = 'All students total', 
                          '2' = 'Undergraduate', 
                          '4' = 'Graduate')) %>% 
  mutate(LSTUDY = recode(LSTUDY, 
                         '1' = 'Undergraduate', 
                         '3' = 'Graduate', 
                         '999' = 'Generated total')) %>% 
  mutate(AAPIM = rowSums(.[, c("EFYASIAM", "EFYNHPIM")], na.rm = TRUE),
         AAPIW = rowSums(.[, c("EFYASIAW", "EFYNHPIW")], na.rm = TRUE),
         AAPIT = rowSums(.[, c("EFYASIAT", "EFYNHPIT")], na.rm = TRUE)) %>%
  subset(., select = -c(EFYASIAM, EFYNHPIM, EFYASIAW, EFYNHPIW, EFYASIAT, EFYNHPIT)) %>% 
  rename(NONRESM = EFYNRALM,
         NONRESW = EFYNRALW,
         NONREST = EFYNRALT,
         
         BLACKM = EFYBKAAM,
         BLACKW = EFYBKAAW,
         BLACKT = EFYBKAAT,
         
         AIANM = EFYAIANM,
         AIANW = EFYAIANW,
         AIANT = EFYAIANT,

         HISPM = EFYHISPM,
         HISPW = EFYHISPW,
         HISPT = EFYHISPT,
         
         WHITEM = EFYWHITM,
         WHITEW = EFYWHITW,
         WHITET = EFYWHITT,
         
         UNRACEM = EFYUNKNM,
         UNRACEW = EFYUNKNW,
         UNRACET = EFYUNKNT,
         
         TOTALM = EFYTOTLM,
         TOTALW = EFYTOTLW,
         TOTAL = EFYTOTLT,
         
         MULTRACEM = EFY2MORM,
         MULTRACEW = EFY2MORW,
         MULTRACET = EFY2MORT) 
  


# now doing 2014 to 2019 data cleaning

enr14_19 <- rbind(enr2014, enr2015, enr2016, enr2017, enr2018, enr2019)
rm(enr2014, enr2015, enr2016, enr2017, enr2018, enr2019)

enr14_19 <- enr14_19 %>% 
  select(-contains(c("XEY", "XEFY"))) %>% 
  mutate(EFFYLEV = recode(EFFYLEV, 
                          '1' = 'All students total', 
                          '2' = 'Undergraduate', 
                          '4' = 'Graduate')) %>% 
  mutate(LSTUDY = recode(LSTUDY, 
                         '1' = 'Undergraduate', 
                         '3' = 'Graduate', 
                         '999' = 'Generated total')) %>% 
  mutate(AAPIM = rowSums(.[, c("EFYASIAM", "EFYNHPIM")], na.rm = TRUE),
         AAPIW = rowSums(.[, c("EFYASIAW", "EFYNHPIW")], na.rm = TRUE),
         AAPIT = rowSums(.[, c("EFYASIAT", "EFYNHPIT")], na.rm = TRUE)) %>%
  subset(., select = -c(EFYASIAM, EFYNHPIM, EFYASIAW, EFYNHPIW, EFYASIAT, EFYNHPIT)) %>% 
  rename(NONRESM = EFYNRALM,
         NONRESW = EFYNRALW,
         NONREST = EFYNRALT,
         
         BLACKM = EFYBKAAM,
         BLACKW = EFYBKAAW,
         BLACKT = EFYBKAAT,
         
         AIANM = EFYAIANM,
         AIANW = EFYAIANW,
         AIANT = EFYAIANT,

         HISPM = EFYHISPM,
         HISPW = EFYHISPW,
         HISPT = EFYHISPT,
         
         WHITEM = EFYWHITM,
         WHITEW = EFYWHITW,
         WHITET = EFYWHITT,
         
         UNRACEM = EFYUNKNM,
         UNRACEW = EFYUNKNW,
         UNRACET = EFYUNKNT,
         
         TOTALM = EFYTOTLM,
         TOTALW = EFYTOTLW,
         TOTAL = EFYTOTLT,
         
         MULTRACEM = EFY2MORM,
         MULTRACEW = EFY2MORW,
         MULTRACET = EFY2MORT) 


# now cleaning 2020-2021 data

enr20_21 <- rbind(enr2020, enr2021)
rm(enr2020, enr2021)


enr20_21 <- enr20_21 %>% 
  select(-contains(c("XEY", "XEFY"))) %>% 
  filter(EFFYLEV != "-2") %>% 
  mutate(EFFYLEV = recode(EFFYLEV, 
                          '1' = 'All students total', 
                          '2' = 'Undergraduate', 
                          '4' = 'Graduate')) %>% 
  mutate(LSTUDY = recode(LSTUDY, 
                         '1' = 'Undergraduate', 
                         '3' = 'Graduate', 
                         '999' = 'Generated total')) %>% 
  mutate(AAPIM = rowSums(.[, c("EFYASIAM", "EFYNHPIM")], na.rm = TRUE),
         AAPIW = rowSums(.[, c("EFYASIAW", "EFYNHPIW")], na.rm = TRUE),
         AAPIT = rowSums(.[, c("EFYASIAT", "EFYNHPIT")], na.rm = TRUE)) %>%
  subset(., select = -c(EFYASIAM, EFYNHPIM, EFYASIAW, EFYNHPIW, EFYASIAT, EFYNHPIT, EFFYALEV)) %>% 
  rename(NONRESM = EFYNRALM,
         NONRESW = EFYNRALW,
         NONREST = EFYNRALT,
         
         BLACKM = EFYBKAAM,
         BLACKW = EFYBKAAW,
         BLACKT = EFYBKAAT,
         
         AIANM = EFYAIANM,
         AIANW = EFYAIANW,
         AIANT = EFYAIANT,

         HISPM = EFYHISPM,
         HISPW = EFYHISPW,
         HISPT = EFYHISPT,
         
         WHITEM = EFYWHITM,
         WHITEW = EFYWHITW,
         WHITET = EFYWHITT,
         
         UNRACEM = EFYUNKNM,
         UNRACEW = EFYUNKNW,
         UNRACET = EFYUNKNT,
         
         TOTALM = EFYTOTLM,
         TOTALW = EFYTOTLW,
         TOTAL = EFYTOTLT,
         
         MULTRACEM = EFY2MORM,
         MULTRACEW = EFY2MORW,
         MULTRACET = EFY2MORT)
  

# finally 2022 data 

enr2022 <- enr2022 %>% 
  subset(., select = -c(EFYGUUN, EFYGUAN, EFYGUTOT, EFYGUKN)) %>% 
  select(-contains(c("XEY", "XEFY"))) %>% 
  filter(EFFYLEV != "-2") %>% 
  mutate(EFFYLEV = recode(EFFYLEV, 
                          '1' = 'All students total', 
                          '2' = 'Undergraduate', 
                          '4' = 'Graduate')) %>% 
  mutate(LSTUDY = recode(LSTUDY, 
                         '1' = 'Undergraduate', 
                         '3' = 'Graduate', 
                         '999' = 'Generated total')) %>% 
  mutate(AAPIM = rowSums(.[, c("EFYASIAM", "EFYNHPIM")], na.rm = TRUE),
         AAPIW = rowSums(.[, c("EFYASIAW", "EFYNHPIW")], na.rm = TRUE),
         AAPIT = rowSums(.[, c("EFYASIAT", "EFYNHPIT")], na.rm = TRUE)) %>%
  subset(., select = -c(EFYASIAM, EFYNHPIM, EFYASIAW, EFYNHPIW, EFYASIAT, EFYNHPIT, EFFYALEV)) %>% 
  rename(NONRESM = EFYNRALM,
         NONRESW = EFYNRALW,
         NONREST = EFYNRALT,
         
         BLACKM = EFYBKAAM,
         BLACKW = EFYBKAAW,
         BLACKT = EFYBKAAT,
         
         AIANM = EFYAIANM,
         AIANW = EFYAIANW,
         AIANT = EFYAIANT,

         HISPM = EFYHISPM,
         HISPW = EFYHISPW,
         HISPT = EFYHISPT,
         
         WHITEM = EFYWHITM,
         WHITEW = EFYWHITW,
         WHITET = EFYWHITT,
         
         UNRACEM = EFYUNKNM,
         UNRACEW = EFYUNKNW,
         UNRACET = EFYUNKNT,
         
         TOTALM = EFYTOTLM,
         TOTALW = EFYTOTLW,
         TOTAL = EFYTOTLT,
         
         MULTRACEM = EFY2MORM,
         MULTRACEW = EFY2MORW,
         MULTRACET = EFY2MORT)

# finally bind all the dataframes together

enrollments <- rbind(enr04_07,enr08_10, enr11_13, enr14_19, enr20_21, enr2022)

rm(enr04_07,enr08_10, enr11_13, enr14_19, enr20_21, enr2022)

enrollments <- enrollments[, c("UNITID", "YEAR", "EFFYLEV", "LSTUDY", 
                               "TOTAL", "TOTALM", "TOTALW", 
                               "WHITEM", "WHITEW", "WHITET", 
                               "BLACKM", "BLACKW", "BLACKT", 
                               "HISPM", "HISPW", "HISPT", 
                               "AAPIM", "AAPIW", "AAPIT", 
                               "AIANM", "AIANW", "AIANT", 
                               "UNRACEM", "UNRACEW", "UNRACET",
                               "MULTRACEM", "MULTRACEW", "MULTRACET", 
                               "NONRESM", "NONRESW", "NONREST", 
                               "STABBR", "SECTOR", "ICLEVEL", "HLOFFER", 
                               "UGOFFER","GROFFER", "HOSPITAL", "MEDICAL",
                               "CARNEGIE", "CYACTIVE")]

3 Enrollment Figures

The following figures have been developed to analyze enrollments by various institution characteristics. Data was downloaded from IPEDS from 2004-2022 and contains the unduplicated head count of students enrolled over a 12-month period for both undergraduate and graduate levels. These enrollment data are particularly valuable for institutions that use non-traditional calendar systems and offer short-term programs. Because this enrollment measure encompasses an entire year, it provides a more complete picture of the number of students these schools serve. Each record is uniquely defined by the variables IPEDS ID (UNITID), and the level of enrollment (EFFYLEV). Each record will contain the total enrollment, enrollment for men and women, and the enrollment for men and women for all race/ethnicity categories.

3.1 By Level of Study

# Create figures for total enrollments by level and gender

total_summary <- enrollments %>%
  filter(EFFYLEV != "First professional") %>% 
  group_by(YEAR, EFFYLEV) %>%
  summarize(TOTAL = sum(TOTAL))

total_summary$EFFYLEV <- factor(total_summary$EFFYLEV, levels = c("All students total", "Undergraduate", "Graduate"))


ggplot(total_summary, aes(x = YEAR, y = TOTAL, color = EFFYLEV, group = EFFYLEV)) +
  geom_line() +
  geom_point() +
  labs(title = "Enrollments by Level of Study",
       x = "Year",
       y = "Total Number of Students",
       color = "Level of Study") +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal()

3.2 By Sex of Students

# Create figures for total enrollments by level and gender

gender_summary <- enrollments %>%
  filter(EFFYLEV == "All students total") %>% 
  group_by(YEAR) %>%
  summarize(TOTALW = sum(TOTALW, na.rm = TRUE), TOTALM = sum(TOTALM, na.rm = TRUE)) 

ggplot(gender_summary, aes(x = YEAR)) +
  geom_line(aes(y = TOTALW, color = "Women")) +
  geom_point(aes(y = TOTALW, color = "Women")) +
  geom_line(aes(y = TOTALM, color = "Men")) +
  geom_point(aes(y = TOTALM, color = "Men")) +
  labs(title = "Enrollments by Sex of Student",
       x = "Year",
       y = "Total Number of Students",
       color = "Sex of Student") +
  scale_y_continuous(labels = scales::comma) +
  scale_color_manual(values=c("#00Bfc4", "#F8766D")) +
  theme_minimal()

3.3 By Institutional Level

level_summary <- enrollments %>% 
  filter(ICLEVEL != "-3", EFFYLEV == "All students total") %>% 
  group_by(YEAR, ICLEVEL) %>% 
  summarize(LVLTOTL = sum(TOTAL, na.rm = TRUE))

level_summary$ICLEVEL <- factor(level_summary$ICLEVEL, levels = c(1, 2, 3),
                               labels = c("Four or more years", "At least 2 but less than 4 years", "Less than 2 years/Below Associate's)"))

ggplot(level_summary, aes(x = YEAR, y = LVLTOTL, color = ICLEVEL, group = ICLEVEL)) +
  geom_line() +
  geom_point() +
  labs(title = "Total Number of Students by Level of Institution",
       x = "Year",
       y = "Total Number of Students",
       color = "Level of Institution") +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal()

3.4 By Red vs Blue States

blue_states <- c("CA", "NY", "IL", "WA", "OR", "NV", "CO", "NM", "VA", "MD", "DE", "NJ", "CT", "RI", "MA", "VT", "NH", "ME", "HI", "AZ", "GA", "MN", "WI", "MI", "PA", "DC")
red_states <- c("TX", "FL", "NC", "SC", "TN", "AL", "MS", "LA", "AR", "MO", "KY", "WV", "OH", "IN", "OK", "KS", "NE", "SD", "ND", "MT", "WY", "ID", "UT", "AK", "IA")

enrollments$PTX <- ifelse(enrollments$STABBR %in% blue_states, "Blue", 
                           ifelse(enrollments$STABBR %in% red_states, "Red", "Other"))


redblue_summary <- enrollments %>% 
  filter(PTX %in% c("Blue", "Red")) %>% 
  filter(EFFYLEV != "First professional")

redblue_summary <- redblue_summary %>%
  group_by(YEAR, EFFYLEV, PTX) %>%
  summarize(AVGTOTL = mean(TOTAL, na.rm = TRUE))

redblue_summary$EFFYLEV <- factor(redblue_summary$EFFYLEV, levels = c("All students total", "Undergraduate", "Graduate"))

custom_colors <- c(
  "Blue.All students total" = "#112e51",
  "Red.All students total" = "#981b1e",
  "Blue.Undergraduate" = "#005fa3",
  "Red.Undergraduate" = "#cc393e",
  "Blue.Graduate" = "#0092D1",
  "Red.Graduate" = "#e59393"
)

ggplot(redblue_summary, aes(x = YEAR, y = AVGTOTL, color = interaction(PTX, EFFYLEV), group = interaction(PTX, EFFYLEV))) +
  geom_line() +
  geom_point() +
  labs(title = "Average Number of Students by Level of Study and State Category",
       x = "Year",
       y = "Average Number of Students",
       color = "State Category and Level of Study") +
  scale_color_manual(values = custom_colors) +
  theme_minimal()

4 Degrees Conferred

The following figures have been developed to analyze degree completion by various student characteristics. Data was downloaded from 2004-2022, and catalogs the number of programs offered and number of programs offered via distance education, and by award level. The type of program is categorized according to the 2020 Classification of Instructional Programs (CIP), a detailed coding system for postsecondary instructional programs. The 2010 CIP was used to categorize programs for Completions data collected from 2009-10 through 2018-19.

For the purposes of this summary, Doctor’s degrees include medicine (MD), dentistry (DDS and DMD), and other medical specialty degrees that were classified as first-professional prior to 2010-11.

4.1 By Sex of Students and Level of Degree

health_summary <- health %>%
  pivot_longer(cols = c(Bachelors_Total, Bachelors_Male, Bachelors_Female,
                        Masters_Total, Masters_Male, Masters_Female,
                        Doctor_Total, Doctor_Male, Doctor_Female),
               names_to = c("Degree", "Gender"),
               names_sep = "_",
               values_to = "Count")

health_summary$Degree <- factor(health_summary$Degree, levels = c("Bachelors", "Masters", "Doctor"))

# Bachelors
health_summary %>% 
  filter(Degree == "Bachelors") %>% 
  ggplot(., aes(x = Year, y = Count, color = Gender, group = Gender)) +
  geom_line() +
  geom_point() +
  scale_x_discrete(breaks = c("2004-05", "2008-09", "2012-13", "2016-17", "2020-21")) +
  labs(title = "Bachelor's Degrees in Health Professions and Related Programs \nConferred by Postsecondary Institutions",
       x = "Year",
       y = "Total Number of Degrees",
       color = "Sex of Student") +
  scale_y_continuous(labels = scales::comma) +
  scale_color_manual(values=c("#F8766D","#00Bfc4","#7CAE00")) +
  theme_minimal()

# Masters
health_summary %>% 
  filter(Degree == "Masters") %>% 
  ggplot(., aes(x = Year, y = Count, color = Gender, group = Gender)) +
  geom_line() +
  geom_point() +
  scale_x_discrete(breaks = c("2004-05", "2008-09", "2012-13", "2016-17", "2020-21")) +
  labs(title = "Master's Degrees in Health Professions and Related Programs \nConferred by Postsecondary Institutions",
       x = "Year",
       y = "Total Number of Degrees",
       color = "Sex of Student") +
  scale_y_continuous(labels = scales::comma) +
  scale_color_manual(values=c("#F8766D","#00Bfc4","#7CAE00")) +
  theme_minimal()

#Doctors

health_summary %>% 
  filter(Degree == "Doctor") %>% 
  ggplot(., aes(x = Year, y = Count, color = Gender, group = Gender)) +
  geom_line() +
  geom_point() +
  scale_x_discrete(breaks = c("2004-05", "2008-09", "2012-13", "2016-17", "2020-21")) +
  labs(title = "Doctor's Degrees in Health Professions and Related Programs \nConferred by Postsecondary Institutions",
       x = "Year",
       y = "Total Number of Degrees",
       color = "Sex of Student") +
  scale_y_continuous(labels = scales::comma) +
  scale_color_manual(values=c("#F8766D","#00Bfc4","#7CAE00")) +
  theme_minimal()

IPEDS Analysis (2004-2022)

Esther Sanchez

2024-06-27