path <- "/Users/frenandezlawrence/Downloads/Fall_2022-2024_Firsttime_Fulltime_Cohort_data_with_Unmet_Need.xlsx"
sheets <- excel_sheets(path)
sheets
[1] "Variables" "Fall 2022 Financial Aid" "Fall 2022 Enrollment" "Fall 2023 Financial Aid" "Fall 2023 Enrollment" "Fall 2024 Financial Aid Prelim"
[7] "Fall 2024 Enrollment Prelim" "Data Dictionary"
df_example <- read_xlsx(path, sheet = sheets[2]) %>%
clean_names()
head(df_example)
#The below is used to standardize Dependency Status column to ensure that all three years represented using the same format.
dep_lookup <- c(
"0" = "Unkn",
"1" = "Dependent",
"2" = "Independent",
"3" = "Unkn"
)
#The below is used to standardize Race column to ensure that all three years represented using the same format.
race_text_to_code <- c(
"African-American"= 1,
"Native-American" = 2,
"Asian" = 3,
"Hispanic" = 4,
"White" = 5,
"Native-Hawaiian" = 6,
"Mulitracial" = 7,
"International" = 8
)
## Enrollment 2022 Data
enr22_raw <- read_xlsx(path, sheet = "Fall 2022 Enrollment")
enr22 <- enr22_raw %>%
clean_names() %>%
{ if (!"cip_code" %in% names(.)) mutate(., cip_code = NA_character_) else . } %>%
mutate(
study_id = as.character(study_id),
zip_code = str_pad(as.character(zip_code), 5, "left", "0"),
retained_fall2023 = case_when(retained_fall2023 %in% c("Yes","Y","TRUE") ~ TRUE,
retained_fall2023 %in% c("No","N","FALSE") ~ FALSE,
TRUE ~ NA),
casa_student = case_when(casa_student %in% c("Yes","Y","TRUE") ~ TRUE,
casa_student %in% c("No","N","FALSE") ~ FALSE,
TRUE ~ NA),
returned_spring2023 = case_when(returned_spring2023 %in% c("Yes","Y","TRUE") ~ TRUE,
returned_spring2023 %in% c("No","N","FALSE") ~ FALSE,
TRUE ~ NA),
gender = factor(gender, levels = c("F","M")),
tuition_status = as.factor(tuition_status),
newrace = case_when(
newrace %in% names(race_text_to_code) ~ race_text_to_code[newrace],
TRUE ~ as.numeric(newrace)),
high_school_gpa = as.numeric(high_school_gpa),
sat_2016_math_score = as.numeric(sat_2016_math_score),
sat_2016_ebrw_score = as.numeric(sat_2016_ebrw_score),
age = as.integer(age),
cip_code = as.character(cip_code),
admission_test_flag = as.integer(admission_test_flag),,
distance_education_enrollment = as.integer(distance_education_enrollment),
act_math_score = as.integer(act_math_score),
act_english_score = as.integer(act_english_score),
act_reading_score = as.integer(act_reading_score),
act_science_score = as.integer(act_science_score),
act_composite_score = as.integer(act_composite_score),
admission_exemption = as.integer(admission_exemption),
math_remedial_assessment = as.integer(math_remedial_assessment),
english_remedial_assessment = as.integer(english_remedial_assessment),
reading_remedial_assessment = as.integer(reading_remedial_assessment),
military_status = as.integer(military_status),
majors = as.character(majors),
term_credit_hours_attempted2022fall = as.integer(term_credit_hours_attempted2022fall),
semesterhoursearned2022fall = as.integer(semesterhoursearned2022fall),
cumulativegpa2022fall = as.numeric(cumulativegpa2022fall),
term_credit_hours_attempted2023spring = as.integer(term_credit_hours_attempted2023spring),
semesterhoursearned2023spring = as.integer(semesterhoursearned2023spring),
semestergpa2023spring = as.numeric(semestergpa2023spring),
cumulativehoursearned2023spring = as.integer(cumulativehoursearned2023spring),
cumulativegpa2023spring = as.numeric(cumulativegpa2023spring),
college = as.character(college),
collection_term = "Fall",
collection_year = 2022
)
## Financial Aid 2022 Data
fa22_raw <- read_xlsx(path, sheet = "Fall 2022 Financial Aid")
fa22 <- fa22_raw %>%
clean_names() %>%
transmute(
study_id = as.character(study_id),
family_size = as.integer(family_size),
dependency_status = factor(dependency_status, levels = c("Dependent","Independent","Unkn")),
expected_family_contribution = as.numeric(expected_family_contribution),
cost_of_attendance = as.numeric(cost_of_attendance),
financial_aid_disbursement_amount = as.numeric(financial_aid_disbursement_amount),
pell_recipient = case_when(pell_recipient %in% c("Yes","Y","TRUE") ~ TRUE,
pell_recipient %in% c("No","N","FALSE") ~ FALSE,
TRUE ~ NA),
unmetneed_amount = as.numeric(unmetneed_amount),
unmetneed_category = as.integer(unmetneed_category),
family_income_category = as.integer(family_income_category),
first_generation = factor(first_generation, levels = c("Yes","No","Unkn"))
)
cohort22 <- left_join(enr22, fa22, by = "study_id")
##Looping and Data Cleansing of All Three Cohorts
year_sheets <- tribble(
~year, ~fa_sheet, ~enr_sheet,
2022, "Fall 2022 Financial Aid", "Fall 2022 Enrollment",
2023, "Fall 2023 Financial Aid", "Fall 2023 Enrollment",
2024, "Fall 2024 Financial Aid Prelim", "Fall 2024 Enrollment Prelim"
)
vars_raw <- read_xlsx(path, sheet = "Variables", col_names = FALSE)
New names:
• `` -> `...1`
• `` -> `...2`
all_cohorts <- purrr::pmap_dfr(
.l = list(
year = year_sheets$year,
fa_sheet = year_sheets$fa_sheet,
enr_sheet = year_sheets$enr_sheet),
.f = function(year, fa_sheet, enr_sheet) {
next_retained <- paste0("retained_fall", year + 1)
next_returned <- paste0("returned_spring", year + 1)
next_termcreditattemptedfall <- paste0("term_credit_hours_attempted", year,"fall")
next_semesterhoursearnedfall <- paste0("semesterhoursearned", year,"fall")
next_cumulativegpafall <- paste0("cumulativegpa", year,"fall")
next_termcreditattemptedspring <- paste0("term_credit_hours_attempted", year + 1,"spring")
next_semesterhoursearnedspring <- paste0("semesterhoursearned", year + 1,"spring")
next_semestergpaspring <- paste0("semestergpa", year + 1,"spring")
next_cumulativehoursearnedspring <- paste0("cumulativehoursearned", year + 1,"spring")
next_cumulativegpaspring <- paste0("cumulativegpa", year + 1,"spring")
#Enrollment Fields Cleaning
enr_raw <- read_xlsx(path, sheet = enr_sheet)
enr <- enr_raw %>%
clean_names() %>%
{ if (!"cip_code" %in% names(.)) mutate(., cip_code = NA_character_) else . } %>%
mutate(
across(
any_of(next_retained),
~ case_when(
.x %in% c("Yes","Y","TRUE") ~ TRUE,
.x %in% c("No","N","FALSE") ~ FALSE,
TRUE ~ NA)),
study_id = as.character(study_id),
zip_code = str_pad(as.character(zip_code), 5, "left", "0"),
casa_student = (casa_student == "Yes"),
across(
any_of(next_retained),
~ case_when(
.x %in% c("Yes","Y","TRUE") ~ TRUE,
.x %in% c("No","N","FALSE") ~ FALSE,
TRUE ~ NA)),
gender = factor(gender, levels = c("F","M")),
tuition_status = factor(tolower(as.character(tuition_status))),
newrace_str = as.character(newrace),
newrace = if_else(
newrace_str %in% names(race_text_to_code),
race_text_to_code[newrace_str],
as.numeric(newrace_str)),
high_school_gpa = as.numeric(high_school_gpa),
sat_2016_math_score = as.numeric(sat_2016_math_score),
sat_2016_ebrw_score = as.numeric(sat_2016_ebrw_score),
age = as.integer(age),
cip_code = as.character(cip_code),
admission_test_flag = as.integer(admission_test_flag),
admission_exemption = as.character(admission_exemption),
distance_education_enrollment = as.integer(distance_education_enrollment),
act_math_score = as.integer(act_math_score),
act_english_score = as.integer(act_english_score),
act_reading_score = as.integer(act_reading_score),
act_science_score = as.integer(act_science_score),
act_composite_score = as.integer(act_composite_score),
admission_exemption = as.integer(admission_exemption),
math_remedial_assessment = as.integer(math_remedial_assessment),
english_remedial_assessment = as.integer(english_remedial_assessment),
reading_remedial_assessment = as.integer(reading_remedial_assessment),
military_status = as.integer(military_status),
majors = as.character(majors),
#Handling of Dynamic Column Term Credit Attempted Fall 2022-2024
across(any_of(next_termcreditattemptedfall),
as.integer),
#Handling of Dynamic Column Semester Hours Earned Fall 2022-2024
across(any_of(next_semesterhoursearnedfall),
as.integer),
#Handling of Dynamic Column Cumulative GPA Fall 2022-2024
across(any_of(next_cumulativegpafall ),
~ round(as.numeric(.x), 2)),
#Handling of Dynamic Column Term Credit Attempted Spring 2023-2025
across(any_of(next_termcreditattemptedspring),
as.integer),
#Handling of Dynamic Column Semester Hours Earned Spring 2023-2025
across(any_of( next_semesterhoursearnedspring),
as.integer),
#Handling of Dynamic Column GPA Spring 2023-2025
across(any_of(next_semestergpaspring),
~ round(as.numeric(.x), 2)),
#Handling of Dynamic Column Cumulative Hours Earned Spring 2023-2025
across(any_of(next_cumulativehoursearnedspring),
as.integer),
#Handling of Dynamic Column Cumulative GPA Spring 2023-2025
across(any_of(next_cumulativegpaspring ),
~ round(as.numeric(.x), 2)),
college = as.character(college),
collection_term = "Fall",
collection_year = year)
#Enrollment Data Cleaning
fa_raw <- read_xlsx(path, sheet = fa_sheet)
fa <- fa_raw %>%
clean_names() %>%
transmute(
study_id = as.character(study_id),
family_size = as.integer(family_size),
dependency_status = as.character(dependency_status),
#Mapping any numeric values using dep_lookup, Otherwise we will keep text
dependency_status = if_else(
dependency_status %in% names(dep_lookup),
dep_lookup[dependency_status],
dependency_status),
dependency_status = factor(
dependency_status,
levels = c("Dependent","Independent","Unkn")),
expected_family_contribution = as.numeric(expected_family_contribution),
cost_of_attendance = as.numeric(cost_of_attendance),
financial_aid_disbursement_amount = as.numeric(financial_aid_disbursement_amount),
pell_recipient = case_when(pell_recipient %in% c("Yes","Y","TRUE") ~ TRUE,
pell_recipient %in% c("No","N","FALSE") ~ FALSE,
TRUE ~ NA),
unmetneed_amount = as.numeric(unmetneed_amount),
unmetneed_category = as.integer(unmetneed_category),
family_income_category = as.integer(family_income_category),
first_generation = factor(first_generation, levels = c("Yes","No","Unkn")))
#Joining and Returning the data
df_year <- left_join(enr, fa, by = "study_id")%>%
#Filtering Out the Columns that have over 70% of data missing
select(
-sat_2016_math_score,
-sat_2016_ebrw_score,
-act_math_score,
-act_english_score,
-act_reading_score,
-act_science_score,
-act_composite_score,
-newrace_str) %>%
mutate(
admission_exemption = as.character(admission_exemption),
admission_exemption = coalesce(na_if(admission_exemption, ""), "1"),
admission_exemption = as.integer(admission_exemption)) %>%
#Replacing blanks with 9 which represents race as unknown
mutate(
newrace = as.character(newrace),
newrace = coalesce(na_if(newrace, ""), "9"),
newrace = as.integer(newrace)) %>%
#Handling Dependency Status Blanks
mutate(
dependency_status = as.character(dependency_status),
dependency_status = coalesce(na_if(dependency_status, ""), "Unkn"),
dependency_status = factor(dependency_status,levels = c("Dependent","Independent","Unkn"))) %>%
#Identifying, Replacing Outliers and Imputing Missing Values with Mean
mutate(
across(
c(expected_family_contribution, unmetneed_amount, cost_of_attendance, financial_aid_disbursement_amount, high_school_gpa,family_size),
~ {
#Computing the mean for each column on a yearly basis
med <- median(.x, na.rm = TRUE)
#Imputing Missing Values
x0 <- if_else(is.na(.x), med, .x)
#Computing IQR Boundaries
q <- quantile(x0, c(0.25, 0.75), na.rm = TRUE)
iqr <- diff(q)
lower <- q[1] - 1.5 * iqr
upper <- q[2] + 1.5 * iqr
#Replacing outliers with the year’s mean
ifelse(x0 < lower | x0 > upper, med, x0)
}
)
) %>%
#Dropping Records which are NA since it is less than 5% of data set
filter(
if_all(
any_of(c(
paste0("term_credit_hours_attempted", year + 1, "spring"),
paste0("semestergpa", year + 1, "spring"),
paste0("cumulativehoursearned", year + 1, "spring"),
paste0("cumulativegpa", year + 1, "spring")
)),
~ !is.na(.x)
)
)
##The Below Lines of Codes represents lookup tables for varibales.
##For Numeric Variables a label has been created
##For Text Fields a Code has been created to used for modelling
family_inc_lookup <- tibble(
family_income_category = 1:10,
family_income_label = c("Lowest Thru 25,000", "25,001 Thru 50,000", "50,001 Thru 75,000",
"75,001 Thru 100,000", "100,001 Thru 125,000", "125,001 Thru 150,000",
"150,001 Thru 175,000", "175,001 Thru 200,000", "200,001 Thru 225,000",
"225,001+"))
unmetneed_cat_lookup <- tibble(
unmetneed_category = 1:7,
unmetneed_category_label = c("$1 Thru $5000", "$5001 Thru $7500", "$7501 Thru $10000",
"$10001 Thru $15000", "$15001 Thru Highest", "No Unmet Need",
"Unknown Unmet Need"))
distance_edu_lookup <- tibble(
distance_education_enrollment = 1:3,
distance_education_label = c("Enrolled Exclusively in Distance Education", "Enrolled in Some but Not All Distance Education", "Not Enrolled in Any Distance Education"))
admission_test_lookup <- tibble(
admission_test_flag = 0:5,
admission_test_flag_label = c("No Test Required for Admission", "SAT Test Required", "ACT Test Required",
"Either ACT or SAT Required", "Institution Test Required", "Because of Admission Exemption"))
military_status_lookup <- tibble(
military_status = 1:5,
military_status_label = c("Active Duty Member of Any of the US Uniform Forces", "Veteran or Former Active Duty Member of Any of The US Uniformed Forces", "Reserve Duty Member of Any of The US Armed Forces, Including The National Guard", "Spouse or Dependent Child of an Active, Reserve, or Former Member of The US Uniformed Forces", "None of the above"))
frace_lookup <- tibble(
newrace = 1:9,
race_label = c("African-Americans", "Native-Americans", "Asian",
"Hispanic", "White", "Native-Hawaiians",
"MultiRacials", "International","Unknown"))
dependency_status_lookup <- tibble(
dependency_status = c("Unkn", "Dependent","Independent"),
dependency_code = 0:2)
gender_lookup <- tibble(
gender = c("M","F"),
gender_code = 1:2)
tuition_status_lookup <- tibble(
tuition_status = c("in-state", "out-of-state"),
tuition_status_code = 2:3)
first_generation_lookup <- tibble(
first_generation = c("No", "Yes", "Unkn"),
first_generation_code = c(0,1,9))
#Adding the Additional Columns to dataset
df_year <- df_year %>%
left_join(family_inc_lookup, by = "family_income_category") %>%
left_join(unmetneed_cat_lookup, by = "unmetneed_category") %>%
left_join(distance_edu_lookup, by = "distance_education_enrollment") %>%
left_join(admission_test_lookup, by = "admission_test_flag") %>%
left_join(military_status_lookup, by = "military_status") %>%
left_join(frace_lookup, by = "newrace") %>%
left_join(dependency_status_lookup, by = "dependency_status") %>%
left_join(gender_lookup, by = "gender") %>%
left_join(tuition_status_lookup, by = "tuition_status") %>%
left_join(first_generation_lookup, by = "first_generation")
#Writing Each Year Data to A separate CSV to Upload to Tableau
file_name <- paste0("/Users/frenandezlawrence/Downloads/Team_B",year,"Cleaned_Data.csv")
write.csv(df_year, file = file_name, row.names = FALSE)
df_year
}
)
#Performing Uniqueness Check
all_cohorts %>%
count(collection_year, study_id) %>%
filter(n > 1)
# Performing Missingness Check
all_cohorts %>%
summarise_all(~ mean(is.na(.))) %>%
pivot_longer(everything(), names_to = "var", values_to = "pct_missing")
# Perfoming a final look at dataset
#The NAs displaying in the glimpse is for the dynamic columns which are not not applicable for all years.
glimpse(all_cohorts)
Rows: 5,957
Columns: 70
$ collection_term <chr> "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fall", "Fal…
$ collection_year <dbl> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, …
$ study_id <chr> "20227000000002", "20227000000004", "20227000000005", "20227000000006", "20227000000007", "20227000000008", "20227000000010", "20227000000012", "20227000000…
$ gender <chr> "F", "F", "F", "M", "F", "M", "F", "M", "F", "M", "F", "M", "M", "F", "F", "M", "F", "M", "M", "M", "F", "M", "M", "F", "F", "M", "F", "F", "F", "F", "M", "…
$ zip_code <chr> "21220", "10500", "21045", "21085", "21217", "21244", "21212", "21043", "21206", "08000", "20708", "21040", "20695", "21117", "21218", "21500", "21213", "19…
$ tuition_status <chr> "out-of-state", "out-of-state", "in-state", "in-state", "in-state", "in-state", "in-state", "in-state", "in-state", "out-of-state", "in-state", "in-state", …
$ high_school_gpa <dbl> 2.29, 3.00, 3.10, 1.97, 4.00, 2.71, 2.80, 2.76, 2.72, 3.30, 2.35, 2.45, 3.04, 3.88, 3.00, 3.50, 2.84, 2.96, 3.19, 2.67, 3.65, 2.81, 3.10, 3.08, 3.65, 3.20, …
$ distance_education_enrollment <int> 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 3, 3, 3, 3, 2, 2, 1, 3, 2, 3, 3, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, …
$ admission_test_flag <int> 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 3, 5, 3, 5, …
$ admission_exemption <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ math_remedial_assessment <int> 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, …
$ english_remedial_assessment <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ reading_remedial_assessment <int> 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, …
$ military_status <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, …
$ majors <chr> "NURS", "PSYC", "SOWK", "AREN", "SWAN", "MKTG", "SWAN", "COSC", "SOCI", "ACCT", "NURS", "ACCT", "HLTH", "SOWK", "PSYC", "BUAD", "SWAN", "AREN", "EEGR", "PHE…
$ cip_code <chr> "51.3801", "42.0101", "44.0701", "4.0902", "10.0304", "52.1401", "10.0304", "11.0101", "45.1101", "52.0301", "51.3801", "52.0301", "51.2207", "44.0701", "42…
$ age <int> 29, 18, 23, 19, 18, 19, 22, 20, 20, 20, 20, 20, 20, 21, 23, 19, 19, 19, 19, 19, 18, 19, 19, 20, 18, 19, 18, 19, 17, 18, 19, 18, 19, 18, 18, 19, 18, 18, 18, …
$ newrace <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 5, 7, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, …
$ term_credit_hours_attempted2022fall <int> 15, 14, 12, 16, 14, 13, 14, 15, 14, 15, 18, 16, 15, 14, 17, 15, 17, 17, 12, 15, 16, 16, 13, 16, 15, 16, 15, 14, 16, 16, 13, 13, 14, 16, 16, 16, 16, 16, 14, …
$ semesterhoursearned2022fall <int> 15, 14, 12, 10, 14, 13, 14, 14, 13, 0, 18, 16, 15, 14, 17, 15, 17, 14, 12, 11, 16, 10, 13, 13, 15, 3, 11, 7, 12, 10, 13, 10, 14, 16, 10, 16, 16, 1, 14, 17, …
$ cumulativegpa2022fall <dbl> 3.36, 3.43, 3.67, 1.15, 3.40, 1.92, 4.00, 3.20, 2.70, 0.00, 2.64, 2.44, 2.20, 3.10, 4.00, 2.50, 4.00, 2.18, 3.67, 1.45, 3.58, 1.90, 1.92, 2.46, 2.73, 0.69, …
$ retained_fall2023 <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TR…
$ casa_student <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
$ returned_spring2023 <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Y…
$ term_credit_hours_attempted2023spring <int> 16, 16, 15, 12, 15, 15, 13, 14, 13, 12, 14, 12, 17, 13, 18, 17, 13, 12, 16, 12, 13, 13, 14, 16, 15, 13, 14, 13, 8, 15, 16, 9, 14, 16, 13, 20, 15, 13, 16, 13…
$ semesterhoursearned2023spring <int> 16, 16, 9, 6, 14, 9, 13, 0, 7, 0, 14, 12, 6, 13, 18, 14, 13, 8, 16, 3, 13, 13, 10, 3, 15, 3, 14, 7, 0, 6, 16, 6, 10, 16, 7, 20, 15, 3, 16, 13, 12, 3, 8, 16,…
$ semestergpa2023spring <dbl> 3.12, 2.81, 2.75, 1.00, 2.73, 1.80, 3.60, 0.00, 1.23, 0.00, 1.93, 2.50, 0.46, 3.08, 4.00, 2.79, 3.77, 3.00, 3.19, 0.00, 3.77, 2.77, 3.20, 0.60, 2.67, 0.46, …
$ cumulativehoursearned2023spring <int> 31, 30, 21, 16, 30, 16, 27, 14, 20, 0, 32, 28, 21, 27, 35, 32, 30, 22, 53, 14, 29, 23, 23, 16, 30, 6, 22, 11, 12, 19, 29, 16, 24, 32, 20, 36, 31, 4, 33, 33,…
$ cumulativegpa2023spring <dbl> 3.22, 3.10, 3.21, 1.23, 3.08, 2.09, 3.83, 3.43, 1.87, 0.00, 2.29, 2.46, 1.39, 3.09, 4.00, 2.62, 3.90, 2.53, 3.39, 0.94, 3.68, 2.39, 2.48, 1.65, 2.70, 0.75, …
$ college <chr> "SCHP", "CLA", "SOWK", "SAP", "CLA", "SBM", "CLA", "SCMNS", "CLA", "SBM", "SCHP", "SBM", "SCHP", "SOWK", "CLA", "SBM", "CLA", "SAP", "SOE", "SEUS", "SCHP", …
$ family_size <dbl> 2, 3, 3, 3, 6, 2, 2, 4, 3, 2, 3, 3, 4, 3, 1, 3, 5, 5, 2, 2, 3, 4, 4, 3, 3, 1, 3, 6, 4, 3, 4, 2, 4, 3, 4, 2, 2, 4, 3, 6, 4, 2, 3, 2, 6, 3, 2, 3, 2, 3, 6, 4, …
$ dependency_status <chr> "Independent", "Dependent", "Unkn", "Independent", "Dependent", "Dependent", "Independent", "Dependent", "Dependent", "Dependent", "Dependent", "Dependent",…
$ expected_family_contribution <dbl> 0, 6451, 2469, 2789, 15651, 0, 0, 26802, 0, 6443, 448, 0, 709, 12127, 0, 2469, 928, 2469, 1350, 10447, 10490, 0, 24413, 0, 0, 0, 9900, 0, 5483, 2469, 2469, …
$ cost_of_attendance <dbl> 39589, 39533, 38249, 39552, 29061, 39589, 39552, 29061, 21229, 39533, 27777, 27777, 38249, 29061, 29117, 24802, 21229, 39533, 29080, 39533, 39533, 38249, 29…
$ financial_aid_disbursement_amount <dbl> 19797.0, 10444.0, 17910.5, 6845.0, 19872.0, 9695.0, 9409.0, 3000.0, 11359.0, 37021.0, 18447.0, 20297.0, 10145.0, 7444.0, 6895.0, 19623.0, 13289.0, 30358.0, …
$ pell_recipient <lgl> TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, T…
$ unmetneed_amount <dbl> 19792.0, 22638.0, 8394.5, 29918.0, -6462.0, 29894.0, 30143.0, -741.0, 9870.0, -3931.0, 8882.0, 7480.0, 27395.0, 9490.0, 22222.0, 8394.5, 7012.0, 8394.5, 191…
$ unmetneed_category <int> 5, 5, 7, 5, 6, 5, 5, 6, 3, 6, 3, 2, 5, 3, 5, 7, 2, 6, 5, 1, 6, 1, 6, 5, 4, 4, 6, 6, 5, 6, 6, 6, 5, 4, 6, 6, 1, 6, 6, 6, 6, 1, 6, 3, 5, 4, 6, 6, 5, 4, 3, 5, …
$ family_income_category <int> 2, 3, 10, 2, 7, 1, 1, 8, 2, 3, 3, 1, 1, 4, 10, 10, 3, 9, 2, 3, 3, 2, 5, 1, 2, 10, 3, 2, 3, 9, 9, 2, 1, 3, 9, 6, 1, 1, 5, 9, 8, 2, 2, 2, 3, 3, 6, 7, 4, 2, 3,…
$ first_generation <chr> "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No", "No", "No", "Yes", "Yes", "No", "No", "Yes", "No", "No", "No", "No", "No", "Yes", "No", "Yes", "Y…
$ family_income_label <chr> "25,001 Thru 50,000", "50,001 Thru 75,000", "225,001+", "25,001 Thru 50,000", "150,001 Thru 175,000", "Lowest Thru 25,000", "Lowest Thru 25,000", "175,001 T…
$ unmetneed_category_label <chr> "$15001 Thru Highest", "$15001 Thru Highest", "Unknown Unmet Need", "$15001 Thru Highest", "No Unmet Need", "$15001 Thru Highest", "$15001 Thru Highest", "N…
$ distance_education_label <chr> "Enrolled in Some but Not All Distance Education", "Enrolled in Some but Not All Distance Education", "Enrolled in Some but Not All Distance Education", "En…
$ admission_test_flag_label <chr> "Because of Admission Exemption", "Because of Admission Exemption", "Because of Admission Exemption", "Either ACT or SAT Required", "Because of Admission Ex…
$ military_status_label <chr> "None of the above", "None of the above", "None of the above", "None of the above", "None of the above", "None of the above", "None of the above", "None of …
$ race_label <chr> "African-Americans", "African-Americans", "African-Americans", "African-Americans", "African-Americans", "African-Americans", "African-Americans", "African-…
$ dependency_code <int> 2, 1, 0, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
$ gender_code <int> 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, …
$ tuition_status_code <int> 3, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 3, 2, 2, 3, 2, 3, 3, 2, 3, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 2, 2, 2, 2, …
$ first_generation_code <dbl> 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, …
$ term_credit_hours_attempted2023fall <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ semester_hoursearned2023fall <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cumulativegpa2023fall <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ retained_fall2024 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ returned_in_2024spring <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ term_credit_hours_attempted2024spring <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ semesterhoursearned2024spring <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ semestergpa2024spring <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cumulativehoursearned2024spring <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cumulativegpa2024spring <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ currenthoursattempted2024fall <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ semesterhoursearned2024fall <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cgpa2024fall <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ retained_fall2025 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ returned_spring2025 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ currenthoursattempted2025spring <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ semesterhoursearned2025spring <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ semestergpa2025spring <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cumulativehoursearned2025spring <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cumulativegpa2025spring <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
write.csv(all_cohorts,
file = "/Users/frenandezlawrence/Downloads/Team_B_Cleaned_Cohort_Data_2022_2024.csv",
row.names = FALSE)