Using Program Info to standardize program code and title matching

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#standardized admitted program titles
PI<-program_info[,c("OLD_BANNER_CODE","PROGRAM_DESC")] #get old banner code and program names
names(PI)[names(PI) == 'OLD_BANNER_CODE'] <- 'Admitted.Program.Code'
data1<-left_join(data,PI, by='Admitted.Program.Code')

data1 <- data1[, c('Cohort', 'Academic.Period', 'ID', 'Transfer.Pathway.ID', 'Previous.Program.Code', 'Previous.Institution.Name', 'Previous.Institution.Type', 'Multiple.Previous.Institution.Count', 'Cumulative.Transfer.Credits.Awarded', 'Admitted.Program.Code', 'PROGRAM_DESC', 'Admitted.GPA', 'Admitted.Academic.Period', 'Current.Credential', 'Current.Program.Code', 'Current.Program.Title', 'Current.GPA', 'Current.Courses.Completed', 'Total.Program.Courses', 'Percent.Complete', 'Credit.Type', 'Registration.Status', 'Gender', 'Birth.Year', 'Immigration.Status', 'Postal.Code', 'Mother.Tongue', 'Semester', 'SZBMTCU_ELEMENT_43_COHORT_START_DATE', 'Current.Program.Length', 'Graduation.Date')]

names(data1)[names(data1) == 'PROGRAM_DESC'] <- 'Admitted.Program.Title'

#standardized current program titles
names(PI)[names(PI) == 'Admitted.Program.Code'] <- 'Current.Program.Code'
data1<-left_join(data1,PI, by='Current.Program.Code')

data1<-data1[,c(2:4,10:15,32,17,23:29,31)]

names(data1)[names(data1) == 'PROGRAM_DESC'] <- 'Current.Program.Title'

Find programs with Pathway into them

d<-na.omit(data1, cols="Transfer.Pathway.ID") #remove non-transfer
d<-d[!d$Transfer.Pathway.ID=="ZZZ",] #remove credit transfer
d2<-unique(d[,c("Admitted.Program.Code","Admitted.Program.Title")]) # get a list of programs with pathways into
d2$Pathway_Program <- "1" # add flag to programs with pathways


library(dplyr)
d3<-left_join(data1, d2, by=c("Admitted.Program.Code", "Admitted.Program.Title"))

d4<-d3[,c(2,7,1,3:6,8:20)]

Block_transfer_program_data <-d4 %>% filter(Pathway_Program == 1) # find all students admitted to programs with pathways

#only look at Degree programs that have transfer pathway 
BA_transfer_programs <- Block_transfer_program_data[grepl("BA$", Block_transfer_program_data$Admitted.Program.Code), ]

Convert Element 43 to Academic terms

BA_transfer_programs$SZBMTCU_ELEMENT_43_COHORT_START_DATE<-as.character(BA_transfer_programs$SZBMTCU_ELEMENT_43_COHORT_START_DATE)

# change 09 ending to 70 and 01 ending to 30
BA_transfer_programs$Element43_translated <- ifelse(
  substring(BA_transfer_programs$SZBMTCU_ELEMENT_43_COHORT_START_DATE, nchar(BA_transfer_programs$SZBMTCU_ELEMENT_43_COHORT_START_DATE) - 1) == "09",
  paste0(substr(BA_transfer_programs$SZBMTCU_ELEMENT_43_COHORT_START_DATE, 1, 4), "70"),
  ifelse(
    substring(BA_transfer_programs$SZBMTCU_ELEMENT_43_COHORT_START_DATE, nchar(BA_transfer_programs$SZBMTCU_ELEMENT_43_COHORT_START_DATE) - 1) == "01",
    paste0(substr(BA_transfer_programs$SZBMTCU_ELEMENT_43_COHORT_START_DATE, 1, 4), "30"),
    NA
  )
)

BA_transfer_programs<-BA_transfer_programs[,c(1:17,21,19,20)]

Find All students SEM2 Year2 Retention and Persistence

Find SEM2 persistence

BA_transfer_programs <- BA_transfer_programs %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(
    Sem2_Persisted = ifelse(
      endsWith(as.character(Admitted.Academic.Period), "70") & (Academic.Period == Admitted.Academic.Period + 60),
      1,
      ifelse(
        endsWith(as.character(Admitted.Academic.Period), "30") & (Academic.Period==Admitted.Academic.Period + 40) ,
        1,
        0
      )
    ),
    SEM2.GPA = ifelse(
      Sem2_Persisted == 1,
      ifelse(
        endsWith(as.character(Admitted.Academic.Period), "70"),
        Current.GPA[Academic.Period == (Admitted.Academic.Period + 60)],
        ifelse(
          endsWith(as.character(Admitted.Academic.Period), "30"),
          Current.GPA[Academic.Period == (Admitted.Academic.Period + 40)],
          NA
        )
      ),
      NA
    )
  )

Find Year 2 persistence

BA_transfer_programs <- BA_transfer_programs %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(
    Year2_Persisted = ifelse((Academic.Period == Admitted.Academic.Period + 100), 1, 0),
    Year2.GPA = ifelse((Academic.Period == Admitted.Academic.Period + 100), Current.GPA[Academic.Period == (Admitted.Academic.Period + 100)], NA)
  )

Find SEM2 retained

BA_transfer_programs <- BA_transfer_programs %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse(endsWith(as.character(Admitted.Academic.Period), "70") & (Academic.Period == Admitted.Academic.Period + 60)  & ((Admitted.Program.Code == Current.Program.Code) & (Academic.Period == Admitted.Academic.Period + 60)), 1,
                    ifelse(endsWith(as.character(Admitted.Academic.Period), "30") & (Academic.Period == Admitted.Academic.Period + 40) & ((Admitted.Program.Code == Current.Program.Code) & (Academic.Period ==Admitted.Academic.Period + 40)), 1,0)))

Find Year2 retained

BA_transfer_programs <- BA_transfer_programs %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == Admitted.Academic.Period + 100) & (Admitted.Program.Code == Current.Program.Code), 1,0)) 

Address FASHION_BA semester issue

Remap semesters of transfer students with graduation records

fashion_BA<- data[data$Admitted.Program.Code=="FASHION_BA",c('ID', 'Admitted.Academic.Period', 'Academic.Period', 'Admitted.Program.Code', 'Semester', 'SZBMTCU_ELEMENT_43_COHORT_START_DATE', 'Graduation.Date', 'Percent.Complete', 'Transfer.Pathway.ID')] 

fashion_BA$Semester<-as.numeric(fashion_BA$Semester)
## Warning: NAs introduced by coercion
fashion_BA <- fashion_BA %>%
  arrange(ID, Admitted.Academic.Period, desc(Academic.Period))

# There are 5 students with multiple admission records where there supposed to only be 1
fashion_BA$Admitted.Academic.Period <- with(fashion_BA, ifelse(ID %in% c("N01029710", "N01271901", "N01288856", "N01313853", "N01068843"), min(Admitted.Academic.Period), Admitted.Academic.Period))

# For transfer students with Graduation records but no later semester levels let's remap the semesters by counting back from 8 -- The largest available Semester value is given value of 8, second largest Semester value is given value of 7 and so on. When Semester is NA (Co-op before converted to numeric) give it value "C"
fashion_BA1 <- fashion_BA %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(rank = dense_rank(desc(Semester)),
         Mapped_SEM = ifelse(!is.na(Graduation.Date),
                             case_when(
                               rank == 1 ~ 8,
                               rank == 2 ~ 7,
                               rank == 3 ~ 6,
                               rank == 4 ~ 5,
                               rank == 5 ~ 4,
                               rank == 6 ~ 3,
                               rank == 7 ~ 2,
                               rank == 8 ~ 1,
                               TRUE ~ 9999  # Assign a default value of 9999 to when rank is NA
                             ),
                             Semester)) %>%
  ungroup()

# FASHION_BA doesn't have any null or NA values in Semester column. The NAs we see in fashion_BA1 used to be "C" before I forced this column into numeric. Let's fill back in the "C"
fashion_BA1$Mapped_SEM <- ifelse(is.na(fashion_BA1$Semester), "C", fashion_BA1$Mapped_SEM)

Remap semesters of transfer students without graduation records

# check the range of Percentage course completed for each Semester level
a3<-fashion_BA1[!is.na(fashion_BA1$Graduation.Date),]
range_df <- aggregate(Percent.Complete ~ Mapped_SEM, data = a3, FUN = function(x) c(min(x), max(x)))

a4<-fashion_BA1[is.na(fashion_BA1$Graduation.Date) & !is.na(fashion_BA1$Transfer.Pathway.ID) & !fashion_BA1$Transfer.Pathway.ID=="ZZZ",]

# get a range of Percent_Complete value for each Semester level of transfer did not graduate students
range_Transfer_NotGrad <- aggregate(Percent.Complete ~ Mapped_SEM, data = a4, FUN = function(x) c(min(x), max(x)))

# For transfer students without graduation record we arbitrarily assign semester values based on Percent course completed. When students' lowest Percent_Complete is between 20 and 50 and this academic period is fall term then assign semester 3, if this is a winter term then assign semester 4. If students' lowest Percent_Complete is over 50 then assign semester 5.
library(stringr)
fashion_BA2 <- a4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(
    lowest_percent = ifelse(all(is.na(Percent.Complete)), NA, min(Percent.Complete, na.rm = TRUE)),
    New_Column = case_when(
      lowest_percent >= 20 & lowest_percent <= 50 & str_sub(Admitted.Academic.Period, -2) == "70" ~ 3,
      lowest_percent >= 20 & lowest_percent <= 50 & str_sub(Admitted.Academic.Period, -2) == "30" ~ 4,
      lowest_percent > 50 ~ 5,
      TRUE ~ NA_integer_
    ),
    Mapped_SEM_FASHION_BA_WithNA = case_when(
      !is.na(Percent.Complete) & !is.na(Semester) & Percent.Complete == lowest_percent ~ as.character(New_Column),
      TRUE ~ NA_character_
    )
  ) %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(
    Mapped_SEM_FASHION_BA = ifelse(!is.na(Mapped_SEM_FASHION_BA_WithNA), Mapped_SEM_FASHION_BA_WithNA, NA_character_),
    Mapped_SEM_FASHION_BA = ifelse(is.na(Mapped_SEM_FASHION_BA), as.character(min(as.numeric(Mapped_SEM_FASHION_BA_WithNA), na.rm = TRUE) + Semester - 1), Mapped_SEM_FASHION_BA),
    Mapped_SEM_FASHION_BA = ifelse(Mapped_SEM == "C", "C", Mapped_SEM_FASHION_BA)
  ) %>%
  ungroup() %>%
  select(-Mapped_SEM_FASHION_BA_WithNA)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Mapped_SEM_FASHION_BA = ifelse(...)`.
## ℹ In group 29: `ID = "N01365851"`, `Admitted.Academic.Period = 202170`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
# A couple students seem to have correct semester assignment to begin with, interesting.
fashion_BA2$Mapped_SEM_FASHION_BA <- with(fashion_BA2, ifelse(ID == "N01365851", 5, Mapped_SEM_FASHION_BA))
fashion_BA2$Mapped_SEM_FASHION_BA <- with(fashion_BA2, ifelse(ID == "N01546170", as.character(Semester), Mapped_SEM_FASHION_BA))

# replace values in Mapped_SEM with the newly calculated mapped semester levels for transfer students who did not graduate
All_df6<-left_join(fashion_BA1[,c(1:9,11)],fashion_BA2[,c(1:3,14)], by=c("ID","Admitted.Academic.Period","Academic.Period"))
All_df6$Mapped_SEM <- ifelse(!is.na(All_df6$Mapped_SEM_FASHION_BA), All_df6$Mapped_SEM_FASHION_BA, All_df6$Mapped_SEM)
fashion_BA_remapped<-All_df6[,1:10]
fashion_BA_remapped$Semester<- as.character(fashion_BA_remapped$Semester)

Filter for Transfer student SEM2 and Year2 retention and persistence

# Join remapped FASHION_BA semesters with all data
BA_transfer_programs2<- left_join(BA_transfer_programs,fashion_BA_remapped[,c(1,3,4,10)], by=c("ID", "Academic.Period", "Admitted.Program.Code"))

#replace semesters in FASHION_BA with remapped values
BA_transfer_programs2$Semester <- ifelse(!is.na(BA_transfer_programs2$Mapped_SEM), BA_transfer_programs2$Mapped_SEM, BA_transfer_programs2$Semester)
BA_transfer_programs2<-BA_transfer_programs2[,1:26]

# There are 5 students in FASHION_BA with multiple admission records where there supposed to only be 1
BA_transfer_programs2$Admitted.Academic.Period <- with(BA_transfer_programs2, ifelse(ID %in% c("N01029710", "N01271901", "N01288856", "N01313853", "N01068843"), min(Admitted.Academic.Period), Admitted.Academic.Period))

# colnames(g)[colnames(g) == "Var1"] <- "ID"
# g<-g[g$Freq==2,]
# datata<-data[,c(2,3,10,13,15,28)]
# abc<-left_join(g,datata,by=c("ID"))
# abc <- abc[order(abc$ID, abc$Academic.Period), ]

#5 more students with multiple admission records where there supposed to only be 1
BA_transfer_programs2$Admitted.Academic.Period <- with(BA_transfer_programs2, ifelse(ID %in% c("N00397703", "N00837554", "N01047068", "N01352123", "N01359941"), min(Admitted.Academic.Period), Admitted.Academic.Period))

TransferStudent_RetentionPersistence<- BA_transfer_programs2 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) > 1) %>%
   mutate(Min_Sem=min(Semester))


TransferStudent_RetentionPersistence$PathwayStudent <- "1"

# count how many students entered at each advanced SEM level
c<- unique(TransferStudent_RetentionPersistence[,c(1,2,27)]) 
table(c$Min_Sem)
## 
##    2    3    4    5    6    7    8   PT 
##   97  515  334 1245   69   57   21    2
# number of Credit Transfer students who did not start from Semester 1
length(unique(subset(TransferStudent_RetentionPersistence, Transfer.Pathway.ID == "ZZZ")$ID))
## [1] 67
# number of students without Pathway ID who did not start from Semester 1
length(unique(subset(TransferStudent_RetentionPersistence, is.na(Transfer.Pathway.ID))$ID))
## [1] 139

Get a count of transfer students by admitted academic period

# find SEM3 transfer students
SEM3_transfers <- TransferStudent_RetentionPersistence[TransferStudent_RetentionPersistence$Min_Sem=="3",]
SEM3_transfers %>%
  group_by(Admitted.Academic.Period) %>%
  summarise(CountDistinctID = n_distinct(ID))
## # A tibble: 10 × 2
##    Admitted.Academic.Period CountDistinctID
##                       <int>           <int>
##  1                   201670              44
##  2                   201730               1
##  3                   201770              78
##  4                   201830               3
##  5                   201870              87
##  6                   201970              86
##  7                   202030               1
##  8                   202070              93
##  9                   202170              76
## 10                   202270              46
# find SEM4 transfer students
SEM4_transfers <- TransferStudent_RetentionPersistence[TransferStudent_RetentionPersistence$Min_Sem=="4",]
SEM4_transfers %>%
  group_by(Admitted.Academic.Period) %>%
  summarise(CountDistinctID = n_distinct(ID))
## # A tibble: 13 × 2
##    Admitted.Academic.Period CountDistinctID
##                       <int>           <int>
##  1                   201670               9
##  2                   201730              17
##  3                   201770               7
##  4                   201830              10
##  5                   201870              10
##  6                   201930              37
##  7                   201970               8
##  8                   202030              45
##  9                   202070              14
## 10                   202130              73
## 11                   202170               1
## 12                   202230              65
## 13                   202330              38
# find SEM5 transfer students
SEM5_transfers <- TransferStudent_RetentionPersistence[TransferStudent_RetentionPersistence$Min_Sem=="5",]
SEM5_transfers %>%
  group_by(Admitted.Academic.Period) %>%
  summarise(CountDistinctID = n_distinct(ID))
## # A tibble: 11 × 2
##    Admitted.Academic.Period CountDistinctID
##                       <int>           <int>
##  1                   201670              64
##  2                   201730               1
##  3                   201770              93
##  4                   201850               1
##  5                   201870             163
##  6                   201970             196
##  7                   202070             247
##  8                   202130               1
##  9                   202170             270
## 10                   202250               1
## 11                   202270             208

SEM3 Transfers

Find SEM3 transfer students by admission cohorts

SEM3_201770 <- SEM3_transfers[SEM3_transfers$Admitted.Academic.Period=="201770",]
SEM3_201830 <- SEM3_transfers[SEM3_transfers$Admitted.Academic.Period=="201830",] #2 students
SEM3_201870 <- SEM3_transfers[SEM3_transfers$Admitted.Academic.Period=="201870",]
SEM3_201930 <- SEM3_transfers[SEM3_transfers$Admitted.Academic.Period=="201930",] #1 student
SEM3_201970 <- SEM3_transfers[SEM3_transfers$Admitted.Academic.Period=="201970",]
SEM3_202030 <- SEM3_transfers[SEM3_transfers$Admitted.Academic.Period=="202030",] #0
SEM3_202070 <- SEM3_transfers[SEM3_transfers$Admitted.Academic.Period=="202070",]
SEM3_202130 <- SEM3_transfers[SEM3_transfers$Admitted.Academic.Period=="202130",] #0
SEM3_202170 <- SEM3_transfers[SEM3_transfers$Admitted.Academic.Period=="202170",]
SEM3_202230 <- SEM3_transfers[SEM3_transfers$Admitted.Academic.Period=="202230",] #0 
SEM3_202270 <- SEM3_transfers[SEM3_transfers$Admitted.Academic.Period=="202270",]

Get Non-transfer data

df<-BA_transfer_programs2[,c(1:20)]
Non_Transfer<- df %>%
  group_by(ID, Admitted.Academic.Period, Transfer.Pathway.ID) %>%
  filter(min(Semester)==1) %>%
  mutate(Min_Sem=min(Semester))


Non_Transfer %>%
  group_by(Admitted.Academic.Period) %>%
  summarise(CountDistinctID = n_distinct(ID))
## # A tibble: 14 × 2
##    Admitted.Academic.Period CountDistinctID
##                       <int>           <int>
##  1                   201670             965
##  2                   201730             141
##  3                   201770            1071
##  4                   201830             197
##  5                   201870             992
##  6                   201930             158
##  7                   201970             966
##  8                   202030             156
##  9                   202070             870
## 10                   202130             151
## 11                   202170             798
## 12                   202230             152
## 13                   202270             703
## 14                   202330             164

Find 201670 Non-transfer cohort who are in SEM3 in 201770

# Select only non-transfer students in SEM3 in 201770
# Non_Transfer_201770_SEM3<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201670" & Non_Transfer$Semester>="3",]
subset_201770_SEM3 <- subset(Non_Transfer, Admitted.Academic.Period == 201670 &
                        Academic.Period == 201770 & Semester == 3 & Admitted.Program.Code == Current.Program.Code)
subset_201770_SEM3$Match<-1
Non_Transfer_201770_SEM3<-left_join(Non_Transfer,subset_201770_SEM3[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_201770_SEM3<-Non_Transfer_201770_SEM3[Non_Transfer_201770_SEM3$Match==1,]
Non_Transfer_201770_SEM3 <- Non_Transfer_201770_SEM3[!is.na(Non_Transfer_201770_SEM3$ID), ]
Non_Transfer_201770_SEM3<- Non_Transfer_201770_SEM3[Non_Transfer_201770_SEM3$Semester>=3, ]


Non_Transfer_201770_SEM3<- Non_Transfer_201770_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 3) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_201770_SEM3 <- Non_Transfer_201770_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 201770 + 60), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 201770 + 60), Current.GPA[Academic.Period == (201770 + 60)], NA)
  )

# Add Year2 persistence
Non_Transfer_201770_SEM3 <- Non_Transfer_201770_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 201770 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 201770 + 100), Current.GPA[Academic.Period == (201770 + 100)], NA))

# Add SEM2 retention
Non_Transfer_201770_SEM3 <- Non_Transfer_201770_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 201770 + 60) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_201770_SEM3 <- Non_Transfer_201770_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 201770 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_201770_SEM3$PathwayStudent <- "0"
Non_Transfer_201770_SEM3$PathwayStudent <- ifelse(is.na(Non_Transfer_201770_SEM3$Transfer.Pathway.ID), 0, "Credit")


SEM3_201770_AllStudents <- rbind(SEM3_201770, Non_Transfer_201770_SEM3)


SEM3_201770_AllStudents_1<- SEM3_201770_AllStudents[, c(1,2,5,9,12:28)]

OneRowSem3201770 <- SEM3_201770_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSem3201770$Comparison_Period<-"201770"


Summary_Sem3_201770<-OneRowSem3201770 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_Sem3_201770$Comparison_Period<- "201770"
Summary_Sem3_201770
## # A tibble: 3 × 6
##   PathwayStudent Sem2_Persisted2 Year2_Persisted2 Sem2_Retained2 Year2_Retained2
##   <chr>                    <dbl>            <dbl>          <dbl>           <dbl>
## 1 0                        0.920            0.866          0.906           0.846
## 2 1                        0.872            0.744          0.846           0.718
## 3 Credit                   0.901            0.901          0.889           0.889
## # ℹ 1 more variable: Comparison_Period <chr>

Find 201770 cohort who are in SEM3 in 201870

# Select only non-transfer students in SEM3 in 201870
# Non_Transfer_201870_SEM3<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201770" & Non_Transfer$Semester>="3",]
subset_201870_SEM3 <- subset(Non_Transfer, Admitted.Academic.Period == 201770 &
                        Academic.Period == 201870 & Semester == 3 & Admitted.Program.Code == Current.Program.Code)
subset_201870_SEM3$Match<-1
Non_Transfer_201870_SEM3<-left_join(Non_Transfer,subset_201870_SEM3[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_201870_SEM3<-Non_Transfer_201870_SEM3[Non_Transfer_201870_SEM3$Match==1,]
Non_Transfer_201870_SEM3 <- Non_Transfer_201870_SEM3[!is.na(Non_Transfer_201870_SEM3$ID), ]
Non_Transfer_201870_SEM3<- Non_Transfer_201870_SEM3[Non_Transfer_201870_SEM3$Semester>=3, ]


Non_Transfer_201870_SEM3<- Non_Transfer_201870_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 3) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_201870_SEM3 <- Non_Transfer_201870_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 201870 + 60), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 201870 + 60), Current.GPA[Academic.Period == (201870 + 60)], NA)
  )

# Add Year2 persistence
Non_Transfer_201870_SEM3 <- Non_Transfer_201870_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 201870 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 201870 + 100), Current.GPA[Academic.Period == (201870 + 100)], NA))

# Add SEM2 retention
Non_Transfer_201870_SEM3 <- Non_Transfer_201870_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 201870 + 60) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_201870_SEM3 <- Non_Transfer_201870_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 201870 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_201870_SEM3$PathwayStudent <- "0"
Non_Transfer_201870_SEM3$PathwayStudent <- ifelse(is.na(Non_Transfer_201870_SEM3$Transfer.Pathway.ID), 0, "Credit")

SEM3_201870_AllStudents <- rbind(SEM3_201870, Non_Transfer_201870_SEM3)


SEM3_201870_AllStudents_1<- SEM3_201870_AllStudents[, c(1,2,5,9,12:28)]

OneRowSem3201870 <- SEM3_201870_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSem3201870$Comparison_Period<-"201870"


Summary_Sem3_201870<-OneRowSem3201870 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_Sem3_201870$Comparison_Period<- "201870"

Find 201870 Non-transfer cohort who are in SEM3 in 201970

# Select only non-transfer students in SEM3 in 201970
# Non_Transfer_201970_SEM3<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201870" & Non_Transfer$Semester>="3",]
subset_201970_SEM3 <- subset(Non_Transfer, Admitted.Academic.Period == 201870 &
                        Academic.Period == 201970 & Semester == 3 & Admitted.Program.Code == Current.Program.Code)
subset_201970_SEM3$Match<-1
Non_Transfer_201970_SEM3<-left_join(Non_Transfer,subset_201970_SEM3[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_201970_SEM3<-Non_Transfer_201970_SEM3[Non_Transfer_201970_SEM3$Match==1,]
Non_Transfer_201970_SEM3 <- Non_Transfer_201970_SEM3[!is.na(Non_Transfer_201970_SEM3$ID), ]
Non_Transfer_201970_SEM3<- Non_Transfer_201970_SEM3[Non_Transfer_201970_SEM3$Semester>=3, ]


Non_Transfer_201970_SEM3<- Non_Transfer_201970_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 3) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_201970_SEM3 <- Non_Transfer_201970_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 201970 + 60), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 201970 + 60), Current.GPA[Academic.Period == (201970 + 60)], NA)
  )

# Add Year2 persistence
Non_Transfer_201970_SEM3 <- Non_Transfer_201970_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 201970 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 201970 + 100), Current.GPA[Academic.Period == (201970 + 100)], NA))

# Add SEM2 retention
Non_Transfer_201970_SEM3 <- Non_Transfer_201970_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 201970 + 60) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_201970_SEM3 <- Non_Transfer_201970_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 201970 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_201970_SEM3$PathwayStudent <- "0"
Non_Transfer_201970_SEM3$PathwayStudent <- ifelse(is.na(Non_Transfer_201970_SEM3$Transfer.Pathway.ID), 0, "Credit")

SEM3_201970_AllStudents <- rbind(SEM3_201970, Non_Transfer_201970_SEM3)


SEM3_201970_AllStudents_1<- SEM3_201970_AllStudents[, c(1,2,5,9,12:28)]

OneRowSem3201970 <- SEM3_201970_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSem3201970$Comparison_Period<-"201970"


Summary_Sem3_201970<-OneRowSem3201970 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_Sem3_201970$Comparison_Period<- "201970"

Find 201970 Non-transfer cohort who are in SEM3 in 202070

# Select only non-transfer students in SEM3 in 202070
# Non_Transfer_202070_SEM3<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201970" & Non_Transfer$Semester>="3",]
subset_202070_SEM3 <- subset(Non_Transfer, Admitted.Academic.Period == 201970 &
                        Academic.Period == 202070 & Semester == 3 & Admitted.Program.Code == Current.Program.Code)
subset_202070_SEM3$Match<-1
Non_Transfer_202070_SEM3<-left_join(Non_Transfer,subset_202070_SEM3[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_202070_SEM3<-Non_Transfer_202070_SEM3[Non_Transfer_202070_SEM3$Match==1,]
Non_Transfer_202070_SEM3 <- Non_Transfer_202070_SEM3[!is.na(Non_Transfer_202070_SEM3$ID), ]
Non_Transfer_202070_SEM3<- Non_Transfer_202070_SEM3[Non_Transfer_202070_SEM3$Semester>=3, ]


Non_Transfer_202070_SEM3<- Non_Transfer_202070_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 3) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_202070_SEM3 <- Non_Transfer_202070_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 202070 + 60), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 202070 + 60), Current.GPA[Academic.Period == (202070 + 60)], NA)
  )

# Add Year2 persistence
Non_Transfer_202070_SEM3 <- Non_Transfer_202070_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 202070 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 202070 + 100), Current.GPA[Academic.Period == (202070 + 100)], NA))

# Add SEM2 retention
Non_Transfer_202070_SEM3 <- Non_Transfer_202070_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 202070 + 60) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_202070_SEM3 <- Non_Transfer_202070_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 202070 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_202070_SEM3$PathwayStudent <- "0"
Non_Transfer_202070_SEM3$PathwayStudent <- ifelse(is.na(Non_Transfer_202070_SEM3$Transfer.Pathway.ID), 0, "Credit")

SEM3_202070_AllStudents <- rbind(SEM3_202070, Non_Transfer_202070_SEM3)


SEM3_202070_AllStudents_1<- SEM3_202070_AllStudents[, c(1,2,5,9,12:28)]

OneRowSem3202070 <- SEM3_202070_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSem3202070$Comparison_Period<-"202070"


Summary_Sem3_202070<-OneRowSem3202070 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_Sem3_202070$Comparison_Period<- "202070"

Find 202070 Non-transfer cohort who are in SEM3 in 202170

# Select only non-transfer students in SEM3 in 202170
# Non_Transfer_202170_SEM3<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="202070" & Non_Transfer$Semester>="3",]
subset_202170_SEM3 <- subset(Non_Transfer, Admitted.Academic.Period == 202070 &
                        Academic.Period == 202170 & Semester == 3 & Admitted.Program.Code == Current.Program.Code)
subset_202170_SEM3$Match<-1
Non_Transfer_202170_SEM3<-left_join(Non_Transfer,subset_202170_SEM3[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_202170_SEM3<-Non_Transfer_202170_SEM3[Non_Transfer_202170_SEM3$Match==1,]
Non_Transfer_202170_SEM3 <- Non_Transfer_202170_SEM3[!is.na(Non_Transfer_202170_SEM3$ID), ]
Non_Transfer_202170_SEM3<- Non_Transfer_202170_SEM3[Non_Transfer_202170_SEM3$Semester>=3, ]


Non_Transfer_202170_SEM3<- Non_Transfer_202170_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 3) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_202170_SEM3 <- Non_Transfer_202170_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 202170 + 60), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 202170 + 60), Current.GPA[Academic.Period == (202170 + 60)], NA)
  )

# Add Year2 persistence
Non_Transfer_202170_SEM3 <- Non_Transfer_202170_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 202170 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 202170 + 100), Current.GPA[Academic.Period == (202170 + 100)], NA))

# Add SEM2 retention
Non_Transfer_202170_SEM3 <- Non_Transfer_202170_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 202170 + 60) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_202170_SEM3 <- Non_Transfer_202170_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 202170 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_202170_SEM3$PathwayStudent <- "0"
Non_Transfer_202170_SEM3$PathwayStudent <- ifelse(is.na(Non_Transfer_202170_SEM3$Transfer.Pathway.ID), 0, "Credit")

SEM3_202170_AllStudents <- rbind(SEM3_202170, Non_Transfer_202170_SEM3)


SEM3_202170_AllStudents_1<- SEM3_202170_AllStudents[, c(1,2,5,9,12:28)]

OneRowSem3202170 <- SEM3_202170_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSem3202170$Comparison_Period<-"202170"


Summary_Sem3_202170<-OneRowSem3202170 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_Sem3_202170$Comparison_Period<- "202170"

Find 202170 Non-transfer cohort who are in SEM3 in 202270

# Select only non-transfer students in SEM3 in 202270
# Non_Transfer_202270_SEM3<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="202170" & Non_Transfer$Semester>="3",]
subset_202270_SEM3 <- subset(Non_Transfer, Admitted.Academic.Period == 202170 &
                        Academic.Period == 202270 & Semester == 3 & Admitted.Program.Code == Current.Program.Code)
subset_202270_SEM3$Match<-1
Non_Transfer_202270_SEM3<-left_join(Non_Transfer,subset_202270_SEM3[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_202270_SEM3<-Non_Transfer_202270_SEM3[Non_Transfer_202270_SEM3$Match==1,]
Non_Transfer_202270_SEM3 <- Non_Transfer_202270_SEM3[!is.na(Non_Transfer_202270_SEM3$ID), ]
Non_Transfer_202270_SEM3<- Non_Transfer_202270_SEM3[Non_Transfer_202270_SEM3$Semester>=3, ]


Non_Transfer_202270_SEM3<- Non_Transfer_202270_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 3) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_202270_SEM3 <- Non_Transfer_202270_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 202270 + 60), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 202270 + 60), Current.GPA[Academic.Period == (202270 + 60)], NA)
  )

# Add Year2 persistence
Non_Transfer_202270_SEM3 <- Non_Transfer_202270_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 202270 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 202270 + 100), Current.GPA[Academic.Period == (202270 + 100)], NA))

# Add SEM2 retention
Non_Transfer_202270_SEM3 <- Non_Transfer_202270_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 202270 + 60) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_202270_SEM3 <- Non_Transfer_202270_SEM3 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 202270 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_202270_SEM3$PathwayStudent <- "0"
Non_Transfer_202270_SEM3$PathwayStudent <- ifelse(is.na(Non_Transfer_202270_SEM3$Transfer.Pathway.ID), 0, "Credit")

SEM3_202270_AllStudents <- rbind(SEM3_202270, Non_Transfer_202270_SEM3)


SEM3_202270_AllStudents_1<- SEM3_202270_AllStudents[, c(1,2,5,9,12:28)]

OneRowSem3202270 <- SEM3_202270_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSem3202270$Comparison_Period<-"202270"


Summary_Sem3_202270<-OneRowSem3202270 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_Sem3_202270$Comparison_Period<- "202270"
summary<-rbind(Summary_Sem3_201770, Summary_Sem3_201870, Summary_Sem3_201970, Summary_Sem3_202070, Summary_Sem3_202170)

library(knitr)
kable(summary)
PathwayStudent Sem2_Persisted2 Year2_Persisted2 Sem2_Retained2 Year2_Retained2 Comparison_Period
0 0.9201597 0.8662675 0.9061876 0.8463074 201770
1 0.8717949 0.7435897 0.8461538 0.7179487 201770
Credit 0.9012346 0.9012346 0.8888889 0.8888889 201770
0 0.9377224 0.8843416 0.9270463 0.8629893 201870
1 0.8965517 0.7586207 0.8965517 0.7471264 201870
Credit 0.9764706 0.9529412 0.9647059 0.9411765 201870
0 0.9168174 0.8734177 0.9150090 0.8625678 201970
1 0.8720930 0.8837209 0.8604651 0.8720930 201970
Credit 0.9550562 0.9662921 0.9438202 0.9438202 201970
0 0.9223301 0.8462783 0.9174757 0.8365696 202070
1 0.9462366 0.8279570 0.9354839 0.8172043 202070
Credit 0.9861111 0.9305556 0.9861111 0.9305556 202070
0 0.8970588 0.8125000 0.8878676 0.7996324 202170
1 0.8026316 0.7105263 0.8026316 0.7105263 202170
Credit 0.9268293 0.8292683 0.9024390 0.8292683 202170

Get overall SEM3 retention persistence rates - Collapsing all comparison periods

All_Cohort_SEM3<-rbind(OneRowSem3201770,OneRowSem3201870,OneRowSem3201970,OneRowSem3202070,OneRowSem3202170, OneRowSem3202270)
All_Cohort_SEM3$Transfer_SEM<-"3"

# This include 201770 - 202270 comparison periods
Summary_SEM3_SEM2<-All_Cohort_SEM3[,c(1:3,4,6)] %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted = sum(Sem2_Persisted1)/n_distinct(ID),
            Sem2_Retained = sum(Sem2_Retained1)/n_distinct(ID))

# This include 201870 - 202170 comparison periods
Summary_SEM3_Year2<-All_Cohort_SEM3[!((All_Cohort_SEM3$Admitted.Academic.Period=="202170"& All_Cohort_SEM3$PathwayStudent==0)|(All_Cohort_SEM3$Admitted.Academic.Period=="202270"& All_Cohort_SEM3$PathwayStudent==1)),c(1:3,5,7)] %>%
  group_by(PathwayStudent) %>%
  summarize(Year2_Persisted = sum(Year2_Persisted1)/n_distinct(ID),
            Year2_Retained = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM3_SEM2
## # A tibble: 3 × 3
##   PathwayStudent Sem2_Persisted Sem2_Retained
##   <chr>                   <dbl>         <dbl>
## 1 0                       0.924         0.916
## 2 1                       0.883         0.875
## 3 Credit                  0.951         0.941
Summary_SEM3_Year2
## # A tibble: 3 × 3
##   PathwayStudent Year2_Persisted Year2_Retained
##   <chr>                    <dbl>          <dbl>
## 1 0                        0.859          0.844
## 2 1                        0.794          0.782
## 3 Credit                   0.838          0.828

SEM5 Transfers

# find SEM5 transfer students
SEM5_transfers <- TransferStudent_RetentionPersistence[TransferStudent_RetentionPersistence$Min_Sem=="5",]

SEM5_transfers %>%
  group_by(Admitted.Academic.Period) %>%
  summarise(CountDistinctID = n_distinct(ID))
## # A tibble: 11 × 2
##    Admitted.Academic.Period CountDistinctID
##                       <int>           <int>
##  1                   201670              64
##  2                   201730               1
##  3                   201770              93
##  4                   201850               1
##  5                   201870             163
##  6                   201970             196
##  7                   202070             247
##  8                   202130               1
##  9                   202170             270
## 10                   202250               1
## 11                   202270             208

Find SEM5 transfer students by admission cohorts

SEM5_201770 <- SEM5_transfers[SEM5_transfers$Admitted.Academic.Period=="201770",]
SEM5_201830 <- SEM5_transfers[SEM5_transfers$Admitted.Academic.Period=="201830",] 
SEM5_201870 <- SEM5_transfers[SEM5_transfers$Admitted.Academic.Period=="201870",]
SEM5_201930 <- SEM5_transfers[SEM5_transfers$Admitted.Academic.Period=="201930",] 
SEM5_201970 <- SEM5_transfers[SEM5_transfers$Admitted.Academic.Period=="201970",]
SEM5_202030 <- SEM5_transfers[SEM5_transfers$Admitted.Academic.Period=="202030",] 
SEM5_202070 <- SEM5_transfers[SEM5_transfers$Admitted.Academic.Period=="202070",]
SEM5_202130 <- SEM5_transfers[SEM5_transfers$Admitted.Academic.Period=="202130",] 
SEM5_202170 <- SEM5_transfers[SEM5_transfers$Admitted.Academic.Period=="202170",]
SEM5_202230 <- SEM5_transfers[SEM5_transfers$Admitted.Academic.Period=="202230",] 
SEM5_202270 <- SEM5_transfers[SEM5_transfers$Admitted.Academic.Period=="202270",]

Identify transfer students with inaccurate Catalog cohort

# Catalog cohort comes from Block transfer report
table(SEM5_201870$Element43_translated)
## 
## 201670 201770 201870 
##    778      4      7
d1<-SEM5_201870[SEM5_201870$Element43_translated=="201870",]
length(unique(d1$ID)) # number of students with inaccurate catalog cohort
## [1] 3

ONCAT admission period only goes back to 201670, so we don’t have control group for SEM5 in 201770

Find 201670 Non-transfer cohort who are in SEM5 in 201870

# Select only non-transfer students in SEM5 in 201870
# Non_Transfer_201870_SEM5<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201670" & Non_Transfer$Semester>="5",]
subset_201870_SEM5 <- subset(Non_Transfer, Admitted.Academic.Period == 201670 &
                        Academic.Period == 201870 & Semester == 5 & Admitted.Program.Code == Current.Program.Code)
subset_201870_SEM5$Match<-1
Non_Transfer_201870_SEM5<-left_join(Non_Transfer,subset_201870_SEM5[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_201870_SEM5<-Non_Transfer_201870_SEM5[Non_Transfer_201870_SEM5$Match==1,]
Non_Transfer_201870_SEM5 <- Non_Transfer_201870_SEM5[!is.na(Non_Transfer_201870_SEM5$ID), ]
Non_Transfer_201870_SEM5<- Non_Transfer_201870_SEM5[Non_Transfer_201870_SEM5$Semester>=5, ]


Non_Transfer_201870_SEM5<- Non_Transfer_201870_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 5) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_201870_SEM5 <- Non_Transfer_201870_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 201870 + 60), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 201870 + 60), Current.GPA[Academic.Period == (201870 + 60)], NA)
  )

# Add Year2 persistence
Non_Transfer_201870_SEM5 <- Non_Transfer_201870_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 201870 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 201870 + 100), Current.GPA[Academic.Period == (201870 + 100)], NA))

# Add SEM2 retention
Non_Transfer_201870_SEM5 <- Non_Transfer_201870_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 201870 + 60) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_201870_SEM5 <- Non_Transfer_201870_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 201870 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_201870_SEM5$PathwayStudent <- "0"
Non_Transfer_201870_SEM5$PathwayStudent <- ifelse(is.na(Non_Transfer_201870_SEM5$Transfer.Pathway.ID), 0, "Credit")

SEM5_201870_AllStudents <- rbind(SEM5_201870, Non_Transfer_201870_SEM5)


SEM5_201870_AllStudents_1<- SEM5_201870_AllStudents[, c(1,2,5,9,12:28)]

OneRowSEM5201870 <- SEM5_201870_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSEM5201870$Comparison_Period<-"201870"


Summary_SEM5_201870<-OneRowSEM5201870 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM5_201870$Comparison_Period<- "201870"

Find 201770 Non-transfer cohort who are in SEM5 in 201970

# Select only non-transfer students in SEM5 in 201970
# Non_Transfer_201970_SEM5<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201770" & Non_Transfer$Semester>="5",]
subset_201970_SEM5 <- subset(Non_Transfer, Admitted.Academic.Period == 201770 &
                        Academic.Period == 201970 & Semester == 5 & Admitted.Program.Code == Current.Program.Code)
subset_201970_SEM5$Match<-1
Non_Transfer_201970_SEM5<-left_join(Non_Transfer,subset_201970_SEM5[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_201970_SEM5<-Non_Transfer_201970_SEM5[Non_Transfer_201970_SEM5$Match==1,]
Non_Transfer_201970_SEM5 <- Non_Transfer_201970_SEM5[!is.na(Non_Transfer_201970_SEM5$ID), ]
Non_Transfer_201970_SEM5<- Non_Transfer_201970_SEM5[Non_Transfer_201970_SEM5$Semester>=5, ]


Non_Transfer_201970_SEM5<- Non_Transfer_201970_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 5) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_201970_SEM5 <- Non_Transfer_201970_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 201970 + 60), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 201970 + 60), Current.GPA[Academic.Period == (201970 + 60)], NA)
  )

# Add Year2 persistence
Non_Transfer_201970_SEM5 <- Non_Transfer_201970_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 201970 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 201970 + 100), Current.GPA[Academic.Period == (201970 + 100)], NA))

# Add SEM2 retention
Non_Transfer_201970_SEM5 <- Non_Transfer_201970_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 201970 + 60) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_201970_SEM5 <- Non_Transfer_201970_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 201970 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_201970_SEM5$PathwayStudent <- "0"
Non_Transfer_201970_SEM5$PathwayStudent <- ifelse(is.na(Non_Transfer_201970_SEM5$Transfer.Pathway.ID), 0, "Credit")

SEM5_201970_AllStudents <- rbind(SEM5_201970, Non_Transfer_201970_SEM5)


SEM5_201970_AllStudents_1<- SEM5_201970_AllStudents[, c(1,2,5,9,12:28)]

OneRowSEM5201970 <- SEM5_201970_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSEM5201970$Comparison_Period<-"201970"


Summary_SEM5_201970<-OneRowSEM5201970 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM5_201970$Comparison_Period<- "201970"

Find 201870 Non-transfer cohort who are in SEM5 in 202070

# Select only non-transfer students in SEM5 in 202070
# Non_Transfer_202070_SEM5<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201870" & Non_Transfer$Semester>="5",]
subset_202070_SEM5 <- subset(Non_Transfer, Admitted.Academic.Period == 201870 &
                        Academic.Period == 202070 & Semester == 5 & Admitted.Program.Code == Current.Program.Code)
subset_202070_SEM5$Match<-1
Non_Transfer_202070_SEM5<-left_join(Non_Transfer,subset_202070_SEM5[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_202070_SEM5<-Non_Transfer_202070_SEM5[Non_Transfer_202070_SEM5$Match==1,]
Non_Transfer_202070_SEM5 <- Non_Transfer_202070_SEM5[!is.na(Non_Transfer_202070_SEM5$ID), ]
Non_Transfer_202070_SEM5<- Non_Transfer_202070_SEM5[Non_Transfer_202070_SEM5$Semester>=5, ]


Non_Transfer_202070_SEM5<- Non_Transfer_202070_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 5) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_202070_SEM5 <- Non_Transfer_202070_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 202070 + 60), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 202070 + 60), Current.GPA[Academic.Period == (202070 + 60)], NA)
  )

# Add Year2 persistence
Non_Transfer_202070_SEM5 <- Non_Transfer_202070_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 202070 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 202070 + 100), Current.GPA[Academic.Period == (202070 + 100)], NA))

# Add SEM2 retention
Non_Transfer_202070_SEM5 <- Non_Transfer_202070_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 202070 + 60) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_202070_SEM5 <- Non_Transfer_202070_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 202070 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_202070_SEM5$PathwayStudent <- "0"
Non_Transfer_202070_SEM5$PathwayStudent <- ifelse(is.na(Non_Transfer_202070_SEM5$Transfer.Pathway.ID), 0, "Credit")

SEM5_202070_AllStudents <- rbind(SEM5_202070, Non_Transfer_202070_SEM5)


SEM5_202070_AllStudents_1<- SEM5_202070_AllStudents[, c(1,2,5,9,12:28)]

OneRowSEM5202070 <- SEM5_202070_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSEM5202070$Comparison_Period<-"202070"


Summary_SEM5_202070<-OneRowSEM5202070 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM5_202070$Comparison_Period<- "202070"

Find 201970 Non-transfer cohort who are in SEM5 in 202170

# Select only non-transfer students in SEM5 in 202170
# Non_Transfer_202170_SEM5<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201970" & Non_Transfer$Semester>="5",]
subset_202170_SEM5 <- subset(Non_Transfer, Admitted.Academic.Period == 201970 &
                        Academic.Period == 202170 & Semester == 5 & Admitted.Program.Code == Current.Program.Code)
subset_202170_SEM5$Match<-1
Non_Transfer_202170_SEM5<-left_join(Non_Transfer,subset_202170_SEM5[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_202170_SEM5<-Non_Transfer_202170_SEM5[Non_Transfer_202170_SEM5$Match==1,]
Non_Transfer_202170_SEM5 <- Non_Transfer_202170_SEM5[!is.na(Non_Transfer_202170_SEM5$ID), ]
Non_Transfer_202170_SEM5<- Non_Transfer_202170_SEM5[Non_Transfer_202170_SEM5$Semester>=5, ]


Non_Transfer_202170_SEM5<- Non_Transfer_202170_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 5) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_202170_SEM5 <- Non_Transfer_202170_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 202170 + 60), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 202170 + 60), Current.GPA[Academic.Period == (202170 + 60)], NA)
  )

# Add Year2 persistence
Non_Transfer_202170_SEM5 <- Non_Transfer_202170_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 202170 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 202170 + 100), Current.GPA[Academic.Period == (202170 + 100)], NA))

# Add SEM2 retention
Non_Transfer_202170_SEM5 <- Non_Transfer_202170_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 202170 + 60) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_202170_SEM5 <- Non_Transfer_202170_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 202170 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_202170_SEM5$PathwayStudent <- "0"
Non_Transfer_202170_SEM5$PathwayStudent <- ifelse(is.na(Non_Transfer_202170_SEM5$Transfer.Pathway.ID), 0, "Credit")

SEM5_202170_AllStudents <- rbind(SEM5_202170, Non_Transfer_202170_SEM5)


SEM5_202170_AllStudents_1<- SEM5_202170_AllStudents[, c(1,2,5,9,12:28)]

OneRowSEM5202170 <- SEM5_202170_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSEM5202170$Comparison_Period<-"202170"


Summary_SEM5_202170<-OneRowSEM5202170 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM5_202170$Comparison_Period<- "202170"

Find 202070 Non-transfer cohort who are in SEM5 in 202270

# Select only non-transfer students in SEM5 in 202270
# Non_Transfer_202270_SEM5<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="202070" & Non_Transfer$Semester>="5",]
subset_202270_SEM5 <- subset(Non_Transfer, Admitted.Academic.Period == 202070 &
                        Academic.Period == 202270 & Semester == 5 & Admitted.Program.Code == Current.Program.Code)
subset_202270_SEM5$Match<-1
Non_Transfer_202270_SEM5<-left_join(Non_Transfer,subset_202270_SEM5[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_202270_SEM5<-Non_Transfer_202270_SEM5[Non_Transfer_202270_SEM5$Match==1,]
Non_Transfer_202270_SEM5 <- Non_Transfer_202270_SEM5[!is.na(Non_Transfer_202270_SEM5$ID), ]
Non_Transfer_202270_SEM5<- Non_Transfer_202270_SEM5[Non_Transfer_202270_SEM5$Semester>=5, ]


Non_Transfer_202270_SEM5<- Non_Transfer_202270_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 5) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_202270_SEM5 <- Non_Transfer_202270_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 202270 + 60), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 202270 + 60), Current.GPA[Academic.Period == (202270 + 60)], NA)
  )

# Add Year2 persistence
Non_Transfer_202270_SEM5 <- Non_Transfer_202270_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 202270 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 202270 + 100), Current.GPA[Academic.Period == (202270 + 100)], NA))

# Add SEM2 retention
Non_Transfer_202270_SEM5 <- Non_Transfer_202270_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 202270 + 60) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_202270_SEM5 <- Non_Transfer_202270_SEM5 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 202270 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_202270_SEM5$PathwayStudent <- "0"
Non_Transfer_202270_SEM5$PathwayStudent <- ifelse(is.na(Non_Transfer_202270_SEM5$Transfer.Pathway.ID), 0, "Credit")

SEM5_202270_AllStudents <- rbind(SEM5_202270, Non_Transfer_202270_SEM5)


SEM5_202270_AllStudents_1<- SEM5_202270_AllStudents[, c(1,2,5,9,12:28)]

OneRowSEM5202270 <- SEM5_202270_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSEM5202270$Comparison_Period<-"202270"


Summary_SEM5_202270<-OneRowSEM5202270 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM5_202270$Comparison_Period<- "202270"
summary1<-rbind(Summary_SEM5_201870, Summary_SEM5_201970, Summary_SEM5_202070, Summary_SEM5_202170, Summary_SEM5_202270)

library(knitr)
kable(summary1)
PathwayStudent Sem2_Persisted2 Year2_Persisted2 Sem2_Retained2 Year2_Retained2 Comparison_Period
0 0.9805353 0.9391727 0.9732360 0.9318735 201870
1 0.9018405 0.8527607 0.8957055 0.8466258 201870
Credit 0.9411765 0.9117647 0.9411765 0.9117647 201870
0 0.9772727 0.9566116 0.9752066 0.9545455 201970
1 0.9030612 0.8469388 0.9030612 0.8469388 201970
Credit 1.0000000 0.9358974 1.0000000 0.9358974 201970
0 0.9476987 0.9309623 0.9476987 0.9309623 202070
1 0.9190283 0.8704453 0.9190283 0.8704453 202070
Credit 0.9879518 0.8795181 0.9879518 0.8795181 202070
0 0.9464286 0.9206349 0.9464286 0.9166667 202170
1 0.8555556 0.7185185 0.8555556 0.7185185 202170
Credit 0.9016393 0.8360656 0.9016393 0.8360656 202170
0 0.9505882 0.0000000 0.9505882 0.0000000 202270
1 0.9134615 0.0000000 0.9134615 0.0000000 202270
Credit 0.9117647 0.0000000 0.9117647 0.0000000 202270

Get overall SEM5 retention persistence rates - Collapsing all comparison periods

All_Cohort_SEM5<-rbind(OneRowSEM5201870,OneRowSEM5201970,OneRowSEM5202070,OneRowSEM5202170,OneRowSEM5202270)
All_Cohort_SEM5$Transfer_SEM<-"5"

# This include 201870 - 202270 comparison periods
Summary_SEM5_SEM2<-All_Cohort_SEM5[,c(1:3,4,6)] %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted = sum(Sem2_Persisted1)/n_distinct(ID),
            Sem2_Retained = sum(Sem2_Retained1)/n_distinct(ID))

# This include 201870 - 202170 comparison periods
Summary_SEM5_Year2<-All_Cohort_SEM5[!((All_Cohort_SEM5$Admitted.Academic.Period=="202070"& All_Cohort_SEM5$PathwayStudent==0)|(All_Cohort_SEM5$Admitted.Academic.Period=="202270"& All_Cohort_SEM5$PathwayStudent==1)),c(1:3,5,7)] %>%
  group_by(PathwayStudent) %>%
  summarize(Year2_Persisted = sum(Year2_Persisted1)/n_distinct(ID),
            Year2_Retained = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM5_SEM2
## # A tibble: 3 × 3
##   PathwayStudent Sem2_Persisted Sem2_Retained
##   <chr>                   <dbl>         <dbl>
## 1 0                       0.960         0.958
## 2 1                       0.899         0.898
## 3 Credit                  0.957         0.957
Summary_SEM5_Year2
## # A tibble: 3 × 3
##   PathwayStudent Year2_Persisted Year2_Retained
##   <chr>                    <dbl>          <dbl>
## 1 0                        0.937          0.933
## 2 1                        0.816          0.815
## 3 Credit                   0.799          0.799

SEM4 Transfers

# find SEM4 transfer students
SEM4_transfers <- TransferStudent_RetentionPersistence[TransferStudent_RetentionPersistence$Min_Sem=="4",]
SEM4_transfers %>%
  group_by(Admitted.Academic.Period) %>%
  summarise(CountDistinctID = n_distinct(ID))
## # A tibble: 13 × 2
##    Admitted.Academic.Period CountDistinctID
##                       <int>           <int>
##  1                   201670               9
##  2                   201730              17
##  3                   201770               7
##  4                   201830              10
##  5                   201870              10
##  6                   201930              37
##  7                   201970               8
##  8                   202030              45
##  9                   202070              14
## 10                   202130              73
## 11                   202170               1
## 12                   202230              65
## 13                   202330              38

Find SEM4 transfer students by admission cohorts

SEM4_201830 <- SEM4_transfers[SEM4_transfers$Admitted.Academic.Period=="201830",] 
SEM4_201870 <- SEM4_transfers[SEM4_transfers$Admitted.Academic.Period=="201870",]
SEM4_201930 <- SEM4_transfers[SEM4_transfers$Admitted.Academic.Period=="201930",] 
SEM4_201970 <- SEM4_transfers[SEM4_transfers$Admitted.Academic.Period=="201970",]
SEM4_202030 <- SEM4_transfers[SEM4_transfers$Admitted.Academic.Period=="202030",] 
SEM4_202070 <- SEM4_transfers[SEM4_transfers$Admitted.Academic.Period=="202070",]
SEM4_202130 <- SEM4_transfers[SEM4_transfers$Admitted.Academic.Period=="202130",] 
SEM4_202170 <- SEM4_transfers[SEM4_transfers$Admitted.Academic.Period=="202170",]
SEM4_202230 <- SEM4_transfers[SEM4_transfers$Admitted.Academic.Period=="202230",] 
SEM4_202270 <- SEM4_transfers[SEM4_transfers$Admitted.Academic.Period=="202270",]

Identify transfer students with inaccurate Catalog cohort

# Catalog cohort comes from Block transfer report
table(SEM4_202230$Element43_translated)
## 
## 202070 202230 
##    191      7
d2<-SEM4_202230[SEM4_202230$Element43_translated=="202230",]
length(unique(d2$ID)) # number of students with inaccurate catalog cohort
## [1] 2

Find 201670 Non-transfer cohort who are in SEM4 in 201830

# Select only non-transfer students in SEM4 in 201830
# Non_Transfer_201830_SEM4<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201670" & Non_Transfer$Semester>="4",]
subset_201830_SEM4 <- subset(Non_Transfer, Admitted.Academic.Period == 201670 &
                        Academic.Period == 201830 & Semester == 4 & Admitted.Program.Code == Current.Program.Code)
subset_201830_SEM4$Match<-1
Non_Transfer_201830_SEM4<-left_join(Non_Transfer,subset_201830_SEM4[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_201830_SEM4<-Non_Transfer_201830_SEM4[Non_Transfer_201830_SEM4$Match==1,]
Non_Transfer_201830_SEM4 <- Non_Transfer_201830_SEM4[!is.na(Non_Transfer_201830_SEM4$ID), ]
Non_Transfer_201830_SEM4<- Non_Transfer_201830_SEM4[Non_Transfer_201830_SEM4$Semester>=4, ]


Non_Transfer_201830_SEM4<- Non_Transfer_201830_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 4) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_201830_SEM4 <- Non_Transfer_201830_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 201830 + 40), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 201830 + 40), Current.GPA[Academic.Period == (201830 + 40)], NA)
  )

# Add Year2 persistence
Non_Transfer_201830_SEM4 <- Non_Transfer_201830_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 201830 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 201830 + 100), Current.GPA[Academic.Period == (201830 + 100)], NA))

# Add SEM2 retention
Non_Transfer_201830_SEM4 <- Non_Transfer_201830_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 201830 + 40) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_201830_SEM4 <- Non_Transfer_201830_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 201830 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_201830_SEM4$PathwayStudent <- "0"
Non_Transfer_201830_SEM4$PathwayStudent <- ifelse(is.na(Non_Transfer_201830_SEM4$Transfer.Pathway.ID), 0, "Credit")

SEM4_201830_AllStudents <- rbind(SEM4_201830, Non_Transfer_201830_SEM4)


SEM4_201830_AllStudents_1<- SEM4_201830_AllStudents[, c(1,2,5,9,12:28)]

OneRowSEM4201830 <- SEM4_201830_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSEM4201830$Comparison_Period<-"201830"


Summary_SEM4_201830<-OneRowSEM4201830 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM4_201830$Comparison_Period<- "201830"

Find 201770 Non-transfer cohort who are in SEM4 in 201930

# Select only non-transfer students in SEM4 in 201930
# Non_Transfer_201930_SEM4<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201770" & Non_Transfer$Semester>="4",]
subset_201930_SEM4 <- subset(Non_Transfer, Admitted.Academic.Period == 201770 &
                        Academic.Period == 201930 & Semester == 4 & Admitted.Program.Code == Current.Program.Code)
subset_201930_SEM4$Match<-1
Non_Transfer_201930_SEM4<-left_join(Non_Transfer,subset_201930_SEM4[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_201930_SEM4<-Non_Transfer_201930_SEM4[Non_Transfer_201930_SEM4$Match==1,]
Non_Transfer_201930_SEM4 <- Non_Transfer_201930_SEM4[!is.na(Non_Transfer_201930_SEM4$ID), ]
Non_Transfer_201930_SEM4<- Non_Transfer_201930_SEM4[Non_Transfer_201930_SEM4$Semester>=4, ]


Non_Transfer_201930_SEM4<- Non_Transfer_201930_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 4) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_201930_SEM4 <- Non_Transfer_201930_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 201930 + 40), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 201930 + 40), Current.GPA[Academic.Period == (201930 + 40)], NA)
  )

# Add Year2 persistence
Non_Transfer_201930_SEM4 <- Non_Transfer_201930_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 201930 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 201930 + 100), Current.GPA[Academic.Period == (201930 + 100)], NA))

# Add SEM2 retention
Non_Transfer_201930_SEM4 <- Non_Transfer_201930_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 201930 + 40) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_201930_SEM4 <- Non_Transfer_201930_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 201930 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_201930_SEM4$PathwayStudent <- "0"
Non_Transfer_201930_SEM4$PathwayStudent <- ifelse(is.na(Non_Transfer_201930_SEM4$Transfer.Pathway.ID), 0, "Credit")

SEM4_201930_AllStudents <- rbind(SEM4_201930, Non_Transfer_201930_SEM4)


SEM4_201930_AllStudents_1<- SEM4_201930_AllStudents[, c(1,2,5,9,12:28)]

OneRowSEM4201930 <- SEM4_201930_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSEM4201930$Comparison_Period<-"201930"


Summary_SEM4_201930<-OneRowSEM4201930 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM4_201930$Comparison_Period<- "201930"

Find 201870 Non-transfer cohort who are in SEM4 in 202030

# Select only non-transfer students in SEM4 in 202030
# Non_Transfer_202030_SEM4<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201870" & Non_Transfer$Semester>="4",]
subset_202030_SEM4 <- subset(Non_Transfer, Admitted.Academic.Period == 201870 &
                        Academic.Period == 202030 & Semester == 4 & Admitted.Program.Code == Current.Program.Code)
subset_202030_SEM4$Match<-1
Non_Transfer_202030_SEM4<-left_join(Non_Transfer,subset_202030_SEM4[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_202030_SEM4<-Non_Transfer_202030_SEM4[Non_Transfer_202030_SEM4$Match==1,]
Non_Transfer_202030_SEM4 <- Non_Transfer_202030_SEM4[!is.na(Non_Transfer_202030_SEM4$ID), ]
Non_Transfer_202030_SEM4<- Non_Transfer_202030_SEM4[Non_Transfer_202030_SEM4$Semester>=4, ]


Non_Transfer_202030_SEM4<- Non_Transfer_202030_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 4) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_202030_SEM4 <- Non_Transfer_202030_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 202030 + 40), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 202030 + 40), Current.GPA[Academic.Period == (202030 + 40)], NA)
  )

# Add Year2 persistence
Non_Transfer_202030_SEM4 <- Non_Transfer_202030_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 202030 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 202030 + 100), Current.GPA[Academic.Period == (202030 + 100)], NA))

# Add SEM2 retention
Non_Transfer_202030_SEM4 <- Non_Transfer_202030_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 202030 + 40) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_202030_SEM4 <- Non_Transfer_202030_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 202030 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_202030_SEM4$PathwayStudent <- "0"
Non_Transfer_202030_SEM4$PathwayStudent <- ifelse(is.na(Non_Transfer_202030_SEM4$Transfer.Pathway.ID), 0, "Credit")

SEM4_202030_AllStudents <- rbind(SEM4_202030, Non_Transfer_202030_SEM4)


SEM4_202030_AllStudents_1<- SEM4_202030_AllStudents[, c(1,2,5,9,12:28)]

OneRowSEM4202030 <- SEM4_202030_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSEM4202030$Comparison_Period<-"202030"


Summary_SEM4_202030<-OneRowSEM4202030 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM4_202030$Comparison_Period<- "202030"

Find 201970 Non-transfer cohort who are in SEM4 in 202130

# Select only non-transfer students in SEM4 in 202130
# Non_Transfer_202130_SEM4<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="201970" & Non_Transfer$Semester>="4",]
subset_202130_SEM4 <- subset(Non_Transfer, Admitted.Academic.Period == 201970 &
                        Academic.Period == 202130 & Semester == 4 & Admitted.Program.Code == Current.Program.Code)
subset_202130_SEM4$Match<-1
Non_Transfer_202130_SEM4<-left_join(Non_Transfer,subset_202130_SEM4[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_202130_SEM4<-Non_Transfer_202130_SEM4[Non_Transfer_202130_SEM4$Match==1,]
Non_Transfer_202130_SEM4 <- Non_Transfer_202130_SEM4[!is.na(Non_Transfer_202130_SEM4$ID), ]
Non_Transfer_202130_SEM4<- Non_Transfer_202130_SEM4[Non_Transfer_202130_SEM4$Semester>=4, ]


Non_Transfer_202130_SEM4<- Non_Transfer_202130_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 4) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_202130_SEM4 <- Non_Transfer_202130_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 202130 + 40), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 202130 + 40), Current.GPA[Academic.Period == (202130 + 40)], NA)
  )

# Add Year2 persistence
Non_Transfer_202130_SEM4 <- Non_Transfer_202130_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 202130 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 202130 + 100), Current.GPA[Academic.Period == (202130 + 100)], NA))

# Add SEM2 retention
Non_Transfer_202130_SEM4 <- Non_Transfer_202130_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 202130 + 40) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_202130_SEM4 <- Non_Transfer_202130_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 202130 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_202130_SEM4$PathwayStudent <- "0"
Non_Transfer_202130_SEM4$PathwayStudent <- ifelse(is.na(Non_Transfer_202130_SEM4$Transfer.Pathway.ID), 0, "Credit")

SEM4_202130_AllStudents <- rbind(SEM4_202130, Non_Transfer_202130_SEM4)


SEM4_202130_AllStudents_1<- SEM4_202130_AllStudents[, c(1,2,5,9,12:28)]

OneRowSEM4202130 <- SEM4_202130_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSEM4202130$Comparison_Period<-"202130"


Summary_SEM4_202130<-OneRowSEM4202130 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM4_202130$Comparison_Period<- "202130"

Find 202070 Non-transfer cohort who are in SEM4 in 202230

# Select only non-transfer students in SEM4 in 202230
# Non_Transfer_202230_SEM4<- Non_Transfer[Non_Transfer$Admitted.Academic.Period=="202070" & Non_Transfer$Semester>="4",]
subset_202230_SEM4 <- subset(Non_Transfer, Admitted.Academic.Period == 202070 &
                        Academic.Period == 202230 & Semester == 4 & Admitted.Program.Code == Current.Program.Code)
subset_202230_SEM4$Match<-1
Non_Transfer_202230_SEM4<-left_join(Non_Transfer,subset_202230_SEM4[,c(1,2,22)], by=c("ID","Admitted.Academic.Period"))
Non_Transfer_202230_SEM4<-Non_Transfer_202230_SEM4[Non_Transfer_202230_SEM4$Match==1,]
Non_Transfer_202230_SEM4 <- Non_Transfer_202230_SEM4[!is.na(Non_Transfer_202230_SEM4$ID), ]
Non_Transfer_202230_SEM4<- Non_Transfer_202230_SEM4[Non_Transfer_202230_SEM4$Semester>=4, ]


Non_Transfer_202230_SEM4<- Non_Transfer_202230_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  filter(min(Semester) == 4) %>%
   mutate(Min_Sem=min(Semester))

# Add SEM2 persistence
Non_Transfer_202230_SEM4 <- Non_Transfer_202230_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Persisted = ifelse((Academic.Period == 202230 + 40), 1, 0),
         SEM2.GPA = ifelse((Academic.Period == 202230 + 40), Current.GPA[Academic.Period == (202230 + 40)], NA)
  )

# Add Year2 persistence
Non_Transfer_202230_SEM4 <- Non_Transfer_202230_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Persisted = ifelse((Academic.Period == 202230 + 100), 1, 0),
         Year2.GPA = ifelse((Academic.Period == 202230 + 100), Current.GPA[Academic.Period == (202230 + 100)], NA))

# Add SEM2 retention
Non_Transfer_202230_SEM4 <- Non_Transfer_202230_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Sem2_Retained = ifelse((Academic.Period == 202230 + 40) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Add Year2 retention
Non_Transfer_202230_SEM4 <- Non_Transfer_202230_SEM4 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(Year2_Retained = ifelse((Academic.Period == 202230 + 100) & Admitted.Program.Code==Current.Program.Code, 1, 0))

# Non_Transfer_202230_SEM4$PathwayStudent <- "0"
Non_Transfer_202230_SEM4$PathwayStudent <- ifelse(is.na(Non_Transfer_202230_SEM4$Transfer.Pathway.ID), 0, "Credit")

SEM4_202230_AllStudents <- rbind(SEM4_202230, Non_Transfer_202230_SEM4)


SEM4_202230_AllStudents_1<- SEM4_202230_AllStudents[, c(1,2,5,9,12:28)]

OneRowSEM4202230 <- SEM4_202230_AllStudents_1 %>%
  group_by(ID, Admitted.Academic.Period, PathwayStudent) %>%
  summarize(Sem2_Persisted1 = sum(Sem2_Persisted),
            Year2_Persisted1 = sum(Year2_Persisted),
            Sem2_Retained1 = sum(Sem2_Retained),
            Year2_Retained1 = sum(Year2_Retained),
            SEM2_GPA1=sum(SEM2.GPA, na.rm = TRUE),
            YEAR2_GPA1=sum(Year2.GPA, na.rm=TRUE),
            Element43=min(Element43_translated))
## `summarise()` has grouped output by 'ID', 'Admitted.Academic.Period'. You can
## override using the `.groups` argument.
OneRowSEM4202230$Comparison_Period<-"202230"


Summary_SEM4_202230<-OneRowSEM4202230 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM4_202230$Comparison_Period<- "202230"
summary2<-rbind(Summary_SEM4_201830, Summary_SEM4_201930, Summary_SEM4_202030, Summary_SEM4_202130, Summary_SEM4_202230)

library(knitr)
kable(summary2)
PathwayStudent Sem2_Persisted2 Year2_Persisted2 Sem2_Retained2 Year2_Retained2 Comparison_Period
0 0.9139073 0.9094923 0.9028698 0.8940397 201830
1 0.8000000 0.7000000 0.8000000 0.7000000 201830
Credit 0.9577465 0.9154930 0.9577465 0.9154930 201830
0 0.9349904 0.9292543 0.9235182 0.9158700 201930
1 0.8648649 0.8378378 0.8648649 0.8378378 201930
Credit 0.9506173 0.9506173 0.9506173 0.9506173 201930
0 0.9388560 0.9013807 0.9309665 0.8934911 202030
1 0.8888889 0.8444444 0.8888889 0.8444444 202030
Credit 1.0000000 0.9879518 0.9879518 0.9759036 202030
0 0.8926056 0.8626761 0.8873239 0.8609155 202130
1 0.8767123 0.8219178 0.8630137 0.8082192 202130
Credit 0.9130435 0.8550725 0.9130435 0.8550725 202130
0 0.8856549 0.8690229 0.8814969 0.8627859 202230
1 0.8923077 0.9230769 0.8923077 0.9230769 202230
Credit 0.9166667 0.8611111 0.9166667 0.8611111 202230

Get overall SEM4 retention persistence rates - Collapsing all comparison periods

All_Cohort_SEM4<-rbind(OneRowSEM4201830,OneRowSEM4201930,OneRowSEM4202030,OneRowSEM4202130, OneRowSEM4202230)

All_Cohort_SEM4$Transfer_SEM<-"4"

# This include 201830 - 202230 comparison periods
Summary_SEM4<-All_Cohort_SEM4 %>%
  group_by(PathwayStudent) %>%
  summarize(Sem2_Persisted = sum(Sem2_Persisted1)/n_distinct(ID),
            Sem2_Retained = sum(Sem2_Retained1)/n_distinct(ID),
            Year2_Persisted = sum(Year2_Persisted1)/n_distinct(ID),
            Year2_Retained = sum(Year2_Retained1)/n_distinct(ID))

Summary_SEM4
## # A tibble: 3 × 5
##   PathwayStudent Sem2_Persisted Sem2_Retained Year2_Persisted Year2_Retained
##   <chr>                   <dbl>         <dbl>           <dbl>          <dbl>
## 1 0                       0.914         0.906           0.895          0.886
## 2 1                       0.878         0.874           0.852          0.848
## 3 Credit                  0.953         0.95            0.924          0.921

Prepare final data for modelling

All_Cohort<- rbind(All_Cohort_SEM3,All_Cohort_SEM4,All_Cohort_SEM5)
colnames(All_Cohort)[colnames(All_Cohort) == "Transfer_SEM"] <- "Comparison_SEM"

a<-as.data.frame(table(All_Cohort$ID))
a<-a[order(a$Freq, decreasing = TRUE), ] 
# some students in df3 don't exist in All_Cohort because they didn't persist into at least SEM3
# b<-as.data.frame(table(df3$ID))
# b<-b[order(b$Freq, decreasing = TRUE), ]

# change multiple Immigration status, postal code, mother tongue to the latest value
BA_transfer_programs3 <- BA_transfer_programs2 %>%
  group_by(ID, Admitted.Academic.Period) %>%
  mutate(largest_period = max(Academic.Period, na.rm = TRUE)) %>%
  mutate(Immigration.Status = ifelse(Academic.Period == largest_period, Immigration.Status, Immigration.Status[which.max(Academic.Period)]),
         Postal.Code = ifelse(Academic.Period == largest_period, Postal.Code, Postal.Code[which.max(Academic.Period)]),
         Mother.Tongue = ifelse(Academic.Period == largest_period, Mother.Tongue, Mother.Tongue[which.max(Academic.Period)])) %>%
  ungroup() %>%
  select(-largest_period)

# for each student change Gender "Not Reported" to the available value
df2<-BA_transfer_programs3 %>%
    mutate(Gender = na_if(Gender, "Not Reported"))
df3 <- df2 %>%
  group_by(ID) %>%
  mutate(Gender = ifelse(is.na(Gender), Gender[!is.na(Gender)][1], Gender))

df3 <- df3 %>%
  group_by(ID) %>%
  mutate(Birth.Year = ifelse(is.na(Birth.Year), Birth.Year[!is.na(Birth.Year)][1], Birth.Year))

# resolve gender transitioning issues
df3$Gender[df3$ID == "N01037047"] <- "Female"
df3$Gender[df3$ID == "N01070542"] <- "Male"
df3$Gender[df3$ID == "N01185978"] <- "Male"
df3$Gender[df3$ID == "N01226236"] <- "Male"
df3$Gender[df3$ID == "N01331096"] <- "Female"
df3$Gender[df3$ID == "N01470508"] <- "Female"
df3$Gender[df3$ID == "N00650738"] <- "Male"
df3$Gender[df3$ID == "N01210934"] <- "Male"

All_df4<- left_join(All_Cohort[,c(1:9,11,12)], unique(df3[,c(1,2,4,5,7,12:16,18,19)]), by=c("ID","Admitted.Academic.Period"))
## Warning in left_join(All_Cohort[, c(1:9, 11, 12)], unique(df3[, c(1, 2, : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 8698 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
multiple_Element43<-as.data.frame(table(All_df4[All_df4$PathwayStudent==1,]$ID))
multiple_Element43<-multiple_Element43[order(multiple_Element43$Freq, decreasing = TRUE), ] 

Calculate Graduation at 100%

#Find catalog cohort for transfer students based on their Min semester
Add_Cohort<- All_df4  %>%
  mutate(Calculated_Cohort = case_when(
    PathwayStudent == 0 ~ Admitted.Academic.Period,
    PathwayStudent == "Credit" ~ Admitted.Academic.Period,
    PathwayStudent == 1 & Comparison_SEM %in% c(3, 5) ~ Admitted.Academic.Period - ((as.numeric(Comparison_SEM) - 1) / 2) * 100,
    Comparison_SEM == 4 ~ Admitted.Academic.Period - 160,
    TRUE ~ NA_real_  # If none of the conditions match, set cohort as NA
  ))

Add_Cohort$Supposed_Grad_Period <- as.numeric(Add_Cohort$Calculated_Cohort)+360

All_df5 <- Add_Cohort %>%
  mutate(Grad_at_100 = ifelse(is.na(Graduation.Date)|Supposed_Grad_Period < Graduation.Date, 0, 1))


# All_df5$Element43_translated <- ifelse(
#   substring(All_df5$SZBMTCU_ELEMENT_43_COHORT_START_DATE, nchar(All_df5$SZBMTCU_ELEMENT_43_COHORT_START_DATE) - 1) == "09",
#   paste0(substr(All_df5$SZBMTCU_ELEMENT_43_COHORT_START_DATE, 1, 4), "70"),
#   ifelse(
#     substring(All_df5$SZBMTCU_ELEMENT_43_COHORT_START_DATE, nchar(All_df5$SZBMTCU_ELEMENT_43_COHORT_START_DATE) - 1) == "01",
#     paste0(substr(All_df5$SZBMTCU_ELEMENT_43_COHORT_START_DATE, 1, 4), "30"),
#     NA
#   )
# )


All_df5<-All_df5[,c(1,22,2,21,23,3:19,24)] #remove Element43 column to get one row per student
All_df5<-unique(All_df5)


No_Sem1_No_PathwayCode_IDs <- All_df5$ID[is.na(All_df5$Transfer.Pathway.ID) & All_df5$PathwayStudent == 1] #70
No_Sem1_CreditTransfer <- All_df5$ID[!is.na(All_df5$Transfer.Pathway.ID) & All_df5$Transfer.Pathway.ID == "ZZZ" & All_df5$PathwayStudent == 1] #41
Sem1_BlockTransfer <- All_df5$ID[(!is.na(All_df5$Transfer.Pathway.ID) & All_df5$Transfer.Pathway.ID != "ZZZ") & All_df5$PathwayStudent == 0] #0


All_df5_2<-All_df5[All_df5$PathwayStudent==1,]

# 21 transfer students have multiple Admitted.Academic.Period where there supposed to only be 1
g<-as.data.frame(table(All_df5_2$ID))
g<-g[order(g$Freq, decreasing = TRUE), ] 
# Check how many students there are for each SEM level comparison
Cohort_SEM_Count <- All_df5 %>%
  group_by(Comparison_SEM, PathwayStudent) %>%
  summarize(Count = n_distinct(ID))
## `summarise()` has grouped output by 'Comparison_SEM'. You can override using
## the `.groups` argument.
kable(Cohort_SEM_Count)
Comparison_SEM PathwayStudent Count
3 0 3259
3 1 463
3 Credit 407
4 0 2529
4 1 230
4 Credit 340
5 0 2302
5 1 1081
5 Credit 324

Graph retention and persistence across the Comparison semesters

Get a summary of Sem2 retention persistence for each Comparison_SEM level

Summary_All <-All_df5 %>%
  group_by(PathwayStudent, Comparison_SEM) %>%
  summarize(Sem2_Persisted2 = sum(Sem2_Persisted1)/n_distinct(ID),
            Sem2_Retained2 = sum(Sem2_Retained1)/n_distinct(ID))
## `summarise()` has grouped output by 'PathwayStudent'. You can override using
## the `.groups` argument.
kable(Summary_All)
PathwayStudent Comparison_SEM Sem2_Persisted2 Sem2_Retained2
0 3 0.9235962 0.9159251
0 4 0.9141953 0.9062871
0 5 0.9600348 0.9582971
1 3 0.8833693 0.8747300
1 4 0.8782609 0.8739130
1 5 0.9000925 0.8991674
Credit 3 0.9508600 0.9410319
Credit 4 0.9529412 0.9500000
Credit 5 0.9567901 0.9567901

Graph Sem2 Persistence

library(ggplot2)
ggplot(Summary_All, aes(x = Comparison_SEM, y = Sem2_Persisted2, group = PathwayStudent, color = PathwayStudent)) +
  geom_line() +
  labs(x = "Comparison Semester", y = "SEM2 Persistence") +
  scale_color_manual(values = c("red", "blue","dark green"))

Graph SEM2 Retention

library(ggplot2)
ggplot(Summary_All, aes(x = Comparison_SEM, y = Sem2_Retained2, group = PathwayStudent, color = PathwayStudent)) +
  geom_line() +
  labs(x = "Comparison Semester", y = "SEM2 Retention") +
  scale_color_manual(values = c("red", "blue", "dark green"))

Get a summary of Year2 retention persistence for each Comparison_SEM level

Summary_All2 <-All_df5[All_df5$Comparison_Period!="202270",] %>%
  group_by(PathwayStudent, Comparison_SEM) %>%
  summarize(Year2_Persisted2 = sum(Year2_Persisted1)/n_distinct(ID),
            Year2_Retained2 = sum(Year2_Retained1)/n_distinct(ID))
## `summarise()` has grouped output by 'PathwayStudent'. You can override using
## the `.groups` argument.
kable(Summary_All2)
PathwayStudent Comparison_SEM Year2_Persisted2 Year2_Retained2
0 3 0.8585348 0.8437387
0 4 0.8948201 0.8861210
0 5 0.9366010 0.9334044
1 3 0.7937650 0.7817746
1 4 0.8521739 0.8478261
1 5 0.8171429 0.8160000
Credit 3 0.9266304 0.9157609
Credit 4 0.9235294 0.9205882
Credit 5 0.8931034 0.8931034

Graph Year2 Persistence

ggplot(Summary_All2, aes(x = Comparison_SEM, y = Year2_Persisted2, group = PathwayStudent, color = PathwayStudent)) +
  geom_line() +
  labs(x = "Comparison Semester", y = "Year2 Persistence") +
  scale_color_manual(values = c("red", "blue","dark green"))

Graph Year2 Retention

ggplot(Summary_All2, aes(x = Comparison_SEM, y = Year2_Retained2, group = PathwayStudent, color = PathwayStudent)) +
  geom_line() +
  labs(x = "Comparison Semester", y = "Year2 Retention") +
  scale_color_manual(values = c("red", "blue","dark green"))

Get a summary of Graduation at 100% for each Comparison_SEM level

Summary_All_Grad <-All_df5[All_df5$Supposed_Grad_Period<202350,] %>%
  group_by(PathwayStudent, Comparison_SEM) %>%
  summarize(Grad_at_100 = sum(Grad_at_100)/n_distinct(ID))
## `summarise()` has grouped output by 'PathwayStudent'. You can override using
## the `.groups` argument.
kable(Summary_All_Grad)
PathwayStudent Comparison_SEM Grad_at_100
0 3 0.5605925
0 4 0.6095656
0 5 0.6654235
1 3 0.4721408
1 4 0.5090909
1 5 0.5965714
Credit 3 0.6544343
Credit 4 0.6809211
Credit 5 0.7068966
# All_df5<- All_df5[,c(1:7,17,8:15)]

Graph Graduation at 100%

ggplot(Summary_All_Grad, aes(x = Comparison_SEM, y = Grad_at_100, group = PathwayStudent, color = PathwayStudent)) +
  geom_line() +
  labs(x = "Comparison Semester", y = "Graduation at 100%") +
  scale_color_manual(values = c("red", "blue","dark green"))