library(tidyverse)

set.seed(2018)

ids <- 1:5000
start_dates <- seq(as.Date("2010/01/01"), as.Date("2018/05/18"), by = "day")
departments <- c("Math", "Science", "English", "History", "Modern Languages")

generate_data <- function(reps, samp, size) {
  c(replicate(size, sample(samp, size = size, replace = TRUE)))
}

ids <- generate_data(100, ids, 100)
departments <- generate_data(100, departments, 100)
start_dates <- start_dates %>% 
  sample(500, TRUE) %>% 
  sample(10000, TRUE)

df <- tibble(
  student_id = ids,
  department = departments,
  class_start_date = start_dates
) %>% 
  mutate(
    quarter = lubridate::quarter(class_start_date),
    year = lubridate::year(class_start_date)
    ) %>% 
  mutate(
    quarter = case_when(
      quarter == 1 ~ "Winter",
      quarter == 2 ~ "Spring",
      quarter == 3 ~ "Summer",
      quarter == 4 ~ "Fall"
    ),
    school = case_when(
      department %in% c("Math", "Science") ~ "STEM",
      TRUE ~ "Liberal Arts"
    )
  ) %>% 
  mutate(quarter = paste(quarter, year)) %>%
  select(student_id, school, department, class_start_date, quarter, year)

glimpse(df)
## Observations: 10,000
## Variables: 6
## $ student_id       <int> 1681, 2319, 303, 988, 2372, 1506, 3034, 651, ...
## $ school           <chr> "Liberal Arts", "STEM", "Liberal Arts", "STEM...
## $ department       <chr> "History", "Math", "English", "Science", "Eng...
## $ class_start_date <date> 2010-03-14, 2013-03-29, 2014-08-04, 2011-07-...
## $ quarter          <chr> "Winter 2010", "Winter 2013", "Summer 2014", ...
## $ year             <dbl> 2010, 2013, 2014, 2011, 2013, 2012, 2017, 201...
quarters <- df %>% 
  arrange(class_start_date) %>% 
  pull(quarter) %>% 
  unique() 
quarters <- factor(quarters, levels = quarters)
quarters
##  [1] Winter 2010 Spring 2010 Summer 2010 Fall 2010   Winter 2011
##  [6] Spring 2011 Summer 2011 Fall 2011   Winter 2012 Spring 2012
## [11] Summer 2012 Fall 2012   Winter 2013 Spring 2013 Summer 2013
## [16] Fall 2013   Winter 2014 Spring 2014 Summer 2014 Fall 2014  
## [21] Winter 2015 Spring 2015 Summer 2015 Fall 2015   Winter 2016
## [26] Spring 2016 Summer 2016 Fall 2016   Winter 2017 Spring 2017
## [31] Summer 2017 Fall 2017   Winter 2018 Spring 2018
## 34 Levels: Winter 2010 Spring 2010 Summer 2010 Fall 2010 ... Spring 2018
df <- mutate(df, quarter = factor(quarter, levels = quarters))

calc_new_students <- function(quarter_td, ...) {
  
  quarter_td_integer <- as.integer(factor(quarter_td, levels = quarters))
  
  df %>% 
    filter(as.integer(quarter) <= quarter_td_integer) %>% 
    group_by(student_id) %>%
    mutate(unique_quarters = paste(unique(quarter), collapse = ", ")) %>%
    ungroup() %>% 
    filter(as.integer(quarter) == quarter_td_integer) %>% 
    distinct(student_id, .keep_all = TRUE) %>%
    mutate(student = if_else(unique_quarters == quarter_td, "New", "Returning")) %>% 
    group_by(student, ...) %>% 
    summarize(students = n()) %>% 
    arrange(...) %>% 
    mutate(quarter = quarter_td)
}

all_students <- quarters %>% 
  map_dfr(calc_new_students)
head(all_students)
## # A tibble: 6 x 3
##   student   students quarter    
##   <chr>        <int> <fct>      
## 1 New            272 Winter 2010
## 2 New            182 Spring 2010
## 3 Returning       17 Spring 2010
## 4 New            202 Summer 2010
## 5 Returning       22 Summer 2010
## 6 New            383 Fall 2010
ggplot(all_students, aes(quarter, students, fill = student)) +
  geom_bar(stat = "identity", position = "fill") +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  labs(x = "Quarter",
       y = "",
       title = "Ratio of New and Returning Students",
       subtitle = "2010-2018",
       fill = "Student") +
  hrbrthemes::theme_ipsum() +
  theme(axis.text.x = element_text(angle = 45))