library(tidyverse)
set.seed(2018)
ids <- 1:5000
start_dates <- seq(as.Date("2010/01/01"), as.Date("2018/05/18"), by = "day")
departments <- c("Math", "Science", "English", "History", "Modern Languages")
generate_data <- function(reps, samp, size) {
c(replicate(size, sample(samp, size = size, replace = TRUE)))
}
ids <- generate_data(100, ids, 100)
departments <- generate_data(100, departments, 100)
start_dates <- start_dates %>%
sample(500, TRUE) %>%
sample(10000, TRUE)
df <- tibble(
student_id = ids,
department = departments,
class_start_date = start_dates
) %>%
mutate(
quarter = lubridate::quarter(class_start_date),
year = lubridate::year(class_start_date)
) %>%
mutate(
quarter = case_when(
quarter == 1 ~ "Winter",
quarter == 2 ~ "Spring",
quarter == 3 ~ "Summer",
quarter == 4 ~ "Fall"
),
school = case_when(
department %in% c("Math", "Science") ~ "STEM",
TRUE ~ "Liberal Arts"
)
) %>%
mutate(quarter = paste(quarter, year)) %>%
select(student_id, school, department, class_start_date, quarter, year)
glimpse(df)
## Observations: 10,000
## Variables: 6
## $ student_id <int> 1681, 2319, 303, 988, 2372, 1506, 3034, 651, ...
## $ school <chr> "Liberal Arts", "STEM", "Liberal Arts", "STEM...
## $ department <chr> "History", "Math", "English", "Science", "Eng...
## $ class_start_date <date> 2010-03-14, 2013-03-29, 2014-08-04, 2011-07-...
## $ quarter <chr> "Winter 2010", "Winter 2013", "Summer 2014", ...
## $ year <dbl> 2010, 2013, 2014, 2011, 2013, 2012, 2017, 201...
quarters <- df %>%
arrange(class_start_date) %>%
pull(quarter) %>%
unique()
quarters <- factor(quarters, levels = quarters)
quarters
## [1] Winter 2010 Spring 2010 Summer 2010 Fall 2010 Winter 2011
## [6] Spring 2011 Summer 2011 Fall 2011 Winter 2012 Spring 2012
## [11] Summer 2012 Fall 2012 Winter 2013 Spring 2013 Summer 2013
## [16] Fall 2013 Winter 2014 Spring 2014 Summer 2014 Fall 2014
## [21] Winter 2015 Spring 2015 Summer 2015 Fall 2015 Winter 2016
## [26] Spring 2016 Summer 2016 Fall 2016 Winter 2017 Spring 2017
## [31] Summer 2017 Fall 2017 Winter 2018 Spring 2018
## 34 Levels: Winter 2010 Spring 2010 Summer 2010 Fall 2010 ... Spring 2018
df <- mutate(df, quarter = factor(quarter, levels = quarters))
calc_new_students <- function(quarter_td, ...) {
quarter_td_integer <- as.integer(factor(quarter_td, levels = quarters))
df %>%
filter(as.integer(quarter) <= quarter_td_integer) %>%
group_by(student_id) %>%
mutate(unique_quarters = paste(unique(quarter), collapse = ", ")) %>%
ungroup() %>%
filter(as.integer(quarter) == quarter_td_integer) %>%
distinct(student_id, .keep_all = TRUE) %>%
mutate(student = if_else(unique_quarters == quarter_td, "New", "Returning")) %>%
group_by(student, ...) %>%
summarize(students = n()) %>%
arrange(...) %>%
mutate(quarter = quarter_td)
}
all_students <- quarters %>%
map_dfr(calc_new_students)
head(all_students)
## # A tibble: 6 x 3
## student students quarter
## <chr> <int> <fct>
## 1 New 272 Winter 2010
## 2 New 182 Spring 2010
## 3 Returning 17 Spring 2010
## 4 New 202 Summer 2010
## 5 Returning 22 Summer 2010
## 6 New 383 Fall 2010
ggplot(all_students, aes(quarter, students, fill = student)) +
geom_bar(stat = "identity", position = "fill") +
geom_hline(yintercept = 0.5, linetype = "dashed") +
labs(x = "Quarter",
y = "",
title = "Ratio of New and Returning Students",
subtitle = "2010-2018",
fill = "Student") +
hrbrthemes::theme_ipsum() +
theme(axis.text.x = element_text(angle = 45))
