combined_header <- paste(names(df), df[1, ], sep = "_")

# Remove the first row from the data
df <- df[-1, ]

# Set the combined value as the new header
names(df) <- combined_header
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
df[is.null(df)] <- NA
df[df == ""] <- NA
df$College_ <- na.locf(df$College_, na.rm = FALSE)
subset_df <- subset(df, Credential_ == "ADVANCED DIPLOMA")
clean_column_names <- function(names) {
  cleaned_names <- str_replace_all(names, "\\.{3}", "_")  # Replace "..." with "_"
  cleaned_names <- str_replace_all(cleaned_names, "[^a-zA-Z]_", "")  # Remove anything that is not a letter before "_"
  cleaned_names <- str_replace_all(cleaned_names, "NOVEMBER_|MARCH_|JUNE_", "")  # Remove NOVEMBER_, MARCH_, or JUNE_
  cleaned_names <- str_replace_all(cleaned_names, "_", ".")  # Replace "_" with "."
  return(cleaned_names)
}

library(stringr)

# Apply clean_column_names function to column names
colnames(subset_df) <- clean_column_names(colnames(subset_df))
subset_df<-subset_df[,c(1,3:23)]
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.1
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
long_data_fall <- subset_df %>%
  pivot_longer(
    cols = c(2:8),
    names_to = "Year",
    values_to = "Fall_Count"
  )

long_data_fall<-long_data_fall[,c(1,16,17)]
long_data_fall <- long_data_fall %>%
  mutate(Year = str_replace(Year, "FALL.", ""))

long_data_winter <- subset_df %>%
  pivot_longer(
    cols = c(9:15),
    names_to = "Year",
    values_to = "Winter_Count"
  )

long_data_winter<-long_data_winter[,c(1,16,17)]
long_data_winter <- long_data_winter %>%
  mutate(Year = str_replace(Year, "WINTER.", ""))

long_data_spring <- subset_df %>%
  pivot_longer(
    cols = c(16:22),
    names_to = "Year",
    values_to = "Spring_Count"
  )

long_data_spring<-long_data_spring[,c(1,16,17)]
long_data_spring <- long_data_spring %>%
  mutate(Year = str_replace(Year, "SPRING.", ""))

merged_data <- left_join(long_data_spring, long_data_fall, by = c("College.", "Year"))

# Join merged_data with long_data_winter based on College and Year
merged_data <- left_join(merged_data, long_data_winter, by = c("College.", "Year"))
library(ggplot2)

# Assuming your merged dataframe is named merged_data

merged_data <-merged_data[1:168,]

# Filter the data for the specified colleges
filtered_data <- merged_data %>%
  filter(College. %in% c("SHERIDAN", "SENECA", "GEORGE BROWN", "HUMBER", "CENTENNIAL", "DURHAM"))

filtered_data <- filtered_data %>%
  mutate_at(vars(3:5), ~as.numeric(gsub("[^0-9.]", "", .)))


# Plot Fall_Count across the Year, with each college represented by a different color
ggplot(filtered_data, aes(x = Year, y = Fall_Count, color = College., group=College.)) +
  geom_point(size = 3) +  # Use points to represent each data point
  geom_line() +  # Connect the points with lines
  labs(title = "Fall_Count across Years",
       x = "Year",
       y = "Fall_Count") +
  scale_color_discrete(name = "College") +  # Customize the legend title
  theme_minimal()  # Use a minimalistic theme

# Winter
ggplot(filtered_data, aes(x = Year, y = Winter_Count, color = College., group=College.)) +
  geom_point(size = 3) +  # Use points to represent each data point
  geom_line() +  # Connect the points with lines
  labs(title = "Winter_Count across Years",
       x = "Year",
       y = "Winter_Count") +
  scale_color_discrete(name = "College") +  # Customize the legend title
  theme_minimal() 

# Spring
ggplot(filtered_data, aes(x = Year, y = Spring_Count, color = College., group=College.)) +
  geom_point(size = 3) +  # Use points to represent each data point
  geom_line() +  # Connect the points with lines
  labs(title = "Spring_Count across Years",
       x = "Year",
       y = "Spring_Count") +
  scale_color_discrete(name = "College") +  # Customize the legend title
  theme_minimal() 

For fall enrolment, Humber and Seneca shows very similar pattern across the years, and both are still in recovery.

The enrollment at George Brown College experienced a significant decline due to the pandemic, it dropped below Humber and Senece in 2021 and 2022. However, since 2023, there has been a noticeable recovery in enrollment numbers at George Brown.

Centennial has recovered to pre-pandemic enrolment count.

Seneca has a lot more spring intakes than other colleges.

Looking at market share by year

filtered_data <- filtered_data %>%
  mutate_at(vars(3:5), ~as.numeric(gsub("[^0-9.]", "", .)))

percentage_data <- filtered_data %>%
  group_by(Year) %>%
  summarise(total_count = sum(Fall_Count)) %>%
  left_join(filtered_data, by = "Year") %>%
  mutate(percentage = Fall_Count / total_count * 100)

ggplot(percentage_data, aes(x = Year, y = percentage, color = College., group = College.)) +
  geom_point(size = 3)+
  geom_line() +
  labs(title = "Percentage of Fall_Count by Year",
       x = "Year",
       y = "Percentage") +
  scale_color_discrete(name = "College") +  # Customize the legend title
  theme_minimal()  # Use a minimalistic theme

Because Sheridan did not have many DA enrolment in Fall 2022, the remaining 4 GTA colleges saw an increase in market share.

Centennial’s market share has increased significantly post-pandemic, while Humber and Seneca’s market share still in recovery, so is Sheridan.

George Brown’s market share has returned to pre-pandemic level.

Durham’s market share increased during the pandemic and started to drop post-pandemic

System Market Share

merged_data <- merged_data %>%
  mutate_at(vars(3:5), ~as.numeric(gsub("[^0-9.]", "", .)))

percentage_data1 <- merged_data %>%
  group_by(Year) %>%
  summarise(total_count = sum(Fall_Count)) %>%
  left_join(merged_data, by = "Year") %>%
  mutate(percentage = Fall_Count / total_count * 100)

label_data <- percentage_data1 %>%
  group_by(College.) %>%
  summarise(Year = first(Year),
            percentage = last(percentage))  # You can adjust the summarizing function as needed


ggplot(percentage_data1, aes(x = Year, y = percentage, group = College.)) +
  geom_point(aes(color = College.), size = 3) +
  geom_line(aes(color = College.), show.legend = FALSE) +
  geom_text(data = label_data, aes(label = College., x = Year, y = percentage, color = College.), 
            hjust = -0.2, vjust = -0.2, size = 3) +  # Add labels to the lines
  labs(title = "Percentage of Fall_Count by Year",
       x = "Year",
       y = "Percentage") +
  scale_color_discrete(guide = FALSE) +  # Remove the color legend
  theme_minimal()  # Use a minimalistic
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Fanshawe, Algonquin, Centennial, Conestoga’s market share has increased post-pandemic compared to pre-pandemic