Stadium_Waste_Analysis_3

Author

Jingyi Yang

Install Packages

library(readxl)
library("readr")
library("tidyverse")
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ purrr     1.0.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(lme4)   
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Import the Data

Clean the data

data_clean <- data %>% select(`Conference`, `School`, `Area Classification (0-Rural; 1-Urban)`, `Year`, `Tenure Year`, `In-Season_Game`, `S_Diversion`, `Attendance`, `Game Time`,`Game result (Win=1; Loss=0)`,`Athletic Dept Profit`, `Athletic Dept Total Expenses`, `Athletic Dept Total Revenues`) # select the column
data_clean$GameTime_numeric <- as.numeric(format(data_clean$`Game Time`, "%H")) + as.numeric(format(data_clean$`Game Time`, "%M"))/60 # convert game time to numerical variable and create a new column
data_clean$`Game Time`=format(data_clean$`Game Time`, format = "%H:%M") # avoid game time impacted by computer system date
data_clean <- data_clean %>% mutate(`Game Time`= as.character(`Game Time`)) %>% mutate(`Area Classification (0-Rural; 1-Urban)`= as.character(`Area Classification (0-Rural; 1-Urban)`)) %>% mutate(`Attendance`= as.numeric(`Attendance`)) # Convert the variable to its suitable data type. 
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `Attendance = as.numeric(Attendance)`.
Caused by warning:
! NAs introduced by coercion
cols_to_factor <- data_clean%>% select_if(is.character) %>% colnames() 
cols_to_factor
[1] "Conference"                            
[2] "School"                                
[3] "Area Classification (0-Rural; 1-Urban)"
[4] "Game Time"                             
[5] "Game result (Win=1; Loss=0)"           
 data_clean <- data_clean %>% 
  mutate(`Game result (Win=1; Loss=0)` = na_if(`Game result (Win=1; Loss=0)`, "N/A")) %>%
          mutate(across(all_of(cols_to_factor), as.factor)) # Make sure there is no NA level.
data_clean <- subset(data_clean, !is.na(`Game result (Win=1; Loss=0)`)) # Clean the game that is being cancelled.
data_clean$GameTime_numeric_c_1 <- with(data_clean, ifelse(
  GameTime_numeric >= 9 &  GameTime_numeric < 12, 1,  # Morning
  ifelse(GameTime_numeric >= 12 & GameTime_numeric < 15.5, 2,   # noon
  ifelse(GameTime_numeric >= 15.5 & GameTime_numeric < 19, 3,   # afternoon
  ifelse(GameTime_numeric >= 19,  4,   # evening
  NA)))                                
)) # Classify game time into four time slots.
data_clean$GameTime_numeric_c_2 <-  data_clean$GameTime_numeric-12 # centralize game time by 12
data_clean$`In-Season_Game_Centered` <- with(data_clean,
  ifelse(`In-Season_Game` == 1, 0,
  ifelse(`In-Season_Game` == 2, 1,
  ifelse(`In-Season_Game` == 3, 2,
  ifelse(`In-Season_Game` == 4, 3,
  ifelse(`In-Season_Game` == 5, 4,
  ifelse(`In-Season_Game` == 6, 5,
  ifelse(`In-Season_Game` == 7, 6,
  ifelse(`In-Season_Game` == 8, 7,
  ifelse(`In-Season_Game` == 9, 8, NA)))))))))
) # Centralize in the in-season game to 0.
data_clean$`Tenure Year Centered` <- with(data_clean,
  ifelse(`Tenure Year` == 1, 0,
  ifelse(`Tenure Year` == 2, 1,
  ifelse(`Tenure Year` == 3, 2,
  ifelse(`Tenure Year` == 4, 3,
  ifelse(`Tenure Year` == 5, 4,
  ifelse(`Tenure Year` == 6, 5,
  ifelse(`Tenure Year` == 7, 6,
  ifelse(`Tenure Year` == 8, 7,
  ifelse(`Tenure Year` == 9, 8,
  ifelse(`Tenure Year` == 10, 9,
  ifelse(`Tenure Year` == 11, 10,
  ifelse(`Tenure Year` == 12, 11,
  ifelse(`Tenure Year` == 13, 12,
  ifelse(`Tenure Year` == 14, 13,
  ifelse(`Tenure Year` == 15, 14,
  ifelse(`Tenure Year` == 16, 15,
  ifelse(`Tenure Year` == 17, 16,
  ifelse(`Tenure Year` == 18, 17,
  ifelse(`Tenure Year` == 19, 18,
  ifelse(`Tenure Year` == 20, 19,
         NA))))))))))))))))))))
)  # Centralize the tenure year to 0.
data_clean$Year_Centered <- with(data_clean,
  ifelse(`Year` == 2003, 0,
  ifelse(`Year` == 2004, 1,
  ifelse(`Year` == 2005, 2,
  ifelse(`Year` == 2006, 3,
  ifelse(`Year` == 2007, 4,
  ifelse(`Year` == 2008, 5,
  ifelse(`Year` == 2009, 6,
  ifelse(`Year` == 2010, 7,
  ifelse(`Year` == 2011, 8,
  ifelse(`Year` == 2012, 9,
  ifelse(`Year` == 2013, 10,
  ifelse(`Year` == 2014, 11,
  ifelse(`Year` == 2015, 12,
  ifelse(`Year` == 2016, 13,
  ifelse(`Year` == 2017, 14,
  ifelse(`Year` == 2018, 15,
  ifelse(`Year` == 2019, 16,
  ifelse(`Year` == 2020, 17,
  ifelse(`Year` == 2021, 18,
  ifelse(`Year` == 2022, 19,
  ifelse(`Year` == 2023, 20,
  ifelse(`Year` == 2024, 21,
         NA))))))))))))))))))))))
) # centralize year to 0.
data_clean <- data_clean %>% dplyr::rename(`conference`= `Conference`,
                                    `school`= `School`,
                                    `area_classification` = `Area Classification (0-Rural; 1-Urban)`,
                                    `year`= `Year`,
                                    `tenure_year` = `Tenure Year`,
                                    `s_game`= `In-Season_Game`,
                                    `s_diversion`= `S_Diversion`,
                                    `attendance`= `Attendance`,
                                    `game_time`= `Game Time`,
                                    `game_result`= `Game result (Win=1; Loss=0)`,
                                    `profit`= `Athletic Dept Profit`,
                                    `total_expenses`= `Athletic Dept Total Expenses`,
                                    `total_revenues`= `Athletic Dept Total Revenues`,
                                    `game_time_chars_c_1`= `GameTime_numeric_c_1`,
                                    `game_time_num_c_2`= `GameTime_numeric_c_2`,
                                    `s_game_c`= `In-Season_Game_Centered`,
                                    `tenure_year_c`= `Tenure Year Centered`,
                                    `year_c`= `Year_Centered`
                                    ) %>% select(- `GameTime_numeric`) # rename the columns.

data_clean$game_time_chars_c_1 <-as.factor(data_clean$game_time_chars_c_1) # Change the variable into a factor variable.
# 
data_clean %<>%
  group_by(school) %>%
  mutate(attendance_mean_school = mean(attendance, na.rm = TRUE)) %>%
  mutate(attendance_cwc_school = attendance - attendance_mean_school) %>%
  ungroup()
data_clean %<>%
  group_by(year) %>%
  mutate(attendance_mean_year = mean(attendance, na.rm = TRUE)) %>%
  mutate(attendance_cwc_year = attendance - attendance_mean_year) %>%
  ungroup()
data_clean %<>%
mutate(total_revenues_mean = mean(total_revenues, na.rm = TRUE)) %>%
mutate(total_revenues_cgm = total_revenues - total_revenues_mean)
str(data_clean)
tibble [1,390 × 24] (S3: tbl_df/tbl/data.frame)
 $ conference            : Factor w/ 5 levels "ACC","Big10",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ school                : Factor w/ 31 levels "Arizona State",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ area_classification   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ year                  : num [1:1390] 2015 2015 2015 2015 2015 ...
 $ tenure_year           : num [1:1390] 1 1 1 1 1 1 1 2 2 2 ...
 $ s_game                : num [1:1390] 1 2 3 4 5 6 7 1 2 3 ...
 $ s_diversion           : num [1:1390] 0.44 0.412 0.315 0.57 0.579 ...
 $ attendance            : num [1:1390] 46500 43310 61904 44157 56534 ...
 $ game_time             : Factor w/ 54 levels "09:00","10:00",..: 49 44 46 44 46 13 15 49 44 44 ...
 $ game_result           : Factor w/ 2 levels "0","1": 2 2 1 2 1 2 2 2 2 2 ...
 $ profit                : num [1:1390] 566524 566524 566524 566524 566524 ...
 $ total_expenses        : num [1:1390] 83873516 83873516 83873516 83873516 83873516 ...
 $ total_revenues        : num [1:1390] 84440040 84440040 84440040 84440040 84440040 ...
 $ game_time_chars_c_1   : Factor w/ 4 levels "1","2","3","4": 4 4 4 4 4 2 2 4 4 4 ...
 $ game_time_num_c_2     : num [1:1390] 8 7 7.5 7 7.5 1 1.5 8 7 7 ...
 $ s_game_c              : num [1:1390] 0 1 2 3 4 5 6 0 1 2 ...
 $ tenure_year_c         : num [1:1390] 0 0 0 0 0 0 0 1 1 1 ...
 $ year_c                : num [1:1390] 12 12 12 12 12 12 12 13 13 13 ...
 $ attendance_mean_school: num [1:1390] 50009 50009 50009 50009 50009 ...
 $ attendance_cwc_school : num [1:1390] -3509 -6699 11895 -5852 6525 ...
 $ attendance_mean_year  : num [1:1390] 70003 70003 70003 70003 70003 ...
 $ attendance_cwc_year   : num [1:1390] -23503 -26693 -8099 -25846 -13469 ...
 $ total_revenues_mean   : num [1:1390] 1.25e+08 1.25e+08 1.25e+08 1.25e+08 1.25e+08 ...
 $ total_revenues_cgm    : num [1:1390] -40621158 -40621158 -40621158 -40621158 -40621158 ...

Analysis & Test

LM

data_clean %>% ggplot(mapping = aes(x = game_time_chars_c_1, y = s_diversion, colour = factor(conference))) +
  geom_point() +
  geom_smooth(mapping = aes(group = conference), method = "lm", se = FALSE, fullrange = TRUE) +
  labs(title = "s_diversion vs. game time (level by conference)",
       colour = "Conference") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))
`geom_smooth()` using formula = 'y ~ x'

data_clean %>% ggplot(mapping = aes(x = game_time_num_c_2, y = s_diversion, colour = factor(conference))) +
  geom_point() +
  geom_smooth(mapping = aes(group = conference), method = "lm", se = FALSE, fullrange = TRUE) +
  labs(title = "s_diversion vs. game time (level by conference)",
       colour = "Conference") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))
`geom_smooth()` using formula = 'y ~ x'

Boxplot

data_clean  %>% 
  ggplot(aes(x = game_time_chars_c_1, y = s_diversion, fill=school)) +
  geom_point(size=0.5) +
  geom_boxplot() +
  theme_classic()+
  theme(legend.position = "none")

data_clean  %>% 
  ggplot(aes(x = game_time_chars_c_1, y = s_diversion)) +
  geom_point(size=0.5) +
  geom_boxplot() +
  theme_classic() +
  theme(legend.position="none")

data_clean  %>% 
  ggplot(aes(x = game_time_chars_c_1, y = s_diversion, fill=conference)) +
  geom_point(size=0.5) +
  geom_boxplot() +
  theme_classic()+
  theme()

Loess

data_clean %>% ggplot(mapping = aes(x = game_time_chars_c_1, y = s_diversion, colour = factor(conference))) +
  geom_point() +
  geom_smooth(mapping = aes(group = conference), method = "loess", se = FALSE, fullrange = TRUE) +
  labs(title = "s_diversion vs. game time (level by conference)",
       colour = "Conference") +
  theme_classic()+
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))
`geom_smooth()` using formula = 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 0.985
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 2.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 8.6211e-16
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 4.0602
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 0.985
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 2.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 9.2717e-16
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 4.0602
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 4.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 2.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 3.3032e-16
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 4.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 2.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 6.3755e-16
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 4.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 2.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 7.9026e-16
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1

data_clean %>% ggplot(mapping = aes(x = game_time_num_c_2, y = s_diversion, colour = factor(conference))) +
  geom_point() +
  geom_smooth(mapping = aes(group = conference), method = "loess", se = FALSE, fullrange = TRUE) +
  labs(title = "s_diversion vs. game time (level by conference)",
       colour = "Conference") +
  theme_classic()+
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 73 rows containing missing values or values outside the scale range
(`geom_smooth()`).

Multilevel Modeling

Two Level

Year

year_model_1_null <- lmer(s_diversion ~ 1 + (1|year), data=data_clean)

summary(year_model_1_null)
Linear mixed model fit by REML ['lmerMod']
Formula: s_diversion ~ 1 + (1 | year)
   Data: data_clean

REML criterion at convergence: 424.5

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.1594 -0.7537 -0.2390  0.8449  2.2797 

Random effects:
 Groups   Name        Variance Std.Dev.
 year     (Intercept) 0.01307  0.1143  
 Residual             0.07647  0.2765  
Number of obs: 1390, groups:  year, 22

Fixed effects:
            Estimate Std. Error t value
(Intercept)  0.37894    0.02623   14.45

Random effects year (Intercept): 0.01307 There’s some variability in average s_diversion across years.

Fixed effects (Intercept): 0.37894 The overall average s_diversion is 0.37894 across all years.

library(performance)
icc(year_model_1_null)
# Intraclass Correlation Coefficient

    Adjusted ICC: 0.146
  Unadjusted ICC: 0.146
library("sjPlot")
plot_model(year_model_1_null, type ='re', facet.grid = FALSE, sort.est = "sort.all", y.offset = .4)
Sorting each group of random effects ('sort.all') is not possible when 'facets = TRUE'.

School

school_model_1_null <- lmer(s_diversion ~ 1 + (1|school), data=data_clean)

summary(school_model_1_null)
Linear mixed model fit by REML ['lmerMod']
Formula: s_diversion ~ 1 + (1 | school)
   Data: data_clean

REML criterion at convergence: -1126.1

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.4297 -0.4986 -0.0120  0.5227  3.1356 

Random effects:
 Groups   Name        Variance Std.Dev.
 school   (Intercept) 0.06163  0.2482  
 Residual             0.02346  0.1532  
Number of obs: 1390, groups:  school, 31

Fixed effects:
            Estimate Std. Error t value
(Intercept)  0.37253    0.04489   8.299

Random effects year (Intercept): 0.06163 There is variability in the average s_diversion across schools.

Fixed effects (Intercept): 0.37253 The overall average s_diversion is 0.37253, across all schools.

library("sjPlot")
plot_model(school_model_1_null, type ='re', facet.grid = FALSE, sort.est = "sort.all", y.offset = .4)
Sorting each group of random effects ('sort.all') is not possible when 'facets = TRUE'.

library(performance)
icc(school_model_1_null)
# Intraclass Correlation Coefficient

    Adjusted ICC: 0.724
  Unadjusted ICC: 0.724

Three Level

two_level_model_null <- lmer(s_diversion ~ 1+(1|school)+(1|school:year), data=data_clean)

summary(two_level_model_null)
Linear mixed model fit by REML ['lmerMod']
Formula: s_diversion ~ 1 + (1 | school) + (1 | school:year)
   Data: data_clean

REML criterion at convergence: -2448.2

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.6732 -0.4055 -0.0140  0.3816  5.6324 

Random effects:
 Groups      Name        Variance Std.Dev.
 school:year (Intercept) 0.019235 0.13869 
 school      (Intercept) 0.058885 0.24266 
 Residual                0.005853 0.07651 
Number of obs: 1390, groups:  school:year, 214; school, 31

Fixed effects:
            Estimate Std. Error t value
(Intercept)  0.37444    0.04518   8.288

Random effects year (Intercept): school:year 0.019235 school 0.058885 There is variance between schools, and the schools change over the years.

Fixed effects:0.37444 The overall average s_diversion is 0.37444, across all schools and years.

plot_model(two_level_model_null, type ='re', facet.grid = FALSE, sort.est = "sort.all", y.offset = .4)
Sorting each group of random effects ('sort.all') is not possible when 'facets = TRUE'.
Sorting each group of random effects ('sort.all') is not possible when 'facets = TRUE'.
[[1]]


[[2]]

library(performance)
icc(two_level_model_null)
# Intraclass Correlation Coefficient

    Adjusted ICC: 0.930
  Unadjusted ICC: 0.930
 frequency_table_school <- table(data_clean$school) %>% data.frame()
      frequency_table_school
                  Var1 Freq
1        Arizona State   33
2             Arkansas    6
3               Auburn   36
4              Clemson   28
5  Colorado University   59
6                 Duke   38
7              Florida   43
8              Georgia   36
9         Georgia Tech   33
10            Illinois   28
11                Iowa   16
12            Kentucky   34
13                 LSU   36
14            Maryland   29
15            Michigan  143
16      Michigan State   21
17           Minnesota   42
18            NC State   31
19          Ohio State  123
20            Oklahoma   23
21          Penn State   82
22              Purdue   55
23            Stanford   32
24           Tennessee   77
25                UCLA    7
26                 UNC  100
27             UoTexas   54
28                 USC   16
29          Washington   32
30    Washington State   42
31           Wisconsin   55
      ggplot(frequency_table_school, aes(x = Var1, y = Freq)) +
             geom_bar(stat = "identity", fill = "black") +
             labs(title = "Barplot for School Categories", x =
"Categories", y = "Frequency") +
       theme_classic()+
       theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8))

 table(data_clean$school,data_clean$year)
                     
                      2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
  Arizona State          0    0    0    0    0    0    0    0    0    0    0
  Arkansas               0    0    0    0    0    0    0    0    0    0    0
  Auburn                 0    0    0    0    0    0    0    0    0    0    0
  Clemson                0    0    0    0    0    0    0    0    0    0    0
  Colorado University    0    0    0    0    0    0    6    6    5    6    6
  Duke                   0    0    0    0    0    0    0    0    0    0    0
  Florida                0    0    0    0    0    0    0    0    0    0    6
  Georgia                0    0    0    0    0    0    0    0    0    0    0
  Georgia Tech           0    0    0    0    0    0    0    0    0    0    0
  Illinois               0    0    0    0    0    0    0    0    0    0    0
  Iowa                   0    0    0    0    0    0    0    0    0    0    0
  Kentucky               0    0    0    0    0    0    0    0    0    0    0
  LSU                    0    0    0    8    7    8    6    0    0    0    0
  Maryland               0    0    0    0    0    0    0    0    0    0    0
  Michigan               0    6    7    7    8    7    8    7    8    6    7
  Michigan State         0    0    0    0    0    0    0    0    0    0    0
  Minnesota              0    0    0    0    0    0    0    0    0    0    0
  NC State               0    0    0    0    0    0    0    0    0    0    0
  Ohio State             0    0    0    0    7    7    7    8    7    7    6
  Oklahoma               0    0    0    0    0    0    0    0    0    6    6
  Penn State             0    0    0    0    0    0    8    7    7    7    7
  Purdue                 0    0    0    0    0    0    0    0    0    7    7
  Stanford               0    0    0    0    0    0    0    0    0    0    0
  Tennessee              0    0    0    7    7    7    8    1    1    7    0
  UCLA                   0    0    0    0    0    0    0    0    0    0    0
  UNC                    0    6    6    7    6    7    7    6    7    7    7
  UoTexas                0    0    0    0    0    0    0    0    0    0    0
  USC                    0    0    0    0    0    0    0    0    0    0    0
  Washington             7    6    6    6    7    0    0    0    0    0    0
  Washington State       0    0    0    0    0    0    0    0    0    0    5
  Wisconsin              0    0    0    0    0    0    0    0    0    7    7
                     
                      2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
  Arizona State          0    7    6    7    6    7    0    0    0    0    0
  Arkansas               0    0    0    0    6    0    0    0    0    0    0
  Auburn                 0    7    8    7    7    7    0    0    0    0    0
  Clemson                0    0    7    7    7    7    0    0    0    0    0
  Colorado University    6    6    6    6    6    0    0    0    0    0    0
  Duke                   7    6    6    7    6    6    0    0    0    0    0
  Florida                6    7    5    6    7    6    0    0    0    0    0
  Georgia                0    7    6    6    7    7    3    0    0    0    0
  Georgia Tech           0    7    7    6    6    7    0    0    0    0    0
  Illinois               0    0    0    0    0    0    0    7    7    7    7
  Iowa                   0    0    7    6    3    0    0    0    0    0    0
  Kentucky               0    0    7    7    7    8    5    0    0    0    0
  LSU                    0    7    0    0    0    0    0    0    0    0    0
  Maryland               6    6    6    6    5    0    0    0    0    0    0
  Michigan               7    7    8    6    7    7    0    7    8    7    8
  Michigan State         0    0    7    7    7    0    0    0    0    0    0
  Minnesota              0    0    0    7    7    7    0    0    7    7    7
  NC State               0    6    6    6    7    6    0    0    0    0    0
  Ohio State             7    7    7    7    7    7    2    7    8    6    9
  Oklahoma               6    5    0    0    0    0    0    0    0    0    0
  Penn State             7    7    7    7    7    7    0    4    0    0    0
  Purdue                 7    7    7    6    7    7    0    0    0    0    0
  Stanford               0    7    6    6    6    7    0    0    0    0    0
  Tennessee              7    7    7    7    3    8    0    0    0    0    0
  UCLA                   0    0    0    0    7    0    0    0    0    0    0
  UNC                    6    7    6    7    5    3    0    0    0    0    0
  UoTexas                0    0    6    6    6    5    4    6    7    6    8
  USC                    0    7    0    0    0    6    3    0    0    0    0
  Washington             0    0    0    0    0    0    0    0    0    0    0
  Washington State       6    6    7    6    6    6    0    0    0    0    0
  Wisconsin              7    7    6    7    7    7    0    0    0    0    0
     df <- data.frame(table(data_clean$school,data_clean$year))
      ggplot(df, aes(x=Var2, y= Var1, fill=Freq)) +
     geom_tile(color="white", size = 0.25) +
     geom_text(aes(label = Freq)) +
     scale_fill_gradientn(colours = c("white", "gold", "darkorchid"),
values = scales::rescale(c(0,1,40)), limits=c(0, 40))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.