Stadium_Waste_Analysis_1

Author

Jingyi Yang

Install Packages

library(readxl)
library("readr")
library("tidyverse")
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ purrr     1.0.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(lme4)   
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Import the Data

Clean the data

data_clean <- data %>% select(`Conference`, `School`, `Area Classification (0-Rural; 1-Urban)`, `Year`, `Tenure Year`, `In-Season_Game`, `S_Diversion`, `Attendance`, `Game Time`,`Game result (Win=1; Loss=0)`,`Athletic Dept Profit`, `Athletic Dept Total Expenses`, `Athletic Dept Total Revenues`) # select the column
data_clean$GameTime_numeric <- as.numeric(format(data_clean$`Game Time`, "%H")) + as.numeric(format(data_clean$`Game Time`, "%M"))/60 # convert game time to numerical variable and create a new column
data_clean$`Game Time`=format(data_clean$`Game Time`, format = "%H:%M") # avoid game time impacted by computer system date
data_clean <- data_clean %>% mutate(`Game Time`= as.character(`Game Time`)) %>% mutate(`Area Classification (0-Rural; 1-Urban)`= as.character(`Area Classification (0-Rural; 1-Urban)`)) %>% mutate(`Attendance`= as.numeric(`Attendance`)) # Convert the variable to its suitable data type. 
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `Attendance = as.numeric(Attendance)`.
Caused by warning:
! NAs introduced by coercion
cols_to_factor <- data_clean%>% select_if(is.character) %>% colnames() 
cols_to_factor
[1] "Conference"                            
[2] "School"                                
[3] "Area Classification (0-Rural; 1-Urban)"
[4] "Game Time"                             
[5] "Game result (Win=1; Loss=0)"           
 data_clean <- data_clean %>% 
  mutate(`Game result (Win=1; Loss=0)` = na_if(`Game result (Win=1; Loss=0)`, "N/A")) %>%
          mutate(across(all_of(cols_to_factor), as.factor)) # Make sure there is no NA level.
data_clean <- subset(data_clean, !is.na(`Game result (Win=1; Loss=0)`)) # Clean the game that is being cancelled.
data_clean$GameTime_numeric_c_1 <- with(data_clean, ifelse(
  GameTime_numeric >= 9 &  GameTime_numeric < 12, 1,  # Morning
  ifelse(GameTime_numeric >= 12 & GameTime_numeric < 17.5, 2,   # noon
  ifelse(GameTime_numeric >= 17.5 & GameTime_numeric < 19, 3,   # afternoon
  ifelse(GameTime_numeric >= 19,  4,   # evening
  NA)))                                
)) # Classify game time into four time slots.
data_clean$GameTime_numeric_c_2 <-  data_clean$GameTime_numeric-12 # centralize game time by 12
data_clean$`In-Season_Game_Centered` <- with(data_clean,
  ifelse(`In-Season_Game` == 1, 0,
  ifelse(`In-Season_Game` == 2, 1,
  ifelse(`In-Season_Game` == 3, 2,
  ifelse(`In-Season_Game` == 4, 3,
  ifelse(`In-Season_Game` == 5, 4,
  ifelse(`In-Season_Game` == 6, 5,
  ifelse(`In-Season_Game` == 7, 6,
  ifelse(`In-Season_Game` == 8, 7,
  ifelse(`In-Season_Game` == 9, 8, NA)))))))))
) # Centralize in the in-season game to 0.
data_clean$`Tenure Year Centered` <- with(data_clean,
  ifelse(`Tenure Year` == 1, 0,
  ifelse(`Tenure Year` == 2, 1,
  ifelse(`Tenure Year` == 3, 2,
  ifelse(`Tenure Year` == 4, 3,
  ifelse(`Tenure Year` == 5, 4,
  ifelse(`Tenure Year` == 6, 5,
  ifelse(`Tenure Year` == 7, 6,
  ifelse(`Tenure Year` == 8, 7,
  ifelse(`Tenure Year` == 9, 8,
  ifelse(`Tenure Year` == 10, 9,
  ifelse(`Tenure Year` == 11, 10,
  ifelse(`Tenure Year` == 12, 11,
  ifelse(`Tenure Year` == 13, 12,
  ifelse(`Tenure Year` == 14, 13,
  ifelse(`Tenure Year` == 15, 14,
  ifelse(`Tenure Year` == 16, 15,
  ifelse(`Tenure Year` == 17, 16,
  ifelse(`Tenure Year` == 18, 17,
  ifelse(`Tenure Year` == 19, 18,
  ifelse(`Tenure Year` == 20, 19,
         NA))))))))))))))))))))
)  # Centralize the tenure year to 0.
data_clean$Year_Centered <- with(data_clean,
  ifelse(`Year` == 2003, 0,
  ifelse(`Year` == 2004, 1,
  ifelse(`Year` == 2005, 2,
  ifelse(`Year` == 2006, 3,
  ifelse(`Year` == 2007, 4,
  ifelse(`Year` == 2008, 5,
  ifelse(`Year` == 2009, 6,
  ifelse(`Year` == 2010, 7,
  ifelse(`Year` == 2011, 8,
  ifelse(`Year` == 2012, 9,
  ifelse(`Year` == 2013, 10,
  ifelse(`Year` == 2014, 11,
  ifelse(`Year` == 2015, 12,
  ifelse(`Year` == 2016, 13,
  ifelse(`Year` == 2017, 14,
  ifelse(`Year` == 2018, 15,
  ifelse(`Year` == 2019, 16,
  ifelse(`Year` == 2020, 17,
  ifelse(`Year` == 2021, 18,
  ifelse(`Year` == 2022, 19,
  ifelse(`Year` == 2023, 20,
  ifelse(`Year` == 2024, 21,
         NA))))))))))))))))))))))
) # centralize year to 0.
data_clean <- data_clean %>% rename(`conference`= `Conference`,
                                    `school`= `School`,
                                    `area_classification` = `Area Classification (0-Rural; 1-Urban)`,
                                    `year`= `Year`,
                                    `tenure_year` = `Tenure Year`,
                                    `s_game`= `In-Season_Game`,
                                    `s_diversion`= `S_Diversion`,
                                    `attendance`= `Attendance`,
                                    `game_time`= `Game Time`,
                                    `game_result`= `Game result (Win=1; Loss=0)`,
                                    `profit`= `Athletic Dept Profit`,
                                    `total_expenses`= `Athletic Dept Total Expenses`,
                                    `total_revenues`= `Athletic Dept Total Revenues`,
                                    `game_time_chars_c_1`= `GameTime_numeric_c_1`,
                                    `game_time_num_c_2`= `GameTime_numeric_c_2`,
                                    `s_game_c`= `In-Season_Game_Centered`,
                                    `tenure_year_c`= `Tenure Year Centered`,
                                    `year_c`= `Year_Centered`  
                                    ) %>% select(- `GameTime_numeric`) # rename the columns.

data_clean$game_time_chars_c_1 <-as.factor(data_clean$game_time_chars_c_1) # Change the variable into a factor variable.
str(data_clean)
tibble [1,390 × 18] (S3: tbl_df/tbl/data.frame)
 $ conference         : Factor w/ 5 levels "ACC","Big10",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ school             : Factor w/ 31 levels "Arizona State",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ area_classification: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ year               : num [1:1390] 2015 2015 2015 2015 2015 ...
 $ tenure_year        : num [1:1390] 1 1 1 1 1 1 1 2 2 2 ...
 $ s_game             : num [1:1390] 1 2 3 4 5 6 7 1 2 3 ...
 $ s_diversion        : num [1:1390] 0.44 0.412 0.315 0.57 0.579 ...
 $ attendance         : num [1:1390] 46500 43310 61904 44157 56534 ...
 $ game_time          : Factor w/ 54 levels "09:00","10:00",..: 49 44 46 44 46 13 15 49 44 44 ...
 $ game_result        : Factor w/ 2 levels "0","1": 2 2 1 2 1 2 2 2 2 2 ...
 $ profit             : num [1:1390] 566524 566524 566524 566524 566524 ...
 $ total_expenses     : num [1:1390] 83873516 83873516 83873516 83873516 83873516 ...
 $ total_revenues     : num [1:1390] 84440040 84440040 84440040 84440040 84440040 ...
 $ game_time_chars_c_1: Factor w/ 4 levels "1","2","3","4": 4 4 4 4 4 2 2 4 4 4 ...
 $ game_time_num_c_2  : num [1:1390] 8 7 7.5 7 7.5 1 1.5 8 7 7 ...
 $ s_game_c           : num [1:1390] 0 1 2 3 4 5 6 0 1 2 ...
 $ tenure_year_c      : num [1:1390] 0 0 0 0 0 0 0 1 1 1 ...
 $ year_c             : num [1:1390] 12 12 12 12 12 12 12 13 13 13 ...

Analysis & Test

LM

data_clean %>% ggplot(mapping = aes(x = game_time_chars_c_1, y = s_diversion, colour = factor(conference))) +
  geom_point() +
  geom_smooth(mapping = aes(group = conference), method = "lm", se = FALSE, fullrange = TRUE) +
  labs(title = "s_diversion vs. game time (level by conference)",
       colour = "Conference") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))
`geom_smooth()` using formula = 'y ~ x'

data_clean %>% ggplot(mapping = aes(x = game_time_num_c_2, y = s_diversion, colour = factor(conference))) +
  geom_point() +
  geom_smooth(mapping = aes(group = conference), method = "lm", se = FALSE, fullrange = TRUE) +
  labs(title = "s_diversion vs. game time (level by conference)",
       colour = "Conference") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))
`geom_smooth()` using formula = 'y ~ x'

Boxplot

data_clean  %>% 
  ggplot(aes(x = game_time_chars_c_1, y = s_diversion, fill=school)) +
  geom_point(size=0.5) +
  geom_boxplot() +
  theme_classic()+
  theme(legend.position = "none")

data_clean  %>% 
  ggplot(aes(x = game_time_chars_c_1, y = s_diversion)) +
  geom_point(size=0.5) +
  geom_boxplot() +
  theme_classic() +
  theme(legend.position="none")

data_clean  %>% 
  ggplot(aes(x = game_time_chars_c_1, y = s_diversion, fill=conference)) +
  geom_point(size=0.5) +
  geom_boxplot() +
  theme_classic()+
  theme()

Loess

data_clean %>% ggplot(mapping = aes(x = game_time_chars_c_1, y = s_diversion, colour = factor(conference))) +
  geom_point() +
  geom_smooth(mapping = aes(group = conference), method = "loess", se = FALSE, fullrange = TRUE) +
  labs(title = "s_diversion vs. game time (level by conference)",
       colour = "Conference") +
  theme_classic()+
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))
`geom_smooth()` using formula = 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 0.985
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 2.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 9.8031e-16
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 4.0602
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 0.985
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 1.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 0
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 4.0602
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: zero-width neighborhood. make span bigger
Warning: Failed to fit group 2.
Caused by error in `predLoess()`:
! NA/NaN/Inf in foreign function call (arg 5)
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 4.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 2.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 2.9153e-16
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 4.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 2.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 2.1483e-16
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 4.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 2.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 4.3782e-16
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1

data_clean %>% ggplot(mapping = aes(x = game_time_num_c_2, y = s_diversion, colour = factor(conference))) +
  geom_point() +
  geom_smooth(mapping = aes(group = conference), method = "loess", se = FALSE, fullrange = TRUE) +
  labs(title = "s_diversion vs. game time (level by conference)",
       colour = "Conference") +
  theme_classic()+
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 73 rows containing missing values or values outside the scale range
(`geom_smooth()`).

Multilevel Modeling

One Level

Year

year_model_1_null <- lmer(s_diversion ~ 1 + (1|year), data=data_clean)

summary(year_model_1_null)
Linear mixed model fit by REML ['lmerMod']
Formula: s_diversion ~ 1 + (1 | year)
   Data: data_clean

REML criterion at convergence: 424.5

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.1594 -0.7537 -0.2390  0.8449  2.2797 

Random effects:
 Groups   Name        Variance Std.Dev.
 year     (Intercept) 0.01307  0.1143  
 Residual             0.07647  0.2765  
Number of obs: 1390, groups:  year, 22

Fixed effects:
            Estimate Std. Error t value
(Intercept)  0.37894    0.02623   14.45

Random effects year (Intercept): 0.01307 There’s some variability in average s_diversion across years.

Fixed effects (Intercept): 0.37894 The overall average s_diversion is 0.37894 across all years.

School

school_model_1_null <- lmer(s_diversion ~ 1 + (1|school), data=data_clean)

summary(school_model_1_null)
Linear mixed model fit by REML ['lmerMod']
Formula: s_diversion ~ 1 + (1 | school)
   Data: data_clean

REML criterion at convergence: -1126.1

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.4297 -0.4986 -0.0120  0.5227  3.1356 

Random effects:
 Groups   Name        Variance Std.Dev.
 school   (Intercept) 0.06163  0.2482  
 Residual             0.02346  0.1532  
Number of obs: 1390, groups:  school, 31

Fixed effects:
            Estimate Std. Error t value
(Intercept)  0.37253    0.04489   8.299

Random effects year (Intercept): 0.06163 There is variability in the average s_diversion across schools.

Fixed effects (Intercept): 0.37253 The overall average s_diversion is 0.37253, across all schools.

Two Level

two_level_model_null <- lmer(s_diversion ~ 1+(1|school)+(1|school:year), data=data_clean)

summary(two_level_model_null)
Linear mixed model fit by REML ['lmerMod']
Formula: s_diversion ~ 1 + (1 | school) + (1 | school:year)
   Data: data_clean

REML criterion at convergence: -2448.2

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.6732 -0.4055 -0.0140  0.3816  5.6324 

Random effects:
 Groups      Name        Variance Std.Dev.
 school:year (Intercept) 0.019235 0.13869 
 school      (Intercept) 0.058885 0.24266 
 Residual                0.005853 0.07651 
Number of obs: 1390, groups:  school:year, 214; school, 31

Fixed effects:
            Estimate Std. Error t value
(Intercept)  0.37444    0.04518   8.288

Random effects year (Intercept): school:year 0.019235 school 0.058885 There is variance between schools, and the schools change over the years.

Fixed effects:0.37444 The overall average s_diversion is 0.37444, across all schools and years.