Warning: package 'ggplot2' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ purrr 1.0.4
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 4.0.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)library(lme4)
Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
library(lmerTest)
Warning: package 'lmerTest' was built under R version 4.5.2
Attaching package: 'lmerTest'
The following object is masked from 'package:lme4':
lmer
The following object is masked from 'package:stats':
step
library("effectsize")
Warning: package 'effectsize' was built under R version 4.5.2
Import the Data
Clean the data
# Select the columnsdata_clean <- data %>%select(`Conference`, `School`, `Area Classification (0-Rural; 1-Urban)`, `Year`, `Tenure Year`, `In-Season_Game`, `S_Diversion`, `Attendance`, `Game Time`,`Game result (Win=1; Loss=0)`,`Athletic Dept Profit`, `Athletic Dept Total Expenses`, `Athletic Dept Total Revenues`) # Convert game time to a numerical variable and create a new columndata_clean$GameTime_numeric <-as.numeric(format(data_clean$`Game Time`, "%H")) +as.numeric(format(data_clean$`Game Time`, "%M"))/60# Avoid game time impacted by the computer system datedata_clean$`Game Time`=format(data_clean$`Game Time`, format ="%H:%M") # Convert the categorical variables to the factor data typedata_clean <- data_clean %>%mutate(`Game Time`=as.character(`Game Time`)) %>%mutate(`Area Classification (0-Rural; 1-Urban)`=as.character(`Area Classification (0-Rural; 1-Urban)`)) %>%mutate(`Attendance`=as.numeric(`Attendance`))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `Attendance = as.numeric(Attendance)`.
Caused by warning:
! NAs introduced by coercion
# Convert the categorical variables to the factor data typecols_to_factor <- data_clean%>%select_if(is.character) %>%colnames() cols_to_factor
# Make sure there is no NA level data_clean <- data_clean %>%mutate(`Game result (Win=1; Loss=0)`=na_if(`Game result (Win=1; Loss=0)`, "N/A")) %>%mutate(across(all_of(cols_to_factor), as.factor))
# Clean the game that is being cancelleddata_clean <-subset(data_clean, !is.na(`Game result (Win=1; Loss=0)`))
# Classify game time into four time slotsdata_clean$GameTime_numeric_c_1 <-with(data_clean, ifelse( GameTime_numeric >=9& GameTime_numeric <12, 1, # Morningifelse(GameTime_numeric >=12& GameTime_numeric <15.5, 2, # Noonifelse(GameTime_numeric >=15.5& GameTime_numeric <19, 3, # Afternoonifelse(GameTime_numeric >=19, 4, # EveningNA))) ))
# Centralize game time by 12data_clean$GameTime_numeric_c_2 <- data_clean$GameTime_numeric-12
# Centralize in the in-season game to 0data_clean$`In-Season_Game_Centered`<-with(data_clean,ifelse(`In-Season_Game`==1, 0,ifelse(`In-Season_Game`==2, 1,ifelse(`In-Season_Game`==3, 2,ifelse(`In-Season_Game`==4, 3,ifelse(`In-Season_Game`==5, 4,ifelse(`In-Season_Game`==6, 5,ifelse(`In-Season_Game`==7, 6,ifelse(`In-Season_Game`==8, 7,ifelse(`In-Season_Game`==9, 8, NA))))))))))
# Centralize the year to 0data_clean$Year_Centered <-with(data_clean,ifelse(`Year`==2003, 0,ifelse(`Year`==2004, 1,ifelse(`Year`==2005, 2,ifelse(`Year`==2006, 3,ifelse(`Year`==2007, 4,ifelse(`Year`==2008, 5,ifelse(`Year`==2009, 6,ifelse(`Year`==2010, 7,ifelse(`Year`==2011, 8,ifelse(`Year`==2012, 9,ifelse(`Year`==2013, 10,ifelse(`Year`==2014, 11,ifelse(`Year`==2015, 12,ifelse(`Year`==2016, 13,ifelse(`Year`==2017, 14,ifelse(`Year`==2018, 15,ifelse(`Year`==2019, 16,ifelse(`Year`==2020, 17,ifelse(`Year`==2021, 18,ifelse(`Year`==2022, 19,ifelse(`Year`==2023, 20,ifelse(`Year`==2024, 21,NA)))))))))))))))))))))))
# Rename the columnsdata_clean <- data_clean %>% dplyr::rename(`conference`=`Conference`,`school`=`School`,`area_classification`=`Area Classification (0-Rural; 1-Urban)`,`year`=`Year`,`tenure_year`=`Tenure Year`,`s_game`=`In-Season_Game`,`s_diversion`=`S_Diversion`,`attendance`=`Attendance`,`game_time`=`Game Time`,`game_result`=`Game result (Win=1; Loss=0)`,`profit`=`Athletic Dept Profit`,`total_expenses`=`Athletic Dept Total Expenses`,`total_revenues`=`Athletic Dept Total Revenues`,`game_time_chars_c_1`=`GameTime_numeric_c_1`,`game_time_num_c_2`=`GameTime_numeric_c_2`,`s_game_c`=`In-Season_Game_Centered`,`tenure_year_c`=`Tenure Year Centered`,`year_c`=`Year_Centered` ) %>%select(-`GameTime_numeric`) # Change the variable into a factor variabledata_clean$game_time_chars_c_1 <-as.factor(data_clean$game_time_chars_c_1)
# The variable Attendance belongs to the game level (level 1). Use the Centering Within cluster (CWC) method to centralize the attendance variable, making each cluster have a mean of 0.data_clean %<>%group_by(school) %>%mutate(attendance_mean_school =mean(attendance, na.rm =TRUE)) %>%mutate(attendance_cwc_school = attendance - attendance_mean_school) %>%ungroup()
# Rescale the attendance to make the variable have a mean of 0 and a standard deviation of 1 to solve the issue of different scales.data_clean %<>%group_by(school) %>%mutate(attendance_mean_school =mean(attendance, na.rm =TRUE)) %>%mutate(attendance_school_z = (attendance - attendance_mean_school)/sd(attendance, na.rm =TRUE)) %>%ungroup()
# The variable Attendance belongs to the game level (level 1). Use the Centering Within cluster (CWC) method to centralize the attendance variable, making each cluster have a mean of 0.data_clean %<>%group_by(year) %>%mutate(attendance_mean_year =mean(attendance, na.rm =TRUE)) %>%mutate(attendance_cwc_year = attendance - attendance_mean_year) %>%ungroup()
# Rescale the attendance to make the variable have a mean of 0 and a standard deviation of 1 to solve the issue of different scales.data_clean %<>%group_by(year) %>%mutate(attendance_mean_year =mean(attendance, na.rm =TRUE)) %>%mutate(attendance_year_z = (attendance - attendance_mean_year)/sd(attendance, na.rm =TRUE)) %>%ungroup()
# The variable total revenues belongs to the year level (level 2). Use the Centering Grand Mean (CGM) method to centralize the total revenues variable, making all individuals have a mean of 0.data_clean %<>%mutate(total_revenues_mean =mean(total_revenues, na.rm =TRUE)) %>%mutate(total_revenues_cgm = total_revenues - total_revenues_mean)
# Rescale the attendance to make the variable have a mean of 0 and a standard deviation of 1 to solve the issue of different scales.data_clean %<>%mutate(total_revenues_mean =mean(total_revenues, na.rm =TRUE)) %>%mutate(total_revenues_z = (total_revenues - total_revenues_mean)/sd(total_revenues,na.rm =TRUE))
The model that treats tenure years as a random slope performs better.
The ICC increases noticeably in the tenure-year model (model_school_r_2).
In attendance model (model_school_r_1), the random slope variance for attendance is extremely small (0.000872) and essentially zero correlation with the intercept. However, in tenure-year model (model_school_r_2), the random slope variance for tenure year is measurable (0.001163) with a correlation –0.20.
The Residual SD drops from 0.13007 in attendance model (model_school_r_1) to 0.1105 in tenure-year model (model_school_r_2).
Warning: Random slopes not present as fixed effects. This artificially inflates
the conditional random effect variances.
Respecify the fixed effects structure of your model (add random slopes
as fixed effects).
The random intercept roughly follows the reference line, so the random intercepts appear approximately normally distributed.
mf <-model.frame(model_school_r_1)res <-residuals(model_school_r_1)par(mfrow=c(2,3))plot(res ~ mf$game_time_num_c_2,main ="Residuals vs Game Time (centered)",xlab ="Game Time (centered)", ylab ="Residuals")plot(res ~ mf$attendance_school_z,main ="Residuals vs Attendance (Z-scoring)",xlab ="Attendance (Z-scoring)", ylab ="Residuals")plot(res ~ mf$total_revenues_z,main ="Residuals vs Total Revenues",xlab ="Total Revenues (Z-scoring)", ylab ="Residuals")plot(res ~ mf$s_game_c,main ="Residuals vs Season Game (centered)",xlab ="Season Game (centered)", ylab ="Residuals")plot(res ~ mf$tenure_year_c,main ="Residuals vs Tenure (centered)",xlab ="Tenure (centered)", ylab ="Residuals")plot(res ~ mf$area_classification,main ="Residuals vs Area (0/1)",xlab ="Area (0/1)", ylab ="Residuals")
plot(res ~ mf$conference,main ="Residuals vs Conference",xlab ="Conference", ylab ="Residuals")
There is no strong curvature. The linearity assumption is mostly reasonable.
The variability of the residuals seems to be non-constant (along the horizontal line at 0), suggesting a violation of the equal variance assumption of the error term.
The random intercept does not roughly follow the reference line, especially the head and tail, so the random intercepts do not appear approximately normally distributed.
mf <-model.frame(model_school_r_2)res <-residuals(model_school_r_2)par(mfrow=c(2,3))plot(res ~ mf$game_time_num_c_2,main ="Residuals vs Game Time (centered)",xlab ="Game Time (centered)", ylab ="Residuals")plot(res ~ mf$attendance_school_z,main ="Residuals vs Attendance (Z-scoring)",xlab ="Attendance (Z-scoring)", ylab ="Residuals")plot(res ~ mf$total_revenues_z,main ="Residuals vs Total Revenues",xlab ="Total Revenues (Z-scoring)", ylab ="Residuals")plot(res ~ mf$s_game_c,main ="Residuals vs Season Game (centered)",xlab ="Season Game (centered)", ylab ="Residuals")plot(res ~ mf$tenure_year_c,main ="Residuals vs Tenure (centered)",xlab ="Tenure (centered)", ylab ="Residuals")plot(res ~ mf$area_classification,main ="Residuals vs Area (0/1)",xlab ="Area (0/1)", ylab ="Residuals")
plot(res ~ mf$conference,main ="Residuals vs Conference",xlab ="Conference", ylab ="Residuals")