Warning: package 'ggplot2' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ purrr 1.0.4
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 4.0.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)library(lme4)
Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
library(lmerTest)
Warning: package 'lmerTest' was built under R version 4.5.2
Attaching package: 'lmerTest'
The following object is masked from 'package:lme4':
lmer
The following object is masked from 'package:stats':
step
library("effectsize")
Warning: package 'effectsize' was built under R version 4.5.2
Import the Data
Clean the data
data_clean <- data %>%select(`Conference`, `School`, `Area Classification (0-Rural; 1-Urban)`, `Year`, `Tenure Year`, `In-Season_Game`, `S_Diversion`, `Attendance`, `Game Time`,`Game result (Win=1; Loss=0)`,`Athletic Dept Profit`, `Athletic Dept Total Expenses`, `Athletic Dept Total Revenues`) # Select the columndata_clean$GameTime_numeric <-as.numeric(format(data_clean$`Game Time`, "%H")) +as.numeric(format(data_clean$`Game Time`, "%M"))/60# Convert game time to a numerical variable and create a new columndata_clean$`Game Time`=format(data_clean$`Game Time`, format ="%H:%M") # Avoid game time impacted by the computer system datedata_clean <- data_clean %>%mutate(`Game Time`=as.character(`Game Time`)) %>%mutate(`Area Classification (0-Rural; 1-Urban)`=as.character(`Area Classification (0-Rural; 1-Urban)`)) %>%mutate(`Attendance`=as.numeric(`Attendance`)) # Convert the categorical variables to the factor data type.
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `Attendance = as.numeric(Attendance)`.
Caused by warning:
! NAs introduced by coercion
cols_to_factor <- data_clean%>%select_if(is.character) %>%colnames() # Convert the categorical variables to the factor data type.cols_to_factor
data_clean <- data_clean %>%mutate(`Game result (Win=1; Loss=0)`=na_if(`Game result (Win=1; Loss=0)`, "N/A")) %>%mutate(across(all_of(cols_to_factor), as.factor)) # Make sure there is no NA level.
data_clean <-subset(data_clean, !is.na(`Game result (Win=1; Loss=0)`)) # Clean the game that is being cancelled.
data_clean$GameTime_numeric_c_1 <-with(data_clean, ifelse( GameTime_numeric >=9& GameTime_numeric <12, 1, # Morningifelse(GameTime_numeric >=12& GameTime_numeric <15.5, 2, # noonifelse(GameTime_numeric >=15.5& GameTime_numeric <19, 3, # afternoonifelse(GameTime_numeric >=19, 4, # eveningNA))) )) # Classify game time into four time slots.
data_clean$GameTime_numeric_c_2 <- data_clean$GameTime_numeric-12# Centralize game time by 12
data_clean$`In-Season_Game_Centered`<-with(data_clean,ifelse(`In-Season_Game`==1, 0,ifelse(`In-Season_Game`==2, 1,ifelse(`In-Season_Game`==3, 2,ifelse(`In-Season_Game`==4, 3,ifelse(`In-Season_Game`==5, 4,ifelse(`In-Season_Game`==6, 5,ifelse(`In-Season_Game`==7, 6,ifelse(`In-Season_Game`==8, 7,ifelse(`In-Season_Game`==9, 8, NA)))))))))) # Centralize in the in-season game to 0.
data_clean$Year_Centered <-with(data_clean,ifelse(`Year`==2003, 0,ifelse(`Year`==2004, 1,ifelse(`Year`==2005, 2,ifelse(`Year`==2006, 3,ifelse(`Year`==2007, 4,ifelse(`Year`==2008, 5,ifelse(`Year`==2009, 6,ifelse(`Year`==2010, 7,ifelse(`Year`==2011, 8,ifelse(`Year`==2012, 9,ifelse(`Year`==2013, 10,ifelse(`Year`==2014, 11,ifelse(`Year`==2015, 12,ifelse(`Year`==2016, 13,ifelse(`Year`==2017, 14,ifelse(`Year`==2018, 15,ifelse(`Year`==2019, 16,ifelse(`Year`==2020, 17,ifelse(`Year`==2021, 18,ifelse(`Year`==2022, 19,ifelse(`Year`==2023, 20,ifelse(`Year`==2024, 21,NA))))))))))))))))))))))) # Centralize the year to 0.
data_clean <- data_clean %>% dplyr::rename(`conference`=`Conference`,`school`=`School`,`area_classification`=`Area Classification (0-Rural; 1-Urban)`,`year`=`Year`,`tenure_year`=`Tenure Year`,`s_game`=`In-Season_Game`,`s_diversion`=`S_Diversion`,`attendance`=`Attendance`,`game_time`=`Game Time`,`game_result`=`Game result (Win=1; Loss=0)`,`profit`=`Athletic Dept Profit`,`total_expenses`=`Athletic Dept Total Expenses`,`total_revenues`=`Athletic Dept Total Revenues`,`game_time_chars_c_1`=`GameTime_numeric_c_1`,`game_time_num_c_2`=`GameTime_numeric_c_2`,`s_game_c`=`In-Season_Game_Centered`,`tenure_year_c`=`Tenure Year Centered`,`year_c`=`Year_Centered` ) %>%select(-`GameTime_numeric`) # Rename the columns.data_clean$game_time_chars_c_1 <-as.factor(data_clean$game_time_chars_c_1) # Change the variable into a factor variable.
# The variable Attendance belongs to the game level (level 1). Use the Centering Within cluster (CWC) method to centralize the attendance variable, making each cluster have a mean of 0.data_clean %<>%group_by(school) %>%mutate(attendance_mean_school =mean(attendance, na.rm =TRUE)) %>%mutate(attendance_cwc_school = attendance - attendance_mean_school) %>%ungroup()
# Rescale the attendance to make the variable have a mean of 0 and a standard deviation of 1 to solve the issue of different scales.data_clean %<>%group_by(school) %>%mutate(attendance_mean_school =mean(attendance, na.rm =TRUE)) %>%mutate(attendance_school_z = (attendance - attendance_mean_school)/sd(attendance, na.rm =TRUE)) %>%ungroup()
# The variable Attendance belongs to the game level (level 1). Use the Centering Within cluster (CWC) method to centralize the attendance variable, making each cluster have a mean of 0.data_clean %<>%group_by(year) %>%mutate(attendance_mean_year =mean(attendance, na.rm =TRUE)) %>%mutate(attendance_cwc_year = attendance - attendance_mean_year) %>%ungroup()
# Rescale the attendance to make the variable have a mean of 0 and a standard deviation of 1 to solve the issue of different scales.data_clean %<>%group_by(year) %>%mutate(attendance_mean_year =mean(attendance, na.rm =TRUE)) %>%mutate(attendance_year_z = (attendance - attendance_mean_year)/sd(attendance, na.rm =TRUE)) %>%ungroup()
# The variable total revenues belongs to the year level (level 2). Use the Centering Grand Mean (CGM) method to centralize the total revenues variable, making all individuals have a mean of 0.data_clean %<>%mutate(total_revenues_mean =mean(total_revenues, na.rm =TRUE)) %>%mutate(total_revenues_cgm = total_revenues - total_revenues_mean)
# Rescale the attendance to make the variable have a mean of 0 and a standard deviation of 1 to solve the issue of different scales.data_clean %<>%mutate(total_revenues_mean =mean(total_revenues, na.rm =TRUE)) %>%mutate(total_revenues_z = (total_revenues - total_revenues_mean)/sd(total_revenues,na.rm =TRUE))
Warning: Random slopes not present as fixed effects. This artificially inflates
the conditional random effect variances.
Respecify the fixed effects structure of your model (add random slopes
as fixed effects).
There is no strong curvature. The linearity assumption is mostly reasonable.
The vertical spread of points increases and decreases across the fitted values. Accordingly, the variance is not constant, which indicates heteroscedasticity.
The random intercept roughly follows the reference line. There is a slight deviation, so the normality assumption is being satisfied.
mf <-model.frame(model_school_r_1)res <-residuals(model_school_r_1)par(mfrow=c(2,3))plot(res ~ mf$game_time_num_c_2,main ="Residuals vs Game Time (centered)",xlab ="Game Time (centered)", ylab ="Residuals")plot(res ~ mf$attendance_school_z,main ="Residuals vs Attendance (Z-scoring)",xlab ="Attendance (Z-scoring)", ylab ="Residuals")plot(res ~ mf$total_revenues_z,main ="Residuals vs Total Revenues",xlab ="Total Revenues (Z-scoring)", ylab ="Residuals")plot(res ~ mf$s_game_c,main ="Residuals vs Season Game (centered)",xlab ="Season Game (centered)", ylab ="Residuals")plot(res ~ mf$tenure_year_c,main ="Residuals vs Tenure (centered)",xlab ="Tenure (centered)", ylab ="Residuals")plot(res ~ mf$area_classification,main ="Residuals vs Area (0/1)",xlab ="Area (0/1)", ylab ="Residuals")
plot(res ~ mf$conference,main ="Residuals vs Conference",xlab ="Conference", ylab ="Residuals")
Residuals vs Game Time (centered): No obvious pattern,the variance is constant, which indicates no heteroscedasticity.
Residuals vs Attendance (Z-scoring): Residuals cluster toward the center, which possible indicate heteroscedasticity.
Residuals vs Total Revenues (Z-scoring): Residuals cluster toward the center, which possible indicate heteroscedasticity.
Residuals vs Season Game (centered): No trend, which indicates constant variance.
Residuals vs Tenure (centered): Residual spread increases with higher tenure year, which indicates heteroscedasticity.
Residuals vs Area (0/1): Boxplots for Area 0 vs 1 show similar medians, and the variance is more likely to be constant.
Residuals vs Conference: Boxplots for Conference show similar medians, and the variance is more likely to be constant.
There is no strong curvature. The linearity assumption is mostly reasonable.
The vertical spread of points increases and decreases across the fitted values. Accordingly, the variance is not constant, which indicates heteroscedasticity.
The random intercept is not roughly follows the reference line, especially two tails, so the normality assumption is not being satisfied.
mf <-model.frame(model_school_r_2)res <-residuals(model_school_r_2)par(mfrow=c(2,3))plot(res ~ mf$game_time_num_c_2,main ="Residuals vs Game Time (centered)",xlab ="Game Time (centered)", ylab ="Residuals")plot(res ~ mf$attendance_school_z,main ="Residuals vs Attendance (Z-scoring)",xlab ="Attendance (Z-scoring)", ylab ="Residuals")plot(res ~ mf$total_revenues_z,main ="Residuals vs Total Revenues",xlab ="Total Revenues (Z-scoring)", ylab ="Residuals")plot(res ~ mf$s_game_c,main ="Residuals vs Season Game (centered)",xlab ="Season Game (centered)", ylab ="Residuals")plot(res ~ mf$tenure_year_c,main ="Residuals vs Tenure (centered)",xlab ="Tenure (centered)", ylab ="Residuals")plot(res ~ mf$area_classification,main ="Residuals vs Area (0/1)",xlab ="Area (0/1)", ylab ="Residuals")
plot(res ~ mf$conference,main ="Residuals vs Conference",xlab ="Conference", ylab ="Residuals")
Residuals vs Game Time (centered): No obvious pattern,the variance is constant, which indicates no heteroscedasticity.
Residuals vs Attendance (Z-scoring): Residuals cluster toward the center, which possible indicate heteroscedasticity.
Residuals vs Total Revenues (Z-scoring): Residuals cluster toward the center, which possible indicate heteroscedasticity.
Residuals vs Season Game (centered): No trend, which indicates constant variance.
Residuals vs Tenure (centered): Residual spread increases with higher tenure year, which indicates heteroscedasticity.
Residuals vs Area (0/1): Boxplots for Area 0 vs 1 show similar medians, and the variance is more likely to be constant.
Residuals vs Conference: Boxplots for Conference show similar medians, and the variance is more likely to be constant.