── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ purrr 1.0.4
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)library(lme4)
Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
library(lmerTest)
Warning: package 'lmerTest' was built under R version 4.5.2
Attaching package: 'lmerTest'
The following object is masked from 'package:lme4':
lmer
The following object is masked from 'package:stats':
step
library("effectsize")
Warning: package 'effectsize' was built under R version 4.5.2
Import the Data
Clean the data
data_clean <- data %>%select(`Conference`, `School`, `Area Classification (0-Rural; 1-Urban)`, `Year`, `Tenure Year`, `In-Season_Game`, `S_Diversion`, `Attendance`, `Game Time`,`Game result (Win=1; Loss=0)`,`Athletic Dept Profit`, `Athletic Dept Total Expenses`, `Athletic Dept Total Revenues`) # select the columndata_clean$GameTime_numeric <-as.numeric(format(data_clean$`Game Time`, "%H")) +as.numeric(format(data_clean$`Game Time`, "%M"))/60# convert game time to numerical variable and create a new columndata_clean$`Game Time`=format(data_clean$`Game Time`, format ="%H:%M") # avoid game time impacted by computer system datedata_clean <- data_clean %>%mutate(`Game Time`=as.character(`Game Time`)) %>%mutate(`Area Classification (0-Rural; 1-Urban)`=as.character(`Area Classification (0-Rural; 1-Urban)`)) %>%mutate(`Attendance`=as.numeric(`Attendance`)) # Convert the variable to its suitable data type.
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `Attendance = as.numeric(Attendance)`.
Caused by warning:
! NAs introduced by coercion
data_clean <- data_clean %>%mutate(`Game result (Win=1; Loss=0)`=na_if(`Game result (Win=1; Loss=0)`, "N/A")) %>%mutate(across(all_of(cols_to_factor), as.factor)) # Make sure there is no NA level.
data_clean <-subset(data_clean, !is.na(`Game result (Win=1; Loss=0)`)) # Clean the game that is being cancelled.
data_clean$GameTime_numeric_c_1 <-with(data_clean, ifelse( GameTime_numeric >=9& GameTime_numeric <12, 1, # Morningifelse(GameTime_numeric >=12& GameTime_numeric <15.5, 2, # noonifelse(GameTime_numeric >=15.5& GameTime_numeric <19, 3, # afternoonifelse(GameTime_numeric >=19, 4, # eveningNA))) )) # Classify game time into four time slots.
data_clean$GameTime_numeric_c_2 <- data_clean$GameTime_numeric-12# centralize game time by 12
data_clean$`In-Season_Game_Centered`<-with(data_clean,ifelse(`In-Season_Game`==1, 0,ifelse(`In-Season_Game`==2, 1,ifelse(`In-Season_Game`==3, 2,ifelse(`In-Season_Game`==4, 3,ifelse(`In-Season_Game`==5, 4,ifelse(`In-Season_Game`==6, 5,ifelse(`In-Season_Game`==7, 6,ifelse(`In-Season_Game`==8, 7,ifelse(`In-Season_Game`==9, 8, NA)))))))))) # Centralize in the in-season game to 0.
data_clean <- data_clean %>% dplyr::rename(`conference`=`Conference`,`school`=`School`,`area_classification`=`Area Classification (0-Rural; 1-Urban)`,`year`=`Year`,`tenure_year`=`Tenure Year`,`s_game`=`In-Season_Game`,`s_diversion`=`S_Diversion`,`attendance`=`Attendance`,`game_time`=`Game Time`,`game_result`=`Game result (Win=1; Loss=0)`,`profit`=`Athletic Dept Profit`,`total_expenses`=`Athletic Dept Total Expenses`,`total_revenues`=`Athletic Dept Total Revenues`,`game_time_chars_c_1`=`GameTime_numeric_c_1`,`game_time_num_c_2`=`GameTime_numeric_c_2`,`s_game_c`=`In-Season_Game_Centered`,`tenure_year_c`=`Tenure Year Centered`,`year_c`=`Year_Centered` ) %>%select(-`GameTime_numeric`) # rename the columns.data_clean$game_time_chars_c_1 <-as.factor(data_clean$game_time_chars_c_1) # Change the variable into a factor variable.
Might violate the heteroscedasticity. The spread of residuals is not constant across the range of fitted values.(fitted values < 0.2, variance is small)