── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ purrr 1.0.4
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)library(lme4)
Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
Import the Data
Clean the data
data_clean <- data %>%select(`Conference`, `School`, `Area Classification (0-Rural; 1-Urban)`, `Year`, `Tenure Year`, `In-Season_Game`, `S_Diversion`, `Attendance`, `Game Time`,`Game result (Win=1; Loss=0)`,`Athletic Dept Profit`, `Athletic Dept Total Expenses`, `Athletic Dept Total Revenues`) # select the columndata_clean$GameTime_numeric <-as.numeric(format(data_clean$`Game Time`, "%H")) +as.numeric(format(data_clean$`Game Time`, "%M"))/60# convert game time to numerical variable and create a new columndata_clean$`Game Time`=format(data_clean$`Game Time`, format ="%H:%M") # avoid game time impacted by computer system datedata_clean <- data_clean %>%mutate(`Game Time`=as.character(`Game Time`)) %>%mutate(`Area Classification (0-Rural; 1-Urban)`=as.character(`Area Classification (0-Rural; 1-Urban)`)) %>%mutate(`Attendance`=as.numeric(`Attendance`)) # Convert the variable to its suitable data type.
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `Attendance = as.numeric(Attendance)`.
Caused by warning:
! NAs introduced by coercion
data_clean <- data_clean %>%mutate(`Game result (Win=1; Loss=0)`=na_if(`Game result (Win=1; Loss=0)`, "N/A")) %>%mutate(across(all_of(cols_to_factor), as.factor)) # Make sure there is no NA level.
data_clean <-subset(data_clean, !is.na(`Game result (Win=1; Loss=0)`)) # Clean the game that is being cancelled.
data_clean$GameTime_numeric_c_1 <-with(data_clean, ifelse( GameTime_numeric >=9& GameTime_numeric <12, 1, # Morningifelse(GameTime_numeric >=12& GameTime_numeric <17.5, 2, # noonifelse(GameTime_numeric >=17.5& GameTime_numeric <19, 3, # afternoonifelse(GameTime_numeric >=19, 4, # eveningNA))) )) # Classify game time into four time slots.
data_clean$GameTime_numeric_c_2 <- data_clean$GameTime_numeric-12# centralize game time by 12
data_clean$`In-Season_Game_Centered`<-with(data_clean,ifelse(`In-Season_Game`==1, 0,ifelse(`In-Season_Game`==2, 1,ifelse(`In-Season_Game`==3, 2,ifelse(`In-Season_Game`==4, 3,ifelse(`In-Season_Game`==5, 4,ifelse(`In-Season_Game`==6, 5,ifelse(`In-Season_Game`==7, 6,ifelse(`In-Season_Game`==8, 7,ifelse(`In-Season_Game`==9, 8, NA)))))))))) # Centralize in the in-season game to 0.
data_clean <- data_clean %>%rename(`conference`=`Conference`,`school`=`School`,`area_classification`=`Area Classification (0-Rural; 1-Urban)`,`year`=`Year`,`tenure_year`=`Tenure Year`,`s_game`=`In-Season_Game`,`s_diversion`=`S_Diversion`,`attendance`=`Attendance`,`game_time`=`Game Time`,`game_result`=`Game result (Win=1; Loss=0)`,`profit`=`Athletic Dept Profit`,`total_expenses`=`Athletic Dept Total Expenses`,`total_revenues`=`Athletic Dept Total Revenues`,`game_time_chars_c_1`=`GameTime_numeric_c_1`,`game_time_num_c_2`=`GameTime_numeric_c_2`,`s_game_c`=`In-Season_Game_Centered`,`tenure_year_c`=`Tenure Year Centered`,`year_c`=`Year_Centered` ) %>%select(-`GameTime_numeric`) # rename the columns.data_clean$game_time_chars_c_1 <-as.factor(data_clean$game_time_chars_c_1) # Change the variable into a factor variable.
data_clean %>%ggplot(mapping =aes(x = game_time_chars_c_1, y = s_diversion, colour =factor(conference))) +geom_point() +geom_smooth(mapping =aes(group = conference), method ="loess", se =FALSE, fullrange =TRUE) +labs(title ="s_diversion vs. game time (level by conference)",colour ="Conference") +theme_classic()+theme(plot.title =element_text(hjust =0.5, face ="bold"))
`geom_smooth()` using formula = 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 0.985
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 4.3782e-16
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1
data_clean %>%ggplot(mapping =aes(x = game_time_num_c_2, y = s_diversion, colour =factor(conference))) +geom_point() +geom_smooth(mapping =aes(group = conference), method ="loess", se =FALSE, fullrange =TRUE) +labs(title ="s_diversion vs. game time (level by conference)",colour ="Conference") +theme_classic()+theme(plot.title =element_text(hjust =0.5, face ="bold"))
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 73 rows containing missing values or values outside the scale range
(`geom_smooth()`).
Linear mixed model fit by REML ['lmerMod']
Formula: s_diversion ~ 1 + (1 | year)
Data: data_clean
REML criterion at convergence: 424.5
Scaled residuals:
Min 1Q Median 3Q Max
-2.1594 -0.7537 -0.2390 0.8449 2.2797
Random effects:
Groups Name Variance Std.Dev.
year (Intercept) 0.01307 0.1143
Residual 0.07647 0.2765
Number of obs: 1390, groups: year, 22
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.37894 0.02623 14.45
Random effects year (Intercept): 0.01307 There’s some variability in average s_diversion across years.
Fixed effects (Intercept): 0.37894 The overall average s_diversion is 0.37894 across all years.
Linear mixed model fit by REML ['lmerMod']
Formula: s_diversion ~ 1 + (1 | school)
Data: data_clean
REML criterion at convergence: -1126.1
Scaled residuals:
Min 1Q Median 3Q Max
-3.4297 -0.4986 -0.0120 0.5227 3.1356
Random effects:
Groups Name Variance Std.Dev.
school (Intercept) 0.06163 0.2482
Residual 0.02346 0.1532
Number of obs: 1390, groups: school, 31
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.37253 0.04489 8.299
Random effects year (Intercept): 0.06163 There is variability in the average s_diversion across schools.
Fixed effects (Intercept): 0.37253 The overall average s_diversion is 0.37253, across all schools.
Linear mixed model fit by REML ['lmerMod']
Formula: s_diversion ~ 1 + (1 | school) + (1 | school:year)
Data: data_clean
REML criterion at convergence: -2448.2
Scaled residuals:
Min 1Q Median 3Q Max
-5.6732 -0.4055 -0.0140 0.3816 5.6324
Random effects:
Groups Name Variance Std.Dev.
school:year (Intercept) 0.019235 0.13869
school (Intercept) 0.058885 0.24266
Residual 0.005853 0.07651
Number of obs: 1390, groups: school:year, 214; school, 31
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.37444 0.04518 8.288
Random effects year (Intercept): school:year 0.019235 school 0.058885 There is variance between schools, and the schools change over the years.
Fixed effects:0.37444 The overall average s_diversion is 0.37444, across all schools and years.