── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ purrr 1.0.4
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
Import the Data
Clean the data
data_clean <- data %>%select(`Conference`, `School`, `Area Classification (0-Rural; 1-Urban)`, `Year`, `Tenure Year`, `In-Season_Game`, `S_Diversion`, `Attendance`, `Game Time`,`Game result (Win=1; Loss=0)`,`Athletic Dept Profit`, `Athletic Dept Total Expenses`, `Athletic Dept Total Revenues`) # select the columndata_clean$GameTime_numeric <-as.numeric(format(data_clean$`Game Time`, "%H")) +as.numeric(format(data_clean$`Game Time`, "%M"))/60# convert game time to numerical variable and create a new columndata_clean$`Game Time`=format(data_clean$`Game Time`, format ="%H:%M") # avoid game time impacted by computer system datedata_clean <- data_clean %>%mutate(`Game Time`=as.character(`Game Time`)) %>%mutate(`Area Classification (0-Rural; 1-Urban)`=as.character(`Area Classification (0-Rural; 1-Urban)`)) %>%mutate(`Attendance`=as.numeric(`Attendance`)) # Convert the variable to its suitable data type.
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `Attendance = as.numeric(Attendance)`.
Caused by warning:
! NAs introduced by coercion
data_clean <- data_clean %>%mutate(`Game result (Win=1; Loss=0)`=na_if(`Game result (Win=1; Loss=0)`, "N/A")) %>%mutate(across(all_of(cols_to_factor), as.factor)) # Make sure there is no NA level.
data_clean <-subset(data_clean, !is.na(`Game result (Win=1; Loss=0)`)) # Clean the game that is being cancelled.
data_clean$GameTime_numeric_c_1 <-with(data_clean, ifelse( GameTime_numeric >=9& GameTime_numeric <12, 1, # Morningifelse(GameTime_numeric >=12& GameTime_numeric <17.5, 2, # noonifelse(GameTime_numeric >=17.5& GameTime_numeric <19, 3, # afternoonifelse(GameTime_numeric >=19, 4, # eveningNA))) )) # Classify game time into four time slots.
data_clean$GameTime_numeric_c_2 <- data_clean$GameTime_numeric-12# centralize game time by 12
data_clean$`In-Season_Game_Centered`<-with(data_clean,ifelse(`In-Season_Game`==1, 0,ifelse(`In-Season_Game`==2, 1,ifelse(`In-Season_Game`==3, 2,ifelse(`In-Season_Game`==4, 3,ifelse(`In-Season_Game`==5, 4,ifelse(`In-Season_Game`==6, 5,ifelse(`In-Season_Game`==7, 6,ifelse(`In-Season_Game`==8, 7,ifelse(`In-Season_Game`==9, 8, NA)))))))))) # Centralize in the in-season game to 0.
data_clean <- data_clean %>%rename(`conference`=`Conference`,`school`=`School`,`area_classification`=`Area Classification (0-Rural; 1-Urban)`,`year`=`Year`,`tenure_year`=`Tenure Year`,`s_game`=`In-Season_Game`,`s_diversion`=`S_Diversion`,`attendance`=`Attendance`,`game_time`=`Game Time`,`game_result`=`Game result (Win=1; Loss=0)`,`profit`=`Athletic Dept Profit`,`total_expenses`=`Athletic Dept Total Expenses`,`total_revenues`=`Athletic Dept Total Revenues`,`game_time_chars_c_1`=`GameTime_numeric_c_1`,`game_time_num_c_2`=`GameTime_numeric_c_2`,`s_game_c`=`In-Season_Game_Centered`,`tenure_year_c`=`Tenure Year Centered`,`year_c`=`Year_Centered` ) %>%select(-`GameTime_numeric`) # rename the columns.data_clean$game_time_chars_c_1 <-as.factor(data_clean$game_time_chars_c_1) # Change the variable into a factor variable.
data_clean %>%ggplot(mapping =aes(x = game_time_chars_c_1, y = s_diversion, colour =factor(conference))) +geom_point() +geom_smooth(mapping =aes(group = conference), method ="loess", se =FALSE, fullrange =TRUE) +labs(title ="s_diversion vs. game time (level by conference)",colour ="Conference") +theme_classic()+theme(plot.title =element_text(hjust =0.5, face ="bold"))
`geom_smooth()` using formula = 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 0.985
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 4.3782e-16
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1
data_clean %>%ggplot(mapping =aes(x = game_time_num_c_2, y = s_diversion, colour =factor(conference))) +geom_point() +geom_smooth(mapping =aes(group = conference), method ="loess", se =FALSE, fullrange =TRUE) +labs(title ="s_diversion vs. game time (level by conference)",colour ="Conference") +theme_classic()+theme(plot.title =element_text(hjust =0.5, face ="bold"))
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 73 rows containing missing values or values outside the scale range
(`geom_smooth()`).