Q1 # Combine all seasons df_all <- rbind(df_2022, df_2023, df_2024) # Check column names for each dataset colnames(df_2022) colnames(df_2023) colnames(df_2024) # Make sure column names are the same colnames(df_2023) <- colnames(df_2022) colnames(df_2024) <- colnames(df_2022) # Combine the data df_all <- rbind(df_2022, df_2023, df_2024) library(dplyr) df_all <- bind_rows(df_2022, df_2023, df_2024) # Convert TEAM_NAME to factor for fixed effects df_all\(TEAM_NAME <- as.factor(df_all\)TEAM_NAME) # Run the fixed effects regression model_fixed <- lm(wp ~ independent_var + TEAM_NAME, data = df_all) # View summary summary(model_fixed) library(car) # Hypothesis Test linearHypothesis(model_fixed, “independent_var = 2”) # Calculate the average runs scored per season r_avg_2022 <- mean(df_2022\(rs, na.rm = TRUE) r_avg_2023 <- mean(df_2023\)rs, na.rm = TRUE) r_avg_2024 <- mean(df_2024\(rs, na.rm = TRUE) # Add standardized RS and RA for each season df_2022 <- df_2022 %>% mutate(rs_standardized = rs / (4 * r_avg_2022), ra_standardized = ra / (4 * r_avg_2022)) df_2023 <- df_2023 %>% mutate(rs_standardized = rs / (4 * r_avg_2023), ra_standardized = ra / (4 * r_avg_2023)) df_2024 <- df_2024 %>% mutate(rs_standardized = rs / (4 * r_avg_2024), ra_standardized = ra / (4 * r_avg_2024)) # Check column names for missing or empty names colnames(df_2024) # Remove columns with missing or empty names df_2022 <- df_2022 %>% select(where(~ !is.na(names(df_2022)) & names(df_2022) != "")) # Rename problematic columns names(df_2024)[is.na(names(df_2024)) | names(df_2024) == ""] <- paste0("column_", seq_along(names(df_2024))) # Check for missing or empty column names colnames(df_2024) # Remove columns with NA or empty names df_2022 <- df_2022[, !is.na(names(df_2022)) & names(df_2022) != ""] df_2023 <- df_2023[, !is.na(names(df_2023)) & names(df_2023) != ""] df_2024 <- df_2024[, !is.na(names(df_2024)) & names(df_2024) != ""] # Add standardized RS and RA after cleaning df_2022 <- df_2022 %>% mutate(rs_standardized = rs / (4 * r_avg_2022), ra_standardized = ra / (4 * r_avg_2022)) df_2023 <- df_2023 %>% mutate(rs_standardized = rs / (4 * r_avg_2023), ra_standardized = ra / (4 * r_avg_2023)) df_2024 <- df_2024 %>% mutate(rs_standardized = rs / (4 * r_avg_2024), ra_standardized = ra / (4 * r_avg_2024)) # Calculate the average runs scored per season r_avg_2022 <- mean(df_2022\)rs, na.rm = TRUE) r_avg_2023 <- mean(df_2023\(rs, na.rm = TRUE) r_avg_2024 <- mean(df_2024\)rs, na.rm = TRUE) # Add standardized RS and RA for each season df_2022 <- df_2022 %>% mutate(rs_standardized = rs / (4 * r_avg_2022), ra_standardized = ra / (4 * r_avg_2022)) df_2023 <- df_2023 %>% mutate(rs_standardized = rs / (4 * r_avg_2023), ra_standardized = ra / (4 * r_avg_2023)) df_2024 <- df_2024 %>% mutate(rs_standardized = rs / (4 * r_avg_2024), ra_standardized = ra / (4 * r_avg_2024)) # Combine datasets df_all <- bind_rows(df_2022, df_2023, df_2024) # Convert TEAM_NAME to a factor for fixed effects df_all\(TEAM_NAME <- as.factor(df_all\)TEAM_NAME) # Run the regression with separate RS and RA model_separate <- lm(wp ~ rs_standardized + ra_standardized + TEAM_NAME, data = df_all) # View the regression summary summary(model_separate) # Hypothesis test for equality of coefficients library(car) linearHypothesis(model_separate, “rs_standardized = ra_standardized”) # Predict win percentage using the fixed effects model df_all\(WPGUESS <- predict(model_fixed, newdata = df_all) # Calculate prediction error df_all\)WPERR <- df_all\(wp - df_all\)WPGUESS # Example: Assuming these are playoff teams (replace with actual names) playoff_teams <- c(“TEAMNYY”, “TEAMLAD”, “TEAMHOU”, “TEAMATL”, “TEAMBOS”, “TEAMLAA”) # Create playoff dummy df_all\(playoff <- ifelse(df_all\)TEAM_NAME %in% playoff_teams, 1, 0) # Run regression with WPERR as the dependent variable model_playoff <- lm(WPERR ~ playoff + TEAM_NAME, data = df_all) summary(model_playoff) savehistory(“~/project/project.Rproj”) Q2 # Load necessary libraries install.packages(“readxl”) library(readxl) library(dplyr) # Load the datasets nfl_2021 <- read_excel(“NFL 2021 Season.xlsx”) nfl_2022 <- read_excel(“NFL 2022 Season.xlsx”) nfl_2023 <- read_excel(“NFL 2023 Season.xlsx”) # Run linear regression for each season model_2021 <- lm(WP ~ independent_var, data = nfl_2021) # Calculate the average points scored per season pf_avg_2021 <- mean(nfl_2021\(PF, na.rm = TRUE) pf_avg_2022 <- mean(nfl_2022\)PF, na.rm = TRUE) pf_avg_2023 <- mean(nfl_2023\(PF, na.rm = TRUE) # Create the independent variable for each season nfl_2021 <- nfl_2021 %>% mutate(independent_var = (PF - PA) / (4 * pf_avg_2021)) nfl_2022 <- nfl_2022 %>% mutate(independent_var = (PF - PA) / (4 * pf_avg_2022)) nfl_2023 <- nfl_2023 %>% mutate(independent_var = (PF - PA) / (4 * pf_avg_2023)) # Run linear regression for each season model_2021 <- lm(WP ~ independent_var, data = nfl_2021) model_2022 <- lm(WP ~ independent_var, data = nfl_2022) model_2023 <- lm(WP ~ independent_var, data = nfl_2023) # View summaries summary(model_2021) summary(model_2022) summary(model_2023) library(car) # Hypothesis test for β = 2 linearHypothesis(model_2021, "independent_var = 2") linearHypothesis(model_2022, "independent_var = 2") linearHypothesis(model_2023, "independent_var = 2") # Combine data from all seasons nfl_all <- bind_rows(nfl_2021, nfl_2022, nfl_2023) # Ensure TEAM_NAME is a factor for fixed effects nfl_all\)TEAM_NAME <- as.factor(nfl_all\(TEAM_NAME) # Run fixed effects regression model_fixed_nfl <- lm(WP ~ independent_var + TEAM_NAME, data = nfl_all) # View summary summary(model_fixed_nfl) library(car) # Hypothesis test for β = 2 linearHypothesis(model_fixed_nfl, "independent_var = 2") # Calculate average points for all seasons combined pf_avg_all <- mean(nfl_all\)PF, na.rm = TRUE) # Create standardized PF and PA nfl_all <- nfl_all %>% mutate(pf_standardized = PF / (4 * pf_avg_all), pa_standardized = PA / (4 * pf_avg_all)) # Run the regression with separate PF and PA model_separate_nfl <- lm(WP ~ pf_standardized + pa_standardized + TEAM_NAME, data = nfl_all) # View the summary summary(model_separate_nfl) library(car) # Hypothesis test for equality of coefficients linearHypothesis(model_separate_nfl, “pf_standardized = pa_standardized”) savehistory(“~/project/project.Rproj”) # Load necessary libraries library(readxl) library(dplyr) Q3 # Load the NHL datasets nhl_2021 <- read_excel(“NHL 2021-2022 Season.xlsx”) nhl_2022 <- read_excel(“NHL 2022-2023 Season.xlsx”) nhl_2023 <- read_excel(“NHL 2023-2024 Season.xlsx”) # Calculate average goals scored per season gf_avg_2021 <- mean(nhl_2021\(GF, na.rm = TRUE) gf_avg_2022 <- mean(nhl_2022\)GF, na.rm = TRUE) gf_avg_2023 <- mean(nhl_2023\(GF, na.rm = TRUE) # Create the independent variable for each season nhl_2021 <- nhl_2021 %>% mutate(independent_var = (GF - GA) / (4 * gf_avg_2021)) nhl_2022 <- nhl_2022 %>% mutate(independent_var = (GF - GA) / (4 * gf_avg_2022)) nhl_2023 <- nhl_2023 %>% mutate(independent_var = (GF - GA) / (4 * gf_avg_2023)) # Run linear regression for each season model_2021_nhl <- lm(WP ~ independent_var, data = nhl_2021) model_2022_nhl <- lm(WP ~ independent_var, data = nhl_2022) model_2023_nhl <- lm(WP ~ independent_var, data = nhl_2023) # View summaries summary(model_2021_nhl) summary(model_2022_nhl) summary(model_2023_nhl) # Combine all NHL seasons nhl_all <- bind_rows(nhl_2021, nhl_2022, nhl_2023) # Ensure TEAM_NAME is treated as a factor for fixed effects nhl_all\)TEAM_NAME <- as.factor(nhl_all\(TEAM_NAME) # Run fixed effects regression model_fixed_nhl <- lm(WP ~ independent_var + TEAM_NAME, data = nhl_all) # View the summary summary(model_fixed_nhl) # Calculate average goals scored for all seasons combined gf_avg_all <- mean(nhl_all\)GF, na.rm = TRUE) # Create standardized GF and GA nhl_all <- nhl_all %>% mutate(gf_standardized = GF / (4 * gf_avg_all), ga_standardized = GA / (4 * gf_avg_all)) library(car) # Hypothesis test for equality of coefficients linearHypothesis(model_separate_nhl, “gf_standardized = ga_standardized”) # Calculate average goals scored for all seasons combined gf_avg_all <- mean(nhl_all\(GF, na.rm = TRUE) # Create standardized GF and GA nhl_all <- nhl_all %>% mutate(gf_standardized = GF / (4 * gf_avg_all), ga_standardized = GA / (4 * gf_avg_all)) # Run the regression with separate GF and GA, controlling for team effects model_separate_nhl <- lm(WP ~ gf_standardized + ga_standardized + TEAM_NAME, data = nhl_all) # View the summary summary(model_separate_nhl) library(car) # Hypothesis test for equality of coefficients linearHypothesis(model_separate_nhl, "gf_standardized = ga_standardized") # Define playoff teams (replace placeholders with actual team names) playoff_teams <- c("TEAMABC", "TEAMXYZ", "TEAMDEF", "TEAMGHI") # Add actual team names here # Create the playoff dummy variable nhl_all\)playoff <- ifelse(nhl_all\(TEAM_NAME %in% playoff_teams, 1, 0) # Regression including playoff dummy variable model_playoff_nhl <- lm(WP ~ gf_standardized + ga_standardized + playoff + TEAM_NAME, data = nhl_all) # View the summary summary(model_playoff_nhl) library(car) # Hypothesis test for the playoff dummy variable linearHypothesis(model_playoff_nhl, "playoff = 0") # Simplified regression without TEAM_NAME to test playoff effect model_playoff_simple <- lm(WP ~ gf_standardized + ga_standardized + playoff, data = nhl_all) # Hypothesis test again library(car) linearHypothesis(model_playoff_simple, "playoff = 0") # Check for multicollinearity issues alias(model_playoff_nhl) # Set one team as the reference category nhl_all\)TEAM_NAME <- relevel(nhl_all\(TEAM_NAME, ref = "TEAMABC") # Replace with an actual team name # Model with interaction terms to reduce collinearity model_interaction <- lm(WP ~ (gf_standardized + ga_standardized) * playoff, data = nhl_all) # Hypothesis test for playoff dummy linearHypothesis(model_interaction, "playoff = 0") # Identify aliased coefficients alias(model_playoff_nhl) # Model excluding gf_standardized model_no_gf <- lm(WP ~ ga_standardized + playoff, data = nhl_all) linearHypothesis(model_no_gf, "playoff = 0") # Model excluding gf_standardized model_no_gf <- lm(WP ~ ga_standardized + playoff, data = nhl_all) linearHypothesis(model_no_gf, "playoff = 0") # Define playoff teams (replace placeholders with actual team names) playoff_teams <- c("TEAMABC", "TEAMXYZ", "TEAMDEF", "TEAMGHI") # Add actual team names here # Create the playoff dummy variable nhl_all\)playoff <- ifelse(nhl_all\(TEAM_NAME %in% playoff_teams, 1, 0) # Regression including playoff dummy variable model_playoff_nhl <- lm(WP ~ gf_standardized + ga_standardized + playoff + TEAM_NAME, data = nhl_all) # View the summary summary(model_playoff_nhl) library(car) # Hypothesis test for the playoff dummy variable linearHypothesis(model_playoff_nhl, "playoff = 0") # Model excluding gf_standardized model_no_gf <- lm(WP ~ ga_standardized + playoff, data = nhl_all) linearHypothesis(model_no_gf, "playoff = 0") # Identify the aliased (problematic) coefficients alias(model_playoff_nhl) # Check for missing data colSums(is.na(nhl_all)) # Simplified model after cleaning data model_playoff_clean <- lm(WP ~ gf_standardized + ga_standardized + playoff, data = nhl_clean) # Create nhl_clean by removing missing data nhl_clean <- na.omit(nhl_all) # Verify missing data is handled colSums(is.na(nhl_clean)) # Verify missing data is handled colSums(is.na(nhl_clean)) # Remove missing values to create the clean dataset nhl_clean <- na.omit(nhl_all) # Run the linear regression model using the clean dataset model_playoff_clean <- lm(WP ~ gf_standardized + ga_standardized + playoff, data = nhl_clean) # Load the car package for hypothesis testing library(car) # Hypothesis test to check if the playoff variable has a significant effect linearHypothesis(model_playoff_clean, "playoff = 0") # Define playoff teams (replace placeholders with actual team names) playoff_teams <- c("TEAMABC", "TEAMXYZ", "TEAMDEF", "TEAMGHI") # Add actual team names here # Create the playoff dummy variable nhl_all\)playoff <- ifelse(nhl_all$TEAM_NAME %in% playoff_teams, 1, 0) # Regression including playoff dummy variable model_playoff_nhl <- lm(WP ~ gf_standardized + ga_standardized + playoff + TEAM_NAME, data = nhl_all) # View the summary summary(model_playoff_nhl) library(car) # Hypothesis test for the playoff dummy variable linearHypothesis(model_playoff_nhl, “playoff = 0”) savehistory(“~/project/project.Rproj”)