#Load in Data
All_Star_Selection <- read.csv("C://Users/james/Downloads/All-Star Selections.csv")

Player_Per_Game_Stats <- read.csv("C://Users/james/Downloads/Player Per Game.csv")

All_NBA_Selections <- read.csv("C://Users/james/Downloads/End Of Season Teams.csv")

Yearly_Awards <- read.csv("C://Users/james/Downloads/Player Award Shares.csv")

HOF <- read.csv("C://Users/james/Downloads/Player Career Info.csv")

NBA_Champions <- read.csv("C://Users/james/Downloads/NBA Finals and MVP.csv")

Clean & Organize the Data Remove Unwanted Columns From Each Dataset

# Filter years to 1980-2024 for HOF
HOF <- HOF[HOF$first_seas >= 1980, ]

# Get rid of columns we don't want for All-Star data
All_Star_Selection <- subset(All_Star_Selection, select = -c(team, lg, replaced))

# Get rid of columns we don't want for Player Per Game Stats
Player_Per_Game_Stats <- subset(Player_Per_Game_Stats, 
                                select = -c(seas_id, player_id, birth_year, lg,
                                            e_fg_percent))

# Get rid of columns we don't want for All NBA Selections
All_NBA_Selections <- subset(All_NBA_Selections, 
                                select = c(season, type, number_tm,
                                           player))

# Get rid of columns we don't want for Yearly Awards & only show winners
# remove clutch player of the year because it only recently existed
Yearly_Awards <- subset(Yearly_Awards, 
                                select = c(season, award, player, winner))

Yearly_Awards <- Yearly_Awards[Yearly_Awards$winner == TRUE, ]

Yearly_Awards <- Yearly_Awards[Yearly_Awards$award != "clutch_poy", ]

# Get rid of columns we don't want for HOF & only show current Hall of Fame players
HOF <- subset(HOF, select = c(player, num_seasons, hof))

# Get rid of columns we don't want for NBA Champions
NBA_Champions <- subset(NBA_Champions, select = c(Year, NBA.Champion))

#Change Data Layout & Cleanup Data

Change All NBA Selection awards layout

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'tidyr' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
All_NBA_Selections <- All_NBA_Selections %>%
  pivot_wider(names_from = type, values_from = number_tm)

colnames(All_NBA_Selections)[which(names(All_NBA_Selections) == 
                                     "All-Defense")] <- "All_Defense"
colnames(All_NBA_Selections)[which(names(All_NBA_Selections) == 
                                     "All-NBA")] <- "All_NBA"
colnames(All_NBA_Selections)[which(names(All_NBA_Selections) == 
                                     "All-Rookie")] <- "All_Rookie"

All_NBA_Selections$'All_Defense' <- ifelse(!is.na(All_NBA_Selections$'All_Defense'),   paste(All_NBA_Selections$'All_Defense', "Team"), All_NBA_Selections$'All_Defense')

All_NBA_Selections$'All_NBA' <- ifelse(!is.na(All_NBA_Selections$'All_NBA'),   paste(All_NBA_Selections$'All_NBA', "Team"), All_NBA_Selections$'All_NBA')

All_NBA_Selections$'All_Rookie' <- ifelse(!is.na(All_NBA_Selections$'All_Rookie'),   paste(All_NBA_Selections$'All_Rookie', "Team"), All_NBA_Selections$'All_Rookie')

# Change Yearly Awards layout
Yearly_Awards <- Yearly_Awards %>%
  pivot_wider(names_from = award, values_from = award, values_fill = NA,
              values_fn = list(award = function(x) "Yes"))

colnames(Yearly_Awards)[which(names(Yearly_Awards) == 
                                     "nba mvp")] <- "mvp"
colnames(Yearly_Awards)[which(names(Yearly_Awards) == 
                                     "nba roy")] <- "roy"

Yearly_Awards <- subset(Yearly_Awards, select = c(season, player, dpoy, mip,
                                                  mvp, roy, smoy))

# Add Yes column to NBA Champions & Add most recent champs
NBA_Champions$tm <- NBA_Champions$NBA.Champion
NBA_Champions$season <- NBA_Champions$Year
NBA_Champions <- subset(NBA_Champions, select = c(season, tm))

recent_champions <- data.frame(
  season = c(2019, 2020, 2021, 2022, 2023, 2024),
  tm = c("TOR", "LAL", "MIL", "GSW", "DEN", "BOS")
)
NBA_Champions <- rbind(NBA_Champions, recent_champions)

NBA_Champions$NBA_Champion <- 'Yes'

Combine Datasets

# Merge Player Per Game Stats & All_Star Selection as NBA Data 

All_Star_Selection$all_star <- "Yes"
NBA_Data <- merge(Player_Per_Game_Stats, All_Star_Selection,
                  by = c("player", "season"), all.x = TRUE)

# Merge All NBA Selections into NBA Data
NBA_Data <- merge(NBA_Data, All_NBA_Selections, 
                  by = c("season", "player"), all.x = TRUE)

# Merge Yearly Awards into NBA Data
NBA_Data <- merge(NBA_Data, Yearly_Awards, 
                  by = c("season", "player"), all.x = TRUE)

# Merge Champions into NBA Data
NBA_Data <- merge(NBA_Data, NBA_Champions, 
                  by = c("season", "tm"), all.x = TRUE)

Add Hall of Famer Accolades to HOF Dataset

library(dplyr)

#All Star Count (all_star)
NBA_Data <- NBA_Data %>% mutate(all_star = ifelse(all_star == "Yes", 1, 0))
all_star_count <- NBA_Data %>% 
  filter(all_star == 1) %>% 
  group_by(player) %>% 
  summarise(all_star = n())
HOF <- left_join(HOF, all_star_count, by = "player")
HOF$all_star[is.na(HOF$all_star)] <- 0

# Defensive Player of The Year Count (dpoy)
NBA_Data <- NBA_Data %>% mutate(dpoy = ifelse(dpoy == "Yes", 1, 0))
dpoy_count <- NBA_Data %>% 
  filter(dpoy == 1) %>% 
  group_by(player) %>% 
  summarise(dpoy = n())
HOF <- left_join(HOF, dpoy_count, by = "player")
HOF$dpoy[is.na(HOF$dpoy)] <- 0

# Most Improved Player Count (MIP)
NBA_Data <- NBA_Data %>% mutate(mip = ifelse(mip == "Yes", 1, 0))
mip_count <- NBA_Data %>% 
  filter(mip == 1) %>% 
  group_by(player) %>% 
  summarise(mip = n())
HOF <- left_join(HOF, mip_count, by = "player")
HOF$mip[is.na(HOF$mip)] <- 0

# Most Valuable Player Count (MVP)
NBA_Data <- NBA_Data %>% mutate(mvp = ifelse(mvp == "Yes", 1, 0))
mvp_count <- NBA_Data %>% 
  filter(mvp == 1) %>% 
  group_by(player) %>% 
  summarise(mvp = n())
HOF <- left_join(HOF, mvp_count, by = "player")
HOF$mvp[is.na(HOF$mvp)] <- 0

# Rookie of the Year Count (ROY)
NBA_Data <- NBA_Data %>% mutate(roy = ifelse(roy == "Yes", 1, 0))
roy_count <- NBA_Data %>% 
  filter(roy == 1) %>% 
  group_by(player) %>% 
  summarise(roy = n())
HOF <- left_join(HOF, roy_count, by = "player")
HOF$roy[is.na(HOF$roy)] <- 0

# Sixth Man of the Year Count (SMOY)
NBA_Data <- NBA_Data %>% mutate(smoy = ifelse(smoy == "Yes", 1, 0))
smoy_count <- NBA_Data %>% 
  filter(smoy == 1) %>% 
  group_by(player) %>% 
  summarise(smoy = n())
HOF <- left_join(HOF, smoy_count, by = "player")
HOF$smoy[is.na(HOF$smoy)] <- 0

# All Defense Team Count (All_Defense)
All_Defense_Count <- NBA_Data %>%
  filter(!is.na(All_Defense)) %>%
  group_by(player) %>%
  summarise(All_Defense = n())
HOF <- left_join(HOF, All_Defense_Count, by = "player")
HOF$All_Defense[is.na(HOF$All_Defense)] <- 0

# All NBA Team Count (All_NBA)
All_NBA_Count <- NBA_Data %>%
  filter(!is.na(All_NBA)) %>%
  group_by(player) %>%
  summarise(All_NBA = n())
HOF <- left_join(HOF, All_NBA_Count, by = "player")
HOF$All_NBA[is.na(HOF$All_NBA)] <- 0

# All Rookie Team Count (All_Rookie)
All_Rookie_Count <- NBA_Data %>%
  filter(!is.na(All_Rookie)) %>%
  group_by(player) %>%
  summarise(All_Rookie = n())
HOF <- left_join(HOF, All_Rookie_Count, by = "player")
HOF$All_Rookie[is.na(HOF$All_Rookie)] <- 0

# Championship Count
Championships_Count <- NBA_Data %>%
  filter(!is.na(NBA_Champion)) %>%
  group_by(player) %>%
  summarise(NBA_Champion = n())
HOF <- left_join(HOF, Championships_Count, by = "player")
HOF$NBA_Champion[is.na(HOF$NBA_Champion)] <- 0

Fix Traded Players in Data

NBA_Data$tm[which(NBA_Data$tm == "TOT")] <- "aaTOT"
NBA_Data <- NBA_Data[order(NBA_Data$tm), ]


id <- paste(NBA_Data$season, NBA_Data$player, sep = "")

NBA_Data <- NBA_Data[!duplicated(id), ]

Fix Positions for Players with Multiple Listed

NBA_Data$pos <- substr(NBA_Data$pos, 1, 2)

Narrow Datasets Down to Years 1980 - 2024 This will reflect the creation of the 3 point line up to the present.

Filter years to 1980-2024 for Player Per Game Stats

Current_Stats <- NBA_Data[NBA_Data$season >= 2025, ]
Player_Per_Game_Stats <- NBA_Data[NBA_Data$season >= 1980 &
                                           NBA_Data$season <= 2024, ]

Create a Scale to Make Stats Comparable Across Years

#NBA_Data
seasons <- unique(NBA_Data$season)
Scaled_NBA_Data <- NBA_Data 
for(i in 1:length(seasons)) {
  Scaled_NBA_Data[which(NBA_Data$season == seasons[i]), 10:30 ] <- scale(Scaled_NBA_Data[which(NBA_Data$season == seasons[i]), 10:30])
}

# Current Data
seasons <- unique(Current_Stats$season)
Scaled_Current_Stats <- Current_Stats 
for(i in 1:length(seasons)) {
  Scaled_Current_Stats[which(Current_Stats$season == seasons[i]), 10:30 ] <- scale(Scaled_Current_Stats[which(Current_Stats$season == seasons[i]), 10:30])
}

Min 65 games played

Scaled_NBA_Data <- Scaled_NBA_Data[Scaled_NBA_Data$g >= 65, ]

Scaled_Current_Stats <- Scaled_Current_Stats

Create Dataset for Conferences

Western_Conf <- c('DAL', 'DEN', 'GSW', 'HOU', 'LAC', 'LAL', 'MEM', 'MIN', 
                  'NOP', 'OKC', 'PHO', 'POR', 'SAC', 'SAS', 'UTA')

Eastern_Conf <- c('ATL', 'BOS', 'BRK', 'CHO', 'CHI', 'CLE', 'DET', 'IND',
                  'MIA', 'MIL', 'NYK', 'ORL', 'PHI', 'TOR', 'WAS')
                  
# Create a dataframe for the Eastern Conference
eastern_df <- data.frame(tm = Eastern_Conf, Conference = "Eastern")

# Create a dataframe for the Western Conference
western_df <- data.frame(tm = Western_Conf, Conference = "Western")

# Combine the two dataframes into one
team_conference <- rbind(eastern_df, western_df)

Main Datasets

head(NBA_Data) # All NBA Player Data
##     season    tm         player pos age experience  g gs mp_per_game
## 150   1947 aaTOT      Bob Duffy   F  24          1 17 NA          NA
## 151   1947 aaTOT Bob Fitzgerald  F-  23          1 60 NA          NA
## 152   1947 aaTOT    Bob Mullens   G  24          1 54 NA          NA
## 153   1947 aaTOT Charlie Hoefer   G  25          1 58 NA          NA
## 154   1947 aaTOT    Dick Murphy   G  25          1 31 NA          NA
## 155   1947 aaTOT    Dick Schulz  F-  30          1 57 NA          NA
##     fg_per_game fga_per_game fg_percent x3p_per_game x3pa_per_game x3p_percent
## 150         0.4          1.9      0.219           NA            NA          NA
## 151         1.2          6.0      0.193           NA            NA          NA
## 152         2.3          8.2      0.281           NA            NA          NA
## 153         2.2          8.9      0.253           NA            NA          NA
## 154         0.5          2.4      0.200           NA            NA          NA
## 155         2.3          9.6      0.237           NA            NA          NA
##     x2p_per_game x2pa_per_game x2p_percent ft_per_game fta_per_game ft_percent
## 150          0.4           1.9       0.219         0.3          0.4      0.714
## 151          1.2           6.0       0.193         1.4          2.2      0.623
## 152          2.3           8.2       0.281         1.2          1.9      0.627
## 153          2.2           8.9       0.253         1.6          2.4      0.655
## 154          0.5           2.4       0.200         0.1          0.3      0.444
## 155          2.3           9.6       0.237         1.6          2.4      0.681
##     orb_per_game drb_per_game trb_per_game ast_per_game stl_per_game
## 150           NA           NA           NA          0.0           NA
## 151           NA           NA           NA          0.6           NA
## 152           NA           NA           NA          1.0           NA
## 153           NA           NA           NA          0.6           NA
## 154           NA           NA           NA          0.3           NA
## 155           NA           NA           NA          1.0           NA
##     blk_per_game tov_per_game pf_per_game pts_per_game all_star All_Defense
## 150           NA           NA         1.0          1.1       NA        <NA>
## 151           NA           NA         2.6          3.7       NA        <NA>
## 152           NA           NA         1.7          5.8       NA        <NA>
## 153           NA           NA         2.4          6.1       NA        <NA>
## 154           NA           NA         0.5          1.1       NA        <NA>
## 155           NA           NA         2.2          6.2       NA        <NA>
##     All_NBA All_Rookie All-ABA All-BAA dpoy mip mvp roy smoy NBA_Champion
## 150    <NA>       <NA>    <NA>    <NA>   NA  NA  NA  NA   NA         <NA>
## 151    <NA>       <NA>    <NA>    <NA>   NA  NA  NA  NA   NA         <NA>
## 152    <NA>       <NA>    <NA>    <NA>   NA  NA  NA  NA   NA         <NA>
## 153    <NA>       <NA>    <NA>    <NA>   NA  NA  NA  NA   NA         <NA>
## 154    <NA>       <NA>    <NA>    <NA>   NA  NA  NA  NA   NA         <NA>
## 155    <NA>       <NA>    <NA>    <NA>   NA  NA  NA  NA   NA         <NA>
head(HOF) # Hall of Fame Player Data
##            player num_seasons   hof all_star dpoy mip mvp roy smoy All_Defense
## 1   Abdul Jeelani           2 FALSE        0    0   0   0   0    0           0
## 2   Allen Leavell          10 FALSE        0    0   0   0   0    0           0
## 3    Arvid Kramer           1 FALSE        0    0   0   0   0    0           0
## 4   Bernard Toone           1 FALSE        0    0   0   0   0    0           0
## 5 Bill Cartwright          15 FALSE        1    0   0   0   0    0           0
## 6 Billy Ray Bates           4 FALSE        0    0   0   0   0    0           0
##   All_NBA All_Rookie NBA_Champion
## 1       0          0            0
## 2       0          0            0
## 3       0          0            0
## 4       0          0            0
## 5       0          1            3
## 6       0          0            0

Variables NBA_Data Predictor Variables player - Player Name season - Season of stats pos - Player Position age - Players Age experience - How many years player has been in league g - Games Played gs - Games started mp_per_game - Mins per game fg_per_game - Field goals per game fga_per_game - Field goals attempted per game fg_percent - Field goal % x3p_per_game - 3 point shots made per game x3pa_per_game - 3 point shots attempted per game x3p_percent - 3 point % x2p_per_game - 2 point shots made per game x2pa_per_game - 2 point shots attempted per game x2p_percent - 2 point % ft_per_game - free throws made per game fta_per_game - free throws attempted per game ft_percent - free throw % orb_per_game - Offensive rebounds per game drb_per_game - defensive rebounds per game trb_per_game - Total rebounds per game ast_per_game - Assists per game stl_per_game - Steals per game blk_per_game - Blocks per game tov_per_game - turnovers per game pf_per_game - personal fouls per game pts_per_game - Pts per game all_star - Was player an all-star (Yes or No) All_Defense - Did player make all defense 1st or 2nd team All_NBA - did player make all nba 1st or 2nd team All_Rookie - did player make all rookie 1st or 2nd team dpoy - did player win defensive player of the year mip - did player win most improved player mvp - did player win most valuable player roy - did player win rookie of the year smoy - did player win sixth man of the year HOF Predictor Variables num_seasons - number of seasons a player played for all_star - number of all star appearances dpoy - number of defensive player of the year awards mip - number of most improved player awards mvp - number of most valuable player awards roy - number of rookie of the year awards (max 1) smoy - number of six man of the year awards All_Defense - number of all defensive team selections All_NBA - number of all NBA team selections All_Rookie - number of all rookie team selections (max 1) Variable Analysis

library(ggplot2)

# Player Position vs Points
ggplot(NBA_Data, aes(x = pos, y = pts_per_game)) +
  geom_boxplot() +
  labs(title = "Player Position vs Points Per Game", x = "Position", 
       y = "Points Per Game") +
  theme_minimal()

# Average Awards and All-Star Appearances for Hall of Famers
library(dplyr)

avg_awards <- HOF %>%
  filter(hof == TRUE) %>%
  summarise(
    avg_num_seasons = mean(num_seasons, na.rm = TRUE),
    avg_all_star = mean(all_star, na.rm = TRUE),
    avg_dpoy = mean(dpoy, na.rm = TRUE),
    avg_mip = mean(mip, na.rm = TRUE),
    avg_mvp = mean(mvp, na.rm = TRUE),
    avg_roy = mean(roy, na.rm = TRUE),
    avg_smoy = mean(smoy, na.rm = TRUE),
    avg_all_defense = mean(All_Defense, na.rm = TRUE),
    avg_all_NBA = mean(All_NBA, na.rm = TRUE),
    avg_all_Rookie = mean(All_Rookie, na.rm = TRUE)
  )
avg_awards_long <- avg_awards %>%
  pivot_longer(cols = everything(), names_to = "Award", values_to = "Average")

ggplot(avg_awards_long, aes(x = Award, y = Average, fill = Award)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Stats of Hall of Famers", x = "Stat", y = "Average Count") +
  theme_minimal() +
  coord_flip()+
  theme(legend.position = "none")

avg_all_star <- NBA_Data %>% 
  filter(all_star == 1) %>%
  summarise(
    avg_ppg = mean(pts_per_game, na.rm = TRUE),
    avg_trb = mean(trb_per_game, na.rm = TRUE),
    avg_apg = mean(ast_per_game, na.rm = TRUE),
    avg_spg = mean(stl_per_game, na.rm = TRUE),
    avg_bpg = mean(blk_per_game, na.rm = TRUE),
    avg_mpg = mean(mp_per_game, na.rm = TRUE)
    )

avg_allstar_long <- avg_all_star %>%
  pivot_longer(cols = everything(), names_to = "Stat", values_to = "Average")

ggplot(avg_allstar_long, aes(x = Stat, y = Average, fill = Stat)) +
  geom_bar(stat = "identity") +
  labs(title = "Per Game Averages of All Stars", x = "Stat", y = "Average Count") +
  theme_minimal() +
  coord_flip()+
  theme(legend.position = "none")

Split data into Test & Training Data

Normal Data

set.seed(7)

total_obs <- dim(Scaled_NBA_Data)[1]
train_data_indices <- sample(1:total_obs, 0.6*total_obs)
train_data <- Scaled_NBA_Data[train_data_indices,]
test_data <- Scaled_NBA_Data[-train_data_indices,]

# HOF Data
set.seed(7)

total_obs_HOF <- dim(HOF)[1]
train_data_indices_HOF <- sample(1:total_obs_HOF, 0.6*total_obs_HOF)
train_data_HOF <- HOF[train_data_indices_HOF,]
test_data_HOF <- HOF[-train_data_indices_HOF,]

Models Logistic Regression

library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.4.2
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# All-Star Predictor Model
train_data$all_star[is.na(train_data$all_star)] <- 0
test_data$all_star[is.na(test_data$all_star)] <- 0

lm_all_star <- glm(all_star ~ fg_per_game + fga_per_game + fg_percent +
                     x3p_per_game + x3pa_per_game + x3p_percent +
                     x2p_per_game + x2pa_per_game + x2p_percent +
                     ft_per_game + fta_per_game + ft_percent +
                     orb_per_game + drb_per_game + trb_per_game +
                     ast_per_game + stl_per_game + blk_per_game + tov_per_game + pts_per_game, 
                   family = 'binomial', data = train_data)

lm_all_star_pred <- predict(lm_all_star, newdata = test_data, type = 'response')
preds_all_star_cat <- ifelse(lm_all_star_pred>.5, 1, 0)
lm_all_star_acc <- confusionMatrix(factor(preds_all_star_cat),
                                   factor(test_data$all_star), positive = '1')

# Yearly Awards Predictor Model 
#MVP
train_data$mvp[is.na(train_data$mvp)] <- 0
test_data$mvp[is.na(test_data$mvp)] <- 0

lm_MVP <- glm(mvp ~ fg_per_game + fga_per_game + fg_percent +
                     x3p_per_game + x3pa_per_game + x3p_percent +
                     x2p_per_game + x2pa_per_game + x2p_percent +
                     ft_per_game + fta_per_game + ft_percent +
                     orb_per_game + drb_per_game + trb_per_game +
                     ast_per_game + stl_per_game + blk_per_game + tov_per_game + pts_per_game, 
                   family = 'binomial', data = train_data)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm_MVP_pred <- predict(lm_MVP, newdata = test_data, type = 'response')
preds_MVP_cat <- ifelse(lm_MVP_pred>.5, 1, 0)
lm_MVP_acc <- confusionMatrix(factor(preds_MVP_cat),
                                   factor(test_data$mvp), positive = '1')

#DPOY
train_data$dpoy[is.na(train_data$dpoy)] <- 0
test_data$dpoy[is.na(test_data$dpoy)] <- 0

lm_DPOY <- glm(dpoy ~ fg_per_game + fga_per_game + fg_percent +
                     x3p_per_game + x3pa_per_game + x3p_percent +
                     x2p_per_game + x2pa_per_game + x2p_percent +
                     ft_per_game + fta_per_game + ft_percent +
                     orb_per_game + drb_per_game + trb_per_game +
                     ast_per_game + stl_per_game + blk_per_game + tov_per_game + pts_per_game, 
                   family = 'binomial', data = train_data)

lm_dpoy_pred <- predict(lm_DPOY, newdata = test_data, type = 'response')
preds_dpoy_cat <- ifelse(lm_dpoy_pred>.5, 1, 0)
lm_dpoy_acc <- confusionMatrix(factor(preds_dpoy_cat),
                                   factor(test_data$dpoy), positive = '1')

# Hall of Fame Predictor Model
lm_HOF <- glm(hof ~ num_seasons + all_star + dpoy + mip + mvp + roy + smoy +
                All_Defense + All_NBA + All_Rookie + NBA_Champion, 
              family = 'binomial', data = train_data_HOF)

lm_HOF_pred <- predict(lm_HOF, newdata = test_data_HOF, type = 'response')
preds_HOF_cat <- ifelse(lm_HOF_pred>.5, 'TRUE', 'FALSE')
lm_HOF_acc <- confusionMatrix(factor(preds_HOF_cat),
                                   factor(test_data_HOF$hof), positive = 'TRUE')
# All-Star Predictor Model
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.2
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
# Replace NAs in target columns with 0
train_data$all_star[is.na(train_data$all_star)] <- 0
test_data$all_star[is.na(test_data$all_star)] <- 0

train_data$all_star <- as.numeric(train_data$all_star)
test_data$all_star <- as.numeric(test_data$all_star)

predictor_columns <- c(
  "fg_per_game", "fga_per_game", "fg_percent",
  "x3p_per_game", "x3pa_per_game", "x3p_percent",
  "x2p_per_game", "x2pa_per_game", "x2p_percent",
  "ft_per_game", "fta_per_game", "ft_percent",
  "orb_per_game", "drb_per_game", "trb_per_game",
  "ast_per_game", "stl_per_game", "blk_per_game",
  "tov_per_game", "pts_per_game"
)

# Create training and test matrices
X_train <- as.matrix(train_data[, predictor_columns])
X_test <- as.matrix(test_data[, predictor_columns])

#XGBoost All-Star
y_train_all_star <- train_data$all_star
y_test_all_star <- test_data$all_star

# Train XGBoost model
xgb_all_star <- xgboost(data = X_train, label = y_train_all_star, 
                        nrounds = 100, objective = "binary:logistic", verbose = 0)

# Predict on test set
all_star_preds <- predict(xgb_all_star, X_test)
all_star_preds_binary <- ifelse(all_star_preds > 0.5, 1, 0)

# Evaluate model
confusionMatrix(factor(all_star_preds_binary), factor(y_test_all_star), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4057  227
##          1  140  469
##                                           
##                Accuracy : 0.925           
##                  95% CI : (0.9173, 0.9322)
##     No Information Rate : 0.8578          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6757          
##                                           
##  Mcnemar's Test P-Value : 7.15e-06        
##                                           
##             Sensitivity : 0.67385         
##             Specificity : 0.96664         
##          Pos Pred Value : 0.77011         
##          Neg Pred Value : 0.94701         
##              Prevalence : 0.14224         
##          Detection Rate : 0.09585         
##    Detection Prevalence : 0.12446         
##       Balanced Accuracy : 0.82025         
##                                           
##        'Positive' Class : 1               
## 
test_data$xgb_all_star_preds <- all_star_preds

# Yearly Awards Predictor Model (MVP & DPOY)

library(xgboost)
library(caret)
library(dplyr)

# Replace NAs in target columns with 0
train_data$mvp[is.na(train_data$mvp)] <- 0
test_data$mvp[is.na(test_data$mvp)] <- 0

train_data$dpoy[is.na(train_data$dpoy)] <- 0
test_data$dpoy[is.na(test_data$dpoy)] <- 0

# Convert target variables to numeric
train_data$mvp <- as.numeric(train_data$mvp)
test_data$mvp <- as.numeric(test_data$mvp)

train_data$dpoy <- as.numeric(train_data$dpoy)
test_data$dpoy <- as.numeric(test_data$dpoy)

predictor_columns <- c(
  "fg_per_game", "fga_per_game", "fg_percent",
  "x3p_per_game", "x3pa_per_game", "x3p_percent",
  "x2p_per_game", "x2pa_per_game", "x2p_percent",
  "ft_per_game", "fta_per_game", "ft_percent",
  "orb_per_game", "drb_per_game", "trb_per_game",
  "ast_per_game", "stl_per_game", "blk_per_game",
  "tov_per_game", "pts_per_game"
)

# Create training and test matrices
X_train <- as.matrix(train_data[, predictor_columns])
X_test <- as.matrix(test_data[, predictor_columns])


#XGBoost MVP
y_train_mvp <- train_data$mvp
y_test_mvp <- test_data$mvp

# Train XGBoost model
xgb_mvp <- xgboost(data = X_train, label = y_train_mvp, 
                   nrounds = 100, objective = "binary:logistic", verbose = 0)

# Predict on test set
mvp_preds <- predict(xgb_mvp, X_test)
mvp_preds_binary <- ifelse(mvp_preds > 0.5, 1, 0)

# Evaluate model
confusionMatrix(factor(mvp_preds_binary), factor(y_test_mvp), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4859   31
##          1    2    1
##                                           
##                Accuracy : 0.9933          
##                  95% CI : (0.9905, 0.9954)
##     No Information Rate : 0.9935          
##     P-Value [Acc > NIR] : 0.6153          
##                                           
##                   Kappa : 0.0561          
##                                           
##  Mcnemar's Test P-Value : 1.093e-06       
##                                           
##             Sensitivity : 0.0312500       
##             Specificity : 0.9995886       
##          Pos Pred Value : 0.3333333       
##          Neg Pred Value : 0.9936605       
##              Prevalence : 0.0065400       
##          Detection Rate : 0.0002044       
##    Detection Prevalence : 0.0006131       
##       Balanced Accuracy : 0.5154193       
##                                           
##        'Positive' Class : 1               
## 
test_data$xgb_mvp_preds <- mvp_preds

#XGBoost DPOY
y_train_dpoy <- train_data$dpoy
y_test_dpoy <- test_data$dpoy

# Train XGBoost model
xgb_dpoy <- xgboost(data = X_train, label = y_train_dpoy, 
                    nrounds = 100, objective = "binary:logistic", verbose = 0, scale_pos_weight = 4876/17,
                    )


x_test_2 <- Scaled_Current_Stats[,colnames(X_test)]
dpoy_preds <- predict(xgb_dpoy, as.matrix(x_test_2))
Scaled_Current_Stats$dpoy <- dpoy_preds
# Predict on test set
dpoy_preds <- predict(xgb_dpoy, X_test)
dpoy_preds_binary <- ifelse(dpoy_preds > 0.5, 1, 0)

# Evaluate model
confusionMatrix(factor(dpoy_preds_binary), factor(y_test_dpoy), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4870    8
##          1   10    5
##                                           
##                Accuracy : 0.9963          
##                  95% CI : (0.9942, 0.9978)
##     No Information Rate : 0.9973          
##     P-Value [Acc > NIR] : 0.9304          
##                                           
##                   Kappa : 0.3553          
##                                           
##  Mcnemar's Test P-Value : 0.8137          
##                                           
##             Sensitivity : 0.384615        
##             Specificity : 0.997951        
##          Pos Pred Value : 0.333333        
##          Neg Pred Value : 0.998360        
##              Prevalence : 0.002657        
##          Detection Rate : 0.001022        
##    Detection Prevalence : 0.003066        
##       Balanced Accuracy : 0.691283        
##                                           
##        'Positive' Class : 1               
## 
test_data$xgb_dpoy_preds <- dpoy_preds

XG Boost

Hall of Fame Predictor Model

library(xgboost)
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(caret)

train_data_HOF$hof <- ifelse(train_data_HOF$hof == TRUE, 1, 0)
test_data_HOF$hof <- ifelse(test_data_HOF$hof == TRUE, 1, 0)

# Select predictor variables
predictors <- c("num_seasons", "all_star", "dpoy", "mip", "mvp", "roy", 
                "smoy", "All_Defense", "All_NBA", "All_Rookie", "NBA_Champion")

# Convert data to matrix format for XGBoost
dtrain <- xgb.DMatrix(data = as.matrix(train_data_HOF[, predictors]), 
                      label = train_data_HOF$hof)
dtest <- xgb.DMatrix(data = as.matrix(test_data_HOF[, predictors]), 
                     label = test_data_HOF$hof)

# Set model parameters
params <- list(
  booster = "gbtree",
  objective = "binary:logistic",  # Binary classification
  eval_metric = "logloss",        # Log-loss for evaluation
  max_depth = 6,                  # Maximum tree depth
  eta = 0.1,                      # Learning rate
  gamma = 0,                      # Minimum loss reduction to make a split
  subsample = 0.8,                # Subsample ratio of training data
  colsample_bytree = 0.8          # Subsample ratio of columns
)

# Train the XGBoost model
set.seed(7)
xgb_model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 100,  # Number of boosting rounds
  watchlist = list(train = dtrain, test = dtest),
  print_every_n = 10,
  early_stopping_rounds = 10,  # Stop if no improvement in 10 rounds
  maximize = FALSE,
    scale_pos_weight = 1346/22,
)
## [1]  train-logloss:0.606670  test-logloss:0.607166 
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 10 rounds.
## 
## [11] train-logloss:0.201228  test-logloss:0.211551 
## [21] train-logloss:0.087788  test-logloss:0.103340 
## [31] train-logloss:0.049749  test-logloss:0.070842 
## [41] train-logloss:0.033950  test-logloss:0.059976 
## [51] train-logloss:0.026648  test-logloss:0.056632 
## [61] train-logloss:0.021273  test-logloss:0.055322 
## Stopping. Best iteration:
## [58] train-logloss:0.022377  test-logloss:0.054873
xgb_predictions <- predict(xgb_model, dtest)
xgb_predictions_class <- ifelse(xgb_predictions > 0.5, 1, 0)

# Evaluate model accuracy
conf_matrix <- confusionMatrix(
  factor(xgb_predictions_class), 
  factor(test_data_HOF$hof), 
  positive = "1"
)
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1326    5
##          1   20   17
##                                           
##                Accuracy : 0.9817          
##                  95% CI : (0.9731, 0.9881)
##     No Information Rate : 0.9839          
##     P-Value [Acc > NIR] : 0.77869         
##                                           
##                   Kappa : 0.5675          
##                                           
##  Mcnemar's Test P-Value : 0.00511         
##                                           
##             Sensitivity : 0.77273         
##             Specificity : 0.98514         
##          Pos Pred Value : 0.45946         
##          Neg Pred Value : 0.99624         
##              Prevalence : 0.01608         
##          Detection Rate : 0.01243         
##    Detection Prevalence : 0.02705         
##       Balanced Accuracy : 0.87893         
##                                           
##        'Positive' Class : 1               
## 
test_data_HOF$xgb_predictions <- xgb_predictions

#Final Predictions
# All Star Final Prediction
x_test_1 <- Scaled_Current_Stats[,colnames(X_test)]
all_star_preds <- predict(xgb_all_star, as.matrix(x_test_2))
Scaled_Current_Stats$all_star <- all_star_preds

Scaled_Current_Stats <- merge(Scaled_Current_Stats, team_conference, by = "tm", all.x = TRUE)

# MVP Final Prediction
x_test_2 <- Scaled_Current_Stats[,colnames(X_test)]
mvp_preds <- predict(xgb_mvp, as.matrix(x_test_2))
Scaled_Current_Stats$mvp <- mvp_preds

# DPOY Final Prediction
x_test_3 <- Scaled_Current_Stats[,colnames(X_test)]
dpoy_preds <- predict(xgb_dpoy, as.matrix(x_test_2))
Scaled_Current_Stats$dpoy <- dpoy_preds

# HOF Final Prediction
test_data_HOF$xgb_predictions <- xgb_predictions
#Outcomes
library(dplyr)
# All-Star
top_5_All_Stars <- Scaled_Current_Stats %>%
  group_by(Conference) %>%
  arrange(desc(all_star)) %>%
  slice_head(n = 12)

# MVP
top_5_MVP <- Scaled_Current_Stats %>%
  arrange(desc(mvp)) %>%
  slice_head(n = 5)

# DPOY
top_5_DPOY <- Scaled_Current_Stats %>%
  arrange(desc(dpoy)) %>%
  slice_head(n = 5)

# HOF
top_5_HOF <- test_data_HOF %>%
  filter(player %in% Scaled_Current_Stats$player)

top_5_HOF <- top_5_HOF %>%
  arrange(desc(xgb_predictions)) %>%
  slice_head(n = 5)
# All-Star
top_5_All_Stars <- Scaled_Current_Stats %>%
  group_by(Conference) %>%
  arrange(desc(all_star)) %>%
  slice_head(n = 5)

# MVP
top_5_MVP <- Scaled_Current_Stats %>%
  arrange(desc(mvp)) %>%
  slice_head(n = 5)

# DPOY
top_5_DPOY <- Scaled_Current_Stats %>%
  arrange(desc(dpoy)) %>%
  slice_head(n = 5)

# HOF
top_5_HOF <- test_data_HOF %>%
  filter(player %in% Scaled_Current_Stats$player)

top_5_HOF <- top_5_HOF %>%
  arrange(desc(xgb_predictions)) %>%
  slice_head(n = 5)

print(top_5_All_Stars)
## # A tibble: 10 × 43
## # Groups:   Conference [2]
##    tm    season player            pos     age experience     g    gs mp_per_game
##    <chr>  <int> <chr>             <chr> <int>      <int> <int> <int>       <dbl>
##  1 MIL     2025 Giannis Antetoko… PF       30         12     5     5        34.4
##  2 ORL     2025 Paolo Banchero    SF       22          3     5     5        36.4
##  3 BOS     2025 Jayson Tatum      SF       26          8     5     5        35.4
##  4 PHI     2025 Tyrese Maxey      PG       24          5     4     4        42  
##  5 CHO     2025 LaMelo Ball       PG       23          5     4     4        34  
##  6 LAL     2025 Anthony Davis     PF       31         13     5     5        36  
##  7 OKC     2025 Shai Gilgeous-Al… SG       26          7     4     4        32.3
##  8 DEN     2025 Nikola Jokić      C        29         10     4     4        39  
##  9 SAC     2025 Domantas Sabonis  C        28          9     4     4        36.3
## 10 HOU     2025 Jalen Green       SG       22          4     5     5        34.8
## # ℹ 34 more variables: fg_per_game <dbl>, fga_per_game <dbl>, fg_percent <dbl>,
## #   x3p_per_game <dbl>, x3pa_per_game <dbl>, x3p_percent <dbl>,
## #   x2p_per_game <dbl>, x2pa_per_game <dbl>, x2p_percent <dbl>,
## #   ft_per_game <dbl>, fta_per_game <dbl>, ft_percent <dbl>,
## #   orb_per_game <dbl>, drb_per_game <dbl>, trb_per_game <dbl>,
## #   ast_per_game <dbl>, stl_per_game <dbl>, blk_per_game <dbl>,
## #   tov_per_game <dbl>, pf_per_game <dbl>, pts_per_game <dbl>, …
print(top_5_DPOY)
##    tm season         player pos age experience g gs mp_per_game fg_per_game
## 1 CLE   2025  Jarrett Allen   C  26          8 5  5        28.8   1.3408786
## 2 UTA   2025 Walker Kessler   C  23          3 5  5        26.6   0.1655087
## 3 TOR   2025   Jakob Poeltl   C  29          9 5  5        31.8   0.9001149
## 4 OKC   2025  Chet Holmgren  PF  22          2 4  4        31.5   1.9652939
## 5 CHO   2025  Nick Richards   C  27          5 4  4        30.5   0.7899240
##   fga_per_game fg_percent x3p_per_game x3pa_per_game x3p_percent x2p_per_game
## 1    0.2905114  1.6961626   -0.9176843    -1.0669739          NA    2.2489952
## 2   -0.1307833  0.6739582   -0.9176843    -1.0669739          NA    0.7104725
## 3    0.5362667  0.5717377   -0.9176843    -1.0669739          NA    1.6720492
## 4    1.3261942  0.6982964    0.4168355     0.5780991  0.02864603    2.3451528
## 5    0.1851877  1.0390312   -0.9176843    -0.9573024 -1.47799304    1.5278127
##   x2pa_per_game x2p_percent ft_per_game fta_per_game  ft_percent orb_per_game
## 1     1.1962310   1.2442591   1.5265707    1.4557048  0.26425936     1.385136
## 2     0.5679102   0.3064082  -0.4040882   -0.2523947 -0.77063594     3.090554
## 3     1.5627515   0.2126231   0.6180254    0.6466050  0.07649354     3.303731
## 4     1.5627515   0.8378571   1.4130026    1.7254047 -0.22480509     1.491724
## 5     0.9867908   0.7351401   0.8451617    1.3658048 -0.64400318     4.156440
##   drb_per_game trb_per_game ast_per_game stl_per_game blk_per_game tov_per_game
## 1     2.761598     2.577911  -0.40932652   0.23875360     2.518588   -0.3171764
## 2     1.868667     2.440254  -0.31003028   0.23875360     3.999531    0.0541740
## 3     2.672304     3.128541   0.18645094  -0.09720973     1.407881    1.3539004
## 4     2.761598     2.577911   0.38504342   1.41462525     5.665591    1.7252508
## 5     1.957960     2.853226  -0.01214155  -0.26519139     3.814413    0.3326868
##   pf_per_game pts_per_game   all_star All_Defense All_NBA All_Rookie All-ABA
## 1  -0.7567487    1.1657795 0.03475658        <NA>    <NA>       <NA>    <NA>
## 2  -0.2450916   -0.1074614 0.02336687        <NA>    <NA>       <NA>    <NA>
## 3   1.6309844    0.6460894 0.14272566        <NA>    <NA>       <NA>    <NA>
## 4   0.5223940    1.7764155 0.55158758        <NA>    <NA>       <NA>    <NA>
## 5   1.3751559    0.6071126 0.06854929        <NA>    <NA>       <NA>    <NA>
##   All-BAA       dpoy mip          mvp roy smoy NBA_Champion Conference
## 1    <NA> 0.12999384  NA 2.349836e-04  NA   NA         <NA>    Eastern
## 2    <NA> 0.10269659  NA 2.996007e-05  NA   NA         <NA>    Western
## 3    <NA> 0.02903398  NA 2.165260e-05  NA   NA         <NA>    Eastern
## 4    <NA> 0.02834208  NA 7.413420e-04  NA   NA         <NA>    Western
## 5    <NA> 0.02229822  NA 1.799389e-04  NA   NA         <NA>    Eastern
print(top_5_HOF)
##              player num_seasons hof all_star dpoy mip mvp roy smoy All_Defense
## 1     Stephen Curry          16   0       10    0   0   2   0    0           0
## 2      Kevin Durant          17   0       16    0   0   1   1    0           0
## 3      James Harden          16   0       14    0   0   1   0    1           0
## 4      LeBron James          22   0       20    0   0   4   1    0           6
## 5 Russell Westbrook          17   0        9    0   0   1   0    0           0
##   All_NBA All_Rookie NBA_Champion xgb_predictions
## 1      10          1            4       0.9968592
## 2      11          1            2       0.9939582
## 3       7          1            0       0.9930856
## 4      20          1            4       0.9927462
## 5       9          1            0       0.9852502
print(top_5_MVP)
##    tm season                player pos age experience g gs mp_per_game
## 1 MIL   2025 Giannis Antetokounmpo  PF  30         12 5  5        34.4
## 2 DEN   2025          Nikola Jokić   C  29         10 4  4        39.0
## 3 LAL   2025         Anthony Davis  PF  31         13 5  5        36.0
## 4 ATL   2025            Trae Young  PG  26          7 5  5        38.2
## 5 MIN   2025         Julius Randle  PF  30         11 4  4        35.0
##   fg_per_game fga_per_game fg_percent x3p_per_game x3pa_per_game x3p_percent
## 1    3.324315     2.045906  1.0390312   -0.9176843    -0.8476308  -1.4779930
## 2    3.177394     2.379431  0.6350170    1.5734192     0.7608850   1.0104499
## 3    2.736630     2.186338  0.5035908   -0.5618123    -0.4820591  -0.3468826
## 4    1.708182     2.186338 -0.1973495    1.7513552     2.5887439  -0.1206605
## 5    1.965294     1.238424  0.8005168    1.3065153     0.3953132   1.3497830
##   x2p_per_game x2pa_per_game  x2p_percent ft_per_game fta_per_game  ft_percent
## 1     4.845252      3.657154  0.739606024    2.548684     4.242604 -0.96713505
## 2     3.306730      3.002653  0.306408210    2.151196     2.174905  0.13325995
## 3     3.883676      3.604794  0.270680555    4.252207     4.422404  0.06776025
## 4     1.287418      1.405671 -0.006208769    4.592911     3.883004  0.69219074
## 5     1.864364      1.562752  0.391261390    1.299434     1.500655 -0.14620544
##   orb_per_game drb_per_game trb_per_game ast_per_game stl_per_game blk_per_game
## 1    0.1060724     3.565235    2.7843974    1.8744871   -0.4331731   0.29717458
## 2    3.3037309     2.538365    3.0252980    3.6121713    1.0786619   1.59299916
## 3    1.8114903     3.118770    2.9908836    0.4843397    1.5826069   2.88882375
## 4    0.3192497     0.618565    0.5818778    4.7540781    1.5826069  -0.07306102
## 5    1.1719586     1.288263    1.3734083    1.4773021   -0.2651914  -0.81353221
##   tov_per_game pf_per_game pts_per_game  all_star All_Defense All_NBA
## 1     2.282276   1.2898797     2.802804 0.9896131        <NA>    <NA>
## 2     1.725251   0.6929464     2.945718 0.9539439        <NA>    <NA>
## 3     0.982550  -0.4156440     2.828788 0.9993210        <NA>    <NA>
## 4     4.324704   0.2665655     2.516974 0.7856501        <NA>    <NA>
## 5     1.725251   1.1193273     1.880354 0.6709493        <NA>    <NA>
##   All_Rookie All-ABA All-BAA         dpoy mip         mvp roy smoy NBA_Champion
## 1       <NA>    <NA>    <NA> 3.820065e-04  NA 0.587326765  NA   NA         <NA>
## 2       <NA>    <NA>    <NA> 2.051102e-03  NA 0.009774761  NA   NA         <NA>
## 3       <NA>    <NA>    <NA> 6.348968e-04  NA 0.005514353  NA   NA         <NA>
## 4       <NA>    <NA>    <NA> 8.877353e-05  NA 0.002880467  NA   NA         <NA>
## 5       <NA>    <NA>    <NA> 8.685108e-05  NA 0.002038081  NA   NA         <NA>
##   Conference
## 1    Eastern
## 2    Western
## 3    Western
## 4    Eastern
## 5    Western