#Load in Data
All_Star_Selection <- read.csv("C://Users/james/Downloads/All-Star Selections.csv")
Player_Per_Game_Stats <- read.csv("C://Users/james/Downloads/Player Per Game.csv")
All_NBA_Selections <- read.csv("C://Users/james/Downloads/End Of Season Teams.csv")
Yearly_Awards <- read.csv("C://Users/james/Downloads/Player Award Shares.csv")
HOF <- read.csv("C://Users/james/Downloads/Player Career Info.csv")
NBA_Champions <- read.csv("C://Users/james/Downloads/NBA Finals and MVP.csv")
Clean & Organize the Data Remove Unwanted Columns From Each Dataset
# Filter years to 1980-2024 for HOF
HOF <- HOF[HOF$first_seas >= 1980, ]
# Get rid of columns we don't want for All-Star data
All_Star_Selection <- subset(All_Star_Selection, select = -c(team, lg, replaced))
# Get rid of columns we don't want for Player Per Game Stats
Player_Per_Game_Stats <- subset(Player_Per_Game_Stats,
select = -c(seas_id, player_id, birth_year, lg,
e_fg_percent))
# Get rid of columns we don't want for All NBA Selections
All_NBA_Selections <- subset(All_NBA_Selections,
select = c(season, type, number_tm,
player))
# Get rid of columns we don't want for Yearly Awards & only show winners
# remove clutch player of the year because it only recently existed
Yearly_Awards <- subset(Yearly_Awards,
select = c(season, award, player, winner))
Yearly_Awards <- Yearly_Awards[Yearly_Awards$winner == TRUE, ]
Yearly_Awards <- Yearly_Awards[Yearly_Awards$award != "clutch_poy", ]
# Get rid of columns we don't want for HOF & only show current Hall of Fame players
HOF <- subset(HOF, select = c(player, num_seasons, hof))
# Get rid of columns we don't want for NBA Champions
NBA_Champions <- subset(NBA_Champions, select = c(Year, NBA.Champion))
#Change Data Layout & Cleanup Data
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'tidyr' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
All_NBA_Selections <- All_NBA_Selections %>%
pivot_wider(names_from = type, values_from = number_tm)
colnames(All_NBA_Selections)[which(names(All_NBA_Selections) ==
"All-Defense")] <- "All_Defense"
colnames(All_NBA_Selections)[which(names(All_NBA_Selections) ==
"All-NBA")] <- "All_NBA"
colnames(All_NBA_Selections)[which(names(All_NBA_Selections) ==
"All-Rookie")] <- "All_Rookie"
All_NBA_Selections$'All_Defense' <- ifelse(!is.na(All_NBA_Selections$'All_Defense'), paste(All_NBA_Selections$'All_Defense', "Team"), All_NBA_Selections$'All_Defense')
All_NBA_Selections$'All_NBA' <- ifelse(!is.na(All_NBA_Selections$'All_NBA'), paste(All_NBA_Selections$'All_NBA', "Team"), All_NBA_Selections$'All_NBA')
All_NBA_Selections$'All_Rookie' <- ifelse(!is.na(All_NBA_Selections$'All_Rookie'), paste(All_NBA_Selections$'All_Rookie', "Team"), All_NBA_Selections$'All_Rookie')
# Change Yearly Awards layout
Yearly_Awards <- Yearly_Awards %>%
pivot_wider(names_from = award, values_from = award, values_fill = NA,
values_fn = list(award = function(x) "Yes"))
colnames(Yearly_Awards)[which(names(Yearly_Awards) ==
"nba mvp")] <- "mvp"
colnames(Yearly_Awards)[which(names(Yearly_Awards) ==
"nba roy")] <- "roy"
Yearly_Awards <- subset(Yearly_Awards, select = c(season, player, dpoy, mip,
mvp, roy, smoy))
# Add Yes column to NBA Champions & Add most recent champs
NBA_Champions$tm <- NBA_Champions$NBA.Champion
NBA_Champions$season <- NBA_Champions$Year
NBA_Champions <- subset(NBA_Champions, select = c(season, tm))
recent_champions <- data.frame(
season = c(2019, 2020, 2021, 2022, 2023, 2024),
tm = c("TOR", "LAL", "MIL", "GSW", "DEN", "BOS")
)
NBA_Champions <- rbind(NBA_Champions, recent_champions)
NBA_Champions$NBA_Champion <- 'Yes'
Combine Datasets
# Merge Player Per Game Stats & All_Star Selection as NBA Data
All_Star_Selection$all_star <- "Yes"
NBA_Data <- merge(Player_Per_Game_Stats, All_Star_Selection,
by = c("player", "season"), all.x = TRUE)
# Merge All NBA Selections into NBA Data
NBA_Data <- merge(NBA_Data, All_NBA_Selections,
by = c("season", "player"), all.x = TRUE)
# Merge Yearly Awards into NBA Data
NBA_Data <- merge(NBA_Data, Yearly_Awards,
by = c("season", "player"), all.x = TRUE)
# Merge Champions into NBA Data
NBA_Data <- merge(NBA_Data, NBA_Champions,
by = c("season", "tm"), all.x = TRUE)
Add Hall of Famer Accolades to HOF Dataset
library(dplyr)
#All Star Count (all_star)
NBA_Data <- NBA_Data %>% mutate(all_star = ifelse(all_star == "Yes", 1, 0))
all_star_count <- NBA_Data %>%
filter(all_star == 1) %>%
group_by(player) %>%
summarise(all_star = n())
HOF <- left_join(HOF, all_star_count, by = "player")
HOF$all_star[is.na(HOF$all_star)] <- 0
# Defensive Player of The Year Count (dpoy)
NBA_Data <- NBA_Data %>% mutate(dpoy = ifelse(dpoy == "Yes", 1, 0))
dpoy_count <- NBA_Data %>%
filter(dpoy == 1) %>%
group_by(player) %>%
summarise(dpoy = n())
HOF <- left_join(HOF, dpoy_count, by = "player")
HOF$dpoy[is.na(HOF$dpoy)] <- 0
# Most Improved Player Count (MIP)
NBA_Data <- NBA_Data %>% mutate(mip = ifelse(mip == "Yes", 1, 0))
mip_count <- NBA_Data %>%
filter(mip == 1) %>%
group_by(player) %>%
summarise(mip = n())
HOF <- left_join(HOF, mip_count, by = "player")
HOF$mip[is.na(HOF$mip)] <- 0
# Most Valuable Player Count (MVP)
NBA_Data <- NBA_Data %>% mutate(mvp = ifelse(mvp == "Yes", 1, 0))
mvp_count <- NBA_Data %>%
filter(mvp == 1) %>%
group_by(player) %>%
summarise(mvp = n())
HOF <- left_join(HOF, mvp_count, by = "player")
HOF$mvp[is.na(HOF$mvp)] <- 0
# Rookie of the Year Count (ROY)
NBA_Data <- NBA_Data %>% mutate(roy = ifelse(roy == "Yes", 1, 0))
roy_count <- NBA_Data %>%
filter(roy == 1) %>%
group_by(player) %>%
summarise(roy = n())
HOF <- left_join(HOF, roy_count, by = "player")
HOF$roy[is.na(HOF$roy)] <- 0
# Sixth Man of the Year Count (SMOY)
NBA_Data <- NBA_Data %>% mutate(smoy = ifelse(smoy == "Yes", 1, 0))
smoy_count <- NBA_Data %>%
filter(smoy == 1) %>%
group_by(player) %>%
summarise(smoy = n())
HOF <- left_join(HOF, smoy_count, by = "player")
HOF$smoy[is.na(HOF$smoy)] <- 0
# All Defense Team Count (All_Defense)
All_Defense_Count <- NBA_Data %>%
filter(!is.na(All_Defense)) %>%
group_by(player) %>%
summarise(All_Defense = n())
HOF <- left_join(HOF, All_Defense_Count, by = "player")
HOF$All_Defense[is.na(HOF$All_Defense)] <- 0
# All NBA Team Count (All_NBA)
All_NBA_Count <- NBA_Data %>%
filter(!is.na(All_NBA)) %>%
group_by(player) %>%
summarise(All_NBA = n())
HOF <- left_join(HOF, All_NBA_Count, by = "player")
HOF$All_NBA[is.na(HOF$All_NBA)] <- 0
# All Rookie Team Count (All_Rookie)
All_Rookie_Count <- NBA_Data %>%
filter(!is.na(All_Rookie)) %>%
group_by(player) %>%
summarise(All_Rookie = n())
HOF <- left_join(HOF, All_Rookie_Count, by = "player")
HOF$All_Rookie[is.na(HOF$All_Rookie)] <- 0
# Championship Count
Championships_Count <- NBA_Data %>%
filter(!is.na(NBA_Champion)) %>%
group_by(player) %>%
summarise(NBA_Champion = n())
HOF <- left_join(HOF, Championships_Count, by = "player")
HOF$NBA_Champion[is.na(HOF$NBA_Champion)] <- 0
Fix Traded Players in Data
NBA_Data$tm[which(NBA_Data$tm == "TOT")] <- "aaTOT"
NBA_Data <- NBA_Data[order(NBA_Data$tm), ]
id <- paste(NBA_Data$season, NBA_Data$player, sep = "")
NBA_Data <- NBA_Data[!duplicated(id), ]
Fix Positions for Players with Multiple Listed
NBA_Data$pos <- substr(NBA_Data$pos, 1, 2)
Narrow Datasets Down to Years 1980 - 2024 This will reflect the creation of the 3 point line up to the present.
Current_Stats <- NBA_Data[NBA_Data$season >= 2025, ]
Player_Per_Game_Stats <- NBA_Data[NBA_Data$season >= 1980 &
NBA_Data$season <= 2024, ]
Create a Scale to Make Stats Comparable Across Years
#NBA_Data
seasons <- unique(NBA_Data$season)
Scaled_NBA_Data <- NBA_Data
for(i in 1:length(seasons)) {
Scaled_NBA_Data[which(NBA_Data$season == seasons[i]), 10:30 ] <- scale(Scaled_NBA_Data[which(NBA_Data$season == seasons[i]), 10:30])
}
# Current Data
seasons <- unique(Current_Stats$season)
Scaled_Current_Stats <- Current_Stats
for(i in 1:length(seasons)) {
Scaled_Current_Stats[which(Current_Stats$season == seasons[i]), 10:30 ] <- scale(Scaled_Current_Stats[which(Current_Stats$season == seasons[i]), 10:30])
}
Min 65 games played
Scaled_NBA_Data <- Scaled_NBA_Data[Scaled_NBA_Data$g >= 65, ]
Scaled_Current_Stats <- Scaled_Current_Stats
Create Dataset for Conferences
Western_Conf <- c('DAL', 'DEN', 'GSW', 'HOU', 'LAC', 'LAL', 'MEM', 'MIN',
'NOP', 'OKC', 'PHO', 'POR', 'SAC', 'SAS', 'UTA')
Eastern_Conf <- c('ATL', 'BOS', 'BRK', 'CHO', 'CHI', 'CLE', 'DET', 'IND',
'MIA', 'MIL', 'NYK', 'ORL', 'PHI', 'TOR', 'WAS')
# Create a dataframe for the Eastern Conference
eastern_df <- data.frame(tm = Eastern_Conf, Conference = "Eastern")
# Create a dataframe for the Western Conference
western_df <- data.frame(tm = Western_Conf, Conference = "Western")
# Combine the two dataframes into one
team_conference <- rbind(eastern_df, western_df)
Main Datasets
head(NBA_Data) # All NBA Player Data
## season tm player pos age experience g gs mp_per_game
## 150 1947 aaTOT Bob Duffy F 24 1 17 NA NA
## 151 1947 aaTOT Bob Fitzgerald F- 23 1 60 NA NA
## 152 1947 aaTOT Bob Mullens G 24 1 54 NA NA
## 153 1947 aaTOT Charlie Hoefer G 25 1 58 NA NA
## 154 1947 aaTOT Dick Murphy G 25 1 31 NA NA
## 155 1947 aaTOT Dick Schulz F- 30 1 57 NA NA
## fg_per_game fga_per_game fg_percent x3p_per_game x3pa_per_game x3p_percent
## 150 0.4 1.9 0.219 NA NA NA
## 151 1.2 6.0 0.193 NA NA NA
## 152 2.3 8.2 0.281 NA NA NA
## 153 2.2 8.9 0.253 NA NA NA
## 154 0.5 2.4 0.200 NA NA NA
## 155 2.3 9.6 0.237 NA NA NA
## x2p_per_game x2pa_per_game x2p_percent ft_per_game fta_per_game ft_percent
## 150 0.4 1.9 0.219 0.3 0.4 0.714
## 151 1.2 6.0 0.193 1.4 2.2 0.623
## 152 2.3 8.2 0.281 1.2 1.9 0.627
## 153 2.2 8.9 0.253 1.6 2.4 0.655
## 154 0.5 2.4 0.200 0.1 0.3 0.444
## 155 2.3 9.6 0.237 1.6 2.4 0.681
## orb_per_game drb_per_game trb_per_game ast_per_game stl_per_game
## 150 NA NA NA 0.0 NA
## 151 NA NA NA 0.6 NA
## 152 NA NA NA 1.0 NA
## 153 NA NA NA 0.6 NA
## 154 NA NA NA 0.3 NA
## 155 NA NA NA 1.0 NA
## blk_per_game tov_per_game pf_per_game pts_per_game all_star All_Defense
## 150 NA NA 1.0 1.1 NA <NA>
## 151 NA NA 2.6 3.7 NA <NA>
## 152 NA NA 1.7 5.8 NA <NA>
## 153 NA NA 2.4 6.1 NA <NA>
## 154 NA NA 0.5 1.1 NA <NA>
## 155 NA NA 2.2 6.2 NA <NA>
## All_NBA All_Rookie All-ABA All-BAA dpoy mip mvp roy smoy NBA_Champion
## 150 <NA> <NA> <NA> <NA> NA NA NA NA NA <NA>
## 151 <NA> <NA> <NA> <NA> NA NA NA NA NA <NA>
## 152 <NA> <NA> <NA> <NA> NA NA NA NA NA <NA>
## 153 <NA> <NA> <NA> <NA> NA NA NA NA NA <NA>
## 154 <NA> <NA> <NA> <NA> NA NA NA NA NA <NA>
## 155 <NA> <NA> <NA> <NA> NA NA NA NA NA <NA>
head(HOF) # Hall of Fame Player Data
## player num_seasons hof all_star dpoy mip mvp roy smoy All_Defense
## 1 Abdul Jeelani 2 FALSE 0 0 0 0 0 0 0
## 2 Allen Leavell 10 FALSE 0 0 0 0 0 0 0
## 3 Arvid Kramer 1 FALSE 0 0 0 0 0 0 0
## 4 Bernard Toone 1 FALSE 0 0 0 0 0 0 0
## 5 Bill Cartwright 15 FALSE 1 0 0 0 0 0 0
## 6 Billy Ray Bates 4 FALSE 0 0 0 0 0 0 0
## All_NBA All_Rookie NBA_Champion
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 1 3
## 6 0 0 0
Variables NBA_Data Predictor Variables player - Player Name season - Season of stats pos - Player Position age - Players Age experience - How many years player has been in league g - Games Played gs - Games started mp_per_game - Mins per game fg_per_game - Field goals per game fga_per_game - Field goals attempted per game fg_percent - Field goal % x3p_per_game - 3 point shots made per game x3pa_per_game - 3 point shots attempted per game x3p_percent - 3 point % x2p_per_game - 2 point shots made per game x2pa_per_game - 2 point shots attempted per game x2p_percent - 2 point % ft_per_game - free throws made per game fta_per_game - free throws attempted per game ft_percent - free throw % orb_per_game - Offensive rebounds per game drb_per_game - defensive rebounds per game trb_per_game - Total rebounds per game ast_per_game - Assists per game stl_per_game - Steals per game blk_per_game - Blocks per game tov_per_game - turnovers per game pf_per_game - personal fouls per game pts_per_game - Pts per game all_star - Was player an all-star (Yes or No) All_Defense - Did player make all defense 1st or 2nd team All_NBA - did player make all nba 1st or 2nd team All_Rookie - did player make all rookie 1st or 2nd team dpoy - did player win defensive player of the year mip - did player win most improved player mvp - did player win most valuable player roy - did player win rookie of the year smoy - did player win sixth man of the year HOF Predictor Variables num_seasons - number of seasons a player played for all_star - number of all star appearances dpoy - number of defensive player of the year awards mip - number of most improved player awards mvp - number of most valuable player awards roy - number of rookie of the year awards (max 1) smoy - number of six man of the year awards All_Defense - number of all defensive team selections All_NBA - number of all NBA team selections All_Rookie - number of all rookie team selections (max 1) Variable Analysis
library(ggplot2)
# Player Position vs Points
ggplot(NBA_Data, aes(x = pos, y = pts_per_game)) +
geom_boxplot() +
labs(title = "Player Position vs Points Per Game", x = "Position",
y = "Points Per Game") +
theme_minimal()
# Average Awards and All-Star Appearances for Hall of Famers
library(dplyr)
avg_awards <- HOF %>%
filter(hof == TRUE) %>%
summarise(
avg_num_seasons = mean(num_seasons, na.rm = TRUE),
avg_all_star = mean(all_star, na.rm = TRUE),
avg_dpoy = mean(dpoy, na.rm = TRUE),
avg_mip = mean(mip, na.rm = TRUE),
avg_mvp = mean(mvp, na.rm = TRUE),
avg_roy = mean(roy, na.rm = TRUE),
avg_smoy = mean(smoy, na.rm = TRUE),
avg_all_defense = mean(All_Defense, na.rm = TRUE),
avg_all_NBA = mean(All_NBA, na.rm = TRUE),
avg_all_Rookie = mean(All_Rookie, na.rm = TRUE)
)
avg_awards_long <- avg_awards %>%
pivot_longer(cols = everything(), names_to = "Award", values_to = "Average")
ggplot(avg_awards_long, aes(x = Award, y = Average, fill = Award)) +
geom_bar(stat = "identity") +
labs(title = "Average Stats of Hall of Famers", x = "Stat", y = "Average Count") +
theme_minimal() +
coord_flip()+
theme(legend.position = "none")
avg_all_star <- NBA_Data %>%
filter(all_star == 1) %>%
summarise(
avg_ppg = mean(pts_per_game, na.rm = TRUE),
avg_trb = mean(trb_per_game, na.rm = TRUE),
avg_apg = mean(ast_per_game, na.rm = TRUE),
avg_spg = mean(stl_per_game, na.rm = TRUE),
avg_bpg = mean(blk_per_game, na.rm = TRUE),
avg_mpg = mean(mp_per_game, na.rm = TRUE)
)
avg_allstar_long <- avg_all_star %>%
pivot_longer(cols = everything(), names_to = "Stat", values_to = "Average")
ggplot(avg_allstar_long, aes(x = Stat, y = Average, fill = Stat)) +
geom_bar(stat = "identity") +
labs(title = "Per Game Averages of All Stars", x = "Stat", y = "Average Count") +
theme_minimal() +
coord_flip()+
theme(legend.position = "none")
Split data into Test & Training Data
set.seed(7)
total_obs <- dim(Scaled_NBA_Data)[1]
train_data_indices <- sample(1:total_obs, 0.6*total_obs)
train_data <- Scaled_NBA_Data[train_data_indices,]
test_data <- Scaled_NBA_Data[-train_data_indices,]
# HOF Data
set.seed(7)
total_obs_HOF <- dim(HOF)[1]
train_data_indices_HOF <- sample(1:total_obs_HOF, 0.6*total_obs_HOF)
train_data_HOF <- HOF[train_data_indices_HOF,]
test_data_HOF <- HOF[-train_data_indices_HOF,]
Models Logistic Regression
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.4.2
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# All-Star Predictor Model
train_data$all_star[is.na(train_data$all_star)] <- 0
test_data$all_star[is.na(test_data$all_star)] <- 0
lm_all_star <- glm(all_star ~ fg_per_game + fga_per_game + fg_percent +
x3p_per_game + x3pa_per_game + x3p_percent +
x2p_per_game + x2pa_per_game + x2p_percent +
ft_per_game + fta_per_game + ft_percent +
orb_per_game + drb_per_game + trb_per_game +
ast_per_game + stl_per_game + blk_per_game + tov_per_game + pts_per_game,
family = 'binomial', data = train_data)
lm_all_star_pred <- predict(lm_all_star, newdata = test_data, type = 'response')
preds_all_star_cat <- ifelse(lm_all_star_pred>.5, 1, 0)
lm_all_star_acc <- confusionMatrix(factor(preds_all_star_cat),
factor(test_data$all_star), positive = '1')
# Yearly Awards Predictor Model
#MVP
train_data$mvp[is.na(train_data$mvp)] <- 0
test_data$mvp[is.na(test_data$mvp)] <- 0
lm_MVP <- glm(mvp ~ fg_per_game + fga_per_game + fg_percent +
x3p_per_game + x3pa_per_game + x3p_percent +
x2p_per_game + x2pa_per_game + x2p_percent +
ft_per_game + fta_per_game + ft_percent +
orb_per_game + drb_per_game + trb_per_game +
ast_per_game + stl_per_game + blk_per_game + tov_per_game + pts_per_game,
family = 'binomial', data = train_data)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm_MVP_pred <- predict(lm_MVP, newdata = test_data, type = 'response')
preds_MVP_cat <- ifelse(lm_MVP_pred>.5, 1, 0)
lm_MVP_acc <- confusionMatrix(factor(preds_MVP_cat),
factor(test_data$mvp), positive = '1')
#DPOY
train_data$dpoy[is.na(train_data$dpoy)] <- 0
test_data$dpoy[is.na(test_data$dpoy)] <- 0
lm_DPOY <- glm(dpoy ~ fg_per_game + fga_per_game + fg_percent +
x3p_per_game + x3pa_per_game + x3p_percent +
x2p_per_game + x2pa_per_game + x2p_percent +
ft_per_game + fta_per_game + ft_percent +
orb_per_game + drb_per_game + trb_per_game +
ast_per_game + stl_per_game + blk_per_game + tov_per_game + pts_per_game,
family = 'binomial', data = train_data)
lm_dpoy_pred <- predict(lm_DPOY, newdata = test_data, type = 'response')
preds_dpoy_cat <- ifelse(lm_dpoy_pred>.5, 1, 0)
lm_dpoy_acc <- confusionMatrix(factor(preds_dpoy_cat),
factor(test_data$dpoy), positive = '1')
# Hall of Fame Predictor Model
lm_HOF <- glm(hof ~ num_seasons + all_star + dpoy + mip + mvp + roy + smoy +
All_Defense + All_NBA + All_Rookie + NBA_Champion,
family = 'binomial', data = train_data_HOF)
lm_HOF_pred <- predict(lm_HOF, newdata = test_data_HOF, type = 'response')
preds_HOF_cat <- ifelse(lm_HOF_pred>.5, 'TRUE', 'FALSE')
lm_HOF_acc <- confusionMatrix(factor(preds_HOF_cat),
factor(test_data_HOF$hof), positive = 'TRUE')
# All-Star Predictor Model
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.2
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
# Replace NAs in target columns with 0
train_data$all_star[is.na(train_data$all_star)] <- 0
test_data$all_star[is.na(test_data$all_star)] <- 0
train_data$all_star <- as.numeric(train_data$all_star)
test_data$all_star <- as.numeric(test_data$all_star)
predictor_columns <- c(
"fg_per_game", "fga_per_game", "fg_percent",
"x3p_per_game", "x3pa_per_game", "x3p_percent",
"x2p_per_game", "x2pa_per_game", "x2p_percent",
"ft_per_game", "fta_per_game", "ft_percent",
"orb_per_game", "drb_per_game", "trb_per_game",
"ast_per_game", "stl_per_game", "blk_per_game",
"tov_per_game", "pts_per_game"
)
# Create training and test matrices
X_train <- as.matrix(train_data[, predictor_columns])
X_test <- as.matrix(test_data[, predictor_columns])
#XGBoost All-Star
y_train_all_star <- train_data$all_star
y_test_all_star <- test_data$all_star
# Train XGBoost model
xgb_all_star <- xgboost(data = X_train, label = y_train_all_star,
nrounds = 100, objective = "binary:logistic", verbose = 0)
# Predict on test set
all_star_preds <- predict(xgb_all_star, X_test)
all_star_preds_binary <- ifelse(all_star_preds > 0.5, 1, 0)
# Evaluate model
confusionMatrix(factor(all_star_preds_binary), factor(y_test_all_star), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4057 227
## 1 140 469
##
## Accuracy : 0.925
## 95% CI : (0.9173, 0.9322)
## No Information Rate : 0.8578
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6757
##
## Mcnemar's Test P-Value : 7.15e-06
##
## Sensitivity : 0.67385
## Specificity : 0.96664
## Pos Pred Value : 0.77011
## Neg Pred Value : 0.94701
## Prevalence : 0.14224
## Detection Rate : 0.09585
## Detection Prevalence : 0.12446
## Balanced Accuracy : 0.82025
##
## 'Positive' Class : 1
##
test_data$xgb_all_star_preds <- all_star_preds
# Yearly Awards Predictor Model (MVP & DPOY)
library(xgboost)
library(caret)
library(dplyr)
# Replace NAs in target columns with 0
train_data$mvp[is.na(train_data$mvp)] <- 0
test_data$mvp[is.na(test_data$mvp)] <- 0
train_data$dpoy[is.na(train_data$dpoy)] <- 0
test_data$dpoy[is.na(test_data$dpoy)] <- 0
# Convert target variables to numeric
train_data$mvp <- as.numeric(train_data$mvp)
test_data$mvp <- as.numeric(test_data$mvp)
train_data$dpoy <- as.numeric(train_data$dpoy)
test_data$dpoy <- as.numeric(test_data$dpoy)
predictor_columns <- c(
"fg_per_game", "fga_per_game", "fg_percent",
"x3p_per_game", "x3pa_per_game", "x3p_percent",
"x2p_per_game", "x2pa_per_game", "x2p_percent",
"ft_per_game", "fta_per_game", "ft_percent",
"orb_per_game", "drb_per_game", "trb_per_game",
"ast_per_game", "stl_per_game", "blk_per_game",
"tov_per_game", "pts_per_game"
)
# Create training and test matrices
X_train <- as.matrix(train_data[, predictor_columns])
X_test <- as.matrix(test_data[, predictor_columns])
#XGBoost MVP
y_train_mvp <- train_data$mvp
y_test_mvp <- test_data$mvp
# Train XGBoost model
xgb_mvp <- xgboost(data = X_train, label = y_train_mvp,
nrounds = 100, objective = "binary:logistic", verbose = 0)
# Predict on test set
mvp_preds <- predict(xgb_mvp, X_test)
mvp_preds_binary <- ifelse(mvp_preds > 0.5, 1, 0)
# Evaluate model
confusionMatrix(factor(mvp_preds_binary), factor(y_test_mvp), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4859 31
## 1 2 1
##
## Accuracy : 0.9933
## 95% CI : (0.9905, 0.9954)
## No Information Rate : 0.9935
## P-Value [Acc > NIR] : 0.6153
##
## Kappa : 0.0561
##
## Mcnemar's Test P-Value : 1.093e-06
##
## Sensitivity : 0.0312500
## Specificity : 0.9995886
## Pos Pred Value : 0.3333333
## Neg Pred Value : 0.9936605
## Prevalence : 0.0065400
## Detection Rate : 0.0002044
## Detection Prevalence : 0.0006131
## Balanced Accuracy : 0.5154193
##
## 'Positive' Class : 1
##
test_data$xgb_mvp_preds <- mvp_preds
#XGBoost DPOY
y_train_dpoy <- train_data$dpoy
y_test_dpoy <- test_data$dpoy
# Train XGBoost model
xgb_dpoy <- xgboost(data = X_train, label = y_train_dpoy,
nrounds = 100, objective = "binary:logistic", verbose = 0, scale_pos_weight = 4876/17,
)
x_test_2 <- Scaled_Current_Stats[,colnames(X_test)]
dpoy_preds <- predict(xgb_dpoy, as.matrix(x_test_2))
Scaled_Current_Stats$dpoy <- dpoy_preds
# Predict on test set
dpoy_preds <- predict(xgb_dpoy, X_test)
dpoy_preds_binary <- ifelse(dpoy_preds > 0.5, 1, 0)
# Evaluate model
confusionMatrix(factor(dpoy_preds_binary), factor(y_test_dpoy), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4870 8
## 1 10 5
##
## Accuracy : 0.9963
## 95% CI : (0.9942, 0.9978)
## No Information Rate : 0.9973
## P-Value [Acc > NIR] : 0.9304
##
## Kappa : 0.3553
##
## Mcnemar's Test P-Value : 0.8137
##
## Sensitivity : 0.384615
## Specificity : 0.997951
## Pos Pred Value : 0.333333
## Neg Pred Value : 0.998360
## Prevalence : 0.002657
## Detection Rate : 0.001022
## Detection Prevalence : 0.003066
## Balanced Accuracy : 0.691283
##
## 'Positive' Class : 1
##
test_data$xgb_dpoy_preds <- dpoy_preds
XG Boost
library(xgboost)
library(Matrix)
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(caret)
train_data_HOF$hof <- ifelse(train_data_HOF$hof == TRUE, 1, 0)
test_data_HOF$hof <- ifelse(test_data_HOF$hof == TRUE, 1, 0)
# Select predictor variables
predictors <- c("num_seasons", "all_star", "dpoy", "mip", "mvp", "roy",
"smoy", "All_Defense", "All_NBA", "All_Rookie", "NBA_Champion")
# Convert data to matrix format for XGBoost
dtrain <- xgb.DMatrix(data = as.matrix(train_data_HOF[, predictors]),
label = train_data_HOF$hof)
dtest <- xgb.DMatrix(data = as.matrix(test_data_HOF[, predictors]),
label = test_data_HOF$hof)
# Set model parameters
params <- list(
booster = "gbtree",
objective = "binary:logistic", # Binary classification
eval_metric = "logloss", # Log-loss for evaluation
max_depth = 6, # Maximum tree depth
eta = 0.1, # Learning rate
gamma = 0, # Minimum loss reduction to make a split
subsample = 0.8, # Subsample ratio of training data
colsample_bytree = 0.8 # Subsample ratio of columns
)
# Train the XGBoost model
set.seed(7)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 100, # Number of boosting rounds
watchlist = list(train = dtrain, test = dtest),
print_every_n = 10,
early_stopping_rounds = 10, # Stop if no improvement in 10 rounds
maximize = FALSE,
scale_pos_weight = 1346/22,
)
## [1] train-logloss:0.606670 test-logloss:0.607166
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 10 rounds.
##
## [11] train-logloss:0.201228 test-logloss:0.211551
## [21] train-logloss:0.087788 test-logloss:0.103340
## [31] train-logloss:0.049749 test-logloss:0.070842
## [41] train-logloss:0.033950 test-logloss:0.059976
## [51] train-logloss:0.026648 test-logloss:0.056632
## [61] train-logloss:0.021273 test-logloss:0.055322
## Stopping. Best iteration:
## [58] train-logloss:0.022377 test-logloss:0.054873
xgb_predictions <- predict(xgb_model, dtest)
xgb_predictions_class <- ifelse(xgb_predictions > 0.5, 1, 0)
# Evaluate model accuracy
conf_matrix <- confusionMatrix(
factor(xgb_predictions_class),
factor(test_data_HOF$hof),
positive = "1"
)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1326 5
## 1 20 17
##
## Accuracy : 0.9817
## 95% CI : (0.9731, 0.9881)
## No Information Rate : 0.9839
## P-Value [Acc > NIR] : 0.77869
##
## Kappa : 0.5675
##
## Mcnemar's Test P-Value : 0.00511
##
## Sensitivity : 0.77273
## Specificity : 0.98514
## Pos Pred Value : 0.45946
## Neg Pred Value : 0.99624
## Prevalence : 0.01608
## Detection Rate : 0.01243
## Detection Prevalence : 0.02705
## Balanced Accuracy : 0.87893
##
## 'Positive' Class : 1
##
test_data_HOF$xgb_predictions <- xgb_predictions
#Final Predictions
# All Star Final Prediction
x_test_1 <- Scaled_Current_Stats[,colnames(X_test)]
all_star_preds <- predict(xgb_all_star, as.matrix(x_test_2))
Scaled_Current_Stats$all_star <- all_star_preds
Scaled_Current_Stats <- merge(Scaled_Current_Stats, team_conference, by = "tm", all.x = TRUE)
# MVP Final Prediction
x_test_2 <- Scaled_Current_Stats[,colnames(X_test)]
mvp_preds <- predict(xgb_mvp, as.matrix(x_test_2))
Scaled_Current_Stats$mvp <- mvp_preds
# DPOY Final Prediction
x_test_3 <- Scaled_Current_Stats[,colnames(X_test)]
dpoy_preds <- predict(xgb_dpoy, as.matrix(x_test_2))
Scaled_Current_Stats$dpoy <- dpoy_preds
# HOF Final Prediction
test_data_HOF$xgb_predictions <- xgb_predictions
#Outcomes
library(dplyr)
# All-Star
top_5_All_Stars <- Scaled_Current_Stats %>%
group_by(Conference) %>%
arrange(desc(all_star)) %>%
slice_head(n = 12)
# MVP
top_5_MVP <- Scaled_Current_Stats %>%
arrange(desc(mvp)) %>%
slice_head(n = 5)
# DPOY
top_5_DPOY <- Scaled_Current_Stats %>%
arrange(desc(dpoy)) %>%
slice_head(n = 5)
# HOF
top_5_HOF <- test_data_HOF %>%
filter(player %in% Scaled_Current_Stats$player)
top_5_HOF <- top_5_HOF %>%
arrange(desc(xgb_predictions)) %>%
slice_head(n = 5)
# All-Star
top_5_All_Stars <- Scaled_Current_Stats %>%
group_by(Conference) %>%
arrange(desc(all_star)) %>%
slice_head(n = 5)
# MVP
top_5_MVP <- Scaled_Current_Stats %>%
arrange(desc(mvp)) %>%
slice_head(n = 5)
# DPOY
top_5_DPOY <- Scaled_Current_Stats %>%
arrange(desc(dpoy)) %>%
slice_head(n = 5)
# HOF
top_5_HOF <- test_data_HOF %>%
filter(player %in% Scaled_Current_Stats$player)
top_5_HOF <- top_5_HOF %>%
arrange(desc(xgb_predictions)) %>%
slice_head(n = 5)
print(top_5_All_Stars)
## # A tibble: 10 × 43
## # Groups: Conference [2]
## tm season player pos age experience g gs mp_per_game
## <chr> <int> <chr> <chr> <int> <int> <int> <int> <dbl>
## 1 MIL 2025 Giannis Antetoko… PF 30 12 5 5 34.4
## 2 ORL 2025 Paolo Banchero SF 22 3 5 5 36.4
## 3 BOS 2025 Jayson Tatum SF 26 8 5 5 35.4
## 4 PHI 2025 Tyrese Maxey PG 24 5 4 4 42
## 5 CHO 2025 LaMelo Ball PG 23 5 4 4 34
## 6 LAL 2025 Anthony Davis PF 31 13 5 5 36
## 7 OKC 2025 Shai Gilgeous-Al… SG 26 7 4 4 32.3
## 8 DEN 2025 Nikola Jokić C 29 10 4 4 39
## 9 SAC 2025 Domantas Sabonis C 28 9 4 4 36.3
## 10 HOU 2025 Jalen Green SG 22 4 5 5 34.8
## # ℹ 34 more variables: fg_per_game <dbl>, fga_per_game <dbl>, fg_percent <dbl>,
## # x3p_per_game <dbl>, x3pa_per_game <dbl>, x3p_percent <dbl>,
## # x2p_per_game <dbl>, x2pa_per_game <dbl>, x2p_percent <dbl>,
## # ft_per_game <dbl>, fta_per_game <dbl>, ft_percent <dbl>,
## # orb_per_game <dbl>, drb_per_game <dbl>, trb_per_game <dbl>,
## # ast_per_game <dbl>, stl_per_game <dbl>, blk_per_game <dbl>,
## # tov_per_game <dbl>, pf_per_game <dbl>, pts_per_game <dbl>, …
print(top_5_DPOY)
## tm season player pos age experience g gs mp_per_game fg_per_game
## 1 CLE 2025 Jarrett Allen C 26 8 5 5 28.8 1.3408786
## 2 UTA 2025 Walker Kessler C 23 3 5 5 26.6 0.1655087
## 3 TOR 2025 Jakob Poeltl C 29 9 5 5 31.8 0.9001149
## 4 OKC 2025 Chet Holmgren PF 22 2 4 4 31.5 1.9652939
## 5 CHO 2025 Nick Richards C 27 5 4 4 30.5 0.7899240
## fga_per_game fg_percent x3p_per_game x3pa_per_game x3p_percent x2p_per_game
## 1 0.2905114 1.6961626 -0.9176843 -1.0669739 NA 2.2489952
## 2 -0.1307833 0.6739582 -0.9176843 -1.0669739 NA 0.7104725
## 3 0.5362667 0.5717377 -0.9176843 -1.0669739 NA 1.6720492
## 4 1.3261942 0.6982964 0.4168355 0.5780991 0.02864603 2.3451528
## 5 0.1851877 1.0390312 -0.9176843 -0.9573024 -1.47799304 1.5278127
## x2pa_per_game x2p_percent ft_per_game fta_per_game ft_percent orb_per_game
## 1 1.1962310 1.2442591 1.5265707 1.4557048 0.26425936 1.385136
## 2 0.5679102 0.3064082 -0.4040882 -0.2523947 -0.77063594 3.090554
## 3 1.5627515 0.2126231 0.6180254 0.6466050 0.07649354 3.303731
## 4 1.5627515 0.8378571 1.4130026 1.7254047 -0.22480509 1.491724
## 5 0.9867908 0.7351401 0.8451617 1.3658048 -0.64400318 4.156440
## drb_per_game trb_per_game ast_per_game stl_per_game blk_per_game tov_per_game
## 1 2.761598 2.577911 -0.40932652 0.23875360 2.518588 -0.3171764
## 2 1.868667 2.440254 -0.31003028 0.23875360 3.999531 0.0541740
## 3 2.672304 3.128541 0.18645094 -0.09720973 1.407881 1.3539004
## 4 2.761598 2.577911 0.38504342 1.41462525 5.665591 1.7252508
## 5 1.957960 2.853226 -0.01214155 -0.26519139 3.814413 0.3326868
## pf_per_game pts_per_game all_star All_Defense All_NBA All_Rookie All-ABA
## 1 -0.7567487 1.1657795 0.03475658 <NA> <NA> <NA> <NA>
## 2 -0.2450916 -0.1074614 0.02336687 <NA> <NA> <NA> <NA>
## 3 1.6309844 0.6460894 0.14272566 <NA> <NA> <NA> <NA>
## 4 0.5223940 1.7764155 0.55158758 <NA> <NA> <NA> <NA>
## 5 1.3751559 0.6071126 0.06854929 <NA> <NA> <NA> <NA>
## All-BAA dpoy mip mvp roy smoy NBA_Champion Conference
## 1 <NA> 0.12999384 NA 2.349836e-04 NA NA <NA> Eastern
## 2 <NA> 0.10269659 NA 2.996007e-05 NA NA <NA> Western
## 3 <NA> 0.02903398 NA 2.165260e-05 NA NA <NA> Eastern
## 4 <NA> 0.02834208 NA 7.413420e-04 NA NA <NA> Western
## 5 <NA> 0.02229822 NA 1.799389e-04 NA NA <NA> Eastern
print(top_5_HOF)
## player num_seasons hof all_star dpoy mip mvp roy smoy All_Defense
## 1 Stephen Curry 16 0 10 0 0 2 0 0 0
## 2 Kevin Durant 17 0 16 0 0 1 1 0 0
## 3 James Harden 16 0 14 0 0 1 0 1 0
## 4 LeBron James 22 0 20 0 0 4 1 0 6
## 5 Russell Westbrook 17 0 9 0 0 1 0 0 0
## All_NBA All_Rookie NBA_Champion xgb_predictions
## 1 10 1 4 0.9968592
## 2 11 1 2 0.9939582
## 3 7 1 0 0.9930856
## 4 20 1 4 0.9927462
## 5 9 1 0 0.9852502
print(top_5_MVP)
## tm season player pos age experience g gs mp_per_game
## 1 MIL 2025 Giannis Antetokounmpo PF 30 12 5 5 34.4
## 2 DEN 2025 Nikola Jokić C 29 10 4 4 39.0
## 3 LAL 2025 Anthony Davis PF 31 13 5 5 36.0
## 4 ATL 2025 Trae Young PG 26 7 5 5 38.2
## 5 MIN 2025 Julius Randle PF 30 11 4 4 35.0
## fg_per_game fga_per_game fg_percent x3p_per_game x3pa_per_game x3p_percent
## 1 3.324315 2.045906 1.0390312 -0.9176843 -0.8476308 -1.4779930
## 2 3.177394 2.379431 0.6350170 1.5734192 0.7608850 1.0104499
## 3 2.736630 2.186338 0.5035908 -0.5618123 -0.4820591 -0.3468826
## 4 1.708182 2.186338 -0.1973495 1.7513552 2.5887439 -0.1206605
## 5 1.965294 1.238424 0.8005168 1.3065153 0.3953132 1.3497830
## x2p_per_game x2pa_per_game x2p_percent ft_per_game fta_per_game ft_percent
## 1 4.845252 3.657154 0.739606024 2.548684 4.242604 -0.96713505
## 2 3.306730 3.002653 0.306408210 2.151196 2.174905 0.13325995
## 3 3.883676 3.604794 0.270680555 4.252207 4.422404 0.06776025
## 4 1.287418 1.405671 -0.006208769 4.592911 3.883004 0.69219074
## 5 1.864364 1.562752 0.391261390 1.299434 1.500655 -0.14620544
## orb_per_game drb_per_game trb_per_game ast_per_game stl_per_game blk_per_game
## 1 0.1060724 3.565235 2.7843974 1.8744871 -0.4331731 0.29717458
## 2 3.3037309 2.538365 3.0252980 3.6121713 1.0786619 1.59299916
## 3 1.8114903 3.118770 2.9908836 0.4843397 1.5826069 2.88882375
## 4 0.3192497 0.618565 0.5818778 4.7540781 1.5826069 -0.07306102
## 5 1.1719586 1.288263 1.3734083 1.4773021 -0.2651914 -0.81353221
## tov_per_game pf_per_game pts_per_game all_star All_Defense All_NBA
## 1 2.282276 1.2898797 2.802804 0.9896131 <NA> <NA>
## 2 1.725251 0.6929464 2.945718 0.9539439 <NA> <NA>
## 3 0.982550 -0.4156440 2.828788 0.9993210 <NA> <NA>
## 4 4.324704 0.2665655 2.516974 0.7856501 <NA> <NA>
## 5 1.725251 1.1193273 1.880354 0.6709493 <NA> <NA>
## All_Rookie All-ABA All-BAA dpoy mip mvp roy smoy NBA_Champion
## 1 <NA> <NA> <NA> 3.820065e-04 NA 0.587326765 NA NA <NA>
## 2 <NA> <NA> <NA> 2.051102e-03 NA 0.009774761 NA NA <NA>
## 3 <NA> <NA> <NA> 6.348968e-04 NA 0.005514353 NA NA <NA>
## 4 <NA> <NA> <NA> 8.877353e-05 NA 0.002880467 NA NA <NA>
## 5 <NA> <NA> <NA> 8.685108e-05 NA 0.002038081 NA NA <NA>
## Conference
## 1 Eastern
## 2 Western
## 3 Western
## 4 Eastern
## 5 Western