#Packages
library(tidyverse)
library(jsonlite)
library(janitor)
library(tidymodels)
library(ggplot2)
library(rsample)
library(parsnip)
library(BBmisc)
library(corrplot)
library(rvest)
library(reactable)
library(reactablefmtr)
library(class)
library(e1071)
library(pROC)
library(neuralnet)
library(rvest)
library(caret)
#salaries
salaries_21 <- read_html("https://hoopshype.com/salaries/players/2020-2021/") %>%
html_nodes("table") %>%
html_table(fill = TRUE)
salaries_21 <- salaries_21[[1]] %>%
clean_names() %>%
select(x2,
x3)
salaries_21$x3 <- as.numeric(gsub('[$,]','', salaries_21$x3))
salaries_21 <- salaries_21 %>%
mutate( season = 2021) %>%
mutate(minimums = case_when(
x3 < 2600000 ~ "minimum",
TRUE ~ "other"
)
) %>%
filter( minimums == "minimum")
salaries_21 <- salaries_21 %>%
rename(name = x2)
salaries_22 <- read_html("https://hoopshype.com/salaries/players/2021-2022/") %>%
html_nodes("table") %>%
html_table(fill = TRUE)
salaries_22 <- salaries_22[[1]] %>%
clean_names() %>%
select(x2,
x3)
salaries_22$x3 <- as.numeric(gsub('[$,]','', salaries_22$x3))
salaries_22 <- salaries_22 %>%
mutate( season = 2022) %>%
mutate(minimums = case_when(
x3 < 2700000 ~ "minimum",
TRUE ~ "other"
)
) %>%
filter( minimums == "minimum")
salaries_22 <- salaries_22 %>%
rename(name = x2)
salaries_23 <- read_html("https://hoopshype.com/salaries/players/2022-2023/") %>%
html_nodes("table") %>%
html_table(fill = TRUE)
salaries_23 <- salaries_23[[1]] %>%
clean_names() %>%
select(x2,
x3)
salaries_23$x3 <- as.numeric(gsub('[$,]','', salaries_23$x3))
salaries_23 <- salaries_23 %>%
mutate( season = 2023) %>%
mutate(minimums = case_when(
x3 < 3200000 ~ "minimum",
TRUE ~ "other"
)
) %>%
filter( minimums == "minimum") %>%
rename( name = x2)
salaries_24 <- read_html("https://hoopshype.com/salaries/players/") %>%
html_nodes("table") %>%
html_table(fill = TRUE)
salaries_24 <- salaries_24[[1]] %>%
clean_names() %>%
select(x2,
x3)
salaries_24 <- salaries_24[-1,]
salaries_24$x3 <- as.numeric(gsub('[$,]','', salaries_24$x3))
salaries_24 <- salaries_24 %>%
mutate( season = 2024) %>%
mutate(minimums = case_when(
x3 < 3200000 ~ "minimum",
TRUE ~ "other"
)
) %>%
filter( minimums == "minimum") %>%
rename(name = x2)
#playoff data
pbp_play_21_url <- "https://api.pbpstats.com/get-totals/nba?Season=2020-21&SeasonType=Playoffs&Type=Player"
pbp_play_21_json <- read_json(pbp_play_21_url)
pbp_play_21 <- pbp_play_21_json[["multi_row_table_data"]] %>%
bind_rows() %>%
clean_names() %>%
mutate( playoff_minutes_per_game = minutes/games_played) %>%
select(
name,
playoff_minutes_per_game
)
pbp_play_21[is.na(pbp_play_21)] = 0
pbp_play_22_url <- "https://api.pbpstats.com/get-totals/nba?Season=2021-22&SeasonType=Playoffs&Type=Player"
pbp_play_22_json <- read_json(pbp_play_22_url)
pbp_play_22 <- pbp_play_22_json[["multi_row_table_data"]] %>%
bind_rows() %>%
clean_names() %>%
mutate( playoff_minutes_per_game = minutes/games_played) %>%
select(
name,
playoff_minutes_per_game
)
pbp_play_22[is.na(pbp_play_22)] = 0
pbp_play_23_url <- "https://api.pbpstats.com/get-totals/nba?Season=2022-23&SeasonType=Playoffs&Type=Player"
pbp_play_23_json <- read_json(pbp_play_23_url)
pbp_play_23 <- pbp_play_23_json[["multi_row_table_data"]] %>%
bind_rows() %>%
clean_names() %>%
mutate( playoff_minutes_per_game = minutes/games_played) %>%
select(
name,
playoff_minutes_per_game
)
pbp_play_23[is.na(pbp_play_23)] = 0
#regular season
# 2 year data
pbp_reg_20_21_url <- "https://api.pbpstats.com/get-totals/nba?Season=2019-20,2020-21&SeasonType=Regular%2BSeason&Type=Player"
pbp_reg_20_21_json <- read_json(pbp_reg_20_21_url)
pbp_reg_20_21 <- pbp_reg_20_21_json[["multi_row_table_data"]] %>%
bind_rows() %>%
clean_names() %>%
select(
name,
games_played,
minutes,
plus_minus,
assisted2s_pct,
assisted3s_pct,
fg3pct,
second_chance_fg3pct,
fg2pct,
efg_pct,
ts_pct,
fg3a_pct,
usage,
live_ball_turnover_pct,
at_rim_frequency,
at_rim_accuracy,
at_rim_pct_assisted,
short_mid_range_frequency,
short_mid_range_accuracy,
long_mid_range_frequency,
long_mid_range_accuracy,
corner3frequency,
corner3pct_assisted,
arc3frequency,
arc3accuracy,
arc3pct_assisted,
shooting_fouls_drawn_pct,
at_rim_assists,
three_pt_assists
)
pbp_reg_20_21[is.na(pbp_reg_20_21)] = 0
pbp_reg_21_22_url <- "https://api.pbpstats.com/get-totals/nba?Season=2020-21,2021-22&SeasonType=Regular%2BSeason&Type=Player"
pbp_reg_21_22_json <- read_json(pbp_reg_21_22_url)
pbp_reg_21_22 <- pbp_reg_21_22_json[["multi_row_table_data"]] %>%
bind_rows() %>%
clean_names() %>%
select(
name,
games_played,
minutes,
plus_minus,
assisted2s_pct,
assisted3s_pct,
fg3pct,
second_chance_fg3pct,
fg2pct,
efg_pct,
ts_pct,
fg3a_pct,
usage,
live_ball_turnover_pct,
at_rim_frequency,
at_rim_accuracy,
at_rim_pct_assisted,
short_mid_range_frequency,
short_mid_range_accuracy,
long_mid_range_frequency,
long_mid_range_accuracy,
corner3frequency,
corner3pct_assisted,
arc3frequency,
arc3accuracy,
arc3pct_assisted,
shooting_fouls_drawn_pct,
at_rim_assists,
three_pt_assists
)
pbp_reg_21_22[is.na(pbp_reg_21_22)] = 0
pbp_reg_22_23_url <- "https://api.pbpstats.com/get-totals/nba?Season=2022-23,2021-22&SeasonType=Regular%2BSeason&Type=Player"
pbp_reg_22_23_json <- read_json(pbp_reg_22_23_url)
pbp_reg_22_23 <- pbp_reg_22_23_json[["multi_row_table_data"]] %>%
bind_rows() %>%
clean_names() %>%
select(
name,
games_played,
minutes,
plus_minus,
assisted2s_pct,
assisted3s_pct,
fg3pct,
second_chance_fg3pct,
fg2pct,
efg_pct,
ts_pct,
fg3a_pct,
usage,
live_ball_turnover_pct,
at_rim_frequency,
at_rim_accuracy,
at_rim_pct_assisted,
short_mid_range_frequency,
short_mid_range_accuracy,
long_mid_range_frequency,
long_mid_range_accuracy,
corner3frequency,
corner3pct_assisted,
arc3frequency,
arc3accuracy,
arc3pct_assisted,
shooting_fouls_drawn_pct,
at_rim_assists,
three_pt_assists
)
pbp_reg_22_23[is.na(pbp_reg_22_23)] = 0
# 1 year data
pbp_reg_23_url <- "https://api.pbpstats.com/get-totals/nba?Season=2022-23&SeasonType=Regular%2BSeason&Type=Player"
pbp_reg_23_json <- read_json(pbp_reg_23_url)
pbp_reg_23 <- pbp_reg_23_json[["multi_row_table_data"]] %>%
bind_rows() %>%
clean_names() %>%
select(
name,
games_played,
minutes,
plus_minus,
assisted2s_pct,
assisted3s_pct,
fg3pct,
second_chance_fg3pct,
fg2pct,
efg_pct,
ts_pct,
fg3a_pct,
usage,
live_ball_turnover_pct,
at_rim_frequency,
at_rim_accuracy,
at_rim_pct_assisted,
short_mid_range_frequency,
short_mid_range_accuracy,
long_mid_range_frequency,
long_mid_range_accuracy,
corner3frequency,
corner3pct_assisted,
arc3frequency,
arc3accuracy,
arc3pct_assisted,
shooting_fouls_drawn_pct,
at_rim_assists,
three_pt_assists
)
pbp_reg_23[is.na(pbp_reg_23)] = 0
pbp_reg_22_url <- "https://api.pbpstats.com/get-totals/nba?Season=2021-22&SeasonType=Regular%2BSeason&Type=Player"
pbp_reg_22_json <- read_json(pbp_reg_22_url)
pbp_reg_22 <- pbp_reg_22_json[["multi_row_table_data"]] %>%
bind_rows() %>%
clean_names() %>%
select(
name,
games_played,
minutes,
plus_minus,
assisted2s_pct,
assisted3s_pct,
fg3pct,
second_chance_fg3pct,
fg2pct,
efg_pct,
ts_pct,
fg3a_pct,
usage,
live_ball_turnover_pct,
at_rim_frequency,
at_rim_accuracy,
at_rim_pct_assisted,
short_mid_range_frequency,
short_mid_range_accuracy,
long_mid_range_frequency,
long_mid_range_accuracy,
corner3frequency,
corner3pct_assisted,
arc3frequency,
arc3accuracy,
arc3pct_assisted,
shooting_fouls_drawn_pct,
at_rim_assists,
three_pt_assists
)
pbp_reg_22[is.na(pbp_reg_22)] = 0
pbp_reg_21_url <- "https://api.pbpstats.com/get-totals/nba?Season=2020-21&SeasonType=Regular%2BSeason&Type=Player"
pbp_reg_21_json <- read_json(pbp_reg_21_url)
pbp_reg_21 <- pbp_reg_21_json[["multi_row_table_data"]] %>%
bind_rows() %>%
clean_names() %>%
select(
name,
games_played,
minutes,
plus_minus,
assisted2s_pct,
assisted3s_pct,
fg3pct,
second_chance_fg3pct,
fg2pct,
efg_pct,
ts_pct,
fg3a_pct,
usage,
live_ball_turnover_pct,
at_rim_frequency,
at_rim_accuracy,
at_rim_pct_assisted,
short_mid_range_frequency,
short_mid_range_accuracy,
long_mid_range_frequency,
long_mid_range_accuracy,
corner3frequency,
corner3pct_assisted,
arc3frequency,
arc3accuracy,
arc3pct_assisted,
shooting_fouls_drawn_pct,
at_rim_assists,
three_pt_assists
)
pbp_reg_21[is.na(pbp_reg_21)] = 0
# 1 year adv data
url_1 <- "https://www.basketball-reference.com/leagues/NBA_2023_advanced.html"
adv_23 <- url_1 %>%
read_html() %>%
html_node("table") %>%
html_table()
adv_23 <- adv_23 %>%
select(where(function(x) any(!is.na(x)))) %>%
replace(is.na(.), 0) %>%
filter(Player != 'Player' ) %>%
filter(G > 10) %>%
filter(Tm != "TOT") %>%
rename(name = Player) %>%
select(name,
PER,
'TS%',
FTr,
'ORB%',
'AST%',
'STL%',
'TOV%',
OWS,
DWS,
OBPM,
DBPM,
BPM,
VORP)%>%
mutate_at(c('PER',
'TS%',
'FTr',
'ORB%',
'AST%',
'STL%',
'TOV%',
'OWS',
'DWS',
'OBPM',
'DBPM',
'BPM',
'VORP'), as.numeric)
url_2 <- "https://www.basketball-reference.com/leagues/NBA_2022_advanced.html"
adv_22 <- url_2 %>%
read_html() %>%
html_node("table") %>%
html_table()
adv_22 <- adv_22 %>%
select(where(function(x) any(!is.na(x)))) %>%
replace(is.na(.), 0) %>%
filter(Player != 'Player' ) %>%
filter(G > 10) %>%
filter(Tm != "TOT") %>%
rename(name = Player) %>%
select(name,
PER,
'TS%',
FTr,
'ORB%',
'AST%',
'STL%',
'TOV%',
OWS,
DWS,
OBPM,
DBPM,
BPM,
VORP)%>%
mutate_at(c('PER',
'TS%',
'FTr',
'ORB%',
'AST%',
'STL%',
'TOV%',
'OWS',
'DWS',
'OBPM',
'DBPM',
'BPM',
'VORP'), as.numeric)
url_3 <- "https://www.basketball-reference.com/leagues/NBA_2021_advanced.html"
adv_21 <- url_3 %>%
read_html() %>%
html_node("table") %>%
html_table()
adv_21 <- adv_21 %>%
select(where(function(x) any(!is.na(x)))) %>%
replace(is.na(.), 0) %>%
filter(Player != 'Player' ) %>%
filter(G > 10) %>%
filter(Tm != "TOT") %>%
rename(name = Player) %>%
select(name,
PER,
'TS%',
FTr,
'ORB%',
'AST%',
'STL%',
'TOV%',
OWS,
DWS,
OBPM,
DBPM,
BPM,
VORP) %>%
mutate_at(c('PER',
'TS%',
'FTr',
'ORB%',
'AST%',
'STL%',
'TOV%',
'OWS',
'DWS',
'OBPM',
'DBPM',
'BPM',
'VORP'), as.numeric)
url_4 <- "https://www.basketball-reference.com/leagues/NBA_2020_advanced.html"
adv_20 <- url_4 %>%
read_html() %>%
html_node("table") %>%
html_table()
adv_20 <- adv_20 %>%
select(where(function(x) any(!is.na(x)))) %>%
replace(is.na(.), 0) %>%
filter(Player != 'Player' ) %>%
filter(G > 10) %>%
filter(Tm != "TOT") %>%
rename(name = Player) %>%
select(name,
PER,
'TS%',
FTr,
'ORB%',
'AST%',
'STL%',
'TOV%',
OWS,
DWS,
OBPM,
DBPM,
BPM,
VORP) %>%
mutate_at(c('PER',
'TS%',
'FTr',
'ORB%',
'AST%',
'STL%',
'TOV%',
'OWS',
'DWS',
'OBPM',
'DBPM',
'BPM',
'VORP'), as.numeric)
adv_20_21 <- right_join(adv_20, adv_21, by = 'name') %>%
group_by(name) %>%
clean_names() %>%
mutate(
per = (per_x + per_y)/2,
ts_pct = (ts_percent_x + ts_percent_y)/2,
orb_pct = (orb_percent_x + orb_percent_y)/2,
ast_pct = (ast_percent_x + ast_percent_y)/2,
stl_pct = (stl_percent_x + stl_percent_y)/2,
tov_pct = (tov_percent_x + tov_percent_y)/2,
ows = (ows_x + ows_y),
dws = (dws_x + dws_y),
obpm = (obpm_x + obpm_y),
dbpm = (dbpm_x + dbpm_y),
bpm = (bpm_x + bpm_y),
vorp = (vorp_x + vorp_y)
) %>%
select(
name,
per,
ts_pct,
orb_pct,
ast_pct,
stl_pct,
tov_pct,
ows,
dws,
obpm,
dbpm,
bpm,
vorp
)
adv_21_22 <- right_join(adv_21, adv_22, by = 'name') %>%
group_by(name) %>%
clean_names() %>%
mutate(
per = (per_x + per_y)/2,
ts_pct = (ts_percent_x + ts_percent_y)/2,
orb_pct = (orb_percent_x + orb_percent_y)/2,
ast_pct = (ast_percent_x + ast_percent_y)/2,
stl_pct = (stl_percent_x + stl_percent_y)/2,
tov_pct = (tov_percent_x + tov_percent_y)/2,
ows = (ows_x + ows_y),
dws = (dws_x + dws_y),
obpm = (obpm_x + obpm_y),
dbpm = (dbpm_x + dbpm_y),
bpm = (bpm_x + bpm_y),
vorp = (vorp_x + vorp_y)
) %>%
select(
name,
per,
ts_pct,
orb_pct,
ast_pct,
stl_pct,
tov_pct,
ows,
dws,
obpm,
dbpm,
bpm,
vorp
)
adv_22_23 <- right_join(adv_22, adv_23, by = 'name') %>%
group_by(name) %>%
clean_names() %>%
mutate(
per = (per_x + per_y)/2,
ts_pct = (ts_percent_x + ts_percent_y)/2,
orb_pct = (orb_percent_x + orb_percent_y)/2,
ast_pct = (ast_percent_x + ast_percent_y)/2,
stl_pct = (stl_percent_x + stl_percent_y)/2,
tov_pct = (tov_percent_x + tov_percent_y)/2,
ows = (ows_x + ows_y),
dws = (dws_x + dws_y),
obpm = (obpm_x + obpm_y),
dbpm = (dbpm_x + dbpm_y),
bpm = (bpm_x + bpm_y),
vorp = (vorp_x + vorp_y)
) %>%
select(
name,
per,
ts_pct,
orb_pct,
ast_pct,
stl_pct,
tov_pct,
ows,
dws,
obpm,
dbpm,
bpm,
vorp
)
#combining data frames
play_22 <- left_join(salaries_21, pbp_play_22, by = 'name') %>%
drop_na() %>%
select(
name,
playoff_minutes_per_game
)
play_23 <- left_join(salaries_22, pbp_play_23, by = 'name') %>%
drop_na() %>%
select(
name,
playoff_minutes_per_game
)
# 2 year model
b_22 <- left_join(play_22,pbp_reg_20_21) %>%
drop_na() %>%
filter(games_played > 50) %>%
left_join(adv_20_21, by = 'name')
b_23 <- left_join(play_23,pbp_reg_21_22) %>%
drop_na() %>%
filter(games_played > 50) %>%
left_join(adv_21_22, by = 'name')
b <- rbind(b_22,b_23) %>%
mutate( playoff_roto = case_when(
playoff_minutes_per_game > 10 ~ "Yes",
TRUE ~ "No"
)
) %>%
replace(is.na(.), 0) %>%
clean_names()
b$playoff_roto <- as.factor(b$playoff_roto)
# 1 year model
a_21 <- left_join(play_22, pbp_reg_21) %>%
drop_na() %>%
filter(games_played > 15) %>%
left_join(adv_21)
a_22 <- left_join(play_23, pbp_reg_22) %>%
drop_na() %>%
filter(games_played > 15) %>%
left_join(adv_22)
a <- rbind(a_21,a_22) %>%
mutate( playoff_roto = case_when(
playoff_minutes_per_game > 10 ~ "Yes",
TRUE ~ "No"
)
) %>%
replace(is.na(.), 0) %>%
clean_names()
a$playoff_roto <- as.factor(a$playoff_roto)
set.seed(162)
# 2 year
b_split <- initial_split(b, prop = .75, strata = playoff_roto)
b_train <- b_split %>%
training()
b_train_clean <- b_train %>%
select(-c(1:2))
b_test <- b_split %>%
testing()
b_test_clean <- b_test %>%
select(-c(1:2))
# 1 year
a_split <- initial_split(a, prop = .75, strata = playoff_roto)
a_train <- a_split %>%
training()
a_train_clean <- a_train %>%
select(-c(1:2))
a_test <- a_split %>%
testing()
a_test_clean <- a_test %>%
select(-c(1:2))
# 2 year recipe
b_recipe <- recipe(playoff_roto ~ .,
data = b_train_clean) %>%
step_corr(all_numeric(), threshold = .40) %>%
step_normalize(all_numeric())
b_recipe_prep <- b_recipe %>%
prep(training = b_train_clean)
b_training_prep <- b_recipe_prep %>%
bake(new_data = NULL)
b_test_prep <- b_recipe_prep %>%
bake(new_data = b_test_clean)
# 1 year recipe
a_recipe <- recipe(playoff_roto ~ .,
data = a_train_clean) %>%
step_corr(all_numeric(), threshold = .40) %>%
step_normalize(all_numeric())
a_recipe_prep <- a_recipe %>%
prep(training = a_train_clean)
a_training_prep <- a_recipe_prep %>%
bake(new_data = NULL)
a_test_prep <- a_recipe_prep %>%
bake(new_data = a_test_clean)
# k-fold cross validation
# 1 year
a_clean <- a %>%
mutate(factor = case_when(
playoff_roto == "Yes" ~ 1,
TRUE ~ 0
)) %>%
select(-c(1:2,44))
train_control <- trainControl(method = "cv", number = 5)
set.seed(123)
cv_model <- train(factor ~.,
data = a_clean,
method = "knn",
trControl = train_control)
summary(cv_model)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## theDots 0 -none- list
## xNames 41 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 1 -none- logical
## param 0 -none- list
# 2 year
b_clean <- b %>%
mutate(factor = case_when(
playoff_roto == "Yes" ~ 1,
TRUE ~ 0
)) %>%
select(-c(1:2,42))
train_control <- trainControl(method = "cv", number = 10)
set.seed(123)
b_cv_model <- train(factor ~.,
data = b_clean,
method = "knn",
trControl = train_control)
summary(b_cv_model)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## theDots 0 -none- list
## xNames 40 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 1 -none- logical
## param 0 -none- list
# 2 year
b_training_prep1 <- b_training_prep %>%
mutate(factor = case_when(
playoff_roto == "Yes" ~ 1,
TRUE ~ 0
)) %>%
select(-playoff_roto)
b_test_prep1 <- b_test_prep %>%
mutate(factor = case_when(
playoff_roto == "Yes" ~ 1,
TRUE ~ 0
)) %>%
select(-playoff_roto)
b_test_pred_knn <- knn(
train = b_training_prep1,
test = b_test_prep1,
cl = b_training_prep1$factor,
k = 7
)
b_actual <- b_test_prep1$factor
b_cm <- table(b_actual,b_test_pred_knn)
b_cm
## b_test_pred_knn
## b_actual 0 1
## 0 10 0
## 1 1 14
b_accuracy <- sum(diag(b_cm))/length(b_actual)
sprintf("Accuracy: %.2f%%", b_accuracy*100)
## [1] "Accuracy: 96.00%"
# 1 year model
a_training_prep1 <- a_training_prep %>%
mutate(factor = case_when(
playoff_roto == "Yes" ~ 1,
TRUE ~ 0
)) %>%
select(-playoff_roto)
a_test_prep1 <- a_test_prep %>%
mutate(factor = case_when(
playoff_roto == "Yes" ~ 1,
TRUE ~ 0
)) %>%
select(-playoff_roto)
a_test_pred_knn <- knn(
train = a_training_prep1,
test = a_test_prep1,
cl = a_training_prep1$factor,
k = 9
)
a_actual <- a_test_prep1$factor
a_cm <- table(a_actual,a_test_pred_knn)
a_cm
## a_test_pred_knn
## a_actual 0 1
## 0 12 1
## 1 4 11
a_accuracy <- sum(diag(a_cm))/length(a_actual)
sprintf("Accuracy: %.2f%%", a_accuracy*100)
## [1] "Accuracy: 82.14%"
mins_24 <- right_join(pbp_reg_23, adv_23) %>%
right_join(salaries_24) %>%
filter(games_played > 15)
unique_rows_24 <- !duplicated(mins_24$name)
mins_24 <- mins_24[unique_rows_24,]
mins_24_clean <- mins_24 %>%
clean_names() %>%
select(games_played,
plus_minus,
assisted3s_pct,
live_ball_turnover_pct,
at_rim_pct_assisted,
short_mid_range_accuracy,
long_mid_range_accuracy,
stl_percent,
tov_percent,
obpm) %>%
drop_na() %>%
normalize()
mins_24_knn <- knn(
train = a_training_prep1[-11],
test = mins_24_clean,
cl = a_training_prep1$factor,
k = 9,
prob = TRUE
)
probs1 <- attributes(mins_24_knn)
mins_24_df <- cbind(mins_24$name, mins_24_knn) %>%
cbind(mins_24$games_played) %>%
cbind(probs1$prob) %>%
as.data.frame() %>%
rename(Player = V1,
Confidence = V4,
'Sample(Games)' = V3) %>%
mutate('Playoff Rotation' = case_when(
mins_24_knn == 2 ~ "Yes",
TRUE ~ "No"
)) %>%
select(
Player,
'Sample(Games)',
'Playoff Rotation',
Confidence
)
mins_24_df$Confidence <- as.numeric(mins_24_df$Confidence)
mins_24_df %>%
mutate_at(vars(Confidence), funs(round(., 3))) %>%
reactable(sortable = TRUE,
filterable = TRUE,
searchable = TRUE) %>%
add_title("1 Year KNN results")
mins_24_b <- right_join(pbp_reg_22_23, adv_22_23, by = 'name') %>%
right_join(salaries_24) %>%
filter(games_played > 50)
unique_rows_24_b <- !duplicated(mins_24_b$name)
mins_24_b <- mins_24_b[unique_rows_24_b,]
mins_24_b_clean <- mins_24_b %>%
clean_names() %>%
select(plus_minus,
at_rim_accuracy,
short_mid_range_frequency,
long_mid_range_accuracy,
corner3pct_assisted,
stl_pct) %>%
drop_na() %>%
normalize()
mins_24_knn_b <- knn(
train = b_training_prep1[-7],
test = mins_24_b_clean,
cl = b_training_prep1$factor,
k = 7,
prob = TRUE
)
probs1_b <- attributes(mins_24_knn_b)
mins_24_b_df <- cbind(mins_24_b$name, mins_24_knn_b) %>%
cbind(mins_24_b$games_played) %>%
cbind(probs1_b$prob) %>%
as.data.frame() %>%
rename(Player = V1,
Confidence = V4,
'Sample(Games)' = V3) %>%
mutate('Playoff Rotation' = case_when(
mins_24_knn_b == 2 ~ "Yes",
TRUE ~ "No"
)) %>%
select(
Player,
'Sample(Games)',
'Playoff Rotation',
Confidence
)
mins_24_b_df$Confidence <- as.numeric(mins_24_b_df$Confidence)
mins_24_b_df %>%
mutate_at(vars(Confidence), funs(round(., 3))) %>%
reactable(sortable = TRUE,
filterable = TRUE,
searchable = TRUE) %>%
add_title("2 Year KNN results")
#k-fold cross validation for svm
train_control <- trainControl(method = "cv", number = 10)
set.seed(153)
a_svm_model <- train(playoff_roto ~.,
data = a_train_clean,
method = "svmRadial",
trControl = train_control)
# 2 year
b_recipe_svm <- recipe(playoff_roto ~ .,
data = b_train_clean) %>%
step_corr(all_numeric(), threshold = .90) %>%
step_normalize(all_numeric())
b_recipe_prep_svm <- b_recipe_svm %>%
prep(training = b_train_clean)
b_training_prep_svm <- b_recipe_prep_svm %>%
bake(new_data = NULL)
b_test_prep_svm <- b_recipe_prep_svm %>%
bake(new_data = b_test_clean)
b_tune <- tune.svm( x = b_training_prep_svm[,-37],
y = b_training_prep_svm$playoff_roto,
gamma = 5 * 10 ^(-5:5),
cost = c(.01:50),
type = "C-classification",
kernal = "radial")
b_svm <- svm(playoff_roto ~ .,
data = b_training_prep,
type = "C-classification",
kernal = "radial",
cost = b_tune$best.parameters$cost,
gamma = b_tune$best.parameters$gamma,
sigma = 0.00000551727)
b_pred_test <- predict(b_svm, b_test_prep)
mean(b_pred_test == b_test_prep$playoff_roto)
## [1] 0.56
# 1 year
a_recipe_svm <- recipe(playoff_roto ~ .,
data = a_train_clean) %>%
step_corr(all_numeric(), threshold = .90) %>%
step_normalize(all_numeric())
a_recipe_prep_svm <- a_recipe_svm %>%
prep(training = a_train_clean)
a_training_prep_svm <- a_recipe_prep_svm %>%
bake(new_data = NULL)
a_test_prep_svm <- a_recipe_prep_svm %>%
bake(new_data = a_test_clean)
a_tune <- tune.svm( x = a_training_prep_svm[,-39],
y = a_training_prep_svm$playoff_roto,
gamma = 5 * 10 ^(-3:3),
cost = c(.01:50),
type = "C-classification",
kernal = "radial")
a_svm <- svm(playoff_roto ~ bpm + dbpm + games_played + at_rim_frequency + short_mid_range_accuracy + per + obpm,
data = a_training_prep_svm,
type = "C-classification",
kernal = "radial",
cost = a_tune$best.parameters$cost,
gamma = a_tune$best.parameters$gamma,
sigma = .02054)
a_pred_test <- predict(a_svm, a_test_prep_svm)
mean(a_pred_test == a_test_prep_svm$playoff_roto)
## [1] 0.6785714
#k-fold cross validation
log_model <- train(playoff_roto ~ bpm + dbpm + tov_percent + usage + orb_percent + long_mid_range_accuracy,
data = a_train_clean,
method = "glm",
trControl = train_control)
# 2 year
b_recipe_log <- recipe(playoff_roto ~ .,
data = b_train_clean) %>%
step_corr(all_numeric(), threshold = .90) %>%
step_normalize(all_numeric())
b_recipe_prep_log <- b_recipe_log %>%
prep(training = b_train_clean)
b_training_prep_log <- b_recipe_prep_log %>%
bake(new_data = NULL)
b_test_prep_log <- b_recipe_prep_log %>%
bake(new_data = b_test_clean)
b_log_model <- logistic_reg() %>%
set_engine('glm') %>%
set_mode('classification')
b_log_fit <- b_log_model %>%
fit(playoff_roto ~ .,
data = b_training_prep_log)
b_class_preds <- predict(b_log_fit,
new_data = b_test_prep_log,
type = "class")
b_prob_preds <- predict(b_log_fit,
new_data = b_test_prep_log,
type = "prob")
b_results <- b_test_prep_log %>%
select(playoff_roto) %>%
bind_cols(b_class_preds, b_prob_preds)
conf_mat(b_results, truth = playoff_roto,
estimate = .pred_class)
## Truth
## Prediction No Yes
## No 8 7
## Yes 2 8
b_accuracy <- accuracy(b_results, truth = playoff_roto,
estimate = .pred_class)
# 1 year
a_recipe_log <- recipe(playoff_roto ~ .,
data = a_train_clean) %>%
step_corr(all_numeric(), threshold = .90) %>%
step_normalize(all_numeric())
a_recipe_prep_log <- a_recipe_log %>%
prep(training = a_train_clean)
a_training_prep_log <- a_recipe_prep_log %>%
bake(new_data = NULL)
a_test_prep_log <- a_recipe_prep_log %>%
bake(new_data = a_test_clean)
a_log_model <- logistic_reg() %>%
set_engine('glm') %>%
set_mode('classification')
a_log_fit <- a_log_model %>%
fit(playoff_roto ~ bpm + dbpm + tov_percent + usage + orb_percent + long_mid_range_accuracy,
data = a_training_prep_log)
a_class_preds <- predict(a_log_fit,
new_data = a_test_prep_log,
type = "class")
a_prob_preds <- predict(a_log_fit,
new_data = a_test_prep_log,
type = "prob")
a_results <- a_test_prep_log %>%
select(playoff_roto) %>%
bind_cols(a_class_preds, a_prob_preds)
conf_mat(a_results, truth = playoff_roto,
estimate = .pred_class)
## Truth
## Prediction No Yes
## No 10 6
## Yes 3 9
a_accuracy <- accuracy(a_results, truth = playoff_roto,
estimate = .pred_class)
# 2 year model
b_recipe_nn <- recipe(playoff_roto ~ .,
data = b_train_clean) %>%
step_corr(all_numeric(), threshold = .50) %>%
step_normalize(all_numeric())
b_recipe_prep_nn <- b_recipe_nn %>%
prep(training = b_train_clean)
b_training_prep_nn <- b_recipe_prep_nn %>%
bake(new_data = NULL)
b_test_prep_nn <- b_recipe_prep_nn %>%
bake(new_data = b_test_clean)
b_net <- neuralnet(
playoff_roto == "Yes" ~ .,
data = b_training_prep_nn,
hidden = c(24,8,4),
linear.output = FALSE
)
plot(b_net, rep = "best")
b_pred <- predict(b_net, b_test_prep_nn) %>%
as.data.frame()
b_pred <- b_pred %>%
mutate( pred = case_when(
b_pred$V1 > .5 ~ "Yes",
TRUE ~ "No"
))
b_act_values <- b_test_prep_log$playoff_roto
b_pred_values <- ifelse(b_pred > .5, 1, 0)
b_acc <- mean(b_act_values == b_pred$pred)
# 1 year
a_recipe_nn <- recipe(playoff_roto ~ .,
data = a_train_clean) %>%
step_corr(all_numeric(), threshold = .90) %>%
step_normalize(all_numeric())
a_recipe_prep_nn <- a_recipe_nn %>%
prep(training = a_train_clean)
a_training_prep_nn <- a_recipe_prep_nn %>%
bake(new_data = NULL)
a_test_prep_nn <- a_recipe_prep_nn %>%
bake(new_data = a_test_clean)
a_net <- neuralnet(
playoff_roto == "Yes" ~ bpm + dbpm + games_played + at_rim_frequency + per + obpm + tov_percent + efg_pct + usage + at_rim_pct_assisted + at_rim_assists,
data = a_training_prep_nn,
hidden = c(8, 4),
linear.output = FALSE
)
plot(a_net, rep = "best")
a_pred <- predict(a_net, a_test_prep_nn) %>%
as.data.frame()
a_pred <- a_pred %>%
mutate( pred = case_when(
a_pred$V1 > .5 ~ "Yes",
TRUE ~ "No"
))
a_act_values <- a_test_prep_log$playoff_roto
a_pred_values <- ifelse(a_pred > .5, 1, 0)
a_acc <- mean(a_act_values == a_pred$pred)
Ultimately I chose to use only the KNN models because they provided a better accuracy (~80% for one year, ~90% for two year). I did debate aggregating the results of the KNN with one or multiple of the other model types but their accuracies were all sub 70% so I decided the cost in accuracy was not worth the added complexity.
Main Weaknesses of model
q <- full_join(mins_24_b_df, mins_24_df, by = 'Player') %>%
rename('2 year Sample'= 'Sample(Games).x',
'2 year Playoff Projection' = 'Playoff Rotation.x',
'2 year Confidence' = 'Confidence.x',
'1 year Sample' = 'Sample(Games).y',
'1 year Playoff Projection' = 'Playoff Rotation.y',
'1 year Confidence' = 'Confidence.y')
q <- q %>% mutate_at(c(2,4,5,7), as.numeric) %>%
replace(is.na(.), 0) %>%
mutate_at(vars(4,7), funs(round(., 3)))
q %>% reactable(theme = espn(),
searchable = TRUE,
filterable = TRUE,
sortable = TRUE) %>%
add_title('Playoff Projection Model')
p <- q %>%
filter(q$`2 year Playoff Projection` == "Yes",
q$`1 year Playoff Projection` == "Yes")
p %>%
mutate('Combined Confidence' = (p$`2 year Confidence` + p$`1 year Confidence`)/2) %>%
select(Player,
'Combined Confidence',
'2 year Sample',
'1 year Sample') %>%
reactable(theme = espn(),
sortable = TRUE,
filterable = TRUE,
searchable = TRUE) %>%
add_title("Players that received a Yes from both models")