library(tidyverse)
library(dplyr)
library(rvest)
library(janitor)
library(plotly)
library(crosstalk)
library(ggplot2)
library(reactable)
library(rpubs)
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
bball_url <- "https://www.basketball-reference.com/leagues/NBA_2022_advanced.html"
bballref <- bball_url %>%
read_html() %>%
html_elements("table") %>%
html_table() %>%
.[[1]] %>%
clean_names()
bballref[, c(6:29)] <- sapply(bballref[, c(6:29)], as.numeric )
bballref <- bballref %>%
filter( g > 10,
tm != "TOT")
#age analysis
creation <- bballref %>%
mutate(obpm_score = rank(obpm),
usg_score = rank(usg_percent),
creation_score = usg_score + obpm_score ) %>%
filter( player != "Player") %>%
arrange(-creation_score) %>%
select( player, tm, age, creation_score, usg_score, obpm_score)
creation$age <- as.numeric(creation$age)
avg_age <- mean(creation$age)
cq <- quantile(creation$creation_score, probs = c(.20, .40, .60, .80))
creation <- creation %>%
mutate(creation_level = case_when(
creation_score > 794 ~ 'A',
creation_score > 596 ~ 'B',
creation_score > 440 ~ 'C',
creation_score > 292 ~ 'D',
TRUE ~'F'
))
age <- creation %>%
mutate( old_young = case_when(
age > 26 ~ 'old',
TRUE ~ 'young'
)) %>%
filter( creation_level == "A")
plot_ly( age , x = ~obpm_score, y = ~usg_score, color = ~old_young,
hoverinfo = "text",
text = ~paste(" ", player,
"<br>", " ", age,
"<br>", " ", tm)) %>%
add_markers() %>%
layout( title = "Distribution by Age of 'A' level creators")
table(age$old_young)
##
## old young
## 54 54
top_age <- age %>%
head(n = 20)
plot_ly( top_age, x = ~obpm_score, y = ~usg_score, color = ~old_young,
hoverinfo = "text",
text = ~paste(" ", player,
"<br>", " ", age,
"<br>", " ", tm)) %>%
add_markers() %>%
layout( title = "Top 20 Creators")
table(top_age$old_young)
##
## old young
## 9 11
12 of 20 top creators are still on the team that drafted them, but only 4 of 9 of “old” players are still with the team that drafted them.
top_age %>%
filter(creation_level == "A") %>%
plot_ly( x = ~obpm_score, y = ~usg_score, color = ~tm,
hoverinfo = "text",
text = ~paste(" ", player,
"<br>", " ", age,
"<br>", " ", tm)) %>%
add_markers()
table(top_age$tm)
##
## ATL BOS BRK CHI DAL DEN GSW LAL MEM MIL MIN NOP OKC PHI PHO POR UTA WAS
## 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
There is an even distribution of the top creators, with the Bulls and Nets being only teams with two and no team having two of the top young creators.
creation <- bballref %>%
mutate(obpm_score = rank(obpm),
usg_score = rank(usg_percent),
creation_score = usg_score + obpm_score ) %>%
filter( player != "Player") %>%
arrange(-creation_score) %>%
select( player, tm, age, creation_score, usg_score, obpm_score)
creation$age <- as.numeric(creation$age)
avg_age <- mean(creation$age)
cq <- quantile(creation$creation_score, probs = c(.20, .40, .60, .80))
creation <- creation %>%
mutate(creation_level = case_when(
creation_score > 794 ~ 'A',
creation_score > 596 ~ 'B',
creation_score > 440 ~ 'C',
creation_score > 292 ~ 'D',
TRUE ~'F'
))
g2 <- ggplot(creation, aes( x = player, y = creation_score, color = creation_level))+
geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
labs(title = "Creation Distribution", subtitle = "Important to note clustering ")
ggplotly(g2)
g2+facet_wrap(. ~tm)
Fairly even distribution throughout but clustering in the middle supports the common sentiment held throughout the league but maybe a more extreme version. People often say 10 percent of the league would be good anywhere, 10 percent would struggle anywhere and for everyone else performance is dependent upon situation. But this analysis suggests that the splits are closer to 20% than 10%.
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rpubs_0.2.2 reactable_0.3.0 crosstalk_1.2.0 plotly_4.10.0
## [5] janitor_2.1.0 rvest_1.0.2 forcats_0.5.1 stringr_1.4.0
## [9] dplyr_1.0.7 purrr_0.3.4 readr_2.0.1 tidyr_1.2.0
## [13] tibble_3.1.4 ggplot2_3.3.5 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.7 lubridate_1.7.10 assertthat_0.2.1 digest_0.6.27
## [5] utf8_1.2.2 R6_2.5.1 cellranger_1.1.0 backports_1.2.1
## [9] reprex_2.0.0 evaluate_0.14 highr_0.9 httr_1.4.2
## [13] pillar_1.6.2 rlang_0.4.11 curl_4.3.2 lazyeval_0.2.2
## [17] readxl_1.3.1 rstudioapi_0.13 data.table_1.14.0 jquerylib_0.1.4
## [21] rmarkdown_2.14 labeling_0.4.2 selectr_0.4-2 htmlwidgets_1.5.4
## [25] munsell_0.5.0 broom_0.7.8 compiler_4.1.0 modelr_0.1.8
## [29] xfun_0.31 pkgconfig_2.0.3 htmltools_0.5.2 tidyselect_1.1.1
## [33] viridisLite_0.4.0 fansi_0.5.0 crayon_1.4.1 tzdb_0.1.2
## [37] dbplyr_2.1.1 withr_2.4.2 grid_4.1.0 jsonlite_1.7.2
## [41] gtable_0.3.0 lifecycle_1.0.0 DBI_1.1.1 magrittr_2.0.1
## [45] scales_1.1.1 cli_3.0.1 stringi_1.7.4 farver_2.1.0
## [49] fs_1.5.0 snakecase_0.11.0 xml2_1.3.2 bslib_0.3.1
## [53] ellipsis_0.3.2 generics_0.1.0 vctrs_0.3.8 RColorBrewer_1.1-2
## [57] tools_4.1.0 glue_1.4.2 hms_1.1.0 fastmap_1.1.0
## [61] yaml_2.2.1 colorspace_2.0-2 knitr_1.33 haven_2.4.1
## [65] sass_0.4.0