R Packages
library(rvest)
library(tidyverse)
library(nbastatR)
library(reactable)
library(reactablefmtr)
library(plotly)
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
Data collection and cleaning
# check pro sports transactions for updated page number and adjust number at total_pages
all_tables <- list()
# Set the base URL without the 'start' parameter
base_url <- "https://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=&ILChkBx=yes&Submit=Search&start="
# Specify the total number of pages (you may need to find this manually)
total_pages <- 1586 #
# Iterate through each page and fetch data
for (page in 1:total_pages) {
# Construct the URL for the current page
url <- paste0(base_url, (page - 1) * 25 + 1)
# Read the HTML and extract the table
table <- read_html(url) %>%
html_nodes("table") %>%
html_table(fill = TRUE) %>%
as.data.frame()
# Append the table to the list
all_tables[[page]] <- table
}
# Combine all tables into a single data frame
combined_table <- do.call(rbind, all_tables)
colnames(combined_table) <- combined_table[1, ]
combined_table <- combined_table[-1, ]
combined_table$Player <- paste(combined_table$Acquired, combined_table$Relinquished, sep = "")
cleaner_table <- combined_table %>%
select(Date,
Team,
Player,
Notes) %>%
rename(Action = Notes)
cleaner_table$Player <- gsub("•\\s* ", "", cleaner_table$Player)
cleaner_table <- cleaner_table[cleaner_table$Date != "Date", ]
cleaner_table <- separate(cleaner_table, Action, into = c("Action", "Injury"), sep = "with", remove = FALSE)
clean_table1 <- cleaner_table %>%
mutate( Date = as.Date(Date)) %>%
arrange(desc(Date)) %>%
group_by(Player) %>%
mutate(Days_out = lag(Date) - Date) %>%
ungroup()
clean_table2 <- clean_table1 %>%
mutate( body_part = case_when(
str_detect(Injury, "foot") ~ "Foot",
str_detect(Injury, "toe") ~ "Toe",
str_detect(Injury, "ankle") ~ "Ankle",
str_detect(Injury, "heel") ~ "Heel",
str_detect(Injury, "Achilles") ~ "Achilles",
str_detect(Injury, "calf") ~ "Calf",
str_detect(Injury, "shin") ~ "Shin",
str_detect(Injury, "tibia") ~ "Tibia",
str_detect(Injury, "fibula") ~ "Fibula",
str_detect(Injury, "knee") ~ "Knee",
str_detect(Injury, "patella") ~ "Knee",
str_detect(Injury, "hamstring") ~ "Hamstring",
str_detect(Injury, "groin") ~ "Groin",
str_detect(Injury, "quadriceps") ~ "Quadriceps",
str_detect(Injury, "thigh") ~ "Thigh",
str_detect(Injury, "hip") ~ "Hip",
str_detect(Injury, "pelvis") ~ "Pelvis",
str_detect(Injury, "tailbone") ~ "Tailbone",
str_detect(Injury, "adductor") ~ "Adductor",
str_detect(Injury, "rib") ~ "Rib",
str_detect(Injury, "ribcage") ~ "Rib",
str_detect(Injury, "abdominal") ~ "Abdominal",
str_detect(Injury, "core") ~ "Abdominal",
str_detect(Injury, "oblique") ~ "Oblique",
str_detect(Injury, "glute") ~ "Glute",
str_detect(Injury, "back") ~ "Back",
str_detect(Injury, "spine") ~ "Back",
str_detect(Injury, "shoulder") ~ "Shoulder",
str_detect(Injury, "sternum") ~ "Sternum",
str_detect(Injury, "forearm") ~ "Forearm",
str_detect(Injury, "wrist") ~ "Wrist",
str_detect(Injury, "hand") ~ "Hand",
str_detect(Injury, "finger") ~ "Finger",
str_detect(Injury, "thumb") ~ "Finger",
str_detect(Injury, "metatarsal") ~ "Finger",
str_detect(Injury, "elbow") ~ "Elbow",
str_detect(Injury, "neck") ~ "Neck",
str_detect(Injury, "facial") ~ "Facial",
str_detect(Injury, "jaw") ~ "Jaw",
str_detect(Injury, "nasal") ~ "Nasal",
str_detect(Injury, "eye") ~ "Eye",
str_detect(Injury, "dental") ~ "Dental",
str_detect(Injury, "head") ~ "Head",
str_detect(Injury, "lung") ~ "Lung",
str_detect(Injury, "concussion") ~ "Concussion",
str_detect(Injury, "illness" ) ~ "Illness",
str_detect(Injury, "infection") ~ "Illness",
str_detect(Injury, "virus") ~ "Illness",
str_detect(Injury, "flu") ~ "Illness",
str_detect(Injury, "cold") ~ "Illness",
str_detect(Injury, "sinus") ~ "Illness",
str_detect(Injury, "gastrointestinal") ~ "Illness",
str_detect(Injury, "rest") ~ "Rest",
str_detect(Injury, "COVID") ~ "Covid",
str_detect(Injury, "COVID-19") ~ "Covid",
str_detect(Injury, "protocalls") ~ "Covid",
TRUE ~ "other"
)) %>%
mutate( body_region =
case_when(
str_detect(body_part, "Foot") ~ "Leg",
str_detect(body_part, "Ankle") ~ "Leg",
str_detect(body_part, "Heel") ~ "Leg",
str_detect(body_part, "Achilles") ~ "Leg",
str_detect(body_part, "Toe") ~ "Leg",
str_detect(body_part, "Shin") ~ "Leg",
str_detect(body_part, "Calf") ~ "Leg",
str_detect(body_part, "Fibula") ~ "Leg",
str_detect(body_part, "Tibia") ~ "Leg",
str_detect(body_part, "Knee") ~ "Leg",
str_detect(body_part, "Patella") ~ "Leg",
str_detect(body_part, "Thigh") ~ "Leg",
str_detect(body_part, "Groin") ~ "Leg",
str_detect(body_part, "Hamstring") ~ "Leg",
str_detect(body_part, "Quadriceps") ~ "Leg",
str_detect(body_part, "Hip") ~ "Core",
str_detect(body_part, "Pelvis") ~ "Core",
str_detect(body_part, "Tailbone") ~ "Core",
str_detect(body_part, "Adductor") ~ "Core",
str_detect(body_part, "Rib") ~ "Core",
str_detect(body_part, "Abdominal") ~ "Core",
str_detect(body_part, "Core") ~ "Core",
str_detect(body_part, "Oblique") ~ "Core",
str_detect(body_part, "Glute") ~ "Core",
str_detect(body_part, "Back") ~ "Core",
str_detect(body_part, "Sternum") ~ "Core",
str_detect(body_part, "Shoulder") ~ "Arm",
str_detect(body_part, "Forearm") ~ "Arm",
str_detect(body_part, "Wrist") ~ "Arm",
str_detect(body_part, "Finger") ~ "Arm",
str_detect(body_part, "Hand") ~ "Arm",
str_detect(body_part, "Elbow") ~ "Arm",
str_detect(body_part, "Neck") ~ "Head",
str_detect(body_part, "Dental") ~ "Head",
str_detect(body_part, "Nasal") ~ "Head",
str_detect(body_part, "Eye") ~ "Head",
str_detect(body_part, "Head") ~ "Head",
str_detect(body_part, "Jaw") ~ "Head",
str_detect(body_part, "Nasal") ~ "Head",
str_detect(body_part, "Head") ~ "Head",
str_detect(body_part, "Illness") ~ "Illness",
str_detect(body_part, "Covid") ~ "Illness",
str_detect(body_part, "Concussion") ~ "Concussion",
TRUE ~ "other"
)) %>%
mutate(injury_type =
case_when(
str_detect(Injury, "sprained") ~ "Sprain",
str_detect(Injury, "sprain") ~ "Sprain",
str_detect(Injury, "plantar fasciitis") ~ "Plantar Fasciitis",
str_detect(Injury, "spasms") ~ "Spasms",
str_detect(Injury, "hyperextended") ~ "Hyperextended",
str_detect(Injury, "sore") ~ "Sore",
str_detect(Injury, "bruised") ~ "Bruised",
str_detect(Injury, "fractured") ~ "Fracture",
str_detect(Injury, "fracture") ~ "Fracture",
str_detect(Injury, "tendinitis") ~ "Tendinitis",
str_detect(Injury, "impingement") ~ "Impingement",
str_detect(Injury, "dislocated") ~ "Discoloation",
str_detect(Injury, "impingement") ~ "Impingement",
str_detect(Injury, "impingement") ~ "Impingement",
str_detect(Injury, "torn") ~ "Tear",
str_detect(Injury, "tightness") ~ "Tightness",
str_detect(Injury, "impingement") ~ "Impingement",
str_detect(Injury, "concussion") ~ "Concussion",
str_detect(Injury, "inflammation") ~ "Inflammation",
str_detect(Injury, "infection") ~ "Infection",
str_detect(Injury, "laceration") ~ "Laceration",
str_detect(Injury, "collapsed") ~ "Collapsed",
str_detect(Injury, "cramps") ~ "Cramps",
str_detect(Injury, "spasms") ~ "Spasms",
str_detect(Injury, "bursitis") ~ "Bursitis",
str_detect(Injury, "stress reaction") ~ "Stress Reaction",
str_detect(Injury, "subluxation") ~ "Subluxation",
str_detect(Injury, "swollen") ~ "Swollen",
str_detect(Injury, "stiffness") ~ "Stiffness",
str_detect(Injury, "tendinopathy") ~ "Tendinopathy",
str_detect(Injury, "hernia") ~ "Hernia",
TRUE ~ "other"
))
clean_table2 %>%
filter(Action != "activated from IL") %>%
select(Date,
Team,
Player,
Injury,
Days_out) %>%
rename(`Days Missed` = Days_out) %>%
reactable( filterable = TRUE,
searchable = TRUE,
sortable = TRUE,
theme = espn(),
defaultPageSize = 15) %>%
add_subtitle("Data is converted from NBA injury reports")
Notes on table: * An NA value in days missed indicates that the player has not returned yet. * There are some inconsistencies with data due to human error, variance, and arbitrary languange when filling out injury reports. * Date is formatted as: year - month - day
clean_table_teams_22 <- clean_table2 %>%
filter(Date > as.Date("2021-10-10")) %>%
na.omit(Days_out) %>%
group_by(Team) %>%
summarize(
days_out = sum(Days_out)/3
)
clean_table_teams_22$days_out <- as.numeric(clean_table_teams_22$days_out)
clean_table_teams_22 %>%
ungroup() %>%
rename(
`Days Missed` = days_out
) %>%
arrange(desc(`Days Missed`)) %>%
plot_ly(x = ~Team, y = ~`Days Missed`, type = "bar",
marker = list(color = c("red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "blue", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red"))) %>%
layout(xaxis = list(categoryorder = "total descending"),
title = "Days players on roster missed due to injury per season",
plot_bgcolor = "white",
annotations = list(
text = "2022-2024 seasons",
x = .5 , y = 1.05,
xanchor = "center" , yanchor = "top",
showarrow = FALSE
))
clean_table_teams_23 <- clean_table2 %>%
filter(Date > as.Date("2022-10-10")) %>%
na.omit(Days_out) %>%
group_by(Team) %>%
summarize(
days_out = sum(Days_out)/2
)
clean_table_teams_23 %>%
ungroup() %>%
arrange(desc(days_out)) %>%
rename(
`Days Missed` = days_out
) %>%
plot_ly(x = ~Team, y = ~`Days Missed`, type = "bar",
marker = list(color = c("red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "blue", "red", "red", "red", "red"))) %>%
layout(xaxis = list(categoryorder = "total descending"),
title = "Days players on roster missed due to injury per season",
annotations = list(
text = "2023 & 2024 seasons",
x = .5 , y = 1.05,
xanchor = "center" , yanchor = "top",
showarrow = FALSE
))
clean_table_teams_24 <- clean_table2 %>%
filter(Date > as.Date("2023-10-10")) %>%
na.omit(Days_out) %>%
group_by(Team) %>%
summarize(
days_out = sum(Days_out)
)
clean_table_teams_24$days_out <- as.numeric(clean_table_teams_24$days_out)
clean_table_teams_24 %>%
ungroup() %>%
arrange(desc(days_out)) %>%
rename(
`Days Out` = days_out
) %>%
plot_ly(x = ~Team, y = ~`Days Out`, type = "bar",
marker = list(color = c("red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "blue", "red", "red", "red", "red", "red"))) %>%
layout(xaxis = list(categoryorder = "total descending"),
title = "Days players on roster missed due to injury",
annotations = list(
text = "2024 season",
x = .5 , y = 1.05,
xanchor = "center" , yanchor = "top",
showarrow = FALSE
))
clean_table2 %>%
filter(Date > as.Date("2010-10-10"),
body_region != "other") %>%
rename( `Region of body` = body_region) %>%
ggplot(aes(x = `Region of body`, fill = "red")) +
geom_bar() +
scale_fill_manual(values = "red") +
labs(title = "Frequency of injury to region of body",
subtitle = "Limited to last 14 seasons",
y = str_to_title("count")) +
theme_minimal() +
theme(legend.position = "none")
clean_table2 %>%
filter(Date > as.Date("2010-10-10"),
body_region != "other") %>%
na.omit(Days_out) %>%
group_by(body_region) %>%
summarize(
`Days Missed` = sum(Days_out)
) %>%
rename( `Region of body` = body_region) %>%
ggplot( aes(x = `Region of body`, y = `Days Missed`, fill = `Region of body`)) +
geom_col() +
scale_fill_manual( values = c("red", "red", "red", "red", "red", "red")) +
theme_minimal() +
labs(title = "Days lost due to injury to region of body",
subtitle = "Limited to last 14 seasons") +
theme(legend.position = "none")
clean_table2 %>%
filter(Date > as.Date("2010-10-10"),
body_region != "other") %>%
na.omit(Days_out) %>%
group_by(body_region) %>%
summarize(
`Days Missed` = sum(Days_out)
) %>%
rename( `Region of body` = body_region) %>%
reactable() %>%
add_title("Table of days lost due to injury to region of the body")
body_part_freq <- clean_table2 %>%
filter(body_part != "other") %>%
count(body_part, sort = TRUE)
# Select the top ten most frequent values
top_ten_body_part <- body_part_freq %>%
top_n(10)
# Filter the data frame to include only the top ten most frequent values
clean_table_top_ten <- clean_table2 %>%
filter(body_part %in% top_ten_body_part$body_part) %>%
filter(Date > as.Date("2010-10-10"))
# Create the ggplot with the filtered data
clean_table_top_ten %>%
rename(`Body Part` = body_part) %>%
ggplot(aes(x = `Body Part`, fill = "red")) +
geom_bar() +
scale_fill_manual(values = "red") +
labs(title = "Frequency of injury to body part",
subtitle = "Limited to last 14 seasons",
y = str_to_title("count")) +
theme_minimal() +
theme(legend.position = "none")
clean_table2 %>%
filter(Date > as.Date("2010-10-10"),
body_part != "other") %>%
na.omit(Days_out) %>%
group_by(body_part) %>%
summarize(
`Days Missed` = sum(Days_out)
) %>%
arrange(desc(`Days Missed`)) %>%
head(n = 10) %>%
rename(`Body Part` = body_part) %>%
ggplot( aes(x = `Body Part`, y = `Days Missed`, fill = `Body Part`)) +
geom_col() +
scale_fill_manual( values = c("red", "red", "red", "red", "red", "red" , "red" , "red" , "red" , "red")) +
labs(title = "Days lost due to injury to body part",
subtitle = "Limited to last 14 seasons") +
theme_minimal() +
theme(legend.position = "none")
clean_table2 %>%
filter(Date > as.Date("2010-10-10"),
body_part != "other") %>%
na.omit(Days_out) %>%
group_by(body_part) %>%
summarize(
`Days Missed` = sum(Days_out)
) %>%
arrange(desc(`Days Missed`)) %>%
rename(`Body Part` = body_part) %>%
head(n = 10) %>%
reactable() %>%
add_title("Ten body parts that cause the most days lost from last 14 seasons")
injury_freq <- clean_table2 %>%
filter(injury_type != "other") %>%
count(injury_type, sort = TRUE)
# Select the top ten most frequent values
top_ten_injury <- injury_freq %>%
top_n(10)
# Filter the data frame to include only the top ten most frequent values
clean_table_injury <- clean_table2 %>%
filter(injury_type %in% top_ten_injury$injury_type) %>%
filter(Date > as.Date("2010-10-10"))
# Create the ggplot with the filtered data
clean_table_injury %>%
rename(`Injury Type` = injury_type) %>%
ggplot(aes(x = `Injury Type`, fill = "red")) +
geom_bar() +
scale_fill_manual(values = "red") +
labs(title = "Frequency of type of injury",
subtitle = "Limited to last 14 seasons",
y = str_to_title("count")) +
theme_minimal() +
theme(legend.position = "none")
clean_table2 %>%
filter(Date > as.Date("2010-08-10"),
injury_type != "other") %>%
na.omit(Days_out) %>%
group_by(injury_type) %>%
summarize(
`Days Missed` = sum(Days_out)
) %>%
arrange(desc(`Days Missed`)) %>%
rename(`Injury Type` = injury_type) %>%
head(n = 10) %>%
ggplot( aes(x = `Injury Type`, y = `Days Missed`, fill = `Injury Type`)) +
geom_col() +
scale_fill_manual( values = c("red", "red", "red", "red", "red", "red" , "red" , "red" , "red" , "red")) +
labs(title = "Days lost due to injury type",
subtitle = "limited to last 14 seasons") +
theme_minimal() +
theme(legend.position = "none")
clean_table2 %>%
filter(Date > as.Date("2010-10-10"),
injury_type != "other") %>%
na.omit(Days_out) %>%
group_by(injury_type) %>%
summarize(
`Days Missed` = sum(Days_out)
) %>%
arrange(desc(`Days Missed`)) %>%
rename(`Injury Type` = injury_type) %>%
head(n = 10) %>%
reactable() %>%
add_title("Ten types of injuries that caused the most missed days in last 14 seasons")