R Packages

library(rvest)
library(tidyverse)
library(nbastatR)
library(reactable)
library(reactablefmtr)
library(plotly)
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)

Data collection and cleaning

# check pro sports transactions for updated page number and adjust number at total_pages
all_tables <- list()

# Set the base URL without the 'start' parameter
base_url <- "https://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=&ILChkBx=yes&Submit=Search&start="

# Specify the total number of pages (you may need to find this manually)
total_pages <- 1586  #

# Iterate through each page and fetch data
for (page in 1:total_pages) {
  # Construct the URL for the current page
  url <- paste0(base_url, (page - 1) * 25 + 1)
  
  # Read the HTML and extract the table
  table <- read_html(url) %>%
    html_nodes("table") %>%
    html_table(fill = TRUE) %>%
    as.data.frame()
  
  # Append the table to the list
  all_tables[[page]] <- table
}

# Combine all tables into a single data frame
combined_table <- do.call(rbind, all_tables)
colnames(combined_table) <- combined_table[1, ]
combined_table <- combined_table[-1, ]
combined_table$Player <- paste(combined_table$Acquired, combined_table$Relinquished, sep = "")

Injury Report data table, 1952 to 2024 seasons

cleaner_table <- combined_table %>%
  select(Date,
         Team,
         Player,
         Notes) %>%
  rename(Action = Notes)
cleaner_table$Player <- gsub("•\\s* ", "", cleaner_table$Player)
cleaner_table <- cleaner_table[cleaner_table$Date != "Date", ]
cleaner_table <- separate(cleaner_table, Action, into = c("Action", "Injury"), sep = "with", remove = FALSE)
clean_table1 <- cleaner_table %>%
  mutate( Date = as.Date(Date)) %>%
  arrange(desc(Date)) %>%
  group_by(Player) %>%
  mutate(Days_out = lag(Date) - Date) %>%
  ungroup()
clean_table2 <- clean_table1 %>%
  mutate( body_part = case_when(
    str_detect(Injury, "foot") ~ "Foot",
    str_detect(Injury, "toe") ~ "Toe",
    str_detect(Injury, "ankle") ~ "Ankle",
    str_detect(Injury, "heel") ~ "Heel",
    str_detect(Injury, "Achilles") ~ "Achilles",
    str_detect(Injury, "calf") ~ "Calf",
    str_detect(Injury, "shin") ~ "Shin",
    str_detect(Injury, "tibia") ~ "Tibia",
    str_detect(Injury, "fibula") ~ "Fibula",
    str_detect(Injury, "knee") ~ "Knee",
    str_detect(Injury, "patella") ~ "Knee",
    str_detect(Injury, "hamstring") ~ "Hamstring",
    str_detect(Injury, "groin") ~ "Groin",
    str_detect(Injury, "quadriceps") ~ "Quadriceps",
    str_detect(Injury, "thigh") ~ "Thigh",
    str_detect(Injury, "hip") ~ "Hip",
    str_detect(Injury, "pelvis") ~ "Pelvis",
    str_detect(Injury, "tailbone") ~ "Tailbone",
    str_detect(Injury, "adductor") ~ "Adductor",
    str_detect(Injury, "rib") ~ "Rib",
    str_detect(Injury, "ribcage") ~ "Rib",
    str_detect(Injury, "abdominal") ~ "Abdominal",
    str_detect(Injury, "core") ~ "Abdominal",
    str_detect(Injury, "oblique") ~ "Oblique",
    str_detect(Injury, "glute") ~ "Glute",
    str_detect(Injury, "back") ~ "Back",
    str_detect(Injury, "spine") ~ "Back",
    str_detect(Injury, "shoulder") ~ "Shoulder",
    str_detect(Injury, "sternum") ~ "Sternum",
    str_detect(Injury, "forearm") ~ "Forearm",
    str_detect(Injury, "wrist") ~ "Wrist",
    str_detect(Injury, "hand") ~ "Hand",
    str_detect(Injury, "finger") ~ "Finger",
    str_detect(Injury, "thumb") ~ "Finger",
    str_detect(Injury, "metatarsal") ~ "Finger",
    str_detect(Injury, "elbow") ~ "Elbow",
    str_detect(Injury, "neck") ~ "Neck",
    str_detect(Injury, "facial") ~ "Facial",
    str_detect(Injury, "jaw") ~ "Jaw",
    str_detect(Injury, "nasal") ~ "Nasal",
    str_detect(Injury, "eye") ~ "Eye",
    str_detect(Injury, "dental") ~ "Dental",
    str_detect(Injury, "head") ~ "Head",
    str_detect(Injury, "lung") ~ "Lung",
    str_detect(Injury, "concussion") ~ "Concussion",
    str_detect(Injury, "illness" ) ~ "Illness",
    str_detect(Injury, "infection") ~ "Illness",
    str_detect(Injury, "virus") ~ "Illness",
    str_detect(Injury, "flu") ~ "Illness",
    str_detect(Injury, "cold") ~ "Illness",
    str_detect(Injury, "sinus") ~ "Illness",
    str_detect(Injury, "gastrointestinal") ~ "Illness",
    str_detect(Injury, "rest") ~ "Rest",
    str_detect(Injury, "COVID") ~ "Covid",
    str_detect(Injury, "COVID-19") ~ "Covid",
    str_detect(Injury, "protocalls") ~ "Covid",
    TRUE ~ "other"
  )) %>%
  mutate( body_region = 
            case_when(
              str_detect(body_part, "Foot") ~ "Leg",
              str_detect(body_part, "Ankle") ~ "Leg",
              str_detect(body_part, "Heel") ~ "Leg",
              str_detect(body_part, "Achilles") ~ "Leg",
              str_detect(body_part, "Toe") ~ "Leg",
              str_detect(body_part, "Shin") ~ "Leg",
              str_detect(body_part, "Calf") ~ "Leg",
              str_detect(body_part, "Fibula") ~ "Leg",
              str_detect(body_part, "Tibia") ~ "Leg",
              str_detect(body_part, "Knee") ~ "Leg",
              str_detect(body_part, "Patella") ~ "Leg",
              str_detect(body_part, "Thigh") ~ "Leg",
              str_detect(body_part, "Groin") ~ "Leg",
              str_detect(body_part, "Hamstring") ~ "Leg",
              str_detect(body_part, "Quadriceps") ~ "Leg",
              str_detect(body_part, "Hip") ~ "Core",
              str_detect(body_part, "Pelvis") ~ "Core",
              str_detect(body_part, "Tailbone") ~ "Core",
              str_detect(body_part, "Adductor") ~ "Core",
              str_detect(body_part, "Rib") ~ "Core",
              str_detect(body_part, "Abdominal") ~ "Core",
              str_detect(body_part, "Core") ~ "Core",
              str_detect(body_part, "Oblique") ~ "Core",
              str_detect(body_part, "Glute") ~ "Core",
              str_detect(body_part, "Back") ~ "Core",
              str_detect(body_part, "Sternum") ~ "Core",
              str_detect(body_part, "Shoulder") ~ "Arm",
              str_detect(body_part, "Forearm") ~ "Arm",
              str_detect(body_part, "Wrist") ~ "Arm",
              str_detect(body_part, "Finger") ~ "Arm",
              str_detect(body_part, "Hand") ~ "Arm",
              str_detect(body_part, "Elbow") ~ "Arm",
              str_detect(body_part, "Neck") ~ "Head",
              str_detect(body_part, "Dental") ~ "Head",
              str_detect(body_part, "Nasal") ~ "Head",
              str_detect(body_part, "Eye") ~ "Head",
              str_detect(body_part, "Head") ~ "Head",
              str_detect(body_part, "Jaw") ~ "Head",
              str_detect(body_part, "Nasal") ~ "Head",
              str_detect(body_part, "Head") ~ "Head",
              str_detect(body_part, "Illness") ~ "Illness",
              str_detect(body_part, "Covid") ~ "Illness",
              str_detect(body_part, "Concussion") ~ "Concussion",
              TRUE ~ "other"
            )) %>%
  mutate(injury_type =
           case_when(
             str_detect(Injury, "sprained") ~ "Sprain",
             str_detect(Injury, "sprain") ~ "Sprain",
             str_detect(Injury, "plantar fasciitis") ~ "Plantar Fasciitis",
             str_detect(Injury, "spasms") ~ "Spasms",
             str_detect(Injury, "hyperextended") ~ "Hyperextended",
             str_detect(Injury, "sore") ~ "Sore",
             str_detect(Injury, "bruised") ~ "Bruised",
             str_detect(Injury, "fractured") ~ "Fracture",
             str_detect(Injury, "fracture") ~ "Fracture",
             str_detect(Injury, "tendinitis") ~ "Tendinitis",
             str_detect(Injury, "impingement") ~ "Impingement",
             str_detect(Injury, "dislocated") ~ "Discoloation",
             str_detect(Injury, "impingement") ~ "Impingement",
             str_detect(Injury, "impingement") ~ "Impingement",
             str_detect(Injury, "torn") ~ "Tear",
             str_detect(Injury, "tightness") ~ "Tightness",
             str_detect(Injury, "impingement") ~ "Impingement",
             str_detect(Injury, "concussion") ~ "Concussion",
             str_detect(Injury, "inflammation") ~ "Inflammation",
             str_detect(Injury, "infection") ~ "Infection",
             str_detect(Injury, "laceration") ~ "Laceration",
             str_detect(Injury, "collapsed") ~ "Collapsed",
             str_detect(Injury, "cramps") ~ "Cramps",
             str_detect(Injury, "spasms") ~ "Spasms",
             str_detect(Injury, "bursitis") ~ "Bursitis",
             str_detect(Injury, "stress reaction") ~ "Stress Reaction",
             str_detect(Injury, "subluxation") ~ "Subluxation",
             str_detect(Injury, "swollen") ~ "Swollen",
             str_detect(Injury, "stiffness") ~ "Stiffness",
             str_detect(Injury, "tendinopathy") ~ "Tendinopathy",
             str_detect(Injury, "hernia") ~ "Hernia",
             TRUE ~ "other"
           ))

clean_table2 %>%
  filter(Action != "activated from IL") %>%
  select(Date,
         Team,
         Player,
         Injury,
         Days_out) %>%
  rename(`Days Missed` = Days_out) %>%
  reactable( filterable = TRUE,
             searchable = TRUE,
             sortable = TRUE,
             theme = espn(),
             defaultPageSize = 15) %>%
  add_subtitle("Data is converted from NBA injury reports")

Data is converted from NBA injury reports

Notes on table: * An NA value in days missed indicates that the player has not returned yet. * There are some inconsistencies with data due to human error, variance, and arbitrary languange when filling out injury reports. * Date is formatted as: year - month - day

Leaguewide chart for 22-24 seasons

clean_table_teams_22 <- clean_table2 %>%
  filter(Date > as.Date("2021-10-10")) %>%
  na.omit(Days_out) %>%
  group_by(Team) %>%
  summarize(
    days_out = sum(Days_out)/3
  )

clean_table_teams_22$days_out <- as.numeric(clean_table_teams_22$days_out)

clean_table_teams_22 %>%
  ungroup() %>%
  rename(
    `Days Missed` = days_out
  ) %>%
  arrange(desc(`Days Missed`)) %>%
  plot_ly(x = ~Team, y = ~`Days Missed`, type = "bar",
          marker = list(color = c("red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "blue", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red"))) %>%
  layout(xaxis = list(categoryorder = "total descending"),
         title = "Days players on roster missed due to injury per season",
         plot_bgcolor = "white",
         annotations = list(
           text = "2022-2024 seasons",
           x = .5 , y = 1.05,
           xanchor = "center" , yanchor = "top",
           showarrow = FALSE
         ))

League chart for 23 & 24 season chart

clean_table_teams_23 <- clean_table2 %>%
  filter(Date > as.Date("2022-10-10")) %>%
  na.omit(Days_out) %>%
  group_by(Team) %>%
  summarize(
    days_out = sum(Days_out)/2
  )

clean_table_teams_23 %>%
  ungroup() %>%
  arrange(desc(days_out)) %>%
  rename(
    `Days Missed` = days_out
  ) %>%
  plot_ly(x = ~Team, y = ~`Days Missed`, type = "bar",
          marker = list(color = c("red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "blue", "red", "red", "red", "red"))) %>%
  layout(xaxis = list(categoryorder = "total descending"),
         title = "Days players on roster missed due to injury per season",
         annotations = list(
           text = "2023 & 2024 seasons",
           x = .5 , y = 1.05,
           xanchor = "center" , yanchor = "top",
           showarrow = FALSE
         ))

League chart for 24 season chart

clean_table_teams_24 <- clean_table2 %>%
  filter(Date > as.Date("2023-10-10")) %>%
  na.omit(Days_out) %>%
  group_by(Team) %>%
  summarize(
    days_out = sum(Days_out)
  )

clean_table_teams_24$days_out <- as.numeric(clean_table_teams_24$days_out)

clean_table_teams_24 %>%
  ungroup() %>%
  arrange(desc(days_out)) %>%
  rename(
    `Days Out` = days_out
  ) %>%
  plot_ly(x = ~Team, y = ~`Days Out`, type = "bar",
          marker = list(color = c("red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "red", "blue", "red", "red", "red", "red", "red"))) %>%
  layout(xaxis = list(categoryorder = "total descending"),
         title = "Days players on roster missed due to injury",
         annotations = list(
           text = "2024 season",
           x = .5 , y = 1.05,
           xanchor = "center" , yanchor = "top",
           showarrow = FALSE
         ))

Injury by region of body

clean_table2 %>%
  filter(Date > as.Date("2010-10-10"),
         body_region != "other") %>%
  rename( `Region of body` = body_region) %>%
  ggplot(aes(x = `Region of body`, fill = "red")) +
  geom_bar() + 
  scale_fill_manual(values = "red") +
  labs(title = "Frequency of injury to region of body",
       subtitle = "Limited to last 14 seasons",
       y = str_to_title("count")) +
  theme_minimal() +
  theme(legend.position = "none")

clean_table2 %>%
  filter(Date > as.Date("2010-10-10"),
         body_region != "other") %>%
  na.omit(Days_out) %>%
  group_by(body_region) %>%
  summarize(
    `Days Missed` = sum(Days_out)
  ) %>%
  rename( `Region of body` = body_region) %>%
  ggplot( aes(x = `Region of body`, y = `Days Missed`, fill = `Region of body`)) + 
  geom_col() +
  scale_fill_manual( values = c("red", "red", "red", "red", "red", "red")) +
  theme_minimal() +
  labs(title = "Days lost due to injury to region of body",
       subtitle = "Limited to last 14 seasons") +
  theme(legend.position = "none")

clean_table2 %>%
  filter(Date > as.Date("2010-10-10"),
         body_region != "other") %>%
  na.omit(Days_out) %>%
  group_by(body_region) %>%
  summarize(
    `Days Missed` = sum(Days_out)
  ) %>%
  rename( `Region of body` = body_region) %>%
  reactable() %>%
  add_title("Table of days lost due to injury to region of the body")

Table of days lost due to injury to region of the body

Injury by body part

body_part_freq <- clean_table2 %>%
  filter(body_part != "other") %>%
  count(body_part, sort = TRUE)

# Select the top ten most frequent values
top_ten_body_part <- body_part_freq %>%
  top_n(10)

# Filter the data frame to include only the top ten most frequent values
clean_table_top_ten <- clean_table2 %>%
  filter(body_part %in% top_ten_body_part$body_part) %>%
  filter(Date > as.Date("2010-10-10"))

# Create the ggplot with the filtered data
clean_table_top_ten %>%
  rename(`Body Part` = body_part) %>%
  ggplot(aes(x = `Body Part`, fill = "red")) +
  geom_bar() +
  scale_fill_manual(values = "red") +
  labs(title = "Frequency of injury to body part",
       subtitle = "Limited to last 14 seasons",
       y = str_to_title("count")) +
  theme_minimal() +
  theme(legend.position = "none")

clean_table2 %>%
  filter(Date > as.Date("2010-10-10"),
         body_part != "other") %>%
  na.omit(Days_out) %>%
  group_by(body_part) %>%
  summarize(
    `Days Missed` = sum(Days_out)
  ) %>%
  arrange(desc(`Days Missed`)) %>%
  head(n = 10) %>%
  rename(`Body Part` = body_part) %>%
  ggplot( aes(x = `Body Part`, y = `Days Missed`, fill = `Body Part`)) + 
  geom_col() +
  scale_fill_manual( values = c("red", "red", "red", "red", "red", "red" , "red" , "red" , "red" , "red")) +
  labs(title = "Days lost due to injury to body part",
       subtitle = "Limited to last 14 seasons") + 
  theme_minimal() +
  theme(legend.position = "none")

clean_table2 %>%
  filter(Date > as.Date("2010-10-10"),
         body_part != "other") %>%
  na.omit(Days_out) %>%
  group_by(body_part) %>%
  summarize(
    `Days Missed` = sum(Days_out)
  ) %>%
  arrange(desc(`Days Missed`)) %>%
  rename(`Body Part` = body_part) %>%
  head(n = 10) %>%
  reactable() %>%
  add_title("Ten body parts that cause the most days lost from last 14 seasons")

Ten body parts that cause the most days lost from last 14 seasons

Type of injury

injury_freq <- clean_table2 %>%
  filter(injury_type != "other") %>%
  count(injury_type, sort = TRUE)

# Select the top ten most frequent values
top_ten_injury <- injury_freq %>%
  top_n(10)

# Filter the data frame to include only the top ten most frequent values
clean_table_injury <- clean_table2 %>%
  filter(injury_type %in% top_ten_injury$injury_type) %>%
  filter(Date > as.Date("2010-10-10"))

# Create the ggplot with the filtered data
clean_table_injury %>%
  rename(`Injury Type` = injury_type) %>%
  ggplot(aes(x = `Injury Type`, fill = "red")) +
  geom_bar() +
  scale_fill_manual(values = "red") +
  labs(title = "Frequency of type of injury",
       subtitle = "Limited to last 14 seasons",
       y = str_to_title("count")) +
  theme_minimal() +
  theme(legend.position = "none")

clean_table2 %>%
  filter(Date > as.Date("2010-08-10"),
         injury_type != "other") %>%
  na.omit(Days_out) %>%
  group_by(injury_type) %>%
  summarize(
    `Days Missed` = sum(Days_out)
  ) %>%
  arrange(desc(`Days Missed`)) %>%
  rename(`Injury Type` = injury_type) %>%
  head(n = 10) %>%
  ggplot( aes(x = `Injury Type`, y = `Days Missed`, fill = `Injury Type`)) + 
  geom_col() +
  scale_fill_manual( values = c("red", "red", "red", "red", "red", "red" , "red" , "red" , "red" , "red")) +
  labs(title = "Days lost due to injury type",
       subtitle = "limited to last 14 seasons") + 
  theme_minimal() +
  theme(legend.position = "none")

clean_table2 %>%
  filter(Date > as.Date("2010-10-10"),
         injury_type != "other") %>%
  na.omit(Days_out) %>%
  group_by(injury_type) %>%
  summarize(
    `Days Missed` = sum(Days_out)
  ) %>%
  arrange(desc(`Days Missed`)) %>%
  rename(`Injury Type` = injury_type) %>%
  head(n = 10) %>%
  reactable() %>%
  add_title("Ten types of injuries that caused the most missed days in last 14 seasons")

Ten types of injuries that caused the most missed days in last 14 seasons