Load Required Libraries

library(tidyverse)
library(ggplot2)
library(dplyr)
library(knitr)

Load and Clean Data

# Read using read.fwf with proper column widths
# This is the most reliable way for this specific format

# Define column specifications
col_spec <- fwf_widths(
  c(3, 15, 8, 4, 4, 3, 3, 5, 4, 5, 5, 3, 4, 5, 3, 4, 5, 6, 3, 4, 5, 4, 4, 4, 4, 4, 4, 4, 3, 5),
  col_names = c("Rk", "Player", "Pos", "Age", "Tm", "G", "GS", "MP", 
                "FG", "FGA", "FG_pct", "X3P", "X3PA", "X3P_pct",
                "X2P", "X2PA", "X2P_pct", "eFG_pct", "FT", "FTA", "FT_pct",
                "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS")
)

# Read the data
nba_data <- read_fwf('2021-2022-NBA-Player-Stats-Playoffs.txt', 
                     col_positions = col_spec,
                     skip = 1)  # Skip header row

# Clean up - trim whitespace
nba_data <- nba_data %>%
  mutate(across(where(is.character), str_trim))

# Convert numeric columns
numeric_cols <- c("Rk", "Age", "G", "GS", "MP", "FG", "FGA", "FG_pct",
                  "X3P", "X3PA", "X3P_pct", "X2P", "X2PA", "X2P_pct",
                  "eFG_pct", "FT", "FTA", "FT_pct", "ORB", "DRB",
                  "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS")

for (col in numeric_cols) {
  nba_data[[col]] <- as.numeric(nba_data[[col]])
}

# Remove rows with missing essential data
nba_clean <- nba_data %>%
  filter(!is.na(PTS), !is.na(MP), !is.na(G), G > 0, MP > 0, !is.na(Player))

cat("Data loaded successfully!\n")
## Data loaded successfully!
cat("Total players:", nrow(nba_clean), "\n")
## Total players: 10
cat("\nFirst few players:\n")
## 
## First few players:
head(nba_clean %>% select(Player, Tm, G, MP, PTS, TRB, AST), 10) %>% kable()
Player Tm G MP PTS TRB AST
Bones Hyland PG NA 5 17.4 0.2 NA NA
CJ McCollum SG 6 6 8.0 0.8 NA 7.0
Ja Morant 9 9 6.0 0.8 7 1.4
Monte Morris PG 5 5 2.0 0.4 5 0.2
Otto Porter Jr. 9 3 5.0 0.8 8 0.6
Bobby Portis C 12 5 8.0 0.8 3 2.1
Jayson Tatum SF 24 24 8.0 0.2 NA 1.0
Klay ThompsonSG 2 22 7.0 0.3 7 0.3
Fred VanVleet P 4 4 4.0 0.3 3 0.5
Nikola Vu?evi? 5 5 2.0 0.2 NA 2.8

Summary Statistics

summary_table <- nba_clean %>%
  summarise(
    Total_Players = n(),
    Avg_Points = round(mean(PTS, na.rm = TRUE), 2),
    Avg_Minutes = round(mean(MP, na.rm = TRUE), 2),
    Avg_Rebounds = round(mean(TRB, na.rm = TRUE), 2),
    Avg_Assists = round(mean(AST, na.rm = TRUE), 2)
  )

kable(summary_table, caption = "Overall Statistics Summary")
Overall Statistics Summary
Total_Players Avg_Points Avg_Minutes Avg_Rebounds Avg_Assists
10 0.48 6.74 5.5 1.77

Plot 1: Distribution of Points Per Game

ggplot(nba_clean, aes(x = PTS)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Points Per Game in 2021-2022 NBA Playoffs",
       x = "Points Per Game",
       y = "Frequency") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
        axis.title = element_text(size = 12))

cat("\nPoints Per Game Summary:\n")
## 
## Points Per Game Summary:
summary(nba_clean$PTS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.200   0.225   0.350   0.480   0.800   0.800

Plot 2: Minutes vs Points

ggplot(nba_clean, aes(x = MP, y = PTS)) +
  geom_point(alpha = 0.6, size = 3, color = "coral") +
  geom_smooth(method = "lm", color = "red", linetype = "dashed", se = TRUE, alpha = 0.2) +
  labs(title = "Relationship Between Minutes Played and Points Scored",
       x = "Minutes Per Game",
       y = "Points Per Game") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
        axis.title = element_text(size = 12))

correlation <- cor(nba_clean$MP, nba_clean$PTS, use = "complete.obs")
cat("\nCorrelation:", round(correlation, 3), "\n")
## 
## Correlation: -0.105
model <- lm(PTS ~ MP, data = nba_clean)
cat("R-squared:", round(summary(model)$r.squared, 3), "\n")
## R-squared: 0.011

Plot 3: Top 10 Scorers

top_scorers <- nba_clean %>%
  arrange(desc(PTS)) %>%
  head(10)

ggplot(top_scorers, aes(x = PTS, y = reorder(Player, PTS))) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  geom_text(aes(label = round(PTS, 1)), hjust = -0.2, size = 3.5) +
  labs(title = "Top 10 Scorers - 2021-2022 NBA Playoffs",
       x = "Points Per Game",
       y = "Player") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14)) +
  xlim(0, max(top_scorers$PTS) * 1.15)

cat("\nTop 10 Scorers:\n")
## 
## Top 10 Scorers:
top_scorers %>% select(Player, Tm, PTS, MP, G) %>% kable(digits = 1)
Player Tm PTS MP G
CJ McCollum SG 6 0.8 8.0 6
Ja Morant 9 0.8 6.0 9
Otto Porter Jr. 9 0.8 5.0 3
Bobby Portis C 12 0.8 8.0 5
Monte Morris PG 5 0.4 2.0 5
Klay ThompsonSG 2 0.3 7.0 22
Fred VanVleet P 4 0.3 4.0 4
Bones Hyland PG NA 0.2 17.4 5
Jayson Tatum SF 24 0.2 8.0 24
Nikola Vu?evi? 5 0.2 2.0 5

Plot 4: Field Goal % vs Points

cat("Plot 4 Debug - Starting with nba_clean rows:", nrow(nba_clean), "\n")
## Plot 4 Debug - Starting with nba_clean rows: 10
cat("FG summary:\n")
## FG summary:
print(summary(nba_clean$FG))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       2       2       2       2       2       2       9
cat("\nFGA summary:\n")
## 
## FGA summary:
print(summary(nba_clean$FGA))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      NA      NA      NA     NaN      NA      NA      10
cat("\nPTS summary:\n")
## 
## PTS summary:
print(summary(nba_clean$PTS))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.200   0.225   0.350   0.480   0.800   0.800
cat("\nChecking NA counts:\n")
## 
## Checking NA counts:
cat("FG NAs:", sum(is.na(nba_clean$FG)), "\n")
## FG NAs: 9
cat("FGA NAs:", sum(is.na(nba_clean$FGA)), "\n")
## FGA NAs: 10
cat("PTS NAs:", sum(is.na(nba_clean$PTS)), "\n")
## PTS NAs: 0
cat("\nFGA > 0 count:", sum(nba_clean$FGA > 0, na.rm = TRUE), "\n")
## 
## FGA > 0 count: 0
cat("\nFirst 20 rows of FG, FGA, PTS:\n")
## 
## First 20 rows of FG, FGA, PTS:
print(head(nba_clean %>% select(Player, FG, FGA, PTS), 20))
## # A tibble: 10 × 4
##    Player             FG   FGA   PTS
##    <chr>           <dbl> <dbl> <dbl>
##  1 Bones Hyland PG     2    NA   0.2
##  2 CJ McCollum SG     NA    NA   0.8
##  3 Ja Morant          NA    NA   0.8
##  4 Monte Morris PG    NA    NA   0.4
##  5 Otto Porter Jr.    NA    NA   0.8
##  6 Bobby Portis C     NA    NA   0.8
##  7 Jayson Tatum SF    NA    NA   0.2
##  8 Klay ThompsonSG    NA    NA   0.3
##  9 Fred VanVleet P    NA    NA   0.3
## 10 Nikola Vu?evi?     NA    NA   0.2
# Try the most minimal filter possible
nba_filtered <- nba_clean %>%
  filter(!is.na(FG), !is.na(FGA), !is.na(PTS))

cat("\nAfter removing NAs only:", nrow(nba_filtered), "\n")
## 
## After removing NAs only: 0
if (nrow(nba_filtered) > 0) {
  # Now filter for FGA > 0
  nba_filtered <- nba_filtered %>%
    filter(FGA > 0) %>%
    mutate(FG_pct_calc = FG / FGA)
  
  cat("After FGA > 0 filter:", nrow(nba_filtered), "\n")
  
  if (nrow(nba_filtered) >= 5) {
    cat("FG_pct_calc range:", range(nba_filtered$FG_pct_calc, na.rm=TRUE), "\n")
    cat("Sample FG_pct_calc values:", head(nba_filtered$FG_pct_calc, 10), "\n\n")
    
    # Remove any non-finite values
    nba_filtered <- nba_filtered %>%
      filter(is.finite(FG_pct_calc), is.finite(PTS))
    
    cat("After removing non-finite:", nrow(nba_filtered), "\n")
    
    if (nrow(nba_filtered) >= 5) {
      # Create plot
      ggplot(nba_filtered, aes(x = FG_pct_calc, y = PTS)) +
        geom_point(alpha = 0.6, size = 3, color = "darkgreen") +
        geom_smooth(method = "lm", color = "blue", se = TRUE, alpha = 0.2) +
        labs(title = "Field Goal Percentage vs Points Per Game",
             subtitle = paste("Players with field goal attempts (n =", nrow(nba_filtered), ")"),
             x = "Field Goal Percentage",
             y = "Points Per Game") +
        theme_minimal() +
        theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
      
      # Calculate correlation
      if (nrow(nba_filtered) >= 2) {
        fg_correlation <- cor(nba_filtered$FG_pct_calc, nba_filtered$PTS, 
                             use = "complete.obs")
        cat("\nCorrelation between FG% and Points:", round(fg_correlation, 3), "\n")
      }
    } else {
      cat("Not enough data after removing non-finite values\n")
    }
  } else {
    cat("Not enough data after FGA > 0 filter\n")
    cat("Sample of data with FGA values:\n")
    print(nba_filtered %>% select(Player, FG, FGA, PTS) %>% head(10))
  }
} else {
  cat("All rows have NA in FG, FGA, or PTS!\n")
}
## All rows have NA in FG, FGA, or PTS!

Plot 5: Assists vs Points by Position

cat("Plot 5 Debug - Starting with nba_clean rows:", nrow(nba_clean), "\n")
## Plot 5 Debug - Starting with nba_clean rows: 10
cat("Non-NA AST:", sum(!is.na(nba_clean$AST)), "\n")
## Non-NA AST: 9
cat("Non-NA Pos:", sum(!is.na(nba_clean$Pos)), "\n")
## Non-NA Pos: 10
cat("Non-NA PTS:", sum(!is.na(nba_clean$PTS)), "\n")
## Non-NA PTS: 10
cat("MP >= 10:", sum(nba_clean$MP >= 10, na.rm=TRUE), "\n")
## MP >= 10: 1
# Try with NO MP filter first
nba_position <- nba_clean %>%
  filter(!is.na(AST), !is.na(Pos), !is.na(PTS))

cat("After filtering (no MP filter):", nrow(nba_position), "\n")
## After filtering (no MP filter): 9
if(nrow(nba_position) > 0) {
  cat("AST range:", range(nba_position$AST, na.rm=TRUE), "\n")
  cat("PTS range:", range(nba_position$PTS, na.rm=TRUE), "\n")
  cat("Positions:", paste(unique(nba_position$Pos), collapse=", "), "\n")
  
  ggplot(nba_position, aes(x = AST, y = PTS, color = Pos)) +
    geom_point(alpha = 0.6, size = 3) +
    geom_smooth(method = "lm", se = FALSE) +
    labs(title = "Assists vs Points Per Game by Position",
         subtitle = paste("All players (n =", nrow(nba_position), ")"),
         x = "Assists Per Game",
         y = "Points Per Game",
         color = "Position") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
} else {
  cat("ERROR: No data to plot!\n")
  cat("Showing first 10 rows of nba_clean:\n")
  print(head(nba_clean %>% select(Player, AST, Pos, PTS, MP), 10))
}
## AST range: 0.2 7 
## PTS range: 0.2 0.8 
## Positions: 30, PG   22, 26, PF   28, 23, 31, G     27, C     31

Plot 6: Rebounds by Position

cat("Plot 6 Debug - Starting with nba_clean rows:", nrow(nba_clean), "\n")
## Plot 6 Debug - Starting with nba_clean rows: 10
cat("TRB summary:\n")
## TRB summary:
print(summary(nba_clean$TRB))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     3.0     3.5     6.0     5.5     7.0     8.0       4
cat("Pos summary:\n")
## Pos summary:
print(table(nba_clean$Pos, useNA = "ifany"))
## 
##        2       23       26       30       31 C     31 G     27  PF   28 
##        1        1        2        1        1        1        1        1 
##  PG   22 
##        1
# Use the absolute minimum filter
nba_rebounds <- nba_clean %>%
  filter(!is.na(TRB), !is.na(Pos), Pos != "")

cat("\nAfter filtering:", nrow(nba_rebounds), "\n")
## 
## After filtering: 6
if(nrow(nba_rebounds) > 0) {
  cat("TRB range:", range(nba_rebounds$TRB, na.rm=TRUE), "\n")
  cat("Number of positions:", length(unique(nba_rebounds$Pos)), "\n")
  cat("Positions:", paste(unique(nba_rebounds$Pos), collapse=", "), "\n\n")
  
  # Explicitly print the plot
  p <- ggplot(nba_rebounds, aes(x = Pos, y = TRB, fill = Pos)) +
    geom_boxplot(alpha = 0.7) +
    labs(title = "Total Rebounds Distribution by Position",
         subtitle = paste("All players (n =", nrow(nba_rebounds), ")"),
         x = "Position",
         y = "Total Rebounds Per Game") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
          legend.position = "none")
  
  print(p)
  
  cat("\nAverage Rebounds by Position:\n")
  nba_rebounds %>%
    group_by(Pos) %>%
    summarise(Count = n(),
              Mean_TRB = round(mean(TRB, na.rm = TRUE), 2), .groups = 'drop') %>%
    arrange(desc(Mean_TRB)) %>%
    kable()
} else {
  cat("ERROR: No data after filtering!\n")
  cat("Sample of raw data:\n")
  print(head(nba_clean %>% select(Player, TRB, Pos), 20))
}
## TRB range: 3 8 
## Number of positions: 5 
## Positions: PG   22, 26, PF   28, 31, G     27

## 
## Average Rebounds by Position:
Pos Count Mean_TRB
PF 28 1 8
31 1 7
PG 22 1 7
26 2 4
G 27 1 3

Plot 7: Three-Point Shooting

three_point_data <- nba_clean %>%
  filter(!is.na(X3P_pct), X3PA >= 2)

ggplot(three_point_data, aes(x = X3P_pct)) +
  geom_histogram(bins = 25, fill = "orange", color = "black", alpha = 0.7) +
  geom_vline(xintercept = mean(three_point_data$X3P_pct, na.rm = TRUE),
             color = "red", linetype = "dashed", linewidth = 1) +
  labs(title = "Distribution of Three-Point Shooting Percentage",
       x = "Three-Point Percentage",
       y = "Frequency") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))

cat("\n3P% Summary:\n")
## 
## 3P% Summary:
summary(three_point_data$X3P_pct)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2980  0.3272  0.3625  0.3608  0.3957  0.4230

Plot 8: Team Performance

cat("Plot 8 Debug - Starting with nba_clean rows:", nrow(nba_clean), "\n")
## Plot 8 Debug - Starting with nba_clean rows: 10
cat("Tm summary:\n")
## Tm summary:
print(table(nba_clean$Tm, useNA = "ifany"))
## 
##   12    2   24    4    5    6    9 <NA> 
##    1    1    1    1    2    1    2    1
cat("\nPTS summary:\n")
## 
## PTS summary:
print(summary(nba_clean$PTS))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.200   0.225   0.350   0.480   0.800   0.800
# Create team stats with minimal filtering
team_stats <- nba_clean %>%
  filter(!is.na(Tm), !is.na(PTS), Tm != "") %>%
  group_by(Tm) %>%
  summarise(Players = n(),
            Avg_PTS = mean(PTS, na.rm = TRUE), .groups = 'drop')

cat("\nTotal teams before filtering:", nrow(team_stats), "\n")
## 
## Total teams before filtering: 7
cat("Teams and player counts:\n")
## Teams and player counts:
print(team_stats)
## # A tibble: 7 × 3
##   Tm    Players Avg_PTS
##   <chr>   <int>   <dbl>
## 1 12          1     0.8
## 2 2           1     0.3
## 3 24          1     0.2
## 4 4           1     0.3
## 5 5           2     0.3
## 6 6           1     0.8
## 7 9           2     0.8
# Use even lower threshold - just 1 player minimum
team_stats <- team_stats %>%
  filter(Players >= 1) %>%
  arrange(desc(Avg_PTS))

cat("\nTeams with >= 1 player:", nrow(team_stats), "\n")
## 
## Teams with >= 1 player: 7
if(nrow(team_stats) > 0 && any(!is.na(team_stats$Avg_PTS))) {
  cat("Avg_PTS range:", range(team_stats$Avg_PTS, na.rm=TRUE), "\n")
  cat("Teams:", paste(team_stats$Tm, collapse=", "), "\n\n")
  
  # Explicitly print the plot
  p <- ggplot(team_stats, aes(x = reorder(Tm, Avg_PTS), y = Avg_PTS)) +
    geom_col(fill = "purple", alpha = 0.7) +
    coord_flip() +
    labs(title = "Average Points Per Game by Team",
         subtitle = paste("All teams (n =", nrow(team_stats), ")"),
         x = "Team",
         y = "Average Points Per Game") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
  
  print(p)
} else {
  cat("ERROR: No valid team data to plot!\n")
  cat("Sample of raw data:\n")
  print(head(nba_clean %>% select(Player, Tm, PTS), 20))
}
## Avg_PTS range: 0.2 0.8 
## Teams: 12, 6, 9, 5, 2, 4, 24

Conclusions

cat("Key Findings:\n")
## Key Findings:
cat("1. Analyzed", nrow(nba_clean), "players from 2021-2022 NBA Playoffs\n")
## 1. Analyzed 10 players from 2021-2022 NBA Playoffs
cat("2. Strong correlation between minutes and points:", round(correlation, 3), "\n")
## 2. Strong correlation between minutes and points: -0.105
cat("3. Top scorer:", top_scorers$Player[1], "with", round(top_scorers$PTS[1], 1), "PPG\n")
## 3. Top scorer: CJ McCollum SG with 0.8 PPG

Session Info

sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] knitr_1.51      lubridate_1.9.4 forcats_1.0.1   stringr_1.5.1  
##  [5] dplyr_1.1.4     purrr_1.0.4     readr_2.1.6     tidyr_1.3.1    
##  [9] tibble_3.2.1    ggplot2_4.0.1   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] utf8_1.2.4         sass_0.4.9         generics_0.1.3     stringi_1.8.7     
##  [5] lattice_0.22-6     hms_1.1.3          digest_0.6.37      magrittr_2.0.3    
##  [9] evaluate_1.0.5     grid_4.4.2         timechange_0.3.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      Matrix_1.7-1       jsonlite_2.0.0     mgcv_1.9-1        
## [17] scales_1.4.0       jquerylib_0.1.4    cli_3.6.3          rlang_1.1.4       
## [21] crayon_1.5.3       bit64_4.6.0-1      splines_4.4.2      withr_3.0.2       
## [25] cachem_1.1.0       yaml_2.3.12        tools_4.4.2        parallel_4.4.2    
## [29] tzdb_0.5.0         vctrs_0.6.5        R6_2.5.1           lifecycle_1.0.4   
## [33] bit_4.6.0          vroom_1.6.7        pkgconfig_2.0.3    pillar_1.10.1     
## [37] bslib_0.9.0        gtable_0.3.6       glue_1.8.0         xfun_0.55         
## [41] tidyselect_1.2.1   rstudioapi_0.17.1  farver_2.1.2       htmltools_0.5.8.1 
## [45] nlme_3.1-166       rmarkdown_2.30     labeling_0.4.3     compiler_4.4.2    
## [49] S7_0.2.1