Load Required Libraries
library(tidyverse)
library(ggplot2)
library(dplyr)
library(knitr)
Load and Clean Data
# Read using read.fwf with proper column widths
# This is the most reliable way for this specific format
# Define column specifications
col_spec <- fwf_widths(
c(3, 15, 8, 4, 4, 3, 3, 5, 4, 5, 5, 3, 4, 5, 3, 4, 5, 6, 3, 4, 5, 4, 4, 4, 4, 4, 4, 4, 3, 5),
col_names = c("Rk", "Player", "Pos", "Age", "Tm", "G", "GS", "MP",
"FG", "FGA", "FG_pct", "X3P", "X3PA", "X3P_pct",
"X2P", "X2PA", "X2P_pct", "eFG_pct", "FT", "FTA", "FT_pct",
"ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS")
)
# Read the data
nba_data <- read_fwf('2021-2022-NBA-Player-Stats-Playoffs.txt',
col_positions = col_spec,
skip = 1) # Skip header row
# Clean up - trim whitespace
nba_data <- nba_data %>%
mutate(across(where(is.character), str_trim))
# Convert numeric columns
numeric_cols <- c("Rk", "Age", "G", "GS", "MP", "FG", "FGA", "FG_pct",
"X3P", "X3PA", "X3P_pct", "X2P", "X2PA", "X2P_pct",
"eFG_pct", "FT", "FTA", "FT_pct", "ORB", "DRB",
"TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS")
for (col in numeric_cols) {
nba_data[[col]] <- as.numeric(nba_data[[col]])
}
# Remove rows with missing essential data
nba_clean <- nba_data %>%
filter(!is.na(PTS), !is.na(MP), !is.na(G), G > 0, MP > 0, !is.na(Player))
cat("Data loaded successfully!\n")
## Data loaded successfully!
cat("Total players:", nrow(nba_clean), "\n")
## Total players: 10
cat("\nFirst few players:\n")
##
## First few players:
head(nba_clean %>% select(Player, Tm, G, MP, PTS, TRB, AST), 10) %>% kable()
| Bones Hyland PG |
NA |
5 |
17.4 |
0.2 |
NA |
NA |
| CJ McCollum SG |
6 |
6 |
8.0 |
0.8 |
NA |
7.0 |
| Ja Morant |
9 |
9 |
6.0 |
0.8 |
7 |
1.4 |
| Monte Morris PG |
5 |
5 |
2.0 |
0.4 |
5 |
0.2 |
| Otto Porter Jr. |
9 |
3 |
5.0 |
0.8 |
8 |
0.6 |
| Bobby Portis C |
12 |
5 |
8.0 |
0.8 |
3 |
2.1 |
| Jayson Tatum SF |
24 |
24 |
8.0 |
0.2 |
NA |
1.0 |
| Klay ThompsonSG |
2 |
22 |
7.0 |
0.3 |
7 |
0.3 |
| Fred VanVleet P |
4 |
4 |
4.0 |
0.3 |
3 |
0.5 |
| Nikola Vu?evi? |
5 |
5 |
2.0 |
0.2 |
NA |
2.8 |
Summary Statistics
summary_table <- nba_clean %>%
summarise(
Total_Players = n(),
Avg_Points = round(mean(PTS, na.rm = TRUE), 2),
Avg_Minutes = round(mean(MP, na.rm = TRUE), 2),
Avg_Rebounds = round(mean(TRB, na.rm = TRUE), 2),
Avg_Assists = round(mean(AST, na.rm = TRUE), 2)
)
kable(summary_table, caption = "Overall Statistics Summary")
Overall Statistics Summary
| 10 |
0.48 |
6.74 |
5.5 |
1.77 |
Plot 1: Distribution of Points Per Game
ggplot(nba_clean, aes(x = PTS)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Points Per Game in 2021-2022 NBA Playoffs",
x = "Points Per Game",
y = "Frequency") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
axis.title = element_text(size = 12))

cat("\nPoints Per Game Summary:\n")
##
## Points Per Game Summary:
summary(nba_clean$PTS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.200 0.225 0.350 0.480 0.800 0.800
Plot 2: Minutes vs Points
ggplot(nba_clean, aes(x = MP, y = PTS)) +
geom_point(alpha = 0.6, size = 3, color = "coral") +
geom_smooth(method = "lm", color = "red", linetype = "dashed", se = TRUE, alpha = 0.2) +
labs(title = "Relationship Between Minutes Played and Points Scored",
x = "Minutes Per Game",
y = "Points Per Game") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
axis.title = element_text(size = 12))

correlation <- cor(nba_clean$MP, nba_clean$PTS, use = "complete.obs")
cat("\nCorrelation:", round(correlation, 3), "\n")
##
## Correlation: -0.105
model <- lm(PTS ~ MP, data = nba_clean)
cat("R-squared:", round(summary(model)$r.squared, 3), "\n")
## R-squared: 0.011
Plot 3: Top 10 Scorers
top_scorers <- nba_clean %>%
arrange(desc(PTS)) %>%
head(10)
ggplot(top_scorers, aes(x = PTS, y = reorder(Player, PTS))) +
geom_col(fill = "steelblue", alpha = 0.8) +
geom_text(aes(label = round(PTS, 1)), hjust = -0.2, size = 3.5) +
labs(title = "Top 10 Scorers - 2021-2022 NBA Playoffs",
x = "Points Per Game",
y = "Player") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14)) +
xlim(0, max(top_scorers$PTS) * 1.15)

cat("\nTop 10 Scorers:\n")
##
## Top 10 Scorers:
top_scorers %>% select(Player, Tm, PTS, MP, G) %>% kable(digits = 1)
| CJ McCollum SG |
6 |
0.8 |
8.0 |
6 |
| Ja Morant |
9 |
0.8 |
6.0 |
9 |
| Otto Porter Jr. |
9 |
0.8 |
5.0 |
3 |
| Bobby Portis C |
12 |
0.8 |
8.0 |
5 |
| Monte Morris PG |
5 |
0.4 |
2.0 |
5 |
| Klay ThompsonSG |
2 |
0.3 |
7.0 |
22 |
| Fred VanVleet P |
4 |
0.3 |
4.0 |
4 |
| Bones Hyland PG |
NA |
0.2 |
17.4 |
5 |
| Jayson Tatum SF |
24 |
0.2 |
8.0 |
24 |
| Nikola Vu?evi? |
5 |
0.2 |
2.0 |
5 |
Plot 4: Field Goal % vs Points
cat("Plot 4 Debug - Starting with nba_clean rows:", nrow(nba_clean), "\n")
## Plot 4 Debug - Starting with nba_clean rows: 10
cat("FG summary:\n")
## FG summary:
print(summary(nba_clean$FG))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2 2 2 2 2 2 9
cat("\nFGA summary:\n")
##
## FGA summary:
print(summary(nba_clean$FGA))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## NA NA NA NaN NA NA 10
cat("\nPTS summary:\n")
##
## PTS summary:
print(summary(nba_clean$PTS))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.200 0.225 0.350 0.480 0.800 0.800
cat("\nChecking NA counts:\n")
##
## Checking NA counts:
cat("FG NAs:", sum(is.na(nba_clean$FG)), "\n")
## FG NAs: 9
cat("FGA NAs:", sum(is.na(nba_clean$FGA)), "\n")
## FGA NAs: 10
cat("PTS NAs:", sum(is.na(nba_clean$PTS)), "\n")
## PTS NAs: 0
cat("\nFGA > 0 count:", sum(nba_clean$FGA > 0, na.rm = TRUE), "\n")
##
## FGA > 0 count: 0
cat("\nFirst 20 rows of FG, FGA, PTS:\n")
##
## First 20 rows of FG, FGA, PTS:
print(head(nba_clean %>% select(Player, FG, FGA, PTS), 20))
## # A tibble: 10 × 4
## Player FG FGA PTS
## <chr> <dbl> <dbl> <dbl>
## 1 Bones Hyland PG 2 NA 0.2
## 2 CJ McCollum SG NA NA 0.8
## 3 Ja Morant NA NA 0.8
## 4 Monte Morris PG NA NA 0.4
## 5 Otto Porter Jr. NA NA 0.8
## 6 Bobby Portis C NA NA 0.8
## 7 Jayson Tatum SF NA NA 0.2
## 8 Klay ThompsonSG NA NA 0.3
## 9 Fred VanVleet P NA NA 0.3
## 10 Nikola Vu?evi? NA NA 0.2
# Try the most minimal filter possible
nba_filtered <- nba_clean %>%
filter(!is.na(FG), !is.na(FGA), !is.na(PTS))
cat("\nAfter removing NAs only:", nrow(nba_filtered), "\n")
##
## After removing NAs only: 0
if (nrow(nba_filtered) > 0) {
# Now filter for FGA > 0
nba_filtered <- nba_filtered %>%
filter(FGA > 0) %>%
mutate(FG_pct_calc = FG / FGA)
cat("After FGA > 0 filter:", nrow(nba_filtered), "\n")
if (nrow(nba_filtered) >= 5) {
cat("FG_pct_calc range:", range(nba_filtered$FG_pct_calc, na.rm=TRUE), "\n")
cat("Sample FG_pct_calc values:", head(nba_filtered$FG_pct_calc, 10), "\n\n")
# Remove any non-finite values
nba_filtered <- nba_filtered %>%
filter(is.finite(FG_pct_calc), is.finite(PTS))
cat("After removing non-finite:", nrow(nba_filtered), "\n")
if (nrow(nba_filtered) >= 5) {
# Create plot
ggplot(nba_filtered, aes(x = FG_pct_calc, y = PTS)) +
geom_point(alpha = 0.6, size = 3, color = "darkgreen") +
geom_smooth(method = "lm", color = "blue", se = TRUE, alpha = 0.2) +
labs(title = "Field Goal Percentage vs Points Per Game",
subtitle = paste("Players with field goal attempts (n =", nrow(nba_filtered), ")"),
x = "Field Goal Percentage",
y = "Points Per Game") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
# Calculate correlation
if (nrow(nba_filtered) >= 2) {
fg_correlation <- cor(nba_filtered$FG_pct_calc, nba_filtered$PTS,
use = "complete.obs")
cat("\nCorrelation between FG% and Points:", round(fg_correlation, 3), "\n")
}
} else {
cat("Not enough data after removing non-finite values\n")
}
} else {
cat("Not enough data after FGA > 0 filter\n")
cat("Sample of data with FGA values:\n")
print(nba_filtered %>% select(Player, FG, FGA, PTS) %>% head(10))
}
} else {
cat("All rows have NA in FG, FGA, or PTS!\n")
}
## All rows have NA in FG, FGA, or PTS!
Plot 5: Assists vs Points by Position
cat("Plot 5 Debug - Starting with nba_clean rows:", nrow(nba_clean), "\n")
## Plot 5 Debug - Starting with nba_clean rows: 10
cat("Non-NA AST:", sum(!is.na(nba_clean$AST)), "\n")
## Non-NA AST: 9
cat("Non-NA Pos:", sum(!is.na(nba_clean$Pos)), "\n")
## Non-NA Pos: 10
cat("Non-NA PTS:", sum(!is.na(nba_clean$PTS)), "\n")
## Non-NA PTS: 10
cat("MP >= 10:", sum(nba_clean$MP >= 10, na.rm=TRUE), "\n")
## MP >= 10: 1
# Try with NO MP filter first
nba_position <- nba_clean %>%
filter(!is.na(AST), !is.na(Pos), !is.na(PTS))
cat("After filtering (no MP filter):", nrow(nba_position), "\n")
## After filtering (no MP filter): 9
if(nrow(nba_position) > 0) {
cat("AST range:", range(nba_position$AST, na.rm=TRUE), "\n")
cat("PTS range:", range(nba_position$PTS, na.rm=TRUE), "\n")
cat("Positions:", paste(unique(nba_position$Pos), collapse=", "), "\n")
ggplot(nba_position, aes(x = AST, y = PTS, color = Pos)) +
geom_point(alpha = 0.6, size = 3) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Assists vs Points Per Game by Position",
subtitle = paste("All players (n =", nrow(nba_position), ")"),
x = "Assists Per Game",
y = "Points Per Game",
color = "Position") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
} else {
cat("ERROR: No data to plot!\n")
cat("Showing first 10 rows of nba_clean:\n")
print(head(nba_clean %>% select(Player, AST, Pos, PTS, MP), 10))
}
## AST range: 0.2 7
## PTS range: 0.2 0.8
## Positions: 30, PG 22, 26, PF 28, 23, 31, G 27, C 31

Plot 6: Rebounds by Position
cat("Plot 6 Debug - Starting with nba_clean rows:", nrow(nba_clean), "\n")
## Plot 6 Debug - Starting with nba_clean rows: 10
cat("TRB summary:\n")
## TRB summary:
print(summary(nba_clean$TRB))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.0 3.5 6.0 5.5 7.0 8.0 4
cat("Pos summary:\n")
## Pos summary:
print(table(nba_clean$Pos, useNA = "ifany"))
##
## 2 23 26 30 31 C 31 G 27 PF 28
## 1 1 2 1 1 1 1 1
## PG 22
## 1
# Use the absolute minimum filter
nba_rebounds <- nba_clean %>%
filter(!is.na(TRB), !is.na(Pos), Pos != "")
cat("\nAfter filtering:", nrow(nba_rebounds), "\n")
##
## After filtering: 6
if(nrow(nba_rebounds) > 0) {
cat("TRB range:", range(nba_rebounds$TRB, na.rm=TRUE), "\n")
cat("Number of positions:", length(unique(nba_rebounds$Pos)), "\n")
cat("Positions:", paste(unique(nba_rebounds$Pos), collapse=", "), "\n\n")
# Explicitly print the plot
p <- ggplot(nba_rebounds, aes(x = Pos, y = TRB, fill = Pos)) +
geom_boxplot(alpha = 0.7) +
labs(title = "Total Rebounds Distribution by Position",
subtitle = paste("All players (n =", nrow(nba_rebounds), ")"),
x = "Position",
y = "Total Rebounds Per Game") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
legend.position = "none")
print(p)
cat("\nAverage Rebounds by Position:\n")
nba_rebounds %>%
group_by(Pos) %>%
summarise(Count = n(),
Mean_TRB = round(mean(TRB, na.rm = TRUE), 2), .groups = 'drop') %>%
arrange(desc(Mean_TRB)) %>%
kable()
} else {
cat("ERROR: No data after filtering!\n")
cat("Sample of raw data:\n")
print(head(nba_clean %>% select(Player, TRB, Pos), 20))
}
## TRB range: 3 8
## Number of positions: 5
## Positions: PG 22, 26, PF 28, 31, G 27

##
## Average Rebounds by Position:
| PF 28 |
1 |
8 |
| 31 |
1 |
7 |
| PG 22 |
1 |
7 |
| 26 |
2 |
4 |
| G 27 |
1 |
3 |
Plot 7: Three-Point Shooting
three_point_data <- nba_clean %>%
filter(!is.na(X3P_pct), X3PA >= 2)
ggplot(three_point_data, aes(x = X3P_pct)) +
geom_histogram(bins = 25, fill = "orange", color = "black", alpha = 0.7) +
geom_vline(xintercept = mean(three_point_data$X3P_pct, na.rm = TRUE),
color = "red", linetype = "dashed", linewidth = 1) +
labs(title = "Distribution of Three-Point Shooting Percentage",
x = "Three-Point Percentage",
y = "Frequency") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))

cat("\n3P% Summary:\n")
##
## 3P% Summary:
summary(three_point_data$X3P_pct)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2980 0.3272 0.3625 0.3608 0.3957 0.4230
Plot 8: Team Performance
cat("Plot 8 Debug - Starting with nba_clean rows:", nrow(nba_clean), "\n")
## Plot 8 Debug - Starting with nba_clean rows: 10
cat("Tm summary:\n")
## Tm summary:
print(table(nba_clean$Tm, useNA = "ifany"))
##
## 12 2 24 4 5 6 9 <NA>
## 1 1 1 1 2 1 2 1
cat("\nPTS summary:\n")
##
## PTS summary:
print(summary(nba_clean$PTS))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.200 0.225 0.350 0.480 0.800 0.800
# Create team stats with minimal filtering
team_stats <- nba_clean %>%
filter(!is.na(Tm), !is.na(PTS), Tm != "") %>%
group_by(Tm) %>%
summarise(Players = n(),
Avg_PTS = mean(PTS, na.rm = TRUE), .groups = 'drop')
cat("\nTotal teams before filtering:", nrow(team_stats), "\n")
##
## Total teams before filtering: 7
cat("Teams and player counts:\n")
## Teams and player counts:
print(team_stats)
## # A tibble: 7 × 3
## Tm Players Avg_PTS
## <chr> <int> <dbl>
## 1 12 1 0.8
## 2 2 1 0.3
## 3 24 1 0.2
## 4 4 1 0.3
## 5 5 2 0.3
## 6 6 1 0.8
## 7 9 2 0.8
# Use even lower threshold - just 1 player minimum
team_stats <- team_stats %>%
filter(Players >= 1) %>%
arrange(desc(Avg_PTS))
cat("\nTeams with >= 1 player:", nrow(team_stats), "\n")
##
## Teams with >= 1 player: 7
if(nrow(team_stats) > 0 && any(!is.na(team_stats$Avg_PTS))) {
cat("Avg_PTS range:", range(team_stats$Avg_PTS, na.rm=TRUE), "\n")
cat("Teams:", paste(team_stats$Tm, collapse=", "), "\n\n")
# Explicitly print the plot
p <- ggplot(team_stats, aes(x = reorder(Tm, Avg_PTS), y = Avg_PTS)) +
geom_col(fill = "purple", alpha = 0.7) +
coord_flip() +
labs(title = "Average Points Per Game by Team",
subtitle = paste("All teams (n =", nrow(team_stats), ")"),
x = "Team",
y = "Average Points Per Game") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
print(p)
} else {
cat("ERROR: No valid team data to plot!\n")
cat("Sample of raw data:\n")
print(head(nba_clean %>% select(Player, Tm, PTS), 20))
}
## Avg_PTS range: 0.2 0.8
## Teams: 12, 6, 9, 5, 2, 4, 24

Conclusions
cat("Key Findings:\n")
## Key Findings:
cat("1. Analyzed", nrow(nba_clean), "players from 2021-2022 NBA Playoffs\n")
## 1. Analyzed 10 players from 2021-2022 NBA Playoffs
cat("2. Strong correlation between minutes and points:", round(correlation, 3), "\n")
## 2. Strong correlation between minutes and points: -0.105
cat("3. Top scorer:", top_scorers$Player[1], "with", round(top_scorers$PTS[1], 1), "PPG\n")
## 3. Top scorer: CJ McCollum SG with 0.8 PPG
Session Info
sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.51 lubridate_1.9.4 forcats_1.0.1 stringr_1.5.1
## [5] dplyr_1.1.4 purrr_1.0.4 readr_2.1.6 tidyr_1.3.1
## [9] tibble_3.2.1 ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.4 sass_0.4.9 generics_0.1.3 stringi_1.8.7
## [5] lattice_0.22-6 hms_1.1.3 digest_0.6.37 magrittr_2.0.3
## [9] evaluate_1.0.5 grid_4.4.2 timechange_0.3.0 RColorBrewer_1.1-3
## [13] fastmap_1.2.0 Matrix_1.7-1 jsonlite_2.0.0 mgcv_1.9-1
## [17] scales_1.4.0 jquerylib_0.1.4 cli_3.6.3 rlang_1.1.4
## [21] crayon_1.5.3 bit64_4.6.0-1 splines_4.4.2 withr_3.0.2
## [25] cachem_1.1.0 yaml_2.3.12 tools_4.4.2 parallel_4.4.2
## [29] tzdb_0.5.0 vctrs_0.6.5 R6_2.5.1 lifecycle_1.0.4
## [33] bit_4.6.0 vroom_1.6.7 pkgconfig_2.0.3 pillar_1.10.1
## [37] bslib_0.9.0 gtable_0.3.6 glue_1.8.0 xfun_0.55
## [41] tidyselect_1.2.1 rstudioapi_0.17.1 farver_2.1.2 htmltools_0.5.8.1
## [45] nlme_3.1-166 rmarkdown_2.30 labeling_0.4.3 compiler_4.4.2
## [49] S7_0.2.1