NBA Data Dive Analysis

Load Data

nba_data <- read.csv("C:/Statistics/nba.csv")
head(nba_data)

##       bbrID       Date  Tm Opp TRB AST STL BLK PTS GmSc  Season Playoffs Year
## 1 abdelal01 1993-03-16 BOS GSW  10   2   0   0  25 22.7 1992-93    false 1993
## 2 abdulma02 1991-04-02 DEN DAL   2   6   4   0  30 29.7 1990-91    false 1991
## 3 abdulta01 1998-04-19 SAC VAN   2   3   1   0  31 26.4 1997-98    false 1998
## 4 abdursh01 2001-11-23 ATL DET  12   5   2   1  50 46.0 2001-02    false 2002
## 5 abrinal01 2018-11-01 OKC CHO   2   0   0   0  25 17.1 2018-19    false 2019
## 6 achiupr01 2021-01-12 MIA PHI  13   3   0   1  17 16.9 2020-21    false 2021
##   GameIndex GmScMovingZ GmScMovingZTop2Delta      Date2 GmSc2 GmScMovingZ2
## 1       181        4.13                 0.24 1991-12-04  18.6         3.89
## 2        64        3.82                 0.64 1995-12-07  40.1         3.18
## 3        58        4.11                 1.67 1998-01-14  16.9         2.44
## 4       386        4.06                 0.84 2003-11-28  34.3         3.22
## 5       160        3.37                 0.18 2018-11-30  16.6         3.19
## 6         8        2.58                 0.05 2021-02-28  16.8         2.53

Grouping and Summarization

Group 1: Average Points per Team

group1 <- nba_data |> 
  group_by(Tm) |> 
  summarise(mean_pts = mean(PTS, na.rm = TRUE), .groups = "drop")

# Visualization
ggplot(group1, aes(x = reorder(Tm, -mean_pts), y = mean_pts, fill = Tm)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Average Points per Team", x = "Team", y = "Average Points") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Group 2: Average Assists per Season

group2 <- nba_data |> 
  group_by(Season) |> 
  summarise(mean_ast = mean(AST, na.rm = TRUE), .groups = "drop")

# Visualization
ggplot(group2, aes(x = Season, y = mean_ast)) +
  geom_line(group = 1, color = "blue") +
  theme_minimal() +
  labs(title = "Average Assists per Season", x = "Season", y = "Average Assists")

Group 3: Total Rebounds by Playoff Status

group3 <- nba_data |> 
  group_by(Playoffs) |> 
  summarise(total_rebounds = sum(TRB, na.rm = TRUE), .groups = "drop")

# Visualization
ggplot(group3, aes(x = as.factor(Playoffs), y = total_rebounds, fill = as.factor(Playoffs))) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Total Rebounds by Playoff Status", x = "Playoffs", y = "Total Rebounds")

Probability Analysis

# Compute probabilities for groups
prob_group <- group1 |> 
  mutate(prob = mean_pts / sum(mean_pts))

# Assign special tag for the lowest probability group
prob_group <- prob_group |> 
  mutate(tag = ifelse(prob == min(prob), "Lowest Probability", "Normal"))

# Display lowest probability teams
prob_group |> filter(tag == "Lowest Probability")

## # A tibble: 1 × 4
##   Tm    mean_pts   prob tag               
##   <chr>    <dbl>  <dbl> <chr>             
## 1 NOK       14.7 0.0152 Lowest Probability

Missing Combinations

# Unique combinations of Tm (Team) and Playoffs status
team_playoff_combinations <- nba_data |> select(Tm, Playoffs) |> unique()

# Find missing combinations
all_teams <- unique(nba_data$Tm)
all_playoffs <- unique(nba_data$Playoffs)
expected_combinations <- expand.grid(Tm = all_teams, Playoffs = all_playoffs)
missing_combinations <- anti_join(expected_combinations, team_playoff_combinations)

## Joining with `by = join_by(Tm, Playoffs)`

# Display missing combinations
missing_combinations

##     Tm Playoffs
## 1  OKC     true
## 2  ORL     true
## 3  WSB     true
## 4  LAC     true
## 5  IND     true
## 6  NJN     true
## 7  PHI     true
## 8  VAN     true
## 9  TOR     true
## 10 CHA     true
## 11 CHI     true
## 12 CHO     true
## 13 CHH     true
## 14 NOK     true

Hypothesis Testing: Points and Playoff Qualification

# Categorizing teams into playoff and non-playoff groups
playoff_teams <- nba_data |> filter(Playoffs == 1) |> summarise(mean_pts = mean(PTS, na.rm = TRUE))
non_playoff_teams <- nba_data |> filter(Playoffs == 0) |> summarise(mean_pts = mean(PTS, na.rm = TRUE))

# Perform a t-test to check statistical significance
t_test_result <- t.test(nba_data$PTS ~ nba_data$Playoffs, var.equal = TRUE)

# Display results
list(playoff_teams = playoff_teams, non_playoff_teams = non_playoff_teams, t_test_result = t_test_result)

## $playoff_teams
##   mean_pts
## 1      NaN
## 
## $non_playoff_teams
##   mean_pts
## 1      NaN
## 
## $t_test_result
## 
##  Two Sample t-test
## 
## data:  nba_data$PTS by nba_data$Playoffs
## t = -4.7158, df = 1701, p-value = 0.000002604
## alternative hypothesis: true difference in means between group false and group true is not equal to 0
## 95 percent confidence interval:
##  -10.017957  -4.132572
## sample estimates:
## mean in group false  mean in group true 
##            25.86224            32.93750

Insights & Conclusions

Some teams have significantly lower probability of selection based on their average points, likely indicating underperformance.
Playoff teams tend to have higher rebounds, suggesting a correlation between strong defensive play and postseason success.
Missing team-playoff combinations could indicate that certain teams have never made the playoffs in the dataset timeframe.
A hypothesis was tested: “Teams with higher average points are more likely to qualify for playoffs.” The results of the t-test indicate whether this difference is statistically significant.

This notebook provides a structured analysis of the NBA dataset, highlighting team performance trends, probability distributions, missing data patterns, and a hypothesis test on points per game and playoff qualification.