nba_data <- read.csv("C:/Statistics/nba.csv")
head(nba_data)
## bbrID Date Tm Opp TRB AST STL BLK PTS GmSc Season Playoffs Year
## 1 abdelal01 1993-03-16 BOS GSW 10 2 0 0 25 22.7 1992-93 false 1993
## 2 abdulma02 1991-04-02 DEN DAL 2 6 4 0 30 29.7 1990-91 false 1991
## 3 abdulta01 1998-04-19 SAC VAN 2 3 1 0 31 26.4 1997-98 false 1998
## 4 abdursh01 2001-11-23 ATL DET 12 5 2 1 50 46.0 2001-02 false 2002
## 5 abrinal01 2018-11-01 OKC CHO 2 0 0 0 25 17.1 2018-19 false 2019
## 6 achiupr01 2021-01-12 MIA PHI 13 3 0 1 17 16.9 2020-21 false 2021
## GameIndex GmScMovingZ GmScMovingZTop2Delta Date2 GmSc2 GmScMovingZ2
## 1 181 4.13 0.24 1991-12-04 18.6 3.89
## 2 64 3.82 0.64 1995-12-07 40.1 3.18
## 3 58 4.11 1.67 1998-01-14 16.9 2.44
## 4 386 4.06 0.84 2003-11-28 34.3 3.22
## 5 160 3.37 0.18 2018-11-30 16.6 3.19
## 6 8 2.58 0.05 2021-02-28 16.8 2.53
group1 <- nba_data |>
group_by(Tm) |>
summarise(mean_pts = mean(PTS, na.rm = TRUE), .groups = "drop")
# Visualization
ggplot(group1, aes(x = reorder(Tm, -mean_pts), y = mean_pts, fill = Tm)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Average Points per Team", x = "Team", y = "Average Points") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
group2 <- nba_data |>
group_by(Season) |>
summarise(mean_ast = mean(AST, na.rm = TRUE), .groups = "drop")
# Visualization
ggplot(group2, aes(x = Season, y = mean_ast)) +
geom_line(group = 1, color = "blue") +
theme_minimal() +
labs(title = "Average Assists per Season", x = "Season", y = "Average Assists")
group3 <- nba_data |>
group_by(Playoffs) |>
summarise(total_rebounds = sum(TRB, na.rm = TRUE), .groups = "drop")
# Visualization
ggplot(group3, aes(x = as.factor(Playoffs), y = total_rebounds, fill = as.factor(Playoffs))) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Total Rebounds by Playoff Status", x = "Playoffs", y = "Total Rebounds")
# Compute probabilities for groups
prob_group <- group1 |>
mutate(prob = mean_pts / sum(mean_pts))
# Assign special tag for the lowest probability group
prob_group <- prob_group |>
mutate(tag = ifelse(prob == min(prob), "Lowest Probability", "Normal"))
# Display lowest probability teams
prob_group |> filter(tag == "Lowest Probability")
## # A tibble: 1 × 4
## Tm mean_pts prob tag
## <chr> <dbl> <dbl> <chr>
## 1 NOK 14.7 0.0152 Lowest Probability
# Unique combinations of Tm (Team) and Playoffs status
team_playoff_combinations <- nba_data |> select(Tm, Playoffs) |> unique()
# Find missing combinations
all_teams <- unique(nba_data$Tm)
all_playoffs <- unique(nba_data$Playoffs)
expected_combinations <- expand.grid(Tm = all_teams, Playoffs = all_playoffs)
missing_combinations <- anti_join(expected_combinations, team_playoff_combinations)
## Joining with `by = join_by(Tm, Playoffs)`
# Display missing combinations
missing_combinations
## Tm Playoffs
## 1 OKC true
## 2 ORL true
## 3 WSB true
## 4 LAC true
## 5 IND true
## 6 NJN true
## 7 PHI true
## 8 VAN true
## 9 TOR true
## 10 CHA true
## 11 CHI true
## 12 CHO true
## 13 CHH true
## 14 NOK true
# Categorizing teams into playoff and non-playoff groups
playoff_teams <- nba_data |> filter(Playoffs == 1) |> summarise(mean_pts = mean(PTS, na.rm = TRUE))
non_playoff_teams <- nba_data |> filter(Playoffs == 0) |> summarise(mean_pts = mean(PTS, na.rm = TRUE))
# Perform a t-test to check statistical significance
t_test_result <- t.test(nba_data$PTS ~ nba_data$Playoffs, var.equal = TRUE)
# Display results
list(playoff_teams = playoff_teams, non_playoff_teams = non_playoff_teams, t_test_result = t_test_result)
## $playoff_teams
## mean_pts
## 1 NaN
##
## $non_playoff_teams
## mean_pts
## 1 NaN
##
## $t_test_result
##
## Two Sample t-test
##
## data: nba_data$PTS by nba_data$Playoffs
## t = -4.7158, df = 1701, p-value = 0.000002604
## alternative hypothesis: true difference in means between group false and group true is not equal to 0
## 95 percent confidence interval:
## -10.017957 -4.132572
## sample estimates:
## mean in group false mean in group true
## 25.86224 32.93750
This notebook provides a structured analysis of the NBA dataset, highlighting team performance trends, probability distributions, missing data patterns, and a hypothesis test on points per game and playoff qualification.