data1 <- read.csv("C:\\Statistics\\nba.csv")
head(data1)
## bbrID Date Tm Opp TRB AST STL BLK PTS GmSc Season Playoffs Year
## 1 abdelal01 1993-03-16 BOS GSW 10 2 0 0 25 22.7 1992-93 false 1993
## 2 abdulma02 1991-04-02 DEN DAL 2 6 4 0 30 29.7 1990-91 false 1991
## 3 abdulta01 1998-04-19 SAC VAN 2 3 1 0 31 26.4 1997-98 false 1998
## 4 abdursh01 2001-11-23 ATL DET 12 5 2 1 50 46.0 2001-02 false 2002
## 5 abrinal01 2018-11-01 OKC CHO 2 0 0 0 25 17.1 2018-19 false 2019
## 6 achiupr01 2021-01-12 MIA PHI 13 3 0 1 17 16.9 2020-21 false 2021
## GameIndex GmScMovingZ GmScMovingZTop2Delta Date2 GmSc2 GmScMovingZ2
## 1 181 4.13 0.24 1991-12-04 18.6 3.89
## 2 64 3.82 0.64 1995-12-07 40.1 3.18
## 3 58 4.11 1.67 1998-01-14 16.9 2.44
## 4 386 4.06 0.84 2003-11-28 34.3 3.22
## 5 160 3.37 0.18 2018-11-30 16.6 3.19
## 6 8 2.58 0.05 2021-02-28 16.8 2.53
# Example: For a categorical column "Team"
data1 %>%
group_by(Tm) %>%
summarise(Count = n())
## # A tibble: 38 × 2
## Tm Count
## <chr> <int>
## 1 ATL 62
## 2 BOS 61
## 3 BRK 24
## 4 CHA 17
## 5 CHH 20
## 6 CHI 50
## 7 CHO 14
## 8 CLE 68
## 9 DAL 53
## 10 DEN 57
## # ℹ 28 more rows
# Example: For a numeric column "Points"
summary(data1$PTS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 19.00 24.00 26.06 32.00 81.00
# Additional statistics
quantile(data1$PTS, probs = seq(0, 1, 0.25)) # Quartiles
## 0% 25% 50% 75% 100%
## 4 19 24 32 81
#Set of novel questions
#What are the top 5teams with the highest avg points per game?
#Is there a relationship between players' height and their average points scored?
#How does the distribution of salaries vary across different positions?
# Top 5 teams with the highest average points
top_teams <- data1 %>%
group_by(Tm) %>%
summarise(AveragePoints = mean(PTS, na.rm = TRUE)) %>%
arrange(desc(AveragePoints)) %>%
head(5)
print(top_teams)
## # A tibble: 5 × 2
## Tm AveragePoints
## <chr> <dbl>
## 1 PHO 29.9
## 2 NOP 28.9
## 3 CHI 28.5
## 4 HOU 28.4
## 5 POR 27.8
# Distributive visualization
ggplot(data1, aes(x = PTS)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
ggtitle("Distribution of Points")
```{#The distribution visualization using the histogram provides the following insights:}
#The majority of values fall within a specific range of points, likely around 15–25 . #This range represents the typical performance level.
#Performance Analysis: Helps in understanding whether scoring is typically consistent or varies.
#Do any specific player or team attributes correlate with high/low scoring ranges? #Are the outliers associated with specific players, teams, or time periods?
``` r
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors