library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(car) # For VIF analysis
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
nba_data <- read_csv("C:/Statistics/nba.csv")
## Rows: 1703 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): bbrID, Tm, Opp, Season
## dbl (12): TRB, AST, STL, BLK, PTS, GmSc, Year, GameIndex, GmScMovingZ, GmSc...
## lgl (1): Playoffs
## date (2): Date, Date2
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(nba_data)
## spc_tbl_ [1,703 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ bbrID : chr [1:1703] "abdelal01" "abdulma02" "abdulta01" "abdursh01" ...
## $ Date : Date[1:1703], format: "1993-03-16" "1991-04-02" ...
## $ Tm : chr [1:1703] "BOS" "DEN" "SAC" "ATL" ...
## $ Opp : chr [1:1703] "GSW" "DAL" "VAN" "DET" ...
## $ TRB : num [1:1703] 10 2 2 12 2 13 10 14 2 10 ...
## $ AST : num [1:1703] 2 6 3 5 0 3 1 1 8 3 ...
## $ STL : num [1:1703] 0 4 1 2 0 0 0 1 5 1 ...
## $ BLK : num [1:1703] 0 0 0 1 0 1 0 0 0 3 ...
## $ PTS : num [1:1703] 25 30 31 50 25 17 18 19 31 17 ...
## $ GmSc : num [1:1703] 22.7 29.7 26.4 46 17.1 16.9 19.2 20.7 33.2 20.6 ...
## $ Season : chr [1:1703] "1992-93" "1990-91" "1997-98" "2001-02" ...
## $ Playoffs : logi [1:1703] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Year : num [1:1703] 1993 1991 1998 2002 2019 ...
## $ GameIndex : num [1:1703] 181 64 58 386 160 8 236 124 100 4 ...
## $ GmScMovingZ : num [1:1703] 4.13 3.82 4.11 4.06 3.37 2.58 4.27 4.15 3.16 4.68 ...
## $ GmScMovingZTop2Delta: num [1:1703] 0.24 0.64 1.67 0.84 0.18 0.05 0.02 0.93 0.22 1.16 ...
## $ Date2 : Date[1:1703], format: "1991-12-04" "1995-12-07" ...
## $ GmSc2 : num [1:1703] 18.6 40.1 16.9 34.3 16.6 16.8 19.6 18.5 42.3 29.5 ...
## $ GmScMovingZ2 : num [1:1703] 3.89 3.18 2.44 3.22 3.19 2.53 4.25 3.22 2.94 3.52 ...
## - attr(*, "spec")=
## .. cols(
## .. bbrID = col_character(),
## .. Date = col_date(format = ""),
## .. Tm = col_character(),
## .. Opp = col_character(),
## .. TRB = col_double(),
## .. AST = col_double(),
## .. STL = col_double(),
## .. BLK = col_double(),
## .. PTS = col_double(),
## .. GmSc = col_double(),
## .. Season = col_character(),
## .. Playoffs = col_logical(),
## .. Year = col_double(),
## .. GameIndex = col_double(),
## .. GmScMovingZ = col_double(),
## .. GmScMovingZTop2Delta = col_double(),
## .. Date2 = col_date(format = ""),
## .. GmSc2 = col_double(),
## .. GmScMovingZ2 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(nba_data)
## bbrID Date Tm Opp
## Length:1703 Min. :1984-12-11 Length:1703 Length:1703
## Class :character 1st Qu.:1998-03-22 Class :character Class :character
## Mode :character Median :2008-04-02 Mode :character Mode :character
## Mean :2007-04-20
## 3rd Qu.:2016-12-27
## Max. :2022-05-20
## TRB AST STL BLK
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. :0.0000
## 1st Qu.: 4.00 1st Qu.: 1.00 1st Qu.: 1.000 1st Qu.:0.0000
## Median : 7.00 Median : 3.00 Median : 1.000 Median :0.0000
## Mean : 7.37 Mean : 3.74 Mean : 1.669 Mean :0.8949
## 3rd Qu.:10.00 3rd Qu.: 5.00 3rd Qu.: 2.000 3rd Qu.:1.0000
## Max. :29.00 Max. :22.00 Max. :10.000 Max. :9.0000
## PTS GmSc Season Playoffs
## Min. : 4.00 Min. : 6.40 Length:1703 Mode :logical
## 1st Qu.:19.00 1st Qu.:18.90 Class :character FALSE:1655
## Median :24.00 Median :24.10 Mode :character TRUE :48
## Mean :26.06 Mean :25.14
## 3rd Qu.:32.00 3rd Qu.:30.10
## Max. :81.00 Max. :64.60
## Year GameIndex GmScMovingZ GmScMovingZTop2Delta
## Min. :1985 Min. : 0.0 Min. :2.170 Min. :0.0000
## 1st Qu.:1998 1st Qu.: 70.0 1st Qu.:3.240 1st Qu.:0.1500
## Median :2008 Median : 148.0 Median :3.630 Median :0.3500
## Mean :2007 Mean : 251.1 Mean :3.691 Mean :0.5057
## 3rd Qu.:2017 3rd Qu.: 369.0 3rd Qu.:4.050 3rd Qu.:0.7050
## Max. :2022 Max. :1592.0 Max. :6.750 Max. :3.7300
## Date2 GmSc2 GmScMovingZ2
## Min. :1984-11-21 Min. : 5.30 Min. :1.840
## 1st Qu.:1998-02-14 1st Qu.:16.90 1st Qu.:2.860
## Median :2008-02-27 Median :21.60 Median :3.170
## Mean :2007-03-14 Mean :22.68 Mean :3.185
## 3rd Qu.:2016-04-10 3rd Qu.:27.40 3rd Qu.:3.485
## Max. :2022-05-03 Max. :53.80 Max. :5.110
# Convert necessary columns to factors
nba_data$Playoffs <- as.factor(nba_data$Playoffs)
nba_data$Season <- as.factor(nba_data$Season)
# Histogram of Points per Game
ggplot(nba_data, aes(x = PTS)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
labs(title = "Distribution of Points per Game", x = "Points", y = "Frequency")
# Boxplot of Points by Playoff Status
ggplot(nba_data, aes(x = Playoffs, y = PTS, fill = Playoffs)) +
geom_boxplot() +
labs(title = "Points by Playoff Status", y = "Points", x = "Playoffs")
# Fit a multiple linear regression model for VIF check
vif_model <- lm(PTS ~ AST + TRB + STL, data = nba_data)
vif(vif_model)
## AST TRB STL
## 1.087890 1.024103 1.063847
# Build a linear regression model to predict Points per Game
lm_model <- lm(PTS ~ AST + TRB + STL, data = nba_data)
summary(lm_model)
##
## Call:
## lm(formula = PTS ~ AST + TRB + STL, data = nba_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.611 -7.066 -1.684 5.606 56.387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.91659 0.62604 35.008 < 2e-16 ***
## AST 0.58152 0.07798 7.457 1.40e-13 ***
## TRB 0.27694 0.05575 4.968 7.45e-07 ***
## STL -0.04264 0.17259 -0.247 0.805
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.11 on 1699 degrees of freedom
## Multiple R-squared: 0.04118, Adjusted R-squared: 0.03948
## F-statistic: 24.32 on 3 and 1699 DF, p-value: 2.077e-15
par(mfrow = c(2,2)) # Arrange plots
plot(lm_model) # Residuals vs Fitted, Q-Q Plot, Scale-Location, Residuals vs Leverage
Each coefficient represents the expected change in points per game (PTS) for a one-unit increase in the predictor:
# Confidence Interval for AST coefficient
confint(lm_model, "AST", level = 0.95)
## 2.5 % 97.5 %
## AST 0.4285716 0.734478
The 95% confidence interval for the AST coefficient represents the range in which we are confident the true effect of assists on points per game lies. If the interval does not include zero, the effect is statistically significant.
Further improvements could include testing for interaction effects or using alternative model evaluation techniques like cross-validation.