Load Libraries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(car)  # For VIF analysis
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode

Load Dataset

nba_data <- read_csv("C:/Statistics/nba.csv")
## Rows: 1703 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): bbrID, Tm, Opp, Season
## dbl  (12): TRB, AST, STL, BLK, PTS, GmSc, Year, GameIndex, GmScMovingZ, GmSc...
## lgl   (1): Playoffs
## date  (2): Date, Date2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(nba_data)
## spc_tbl_ [1,703 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ bbrID               : chr [1:1703] "abdelal01" "abdulma02" "abdulta01" "abdursh01" ...
##  $ Date                : Date[1:1703], format: "1993-03-16" "1991-04-02" ...
##  $ Tm                  : chr [1:1703] "BOS" "DEN" "SAC" "ATL" ...
##  $ Opp                 : chr [1:1703] "GSW" "DAL" "VAN" "DET" ...
##  $ TRB                 : num [1:1703] 10 2 2 12 2 13 10 14 2 10 ...
##  $ AST                 : num [1:1703] 2 6 3 5 0 3 1 1 8 3 ...
##  $ STL                 : num [1:1703] 0 4 1 2 0 0 0 1 5 1 ...
##  $ BLK                 : num [1:1703] 0 0 0 1 0 1 0 0 0 3 ...
##  $ PTS                 : num [1:1703] 25 30 31 50 25 17 18 19 31 17 ...
##  $ GmSc                : num [1:1703] 22.7 29.7 26.4 46 17.1 16.9 19.2 20.7 33.2 20.6 ...
##  $ Season              : chr [1:1703] "1992-93" "1990-91" "1997-98" "2001-02" ...
##  $ Playoffs            : logi [1:1703] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Year                : num [1:1703] 1993 1991 1998 2002 2019 ...
##  $ GameIndex           : num [1:1703] 181 64 58 386 160 8 236 124 100 4 ...
##  $ GmScMovingZ         : num [1:1703] 4.13 3.82 4.11 4.06 3.37 2.58 4.27 4.15 3.16 4.68 ...
##  $ GmScMovingZTop2Delta: num [1:1703] 0.24 0.64 1.67 0.84 0.18 0.05 0.02 0.93 0.22 1.16 ...
##  $ Date2               : Date[1:1703], format: "1991-12-04" "1995-12-07" ...
##  $ GmSc2               : num [1:1703] 18.6 40.1 16.9 34.3 16.6 16.8 19.6 18.5 42.3 29.5 ...
##  $ GmScMovingZ2        : num [1:1703] 3.89 3.18 2.44 3.22 3.19 2.53 4.25 3.22 2.94 3.52 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   bbrID = col_character(),
##   ..   Date = col_date(format = ""),
##   ..   Tm = col_character(),
##   ..   Opp = col_character(),
##   ..   TRB = col_double(),
##   ..   AST = col_double(),
##   ..   STL = col_double(),
##   ..   BLK = col_double(),
##   ..   PTS = col_double(),
##   ..   GmSc = col_double(),
##   ..   Season = col_character(),
##   ..   Playoffs = col_logical(),
##   ..   Year = col_double(),
##   ..   GameIndex = col_double(),
##   ..   GmScMovingZ = col_double(),
##   ..   GmScMovingZTop2Delta = col_double(),
##   ..   Date2 = col_date(format = ""),
##   ..   GmSc2 = col_double(),
##   ..   GmScMovingZ2 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(nba_data)
##     bbrID                Date                 Tm                Opp           
##  Length:1703        Min.   :1984-12-11   Length:1703        Length:1703       
##  Class :character   1st Qu.:1998-03-22   Class :character   Class :character  
##  Mode  :character   Median :2008-04-02   Mode  :character   Mode  :character  
##                     Mean   :2007-04-20                                        
##                     3rd Qu.:2016-12-27                                        
##                     Max.   :2022-05-20                                        
##       TRB             AST             STL              BLK        
##  Min.   : 0.00   Min.   : 0.00   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.: 4.00   1st Qu.: 1.00   1st Qu.: 1.000   1st Qu.:0.0000  
##  Median : 7.00   Median : 3.00   Median : 1.000   Median :0.0000  
##  Mean   : 7.37   Mean   : 3.74   Mean   : 1.669   Mean   :0.8949  
##  3rd Qu.:10.00   3rd Qu.: 5.00   3rd Qu.: 2.000   3rd Qu.:1.0000  
##  Max.   :29.00   Max.   :22.00   Max.   :10.000   Max.   :9.0000  
##       PTS             GmSc          Season           Playoffs      
##  Min.   : 4.00   Min.   : 6.40   Length:1703        Mode :logical  
##  1st Qu.:19.00   1st Qu.:18.90   Class :character   FALSE:1655     
##  Median :24.00   Median :24.10   Mode  :character   TRUE :48       
##  Mean   :26.06   Mean   :25.14                                     
##  3rd Qu.:32.00   3rd Qu.:30.10                                     
##  Max.   :81.00   Max.   :64.60                                     
##       Year        GameIndex       GmScMovingZ    GmScMovingZTop2Delta
##  Min.   :1985   Min.   :   0.0   Min.   :2.170   Min.   :0.0000      
##  1st Qu.:1998   1st Qu.:  70.0   1st Qu.:3.240   1st Qu.:0.1500      
##  Median :2008   Median : 148.0   Median :3.630   Median :0.3500      
##  Mean   :2007   Mean   : 251.1   Mean   :3.691   Mean   :0.5057      
##  3rd Qu.:2017   3rd Qu.: 369.0   3rd Qu.:4.050   3rd Qu.:0.7050      
##  Max.   :2022   Max.   :1592.0   Max.   :6.750   Max.   :3.7300      
##      Date2                GmSc2        GmScMovingZ2  
##  Min.   :1984-11-21   Min.   : 5.30   Min.   :1.840  
##  1st Qu.:1998-02-14   1st Qu.:16.90   1st Qu.:2.860  
##  Median :2008-02-27   Median :21.60   Median :3.170  
##  Mean   :2007-03-14   Mean   :22.68   Mean   :3.185  
##  3rd Qu.:2016-04-10   3rd Qu.:27.40   3rd Qu.:3.485  
##  Max.   :2022-05-03   Max.   :53.80   Max.   :5.110

Data Cleaning

# Convert necessary columns to factors
nba_data$Playoffs <- as.factor(nba_data$Playoffs)
nba_data$Season <- as.factor(nba_data$Season)

Exploratory Data Analysis

# Histogram of Points per Game
ggplot(nba_data, aes(x = PTS)) + 
  geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
  labs(title = "Distribution of Points per Game", x = "Points", y = "Frequency")

# Boxplot of Points by Playoff Status
ggplot(nba_data, aes(x = Playoffs, y = PTS, fill = Playoffs)) +
  geom_boxplot() +
  labs(title = "Points by Playoff Status", y = "Points", x = "Playoffs")

Checking for Multicollinearity

# Fit a multiple linear regression model for VIF check
vif_model <- lm(PTS ~ AST + TRB + STL, data = nba_data)  
vif(vif_model)
##      AST      TRB      STL 
## 1.087890 1.024103 1.063847

Linear Regression Model

# Build a linear regression model to predict Points per Game
lm_model <- lm(PTS ~ AST + TRB + STL, data = nba_data)
summary(lm_model)
## 
## Call:
## lm(formula = PTS ~ AST + TRB + STL, data = nba_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.611  -7.066  -1.684   5.606  56.387 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 21.91659    0.62604  35.008  < 2e-16 ***
## AST          0.58152    0.07798   7.457 1.40e-13 ***
## TRB          0.27694    0.05575   4.968 7.45e-07 ***
## STL         -0.04264    0.17259  -0.247    0.805    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.11 on 1699 degrees of freedom
## Multiple R-squared:  0.04118,    Adjusted R-squared:  0.03948 
## F-statistic: 24.32 on 3 and 1699 DF,  p-value: 2.077e-15

Model Diagnostics

par(mfrow = c(2,2))  # Arrange plots
plot(lm_model)  # Residuals vs Fitted, Q-Q Plot, Scale-Location, Residuals vs Leverage

Interpretation of Coefficients

Each coefficient represents the expected change in points per game (PTS) for a one-unit increase in the predictor:

Confidence Interval for a Coefficient

# Confidence Interval for AST coefficient
confint(lm_model, "AST", level = 0.95)
##         2.5 %   97.5 %
## AST 0.4285716 0.734478

Interpretation

The 95% confidence interval for the AST coefficient represents the range in which we are confident the true effect of assists on points per game lies. If the interval does not include zero, the effect is statistically significant.

Conclusion

Further improvements could include testing for interaction effects or using alternative model evaluation techniques like cross-validation.