Loading data

player_data <-read.csv('C:/Users/rohan/OneDrive/Desktop/INTRO TO STATISTICS IN R/DATA SETS/Datasets/Data/Nba_all_seasons_1996_2021.csv')

summary(player_data)
##        X         player_name        team_abbreviation       age       
##  Min.   :    0   Length:12305       Length:12305       Min.   :18.00  
##  1st Qu.: 3076   Class :character   Class :character   1st Qu.:24.00  
##  Median : 6152   Mode  :character   Mode  :character   Median :26.00  
##  Mean   : 6152                                         Mean   :27.08  
##  3rd Qu.: 9228                                         3rd Qu.:30.00  
##  Max.   :12304                                         Max.   :44.00  
##  player_height   player_weight      college            country         
##  Min.   :160.0   Min.   : 60.33   Length:12305       Length:12305      
##  1st Qu.:193.0   1st Qu.: 90.72   Class :character   Class :character  
##  Median :200.7   Median : 99.79   Mode  :character   Mode  :character  
##  Mean   :200.6   Mean   :100.37                                        
##  3rd Qu.:208.3   3rd Qu.:108.86                                        
##  Max.   :231.1   Max.   :163.29                                        
##   draft_year        draft_round        draft_number             gp       
##  Length:12305       Length:12305       Length:12305       Min.   : 1.00  
##  Class :character   Class :character   Class :character   1st Qu.:31.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :57.00  
##                                                           Mean   :51.29  
##                                                           3rd Qu.:73.00  
##                                                           Max.   :85.00  
##       pts              reb              ast           net_rating      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :-250.000  
##  1st Qu.: 3.600   1st Qu.: 1.800   1st Qu.: 0.600   1st Qu.:  -6.400  
##  Median : 6.700   Median : 3.000   Median : 1.200   Median :  -1.300  
##  Mean   : 8.173   Mean   : 3.559   Mean   : 1.814   Mean   :  -2.256  
##  3rd Qu.:11.500   3rd Qu.: 4.700   3rd Qu.: 2.400   3rd Qu.:   3.200  
##  Max.   :36.100   Max.   :16.300   Max.   :11.700   Max.   : 300.000  
##     oreb_pct          dreb_pct        usg_pct           ts_pct      
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.02100   1st Qu.:0.096   1st Qu.:0.1490   1st Qu.:0.4800  
##  Median :0.04100   Median :0.131   Median :0.1810   Median :0.5240  
##  Mean   :0.05447   Mean   :0.141   Mean   :0.1849   Mean   :0.5111  
##  3rd Qu.:0.08400   3rd Qu.:0.180   3rd Qu.:0.2170   3rd Qu.:0.5610  
##  Max.   :1.00000   Max.   :1.000   Max.   :1.0000   Max.   :1.5000  
##     ast_pct          season         
##  Min.   :0.0000   Length:12305      
##  1st Qu.:0.0660   Class :character  
##  Median :0.1030   Mode  :character  
##  Mean   :0.1314                     
##  3rd Qu.:0.1780                     
##  Max.   :1.0000

Generalised Linear model

# Load necessary libraries
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.2
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Choose response variable and explanatory variables
response_variable <- "pts" 

# Select relevant explanatory variables
explanatory_variables <- c("age", "player_height", "player_weight", "gp", "reb", "ast", "usg_pct", "ts_pct")

# Subset the dataset with selected variables
subset_data <- player_data %>% select(response_variable, explanatory_variables)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(response_variable)
## 
##   # Now:
##   data %>% select(all_of(response_variable))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(explanatory_variables)
## 
##   # Now:
##   data %>% select(all_of(explanatory_variables))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Check for missing values and remove rows with missing data if necessary
subset_data <- na.omit(subset_data)

# Check if there are at least 2 data points for the response variable
if (nrow(subset_data) < 2) {
  stop("Insufficient data points for the response variable.")
}

# Split the data into training and testing sets
set.seed(123)  # for reproducibility
train_index <- createDataPartition(subset_data[[response_variable]], p = 0.8, list = FALSE)
train_data <- subset_data[train_index, ]
test_data <- subset_data[-train_index, ]

# Build the generalized linear model (GLM)
glm_model <- glm(paste(response_variable, "~ ."), data = train_data, family = gaussian)

# Make predictions on the test set
predictions <- predict(glm_model, newdata = test_data)

# Evaluate the model performance (you can use appropriate metrics based on your context)
mse <- mean((test_data[[response_variable]] - predictions)^2)
print(paste("Mean Squared Error (MSE):", mse))
## [1] "Mean Squared Error (MSE): 7.06216455532447"

In order to select explanatory variables, variables such as age, player height, player weight, games played, rebounds, assists, usage percentage, and true shooting percentage are chosen. “Pts” (points scored) is the response variable. Depending on your analysis objectives, you can change the response_variable and explanatory_variables.

Diagnosing the model

The numerical columns from the dataset that we wish to include in the correlation matrix are first specified. The dataset is then subset so that only these numerical columns remain. Lastly, we calculate the correlation coefficients between these numerical variables using the cor() function, which allows us to create the correlation matrix.

correlation_matrix <- cor(subset_data[explanatory_variables])
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(correlation_matrix)
##                        age player_height player_weight          gp        reb
## age            1.000000000  -0.008954385    0.05854449 0.055217522 0.03562267
## player_height -0.008954385   1.000000000    0.82542135 0.002202118 0.42327549
## player_weight  0.058544493   0.825421354    1.00000000 0.019651800 0.43792982
## gp             0.055217522   0.002202118    0.01965180 1.000000000 0.47094829
## reb            0.035622669   0.423275490    0.43792982 0.470948288 1.00000000
## ast            0.090208519  -0.449033300   -0.37878377 0.385836033 0.24101766
## usg_pct       -0.119416223  -0.104713789   -0.06738550 0.146806004 0.23055212
## ts_pct         0.025175472   0.072154033    0.06814165 0.375816016 0.31456949
##                       ast    usg_pct     ts_pct
## age            0.09020852 -0.1194162 0.02517547
## player_height -0.44903330 -0.1047138 0.07215403
## player_weight -0.37878377 -0.0673855 0.06814165
## gp             0.38583603  0.1468060 0.37581602
## reb            0.24101766  0.2305521 0.31456949
## ast            1.00000000  0.3925326 0.17608334
## usg_pct        0.39253260  1.0000000 0.12336769
## ts_pct         0.17608334  0.1233677 1.00000000

Interpreting the coefficients

# Assuming you have already loaded the dataset
# nba_data <- read.csv("path/to/your/dataset.csv")

# Fit a logistic regression model (example using 'draft_round' as the outcome variable)
logistic_model <- glm(draft_round == "1" ~ pts, data = player_data, family = binomial)

# Print the coefficients
print(summary(logistic_model))
## 
## Call:
## glm(formula = draft_round == "1" ~ pts, family = binomial, data = player_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.938031   0.035436  -26.47   <2e-16 ***
## pts          0.165192   0.004324   38.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16783  on 12304  degrees of freedom
## Residual deviance: 14808  on 12303  degrees of freedom
## AIC: 14812
## 
## Number of Fisher Scoring iterations: 4
# Interpret the coefficient for 'pts'
pts_coefficient <- coef(logistic_model)["pts"]
print(paste("Coefficient for pts:", pts_coefficient))
## [1] "Coefficient for pts: 0.165191797078324"

Based on a player’s points per game (pts), we are forecasting whether or not they will be selected in the first round (draft_round == “1”). Holding all other factors constant, the coefficient for points will show how the log-odds of being drafted in the first round alter with each extra point a player scores.

This coefficient’s meaning varies depending on the logistic regression model and dataset you are using. A player’s likelihood of being selected in the first round increases with their point total in a game, according to a positive coefficient; conversely, a negative coefficient would imply the opposite. The coefficient’s magnitude represents the strength of the correlation between points scored per game and the chance of being selected in the first round.