# Load necessary libraries
library(readr)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the dataset
nbaStats = read.csv("NBA_Stats202324.csv")
str(nbaStats)
## 'data.frame': 30 obs. of 19 variables:
## $ TEAM : chr "Washington" "Atlanta" "Utah" "Indiana" ...
## $ PPG : num 114 118 116 123 110 ...
## $ oPPG : num 123 120 120 120 119 ...
## $ pDIFF: num -9.3 -2.2 -4.8 3.1 -9.1 -6.4 -6.5 0.6 -10.2 2.6 ...
## $ PACE : num 102.6 100 99.4 101.7 99.8 ...
## $ oEFF : num 110 117 116 121 110 ...
## $ dEFF : num 120 119 120 118 119 ...
## $ eDIFF: num -9.1 -2.1 -4.9 3 -9.1 -6.4 -6.4 0.6 -10.5 2.6 ...
## $ SOS : num 1.06 -0.61 0.38 0.15 0.79 0.36 0.5 -0.08 0.86 -0.44 ...
## $ SAR : num -8.04 -2.71 -4.52 3.15 -8.31 -6.04 -5.9 0.52 -9.64 2.16 ...
## $ CONS : num 12.5 14.7 15.9 15.2 12 15.7 14.4 15 14.4 16.1 ...
## $ A4F : num -0.052 -0.05 0.004 0.012 -0.01 -0.045 -0.015 0.026 -0.068 0.05 ...
## $ W : int 15 36 31 47 14 25 22 47 21 49 ...
## $ L : int 67 46 51 35 68 57 60 35 61 33 ...
## $ WIN. : num 0.183 0.439 0.378 0.573 0.171 0.305 0.268 0.573 0.256 0.598 ...
## $ eWIN.: num 0.234 0.443 0.379 0.578 0.224 0.342 0.329 0.516 0.233 0.564 ...
## $ pWIN.: num 0.194 0.428 0.342 0.602 0.2 0.289 0.286 0.52 0.164 0.586 ...
## $ ACH : num -0.051 -0.004 -0.001 -0.005 -0.053 -0.037 -0.061 0.057 0.023 0.034 ...
## $ STRK : int -6 -6 -1 1 -1 -4 2 2 1 -2 ...
# Drop irrelevant columns and those that shouldn't be used as predictors
features <- nbaStats %>%
select(-TEAM, -W, -L, -WIN., -STRK)
str(features)
## 'data.frame': 30 obs. of 14 variables:
## $ PPG : num 114 118 116 123 110 ...
## $ oPPG : num 123 120 120 120 119 ...
## $ pDIFF: num -9.3 -2.2 -4.8 3.1 -9.1 -6.4 -6.5 0.6 -10.2 2.6 ...
## $ PACE : num 102.6 100 99.4 101.7 99.8 ...
## $ oEFF : num 110 117 116 121 110 ...
## $ dEFF : num 120 119 120 118 119 ...
## $ eDIFF: num -9.1 -2.1 -4.9 3 -9.1 -6.4 -6.4 0.6 -10.5 2.6 ...
## $ SOS : num 1.06 -0.61 0.38 0.15 0.79 0.36 0.5 -0.08 0.86 -0.44 ...
## $ SAR : num -8.04 -2.71 -4.52 3.15 -8.31 -6.04 -5.9 0.52 -9.64 2.16 ...
## $ CONS : num 12.5 14.7 15.9 15.2 12 15.7 14.4 15 14.4 16.1 ...
## $ A4F : num -0.052 -0.05 0.004 0.012 -0.01 -0.045 -0.015 0.026 -0.068 0.05 ...
## $ eWIN.: num 0.234 0.443 0.379 0.578 0.224 0.342 0.329 0.516 0.233 0.564 ...
## $ pWIN.: num 0.194 0.428 0.342 0.602 0.2 0.289 0.286 0.52 0.164 0.586 ...
## $ ACH : num -0.051 -0.004 -0.001 -0.005 -0.053 -0.037 -0.061 0.057 0.023 0.034 ...
# Separate the target variable (WIN%)
target <- nbaStats$WIN.
# Split the data into training and testing sets
set.seed(42)
train_index <- createDataPartition(target, p = 0.8, list = FALSE)
X_train <- features[train_index, ]
X_test <- features[-train_index, ]
y_train <- target[train_index]
y_test <- target[-train_index]
# Initialize and train a linear regression model
model <- train(X_train, y_train, method = "lm")
table(nbaStats$TEAM, nbaStats$WIN. > 0.56)
##
## FALSE TRUE
## Atlanta 1 0
## Boston 0 1
## Brooklyn 1 0
## Charlotte 1 0
## Chicago 1 0
## Cleveland 0 1
## Dallas 0 1
## Denver 0 1
## Detroit 1 0
## Golden State 0 1
## Houston 1 0
## Indiana 0 1
## LA Clippers 0 1
## LA Lakers 0 1
## Memphis 1 0
## Miami 0 1
## Milwaukee 0 1
## Minnesota 0 1
## New Orleans 0 1
## New York 0 1
## Oklahoma City 0 1
## Orlando 0 1
## Philadelphia 0 1
## Phoenix 0 1
## Portland 1 0
## Sacramento 0 1
## San Antonio 1 0
## Toronto 1 0
## Utah 1 0
## Washington 1 0
nbaStats$PTSdiff = nbaStats$PPG - nbaStats$oPPG
summary(nbaStats$PTSdiff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -10.20000 -4.32500 1.95000 0.01667 3.10000 11.40000
plot(nbaStats$PTSdiff, nbaStats$W)

max(nbaStats$WIN.)
## [1] 0.78
# Make predictions on the test set
y_pred <- predict(model, X_test)
summary(y_pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.256 0.439 0.549 0.491 0.601 0.610
# Evaluate the model's performance
mae <- mean(abs(y_test - y_pred))
mse <- mean((y_test - y_pred)^2)
rmse <- sqrt(mse)
r2 <- cor(y_test, y_pred)^2
rmse
## [1] 2.775558e-17
# Output the evaluation metrics
list(MAE = mae, MSE = mse, RMSE = rmse, R2 = r2)
## $MAE
## [1] 1.387779e-17
##
## $MSE
## [1] 7.70372e-34
##
## $RMSE
## [1] 2.775558e-17
##
## $R2
## [1] 1