# Load necessary libraries
library(readr)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load the dataset
nbaStats = read.csv("NBA_Stats202324.csv")
str(nbaStats)
## 'data.frame':    30 obs. of  19 variables:
##  $ TEAM : chr  "Washington" "Atlanta" "Utah" "Indiana" ...
##  $ PPG  : num  114 118 116 123 110 ...
##  $ oPPG : num  123 120 120 120 119 ...
##  $ pDIFF: num  -9.3 -2.2 -4.8 3.1 -9.1 -6.4 -6.5 0.6 -10.2 2.6 ...
##  $ PACE : num  102.6 100 99.4 101.7 99.8 ...
##  $ oEFF : num  110 117 116 121 110 ...
##  $ dEFF : num  120 119 120 118 119 ...
##  $ eDIFF: num  -9.1 -2.1 -4.9 3 -9.1 -6.4 -6.4 0.6 -10.5 2.6 ...
##  $ SOS  : num  1.06 -0.61 0.38 0.15 0.79 0.36 0.5 -0.08 0.86 -0.44 ...
##  $ SAR  : num  -8.04 -2.71 -4.52 3.15 -8.31 -6.04 -5.9 0.52 -9.64 2.16 ...
##  $ CONS : num  12.5 14.7 15.9 15.2 12 15.7 14.4 15 14.4 16.1 ...
##  $ A4F  : num  -0.052 -0.05 0.004 0.012 -0.01 -0.045 -0.015 0.026 -0.068 0.05 ...
##  $ W    : int  15 36 31 47 14 25 22 47 21 49 ...
##  $ L    : int  67 46 51 35 68 57 60 35 61 33 ...
##  $ WIN. : num  0.183 0.439 0.378 0.573 0.171 0.305 0.268 0.573 0.256 0.598 ...
##  $ eWIN.: num  0.234 0.443 0.379 0.578 0.224 0.342 0.329 0.516 0.233 0.564 ...
##  $ pWIN.: num  0.194 0.428 0.342 0.602 0.2 0.289 0.286 0.52 0.164 0.586 ...
##  $ ACH  : num  -0.051 -0.004 -0.001 -0.005 -0.053 -0.037 -0.061 0.057 0.023 0.034 ...
##  $ STRK : int  -6 -6 -1 1 -1 -4 2 2 1 -2 ...
# Drop irrelevant columns and those that shouldn't be used as predictors
features <- nbaStats %>%
  select(-TEAM, -W, -L, -WIN., -STRK)
str(features)
## 'data.frame':    30 obs. of  14 variables:
##  $ PPG  : num  114 118 116 123 110 ...
##  $ oPPG : num  123 120 120 120 119 ...
##  $ pDIFF: num  -9.3 -2.2 -4.8 3.1 -9.1 -6.4 -6.5 0.6 -10.2 2.6 ...
##  $ PACE : num  102.6 100 99.4 101.7 99.8 ...
##  $ oEFF : num  110 117 116 121 110 ...
##  $ dEFF : num  120 119 120 118 119 ...
##  $ eDIFF: num  -9.1 -2.1 -4.9 3 -9.1 -6.4 -6.4 0.6 -10.5 2.6 ...
##  $ SOS  : num  1.06 -0.61 0.38 0.15 0.79 0.36 0.5 -0.08 0.86 -0.44 ...
##  $ SAR  : num  -8.04 -2.71 -4.52 3.15 -8.31 -6.04 -5.9 0.52 -9.64 2.16 ...
##  $ CONS : num  12.5 14.7 15.9 15.2 12 15.7 14.4 15 14.4 16.1 ...
##  $ A4F  : num  -0.052 -0.05 0.004 0.012 -0.01 -0.045 -0.015 0.026 -0.068 0.05 ...
##  $ eWIN.: num  0.234 0.443 0.379 0.578 0.224 0.342 0.329 0.516 0.233 0.564 ...
##  $ pWIN.: num  0.194 0.428 0.342 0.602 0.2 0.289 0.286 0.52 0.164 0.586 ...
##  $ ACH  : num  -0.051 -0.004 -0.001 -0.005 -0.053 -0.037 -0.061 0.057 0.023 0.034 ...
# Separate the target variable (WIN%)
target <- nbaStats$WIN.
# Split the data into training and testing sets
set.seed(42)
train_index <- createDataPartition(target, p = 0.8, list = FALSE)
X_train <- features[train_index, ]
X_test <- features[-train_index, ]
y_train <- target[train_index]
y_test <- target[-train_index]
# Initialize and train a linear regression model
model <- train(X_train, y_train, method = "lm")
table(nbaStats$TEAM, nbaStats$WIN. > 0.56)
##                
##                 FALSE TRUE
##   Atlanta           1    0
##   Boston            0    1
##   Brooklyn          1    0
##   Charlotte         1    0
##   Chicago           1    0
##   Cleveland         0    1
##   Dallas            0    1
##   Denver            0    1
##   Detroit           1    0
##   Golden State      0    1
##   Houston           1    0
##   Indiana           0    1
##   LA Clippers       0    1
##   LA Lakers         0    1
##   Memphis           1    0
##   Miami             0    1
##   Milwaukee         0    1
##   Minnesota         0    1
##   New Orleans       0    1
##   New York          0    1
##   Oklahoma City     0    1
##   Orlando           0    1
##   Philadelphia      0    1
##   Phoenix           0    1
##   Portland          1    0
##   Sacramento        0    1
##   San Antonio       1    0
##   Toronto           1    0
##   Utah              1    0
##   Washington        1    0
nbaStats$PTSdiff = nbaStats$PPG - nbaStats$oPPG
summary(nbaStats$PTSdiff)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -10.20000  -4.32500   1.95000   0.01667   3.10000  11.40000
plot(nbaStats$PTSdiff, nbaStats$W)

max(nbaStats$WIN.)
## [1] 0.78
# Make predictions on the test set
y_pred <- predict(model, X_test)
summary(y_pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.256   0.439   0.549   0.491   0.601   0.610
# Evaluate the model's performance
mae <- mean(abs(y_test - y_pred))
mse <- mean((y_test - y_pred)^2)
rmse <- sqrt(mse)
r2 <- cor(y_test, y_pred)^2
rmse
## [1] 2.775558e-17
# Output the evaluation metrics
list(MAE = mae, MSE = mse, RMSE = rmse, R2 = r2)
## $MAE
## [1] 1.387779e-17
## 
## $MSE
## [1] 7.70372e-34
## 
## $RMSE
## [1] 2.775558e-17
## 
## $R2
## [1] 1