# Set working directory and path to data
  setwd("C:/Users/LENOVO/Downloads/Regression Model/Project")  # Example path on Windows


# Clear the workspace
  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 521728 27.9    1160458   62   660385 35.3
## Vcells 948793  7.3    8388608   64  1769879 13.6
  cat("\f")       # Clear the console
  dev.off         # Clear the charts
## function (which = dev.cur()) 
## {
##     if (which == 1) 
##         stop("cannot shut down device 1 (the null device)")
##     .External(C_devoff, as.integer(which))
##     dev.cur()
## }
## <bytecode: 0x0000019c0cf16268>
## <environment: namespace:grDevices>
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download.csv")
df1 <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download (1).csv")
df2 <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download (2).csv")
df3 <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download (3).csv")
# Merge data frames
merged_df <- bind_rows(df, df1, df2, df3)
summary(merged_df %>% select("Rk",   "Team", "G",    "MP",   "FG",   "FGA",  "FG.",  "X3P",  "X3PA", "X3P.", "X2P", "X2PA", "X2P.", "FT",   "FTA",  "FT.",  "ORB",  "DRB",  "TRB",  "AST",  "STL",  "BLK", "TOV",  "PF",   "PTS"))
##        Rk           Team                 G               MP       
##  Min.   : 1.0   Length:120         Min.   :64.00   Min.   :240.0  
##  1st Qu.: 8.0   Class :character   1st Qu.:72.00   1st Qu.:241.0  
##  Median :15.5   Mode  :character   Median :78.50   Median :241.5  
##  Mean   :15.5                      Mean   :76.65   Mean   :241.6  
##  3rd Qu.:23.0                      3rd Qu.:82.00   3rd Qu.:242.1  
##  Max.   :30.0                      Max.   :82.00   Max.   :243.7  
##        FG             FGA             FG.              X3P       
##  Min.   :37.30   Min.   :83.70   Min.   :0.4290   Min.   : 9.60  
##  1st Qu.:40.08   1st Qu.:86.78   1st Qu.:0.4570   1st Qu.:11.30  
##  Median :41.30   Median :88.40   Median :0.4680   Median :12.20  
##  Mean   :41.16   Mean   :88.41   Mean   :0.4657   Mean   :12.42  
##  3rd Qu.:42.20   3rd Qu.:90.12   3rd Qu.:0.4750   3rd Qu.:13.40  
##  Max.   :44.70   Max.   :94.40   Max.   :0.5040   Max.   :16.70  
##       X3PA            X3P.             X2P             X2PA      
##  Min.   :28.00   Min.   :0.3230   Min.   :24.50   Min.   :43.30  
##  1st Qu.:31.80   1st Qu.:0.3488   1st Qu.:27.25   1st Qu.:50.98  
##  Median :34.20   Median :0.3585   Median :28.70   Median :53.95  
##  Mean   :34.53   Mean   :0.3594   Mean   :28.75   Mean   :53.87  
##  3rd Qu.:37.00   3rd Qu.:0.3700   3rd Qu.:30.32   3rd Qu.:56.83  
##  Max.   :45.30   Max.   :0.4110   Max.   :33.90   Max.   :62.10  
##       X2P.              FT             FTA             FT.        
##  Min.   :0.4760   Min.   :13.80   Min.   :17.50   Min.   :0.6940  
##  1st Qu.:0.5198   1st Qu.:16.40   1st Qu.:21.30   1st Qu.:0.7558  
##  Median :0.5320   Median :17.50   Median :22.40   Median :0.7790  
##  Mean   :0.5340   Mean   :17.54   Mean   :22.57   Mean   :0.7770  
##  3rd Qu.:0.5480   3rd Qu.:18.60   3rd Qu.:23.80   3rd Qu.:0.7953  
##  Max.   :0.5860   Max.   :21.00   Max.   :26.60   Max.   :0.8390  
##       ORB              DRB             TRB             AST       
##  Min.   : 7.600   Min.   :30.30   Min.   :38.80   Min.   :20.60  
##  1st Qu.: 9.475   1st Qu.:32.90   1st Qu.:42.90   1st Qu.:23.70  
##  Median :10.150   Median :34.05   Median :44.20   Median :24.70  
##  Mean   :10.172   Mean   :34.08   Mean   :44.24   Mean   :24.79  
##  3rd Qu.:10.700   3rd Qu.:35.12   3rd Qu.:45.40   3rd Qu.:25.93  
##  Max.   :14.100   Max.   :42.20   Max.   :51.70   Max.   :29.80  
##       STL              BLK             TOV              PF       
##  Min.   : 6.100   Min.   :3.000   Min.   :11.10   Min.   :17.20  
##  1st Qu.: 7.000   1st Qu.:4.375   1st Qu.:13.30   1st Qu.:18.90  
##  Median : 7.450   Median :4.750   Median :14.15   Median :19.90  
##  Mean   : 7.538   Mean   :4.787   Mean   :14.06   Mean   :19.92  
##  3rd Qu.: 8.000   3rd Qu.:5.200   3rd Qu.:14.80   3rd Qu.:20.90  
##  Max.   :10.000   Max.   :6.600   Max.   :16.50   Max.   :23.10  
##       PTS       
##  Min.   :102.9  
##  1st Qu.:109.8  
##  Median :112.8  
##  Mean   :112.3  
##  3rd Qu.:115.1  
##  Max.   :120.7
# Write the merged dataset to a CSV file
write.csv(merged_df, file = "merged_dataset.csv", row.names = FALSE)
# Print first 10 rows
head(merged_df, 10)
##    Rk                    Team  G    MP   FG  FGA   FG.  X3P X3PA  X3P.  X2P
## 1   1        Milwaukee Bucks* 73 241.0 43.3 90.9 0.476 13.8 38.9 0.355 29.5
## 2   2        Houston Rockets* 72 241.4 40.8 90.4 0.451 15.6 45.3 0.345 25.1
## 3   3       Dallas Mavericks* 75 242.3 41.7 90.3 0.461 15.1 41.3 0.367 26.5
## 4   4   Los Angeles Clippers* 72 241.4 41.6 89.2 0.466 12.4 33.5 0.371 29.1
## 5   5    New Orleans Pelicans 72 242.1 42.6 91.6 0.465 13.6 36.9 0.370 28.9
## 6   6 Portland Trail Blazers* 74 241.0 42.2 91.2 0.463 12.9 34.1 0.377 29.3
## 7   7      Washington Wizards 72 241.0 41.5 90.9 0.457 12.0 32.6 0.368 29.5
## 8   8       San Antonio Spurs 71 242.5 42.2 89.4 0.472 10.7 28.5 0.376 31.5
## 9   9         Boston Celtics* 72 242.1 41.3 89.6 0.461 12.6 34.5 0.364 28.7
## 10 10            Phoenix Suns 73 241.0 41.2 88.1 0.468 11.4 31.8 0.358 29.8
##    X2PA  X2P.   FT  FTA   FT.  ORB  DRB  TRB  AST STL BLK  TOV   PF   PTS
## 1  52.0 0.567 18.3 24.7 0.742  9.5 42.2 51.7 25.9 7.2 5.9 15.1 19.6 118.7
## 2  45.2 0.557 20.6 26.1 0.791  9.8 34.5 44.3 21.6 8.7 5.2 14.7 21.8 117.8
## 3  49.0 0.541 18.6 23.8 0.779 10.5 36.4 46.9 24.7 6.1 4.8 12.7 19.5 117.0
## 4  55.8 0.522 20.8 26.3 0.791 10.7 37.0 47.7 23.7 7.1 4.7 14.6 22.1 116.3
## 5  54.8 0.528 17.1 23.4 0.729 11.1 35.4 46.5 26.8 7.5 5.0 16.4 21.2 115.8
## 6  57.1 0.514 17.7 22.1 0.804 10.2 35.1 45.3 20.6 6.3 6.1 12.8 21.7 115.0
## 7  58.3 0.506 19.4 24.6 0.788 10.2 31.9 42.0 25.0 8.0 4.3 14.2 22.7 114.4
## 8  61.0 0.516 19.0 23.4 0.810  9.0 35.6 44.6 24.7 7.3 5.5 12.6 19.4 114.1
## 9  55.0 0.522 18.6 23.2 0.801 10.7 35.4 46.1 23.0 8.3 5.6 13.8 21.6 113.7
## 10 56.3 0.529 19.9 23.8 0.834  9.8 33.8 43.5 27.2 7.7 4.0 14.8 22.0 113.6
str(merged_df)
## 'data.frame':    120 obs. of  25 variables:
##  $ Rk  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Team: chr  "Milwaukee Bucks*" "Houston Rockets*" "Dallas Mavericks*" "Los Angeles Clippers*" ...
##  $ G   : int  73 72 75 72 72 74 72 71 72 73 ...
##  $ MP  : num  241 241 242 241 242 ...
##  $ FG  : num  43.3 40.8 41.7 41.6 42.6 42.2 41.5 42.2 41.3 41.2 ...
##  $ FGA : num  90.9 90.4 90.3 89.2 91.6 91.2 90.9 89.4 89.6 88.1 ...
##  $ FG. : num  0.476 0.451 0.461 0.466 0.465 0.463 0.457 0.472 0.461 0.468 ...
##  $ X3P : num  13.8 15.6 15.1 12.4 13.6 12.9 12 10.7 12.6 11.4 ...
##  $ X3PA: num  38.9 45.3 41.3 33.5 36.9 34.1 32.6 28.5 34.5 31.8 ...
##  $ X3P.: num  0.355 0.345 0.367 0.371 0.37 0.377 0.368 0.376 0.364 0.358 ...
##  $ X2P : num  29.5 25.1 26.5 29.1 28.9 29.3 29.5 31.5 28.7 29.8 ...
##  $ X2PA: num  52 45.2 49 55.8 54.8 57.1 58.3 61 55 56.3 ...
##  $ X2P.: num  0.567 0.557 0.541 0.522 0.528 0.514 0.506 0.516 0.522 0.529 ...
##  $ FT  : num  18.3 20.6 18.6 20.8 17.1 17.7 19.4 19 18.6 19.9 ...
##  $ FTA : num  24.7 26.1 23.8 26.3 23.4 22.1 24.6 23.4 23.2 23.8 ...
##  $ FT. : num  0.742 0.791 0.779 0.791 0.729 0.804 0.788 0.81 0.801 0.834 ...
##  $ ORB : num  9.5 9.8 10.5 10.7 11.1 10.2 10.2 9 10.7 9.8 ...
##  $ DRB : num  42.2 34.5 36.4 37 35.4 35.1 31.9 35.6 35.4 33.8 ...
##  $ TRB : num  51.7 44.3 46.9 47.7 46.5 45.3 42 44.6 46.1 43.5 ...
##  $ AST : num  25.9 21.6 24.7 23.7 26.8 20.6 25 24.7 23 27.2 ...
##  $ STL : num  7.2 8.7 6.1 7.1 7.5 6.3 8 7.3 8.3 7.7 ...
##  $ BLK : num  5.9 5.2 4.8 4.7 5 6.1 4.3 5.5 5.6 4 ...
##  $ TOV : num  15.1 14.7 12.7 14.6 16.4 12.8 14.2 12.6 13.8 14.8 ...
##  $ PF  : num  19.6 21.8 19.5 22.1 21.2 21.7 22.7 19.4 21.6 22 ...
##  $ PTS : num  119 118 117 116 116 ...
colSums(is.na(merged_df))
##   Rk Team    G   MP   FG  FGA  FG.  X3P X3PA X3P.  X2P X2PA X2P.   FT  FTA  FT. 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  ORB  DRB  TRB  AST  STL  BLK  TOV   PF  PTS 
##    0    0    0    0    0    0    0    0    0
sapply(merged_df, class)
##          Rk        Team           G          MP          FG         FGA 
##   "integer" "character"   "integer"   "numeric"   "numeric"   "numeric" 
##         FG.         X3P        X3PA        X3P.         X2P        X2PA 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##        X2P.          FT         FTA         FT.         ORB         DRB 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##         TRB         AST         STL         BLK         TOV          PF 
##   "numeric"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##         PTS 
##   "numeric"
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(merged_df, type = "text", summary.stat = c("mean", "min", "max", "sd", "median"))
## 
## ==================================================
## Statistic  Mean     Min     Max   St. Dev. Median 
## --------------------------------------------------
## Rk        15.500     1      30     8.692    15.5  
## G         76.650    64      82     5.640    78.5  
## MP        241.583 240.000 243.700  0.810   241.500
## FG        41.163  37.300  44.700   1.563   41.300 
## FGA       88.407  83.700  94.400   2.234   88.400 
## FG.        0.466   0.429   0.504   0.015    0.468 
## X3P       12.420   9.600  16.700   1.495   12.200 
## X3PA      34.532  28.000  45.300   3.609   34.200 
## X3P.       0.359   0.323   0.411   0.016    0.358 
## X2P       28.747  24.500  33.900   2.107   28.700 
## X2PA      53.873  43.300  62.100   4.066   53.950 
## X2P.       0.534   0.476   0.586   0.021    0.532 
## FT        17.536  13.800  21.000   1.455   17.500 
## FTA       22.575  17.500  26.600   1.816   22.400 
## FT.        0.777   0.694   0.839   0.028    0.779 
## ORB       10.172   7.600  14.100   1.127   10.150 
## DRB       34.077  30.300  42.200   1.708   34.050 
## TRB       44.242  38.800  51.700   1.982   44.200 
## AST       24.788  20.600  29.800   1.774   24.700 
## STL        7.538   6.100  10.000   0.790    7.450 
## BLK        4.787   3.000   6.600   0.683    4.750 
## TOV       14.063  11.100  16.500   1.095   14.150 
## PF        19.922  17.200  23.100   1.272   19.900 
## PTS       112.266 102.900 120.700  3.853   112.850
## --------------------------------------------------
library(visdat)
## Warning: package 'visdat' was built under R version 4.3.2
vis_dat(merged_df)

names(merged_df)
##  [1] "Rk"   "Team" "G"    "MP"   "FG"   "FGA"  "FG."  "X3P"  "X3PA" "X3P."
## [11] "X2P"  "X2PA" "X2P." "FT"   "FTA"  "FT."  "ORB"  "DRB"  "TRB"  "AST" 
## [21] "STL"  "BLK"  "TOV"  "PF"   "PTS"

Basic Regression:

The estimating equation for this linear model is:

PTS = 𝛽0 + 𝛽1FG + 𝜖 Where:

PTS is the dependent variable (points scored). FG is the independent variable (field goals attempted or made). 𝛽0 is the intercept of the model. 𝛽1 is the coefficient for the independent variable FG, representing the effect of field goals on points scored. 𝜖 is the error term, which captures the variability in PTS that is not explained by FG.

# Fit a linear regression model
model <- lm(PTS ~ FG, data = merged_df)

# Summary of the model
summary(model)
## 
## Call:
## lm(formula = PTS ~ FG, data = merged_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8322 -1.4754 -0.2299  1.3163  6.2945 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.9297     4.9091   5.282 5.92e-07 ***
## FG            2.0974     0.1192  17.599  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.032 on 118 degrees of freedom
## Multiple R-squared:  0.7241, Adjusted R-squared:  0.7218 
## F-statistic: 309.7 on 1 and 118 DF,  p-value: < 2.2e-16
# Plot diagnostic plots for the model
par(mfrow = c(2, 2))
plot(model)

Linear regression with multiple variables

The estimating equation for this multiple linear regression model is:

PTS = 𝛽0 + 𝛽1FG + 𝛽2AST + 𝛽3ORB + 𝜖

Where:

PTS is the dependent variable (points scored). FG is the number of field goals. AST is the number of assists. ORB is the number of offensive rebounds. 𝛽0 is the intercept, representing the baseline value of PTS when FG, AST, and ORB are all zero. 𝛽1 is the coefficient for FG, indicating the effect of an additional field goal on points scored, holding AST and ORB constant. 𝛽2 is the coefficient for AST, indicating the effect of an additional assist on points scored, holding FG and ORB constant. 𝛽3 is the coefficient for ORB, indicating the effect of an additional offensive rebound on points scored, holding FG and AST constant. 𝜖 is the error term, capturing the variability in PTS that is not explained by FG, AST, and ORB.

# Fit a linear regression model with multiple predictors
model1 <- lm(PTS ~ FG + AST + ORB, data = merged_df)

# Summary of the model
summary(model1)
## 
## Call:
## lm(formula = PTS ~ FG + AST + ORB, data = merged_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1766 -1.2938 -0.1248  1.3321  5.5435 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  27.2515     4.9538   5.501 2.28e-07 ***
## FG            2.2706     0.1473  15.412  < 2e-16 ***
## AST          -0.2213     0.1289  -1.717   0.0886 .  
## ORB          -0.2911     0.1671  -1.742   0.0841 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.006 on 116 degrees of freedom
## Multiple R-squared:  0.7358, Adjusted R-squared:  0.7289 
## F-statistic: 107.7 on 3 and 116 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model1)