# Set working directory and path to data
setwd("C:/Users/LENOVO/Downloads/Regression Model/Project") # Example path on Windows
# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 521728 27.9 1160458 62 660385 35.3
## Vcells 948793 7.3 8388608 64 1769879 13.6
cat("\f") # Clear the console
dev.off # Clear the charts
## function (which = dev.cur())
## {
## if (which == 1)
## stop("cannot shut down device 1 (the null device)")
## .External(C_devoff, as.integer(which))
## dev.cur()
## }
## <bytecode: 0x0000019c0cf16268>
## <environment: namespace:grDevices>
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download.csv")
df1 <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download (1).csv")
df2 <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download (2).csv")
df3 <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download (3).csv")
# Merge data frames
merged_df <- bind_rows(df, df1, df2, df3)
summary(merged_df %>% select("Rk", "Team", "G", "MP", "FG", "FGA", "FG.", "X3P", "X3PA", "X3P.", "X2P", "X2PA", "X2P.", "FT", "FTA", "FT.", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS"))
## Rk Team G MP
## Min. : 1.0 Length:120 Min. :64.00 Min. :240.0
## 1st Qu.: 8.0 Class :character 1st Qu.:72.00 1st Qu.:241.0
## Median :15.5 Mode :character Median :78.50 Median :241.5
## Mean :15.5 Mean :76.65 Mean :241.6
## 3rd Qu.:23.0 3rd Qu.:82.00 3rd Qu.:242.1
## Max. :30.0 Max. :82.00 Max. :243.7
## FG FGA FG. X3P
## Min. :37.30 Min. :83.70 Min. :0.4290 Min. : 9.60
## 1st Qu.:40.08 1st Qu.:86.78 1st Qu.:0.4570 1st Qu.:11.30
## Median :41.30 Median :88.40 Median :0.4680 Median :12.20
## Mean :41.16 Mean :88.41 Mean :0.4657 Mean :12.42
## 3rd Qu.:42.20 3rd Qu.:90.12 3rd Qu.:0.4750 3rd Qu.:13.40
## Max. :44.70 Max. :94.40 Max. :0.5040 Max. :16.70
## X3PA X3P. X2P X2PA
## Min. :28.00 Min. :0.3230 Min. :24.50 Min. :43.30
## 1st Qu.:31.80 1st Qu.:0.3488 1st Qu.:27.25 1st Qu.:50.98
## Median :34.20 Median :0.3585 Median :28.70 Median :53.95
## Mean :34.53 Mean :0.3594 Mean :28.75 Mean :53.87
## 3rd Qu.:37.00 3rd Qu.:0.3700 3rd Qu.:30.32 3rd Qu.:56.83
## Max. :45.30 Max. :0.4110 Max. :33.90 Max. :62.10
## X2P. FT FTA FT.
## Min. :0.4760 Min. :13.80 Min. :17.50 Min. :0.6940
## 1st Qu.:0.5198 1st Qu.:16.40 1st Qu.:21.30 1st Qu.:0.7558
## Median :0.5320 Median :17.50 Median :22.40 Median :0.7790
## Mean :0.5340 Mean :17.54 Mean :22.57 Mean :0.7770
## 3rd Qu.:0.5480 3rd Qu.:18.60 3rd Qu.:23.80 3rd Qu.:0.7953
## Max. :0.5860 Max. :21.00 Max. :26.60 Max. :0.8390
## ORB DRB TRB AST
## Min. : 7.600 Min. :30.30 Min. :38.80 Min. :20.60
## 1st Qu.: 9.475 1st Qu.:32.90 1st Qu.:42.90 1st Qu.:23.70
## Median :10.150 Median :34.05 Median :44.20 Median :24.70
## Mean :10.172 Mean :34.08 Mean :44.24 Mean :24.79
## 3rd Qu.:10.700 3rd Qu.:35.12 3rd Qu.:45.40 3rd Qu.:25.93
## Max. :14.100 Max. :42.20 Max. :51.70 Max. :29.80
## STL BLK TOV PF
## Min. : 6.100 Min. :3.000 Min. :11.10 Min. :17.20
## 1st Qu.: 7.000 1st Qu.:4.375 1st Qu.:13.30 1st Qu.:18.90
## Median : 7.450 Median :4.750 Median :14.15 Median :19.90
## Mean : 7.538 Mean :4.787 Mean :14.06 Mean :19.92
## 3rd Qu.: 8.000 3rd Qu.:5.200 3rd Qu.:14.80 3rd Qu.:20.90
## Max. :10.000 Max. :6.600 Max. :16.50 Max. :23.10
## PTS
## Min. :102.9
## 1st Qu.:109.8
## Median :112.8
## Mean :112.3
## 3rd Qu.:115.1
## Max. :120.7
# Write the merged dataset to a CSV file
write.csv(merged_df, file = "merged_dataset.csv", row.names = FALSE)
# Print first 10 rows
head(merged_df, 10)
## Rk Team G MP FG FGA FG. X3P X3PA X3P. X2P
## 1 1 Milwaukee Bucks* 73 241.0 43.3 90.9 0.476 13.8 38.9 0.355 29.5
## 2 2 Houston Rockets* 72 241.4 40.8 90.4 0.451 15.6 45.3 0.345 25.1
## 3 3 Dallas Mavericks* 75 242.3 41.7 90.3 0.461 15.1 41.3 0.367 26.5
## 4 4 Los Angeles Clippers* 72 241.4 41.6 89.2 0.466 12.4 33.5 0.371 29.1
## 5 5 New Orleans Pelicans 72 242.1 42.6 91.6 0.465 13.6 36.9 0.370 28.9
## 6 6 Portland Trail Blazers* 74 241.0 42.2 91.2 0.463 12.9 34.1 0.377 29.3
## 7 7 Washington Wizards 72 241.0 41.5 90.9 0.457 12.0 32.6 0.368 29.5
## 8 8 San Antonio Spurs 71 242.5 42.2 89.4 0.472 10.7 28.5 0.376 31.5
## 9 9 Boston Celtics* 72 242.1 41.3 89.6 0.461 12.6 34.5 0.364 28.7
## 10 10 Phoenix Suns 73 241.0 41.2 88.1 0.468 11.4 31.8 0.358 29.8
## X2PA X2P. FT FTA FT. ORB DRB TRB AST STL BLK TOV PF PTS
## 1 52.0 0.567 18.3 24.7 0.742 9.5 42.2 51.7 25.9 7.2 5.9 15.1 19.6 118.7
## 2 45.2 0.557 20.6 26.1 0.791 9.8 34.5 44.3 21.6 8.7 5.2 14.7 21.8 117.8
## 3 49.0 0.541 18.6 23.8 0.779 10.5 36.4 46.9 24.7 6.1 4.8 12.7 19.5 117.0
## 4 55.8 0.522 20.8 26.3 0.791 10.7 37.0 47.7 23.7 7.1 4.7 14.6 22.1 116.3
## 5 54.8 0.528 17.1 23.4 0.729 11.1 35.4 46.5 26.8 7.5 5.0 16.4 21.2 115.8
## 6 57.1 0.514 17.7 22.1 0.804 10.2 35.1 45.3 20.6 6.3 6.1 12.8 21.7 115.0
## 7 58.3 0.506 19.4 24.6 0.788 10.2 31.9 42.0 25.0 8.0 4.3 14.2 22.7 114.4
## 8 61.0 0.516 19.0 23.4 0.810 9.0 35.6 44.6 24.7 7.3 5.5 12.6 19.4 114.1
## 9 55.0 0.522 18.6 23.2 0.801 10.7 35.4 46.1 23.0 8.3 5.6 13.8 21.6 113.7
## 10 56.3 0.529 19.9 23.8 0.834 9.8 33.8 43.5 27.2 7.7 4.0 14.8 22.0 113.6
str(merged_df)
## 'data.frame': 120 obs. of 25 variables:
## $ Rk : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Team: chr "Milwaukee Bucks*" "Houston Rockets*" "Dallas Mavericks*" "Los Angeles Clippers*" ...
## $ G : int 73 72 75 72 72 74 72 71 72 73 ...
## $ MP : num 241 241 242 241 242 ...
## $ FG : num 43.3 40.8 41.7 41.6 42.6 42.2 41.5 42.2 41.3 41.2 ...
## $ FGA : num 90.9 90.4 90.3 89.2 91.6 91.2 90.9 89.4 89.6 88.1 ...
## $ FG. : num 0.476 0.451 0.461 0.466 0.465 0.463 0.457 0.472 0.461 0.468 ...
## $ X3P : num 13.8 15.6 15.1 12.4 13.6 12.9 12 10.7 12.6 11.4 ...
## $ X3PA: num 38.9 45.3 41.3 33.5 36.9 34.1 32.6 28.5 34.5 31.8 ...
## $ X3P.: num 0.355 0.345 0.367 0.371 0.37 0.377 0.368 0.376 0.364 0.358 ...
## $ X2P : num 29.5 25.1 26.5 29.1 28.9 29.3 29.5 31.5 28.7 29.8 ...
## $ X2PA: num 52 45.2 49 55.8 54.8 57.1 58.3 61 55 56.3 ...
## $ X2P.: num 0.567 0.557 0.541 0.522 0.528 0.514 0.506 0.516 0.522 0.529 ...
## $ FT : num 18.3 20.6 18.6 20.8 17.1 17.7 19.4 19 18.6 19.9 ...
## $ FTA : num 24.7 26.1 23.8 26.3 23.4 22.1 24.6 23.4 23.2 23.8 ...
## $ FT. : num 0.742 0.791 0.779 0.791 0.729 0.804 0.788 0.81 0.801 0.834 ...
## $ ORB : num 9.5 9.8 10.5 10.7 11.1 10.2 10.2 9 10.7 9.8 ...
## $ DRB : num 42.2 34.5 36.4 37 35.4 35.1 31.9 35.6 35.4 33.8 ...
## $ TRB : num 51.7 44.3 46.9 47.7 46.5 45.3 42 44.6 46.1 43.5 ...
## $ AST : num 25.9 21.6 24.7 23.7 26.8 20.6 25 24.7 23 27.2 ...
## $ STL : num 7.2 8.7 6.1 7.1 7.5 6.3 8 7.3 8.3 7.7 ...
## $ BLK : num 5.9 5.2 4.8 4.7 5 6.1 4.3 5.5 5.6 4 ...
## $ TOV : num 15.1 14.7 12.7 14.6 16.4 12.8 14.2 12.6 13.8 14.8 ...
## $ PF : num 19.6 21.8 19.5 22.1 21.2 21.7 22.7 19.4 21.6 22 ...
## $ PTS : num 119 118 117 116 116 ...
colSums(is.na(merged_df))
## Rk Team G MP FG FGA FG. X3P X3PA X3P. X2P X2PA X2P. FT FTA FT.
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## ORB DRB TRB AST STL BLK TOV PF PTS
## 0 0 0 0 0 0 0 0 0
sapply(merged_df, class)
## Rk Team G MP FG FGA
## "integer" "character" "integer" "numeric" "numeric" "numeric"
## FG. X3P X3PA X3P. X2P X2PA
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## X2P. FT FTA FT. ORB DRB
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## TRB AST STL BLK TOV PF
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## PTS
## "numeric"
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(merged_df, type = "text", summary.stat = c("mean", "min", "max", "sd", "median"))
##
## ==================================================
## Statistic Mean Min Max St. Dev. Median
## --------------------------------------------------
## Rk 15.500 1 30 8.692 15.5
## G 76.650 64 82 5.640 78.5
## MP 241.583 240.000 243.700 0.810 241.500
## FG 41.163 37.300 44.700 1.563 41.300
## FGA 88.407 83.700 94.400 2.234 88.400
## FG. 0.466 0.429 0.504 0.015 0.468
## X3P 12.420 9.600 16.700 1.495 12.200
## X3PA 34.532 28.000 45.300 3.609 34.200
## X3P. 0.359 0.323 0.411 0.016 0.358
## X2P 28.747 24.500 33.900 2.107 28.700
## X2PA 53.873 43.300 62.100 4.066 53.950
## X2P. 0.534 0.476 0.586 0.021 0.532
## FT 17.536 13.800 21.000 1.455 17.500
## FTA 22.575 17.500 26.600 1.816 22.400
## FT. 0.777 0.694 0.839 0.028 0.779
## ORB 10.172 7.600 14.100 1.127 10.150
## DRB 34.077 30.300 42.200 1.708 34.050
## TRB 44.242 38.800 51.700 1.982 44.200
## AST 24.788 20.600 29.800 1.774 24.700
## STL 7.538 6.100 10.000 0.790 7.450
## BLK 4.787 3.000 6.600 0.683 4.750
## TOV 14.063 11.100 16.500 1.095 14.150
## PF 19.922 17.200 23.100 1.272 19.900
## PTS 112.266 102.900 120.700 3.853 112.850
## --------------------------------------------------
library(visdat)
## Warning: package 'visdat' was built under R version 4.3.2
vis_dat(merged_df)
names(merged_df)
## [1] "Rk" "Team" "G" "MP" "FG" "FGA" "FG." "X3P" "X3PA" "X3P."
## [11] "X2P" "X2PA" "X2P." "FT" "FTA" "FT." "ORB" "DRB" "TRB" "AST"
## [21] "STL" "BLK" "TOV" "PF" "PTS"
The estimating equation for this linear model is:
PTS = 𝛽0 + 𝛽1FG + 𝜖 Where:
PTS is the dependent variable (points scored). FG is the independent variable (field goals attempted or made). 𝛽0 is the intercept of the model. 𝛽1 is the coefficient for the independent variable FG, representing the effect of field goals on points scored. 𝜖 is the error term, which captures the variability in PTS that is not explained by FG.
# Fit a linear regression model
model <- lm(PTS ~ FG, data = merged_df)
# Summary of the model
summary(model)
##
## Call:
## lm(formula = PTS ~ FG, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8322 -1.4754 -0.2299 1.3163 6.2945
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.9297 4.9091 5.282 5.92e-07 ***
## FG 2.0974 0.1192 17.599 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.032 on 118 degrees of freedom
## Multiple R-squared: 0.7241, Adjusted R-squared: 0.7218
## F-statistic: 309.7 on 1 and 118 DF, p-value: < 2.2e-16
# Plot diagnostic plots for the model
par(mfrow = c(2, 2))
plot(model)
The estimating equation for this multiple linear regression model is:
PTS = 𝛽0 + 𝛽1FG + 𝛽2AST + 𝛽3ORB + 𝜖
Where:
PTS is the dependent variable (points scored). FG is the number of field goals. AST is the number of assists. ORB is the number of offensive rebounds. 𝛽0 is the intercept, representing the baseline value of PTS when FG, AST, and ORB are all zero. 𝛽1 is the coefficient for FG, indicating the effect of an additional field goal on points scored, holding AST and ORB constant. 𝛽2 is the coefficient for AST, indicating the effect of an additional assist on points scored, holding FG and ORB constant. 𝛽3 is the coefficient for ORB, indicating the effect of an additional offensive rebound on points scored, holding FG and AST constant. 𝜖 is the error term, capturing the variability in PTS that is not explained by FG, AST, and ORB.
# Fit a linear regression model with multiple predictors
model1 <- lm(PTS ~ FG + AST + ORB, data = merged_df)
# Summary of the model
summary(model1)
##
## Call:
## lm(formula = PTS ~ FG + AST + ORB, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1766 -1.2938 -0.1248 1.3321 5.5435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.2515 4.9538 5.501 2.28e-07 ***
## FG 2.2706 0.1473 15.412 < 2e-16 ***
## AST -0.2213 0.1289 -1.717 0.0886 .
## ORB -0.2911 0.1671 -1.742 0.0841 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.006 on 116 degrees of freedom
## Multiple R-squared: 0.7358, Adjusted R-squared: 0.7289
## F-statistic: 107.7 on 3 and 116 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model1)